libcpp/lex.cc

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2022 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive.  */
  54 #define UCS_LIMIT 0x10FFFF
  55
  56 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  57 static int skip_line_comment (cpp_reader *);
  58 static void skip_whitespace (cpp_reader *, cppchar_t);
  59 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  60 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  61 static void store_comment (cpp_reader *, cpp_token *);
  62 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  63                             unsigned int, enum cpp_ttype);
  64 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  65 static int name_p (cpp_reader *, const cpp_string *);
  66 static tokenrun *next_tokenrun (tokenrun *);
  67
  68 static _cpp_buff *new_buff (size_t);
  69
  70
  71 /* Utility routine:
  72
  73    Compares, the token TOKEN to the NUL-terminated string STRING.
  74    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  75 int
  76 cpp_ideq (const cpp_token *token, const char *string)
  77 {
  78   if (token->type != CPP_NAME)
  79     return 0;
  80
  81   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  82 }
  83
  84 /* Record a note TYPE at byte POS into the current cleaned logical
  85    line.  */
  86 static void
  87 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  88 {
  89   if (buffer->notes_used == buffer->notes_cap)
  90     {
  91       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  92       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  93                                   buffer->notes_cap);
  94     }
  95
  96   buffer->notes[buffer->notes_used].pos = pos;
  97   buffer->notes[buffer->notes_used].type = type;
  98   buffer->notes_used++;
  99 }
 100
 101 \f
 102 /* Fast path to find line special characters using optimized character
 103    scanning algorithms.  Anything complicated falls back to the slow
 104    path below.  Since this loop is very hot it's worth doing these kinds
 105    of optimizations.
 106
 107    One of the paths through the ifdefs should provide
 108
 109      const uchar *search_line_fast (const uchar *s, const uchar *end);
 110
 111    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 112    the found character.
 113
 114    Note that the last character of the buffer is *always* a newline,
 115    as forced by _cpp_convert_input.  This fact can be used to avoid
 116    explicitly looking for the end of the buffer.  */
 117
 118 /* Configure gives us an ifdef test.  */
 119 #ifndef WORDS_BIGENDIAN
 120 #define WORDS_BIGENDIAN 0
 121 #endif
 122
 123 /* We'd like the largest integer that fits into a register.  There's nothing
 124    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 125    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 126    can get the "real" word size.  */
 127 #ifdef __GNUC__
 128 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 129 #else
 130 typedef unsigned long word_type;
 131 #endif
 132
 133 /* The code below is only expecting sizes 4 or 8.
 134    Die at compile-time if this expectation is violated.  */
 135 typedef char check_word_type_size
 136   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 137
 138 /* Return X with the first N bytes forced to values that won't match one
 139    of the interesting characters.  Note that NUL is not interesting.  */
 140
 141 static inline word_type
 142 acc_char_mask_misalign (word_type val, unsigned int n)
 143 {
 144   word_type mask = -1;
 145   if (WORDS_BIGENDIAN)
 146     mask >>= n * 8;
 147   else
 148     mask <<= n * 8;
 149   return val & mask;
 150 }
 151
 152 /* Return X replicated to all byte positions within WORD_TYPE.  */
 153
 154 static inline word_type
 155 acc_char_replicate (uchar x)
 156 {
 157   word_type ret;
 158
 159   ret = (x << 24) | (x << 16) | (x << 8) | x;
 160   if (sizeof(word_type) == 8)
 161     ret = (ret << 16 << 16) | ret;
 162   return ret;
 163 }
 164
 165 /* Return non-zero if some byte of VAL is (probably) C.  */
 166
 167 static inline word_type
 168 acc_char_cmp (word_type val, word_type c)
 169 {
 170 #if defined(__GNUC__) && defined(__alpha__)
 171   /* We can get exact results using a compare-bytes instruction.
 172      Get (val == c) via (0 >= (val ^ c)).  */
 173   return __builtin_alpha_cmpbge (0, val ^ c);
 174 #else
 175   word_type magic = 0x7efefefeU;
 176   if (sizeof(word_type) == 8)
 177     magic = (magic << 16 << 16) | 0xfefefefeU;
 178   magic |= 1;
 179
 180   val ^= c;
 181   return ((val + magic) ^ ~val) & ~magic;
 182 #endif
 183 }
 184
 185 /* Given the result of acc_char_cmp is non-zero, return the index of
 186    the found character.  If this was a false positive, return -1.  */
 187
 188 static inline int
 189 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 190                 word_type val ATTRIBUTE_UNUSED)
 191 {
 192 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 193   /* The cmpbge instruction sets *bits* of the result corresponding to
 194      matches in the bytes with no false positives.  */
 195   return __builtin_ctzl (cmp);
 196 #else
 197   unsigned int i;
 198
 199   /* ??? It would be nice to force unrolling here,
 200      and have all of these constants folded.  */
 201   for (i = 0; i < sizeof(word_type); ++i)
 202     {
 203       uchar c;
 204       if (WORDS_BIGENDIAN)
 205         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 206       else
 207         c = (val >> i * 8) & 0xff;
 208
 209       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 210         return i;
 211     }
 212
 213   return -1;
 214 #endif
 215 }
 216
 217 /* A version of the fast scanner using bit fiddling techniques.
 218
 219    For 32-bit words, one would normally perform 16 comparisons and
 220    16 branches.  With this algorithm one performs 24 arithmetic
 221    operations and one branch.  Whether this is faster with a 32-bit
 222    word size is going to be somewhat system dependent.
 223
 224    For 64-bit words, we eliminate twice the number of comparisons
 225    and branches without increasing the number of arithmetic operations.
 226    It's almost certainly going to be a win with 64-bit word size.  */
 227
 228 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 229   ATTRIBUTE_UNUSED;
 230
 231 static const uchar *
 232 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 233 {
 234   const word_type repl_nl = acc_char_replicate ('\n');
 235   const word_type repl_cr = acc_char_replicate ('\r');
 236   const word_type repl_bs = acc_char_replicate ('\\');
 237   const word_type repl_qm = acc_char_replicate ('?');
 238
 239   unsigned int misalign;
 240   const word_type *p;
 241   word_type val, t;
 242
 243   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 244   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 245   val = *p;
 246   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 247   if (misalign)
 248     val = acc_char_mask_misalign (val, misalign);
 249
 250   /* Main loop.  */
 251   while (1)
 252     {
 253       t  = acc_char_cmp (val, repl_nl);
 254       t |= acc_char_cmp (val, repl_cr);
 255       t |= acc_char_cmp (val, repl_bs);
 256       t |= acc_char_cmp (val, repl_qm);
 257
 258       if (__builtin_expect (t != 0, 0))
 259         {
 260           int i = acc_char_index (t, val);
 261           if (i >= 0)
 262             return (const uchar *)p + i;
 263         }
 264
 265       val = *++p;
 266     }
 267 }
 268
 269 /* Disable on Solaris 2/x86 until the following problem can be properly
 270    autoconfed:
 271
 272    The Solaris 10+ assembler tags objects with the instruction set
 273    extensions used, so SSE4.2 executables cannot run on machines that
 274    don't support that extension.  */
 275
 276 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 277
 278 /* Replicated character data to be shared between implementations.
 279    Recall that outside of a context with vector support we can't
 280    define compatible vector types, therefore these are all defined
 281    in terms of raw characters.  */
 282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 283   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 284     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 285   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 286     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 287   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 288     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 289   { '?', '?', '?', '?', '?', '?', '?', '?',
 290     '?', '?', '?', '?', '?', '?', '?', '?' },
 291 };
 292
 293 /* A version of the fast scanner using MMX vectorized byte compare insns.
 294
 295    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 296    which was packaged into SSE1; it is also present in the AMD MMX
 297    extension.  Mark the function as using "sse" so that we emit a real
 298    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 299
 300 static const uchar *
 301 #ifndef __SSE__
 302 __attribute__((__target__("sse")))
 303 #endif
 304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 305 {
 306   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 307   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 308
 309   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 310   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 311   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 312   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 313
 314   unsigned int misalign, found, mask;
 315   const v8qi *p;
 316   v8qi data, t, c;
 317
 318   /* Align the source pointer.  While MMX doesn't generate unaligned data
 319      faults, this allows us to safely scan to the end of the buffer without
 320      reading beyond the end of the last page.  */
 321   misalign = (uintptr_t)s & 7;
 322   p = (const v8qi *)((uintptr_t)s & -8);
 323   data = *p;
 324
 325   /* Create a mask for the bytes that are valid within the first
 326      16-byte block.  The Idea here is that the AND with the mask
 327      within the loop is "free", since we need some AND or TEST
 328      insn in order to set the flags for the branch anyway.  */
 329   mask = -1u << misalign;
 330
 331   /* Main loop processing 8 bytes at a time.  */
 332   goto start;
 333   do
 334     {
 335       data = *++p;
 336       mask = -1;
 337
 338     start:
 339       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 340       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 341       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 342       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 343       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 344       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 345       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 346       found = __builtin_ia32_pmovmskb (t);
 347       found &= mask;
 348     }
 349   while (!found);
 350
 351   __builtin_ia32_emms ();
 352
 353   /* FOUND contains 1 in bits for which we matched a relevant
 354      character.  Conversion to the byte index is trivial.  */
 355   found = __builtin_ctz(found);
 356   return (const uchar *)p + found;
 357 }
 358
 359 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 360
 361 static const uchar *
 362 #ifndef __SSE2__
 363 __attribute__((__target__("sse2")))
 364 #endif
 365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 366 {
 367   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 368
 369   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 370   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 371   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 372   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 373
 374   unsigned int misalign, found, mask;
 375   const v16qi *p;
 376   v16qi data, t;
 377
 378   /* Align the source pointer.  */
 379   misalign = (uintptr_t)s & 15;
 380   p = (const v16qi *)((uintptr_t)s & -16);
 381   data = *p;
 382
 383   /* Create a mask for the bytes that are valid within the first
 384      16-byte block.  The Idea here is that the AND with the mask
 385      within the loop is "free", since we need some AND or TEST
 386      insn in order to set the flags for the branch anyway.  */
 387   mask = -1u << misalign;
 388
 389   /* Main loop processing 16 bytes at a time.  */
 390   goto start;
 391   do
 392     {
 393       data = *++p;
 394       mask = -1;
 395
 396     start:
 397       t  = data == repl_nl;
 398       t |= data == repl_cr;
 399       t |= data == repl_bs;
 400       t |= data == repl_qm;
 401       found = __builtin_ia32_pmovmskb128 (t);
 402       found &= mask;
 403     }
 404   while (!found);
 405
 406   /* FOUND contains 1 in bits for which we matched a relevant
 407      character.  Conversion to the byte index is trivial.  */
 408   found = __builtin_ctz(found);
 409   return (const uchar *)p + found;
 410 }
 411
 412 #ifdef HAVE_SSE4
 413 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 414
 415 static const uchar *
 416 #ifndef __SSE4_2__
 417 __attribute__((__target__("sse4.2")))
 418 #endif
 419 search_line_sse42 (const uchar *s, const uchar *end)
 420 {
 421   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 422   static const v16qi search = { '\n', '\r', '?', '\\' };
 423
 424   uintptr_t si = (uintptr_t)s;
 425   uintptr_t index;
 426
 427   /* Check for unaligned input.  */
 428   if (si & 15)
 429     {
 430       v16qi sv;
 431
 432       if (__builtin_expect (end - s < 16, 0)
 433           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 434         {
 435           /* There are less than 16 bytes left in the buffer, and less
 436              than 16 bytes left on the page.  Reading 16 bytes at this
 437              point might generate a spurious page fault.  Defer to the
 438              SSE2 implementation, which already handles alignment.  */
 439           return search_line_sse2 (s, end);
 440         }
 441
 442       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 443          memory need not be aligned.  */
 444       sv = __builtin_ia32_loaddqu ((const char *) s);
 445       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 446
 447       if (__builtin_expect (index < 16, 0))
 448         goto found;
 449
 450       /* Advance the pointer to an aligned address.  We will re-scan a
 451          few bytes, but we no longer need care for reading past the
 452          end of a page, since we're guaranteed a match.  */
 453       s = (const uchar *)((si + 15) & -16);
 454     }
 455
 456   /* Main loop, processing 16 bytes at a time.  */
 457 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 458   while (1)
 459     {
 460       char f;
 461
 462       /* By using inline assembly instead of the builtin,
 463          we can use the result, as well as the flags set.  */
 464       __asm ("%vpcmpestri\t$0, %2, %3"
 465              : "=c"(index), "=@ccc"(f)
 466              : "m"(*s), "x"(search), "a"(4), "d"(16));
 467       if (f)
 468         break;
 469
 470       s += 16;
 471     }
 472 #else
 473   s -= 16;
 474   /* By doing the whole loop in inline assembly,
 475      we can make proper use of the flags set.  */
 476   __asm (      ".balign 16\n"
 477         "0:     add $16, %1\n"
 478         "       %vpcmpestri\t$0, (%1), %2\n"
 479         "       jnc 0b"
 480         : "=&c"(index), "+r"(s)
 481         : "x"(search), "a"(4), "d"(16));
 482 #endif
 483
 484  found:
 485   return s + index;
 486 }
 487
 488 #else
 489 /* Work around out-dated assemblers without sse4 support.  */
 490 #define search_line_sse42 search_line_sse2
 491 #endif
 492
 493 /* Check the CPU capabilities.  */
 494
 495 #include "../gcc/config/i386/cpuid.h"
 496
 497 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 498 static search_line_fast_type search_line_fast;
 499
 500 #define HAVE_init_vectorized_lexer 1
 501 static inline void
 502 init_vectorized_lexer (void)
 503 {
 504   unsigned dummy, ecx = 0, edx = 0;
 505   search_line_fast_type impl = search_line_acc_char;
 506   int minimum = 0;
 507
 508 #if defined(__SSE4_2__)
 509   minimum = 3;
 510 #elif defined(__SSE2__)
 511   minimum = 2;
 512 #elif defined(__SSE__)
 513   minimum = 1;
 514 #endif
 515
 516   if (minimum == 3)
 517     impl = search_line_sse42;
 518   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 519     {
 520       if (minimum == 3 || (ecx & bit_SSE4_2))
 521         impl = search_line_sse42;
 522       else if (minimum == 2 || (edx & bit_SSE2))
 523         impl = search_line_sse2;
 524       else if (minimum == 1 || (edx & bit_SSE))
 525         impl = search_line_mmx;
 526     }
 527   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 528     {
 529       if (minimum == 1
 530           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 531         impl = search_line_mmx;
 532     }
 533
 534   search_line_fast = impl;
 535 }
 536
 537 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 538
 539 /* A vection of the fast scanner using AltiVec vectorized byte compares
 540    and VSX unaligned loads (when VSX is available).  This is otherwise
 541    the same as the AltiVec version.  */
 542
 543 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 544 static const uchar *
 545 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 546 {
 547   typedef __attribute__((altivec(vector))) unsigned char vc;
 548
 549   const vc repl_nl = {
 550     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 551     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 552   };
 553   const vc repl_cr = {
 554     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 555     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 556   };
 557   const vc repl_bs = {
 558     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 559     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 560   };
 561   const vc repl_qm = {
 562     '?', '?', '?', '?', '?', '?', '?', '?',
 563     '?', '?', '?', '?', '?', '?', '?', '?',
 564   };
 565   const vc zero = { 0 };
 566
 567   vc data, t;
 568
 569   /* Main loop processing 16 bytes at a time.  */
 570   do
 571     {
 572       vc m_nl, m_cr, m_bs, m_qm;
 573
 574       data = __builtin_vec_vsx_ld (0, s);
 575       s += 16;
 576
 577       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 578       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 579       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 580       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 581       t = (m_nl | m_cr) | (m_bs | m_qm);
 582
 583       /* T now contains 0xff in bytes for which we matched one of the relevant
 584          characters.  We want to exit the loop if any byte in T is non-zero.
 585          Below is the expansion of vec_any_ne(t, zero).  */
 586     }
 587   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 588
 589   /* Restore s to to point to the 16 bytes we just processed.  */
 590   s -= 16;
 591
 592   {
 593 #define N  (sizeof(vc) / sizeof(long))
 594
 595     union {
 596       vc v;
 597       /* Statically assert that N is 2 or 4.  */
 598       unsigned long l[(N == 2 || N == 4) ? N : -1];
 599     } u;
 600     unsigned long l, i = 0;
 601
 602     u.v = t;
 603
 604     /* Find the first word of T that is non-zero.  */
 605     switch (N)
 606       {
 607       case 4:
 608         l = u.l[i++];
 609         if (l != 0)
 610           break;
 611         s += sizeof(unsigned long);
 612         l = u.l[i++];
 613         if (l != 0)
 614           break;
 615         s += sizeof(unsigned long);
 616         /* FALLTHRU */
 617       case 2:
 618         l = u.l[i++];
 619         if (l != 0)
 620           break;
 621         s += sizeof(unsigned long);
 622         l = u.l[i];
 623       }
 624
 625     /* L now contains 0xff in bytes for which we matched one of the
 626        relevant characters.  We can find the byte index by finding
 627        its bit index and dividing by 8.  */
 628 #ifdef __BIG_ENDIAN__
 629     l = __builtin_clzl(l) >> 3;
 630 #else
 631     l = __builtin_ctzl(l) >> 3;
 632 #endif
 633     return s + l;
 634
 635 #undef N
 636   }
 637 }
 638
 639 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 640
 641 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 642    This cannot be used for little endian because vec_lvsl/lvsr are
 643    deprecated for little endian and the code won't work properly.  */
 644 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 645    so we can't compile this function without -maltivec on the command line
 646    (or implied by some other switch).  */
 647
 648 static const uchar *
 649 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 650 {
 651   typedef __attribute__((altivec(vector))) unsigned char vc;
 652
 653   const vc repl_nl = {
 654     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 655     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 656   };
 657   const vc repl_cr = {
 658     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 659     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 660   };
 661   const vc repl_bs = {
 662     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 663     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 664   };
 665   const vc repl_qm = {
 666     '?', '?', '?', '?', '?', '?', '?', '?',
 667     '?', '?', '?', '?', '?', '?', '?', '?',
 668   };
 669   const vc ones = {
 670     -1, -1, -1, -1, -1, -1, -1, -1,
 671     -1, -1, -1, -1, -1, -1, -1, -1,
 672   };
 673   const vc zero = { 0 };
 674
 675   vc data, mask, t;
 676
 677   /* Altivec loads automatically mask addresses with -16.  This lets us
 678      issue the first load as early as possible.  */
 679   data = __builtin_vec_ld(0, (const vc *)s);
 680
 681   /* Discard bytes before the beginning of the buffer.  Do this by
 682      beginning with all ones and shifting in zeros according to the
 683      mis-alignment.  The LVSR instruction pulls the exact shift we
 684      want from the address.  */
 685   mask = __builtin_vec_lvsr(0, s);
 686   mask = __builtin_vec_perm(zero, ones, mask);
 687   data &= mask;
 688
 689   /* While altivec loads mask addresses, we still need to align S so
 690      that the offset we compute at the end is correct.  */
 691   s = (const uchar *)((uintptr_t)s & -16);
 692
 693   /* Main loop processing 16 bytes at a time.  */
 694   goto start;
 695   do
 696     {
 697       vc m_nl, m_cr, m_bs, m_qm;
 698
 699       s += 16;
 700       data = __builtin_vec_ld(0, (const vc *)s);
 701
 702     start:
 703       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 704       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 705       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 706       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 707       t = (m_nl | m_cr) | (m_bs | m_qm);
 708
 709       /* T now contains 0xff in bytes for which we matched one of the relevant
 710          characters.  We want to exit the loop if any byte in T is non-zero.
 711          Below is the expansion of vec_any_ne(t, zero).  */
 712     }
 713   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 714
 715   {
 716 #define N  (sizeof(vc) / sizeof(long))
 717
 718     union {
 719       vc v;
 720       /* Statically assert that N is 2 or 4.  */
 721       unsigned long l[(N == 2 || N == 4) ? N : -1];
 722     } u;
 723     unsigned long l, i = 0;
 724
 725     u.v = t;
 726
 727     /* Find the first word of T that is non-zero.  */
 728     switch (N)
 729       {
 730       case 4:
 731         l = u.l[i++];
 732         if (l != 0)
 733           break;
 734         s += sizeof(unsigned long);
 735         l = u.l[i++];
 736         if (l != 0)
 737           break;
 738         s += sizeof(unsigned long);
 739         /* FALLTHROUGH */
 740       case 2:
 741         l = u.l[i++];
 742         if (l != 0)
 743           break;
 744         s += sizeof(unsigned long);
 745         l = u.l[i];
 746       }
 747
 748     /* L now contains 0xff in bytes for which we matched one of the
 749        relevant characters.  We can find the byte index by finding
 750        its bit index and dividing by 8.  */
 751     l = __builtin_clzl(l) >> 3;
 752     return s + l;
 753
 754 #undef N
 755   }
 756 }
 757
 758 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 759 #include "arm_neon.h"
 760
 761 /* This doesn't have to be the exact page size, but no system may use
 762    a size smaller than this.  ARMv8 requires a minimum page size of
 763    4k.  The impact of being conservative here is a small number of
 764    cases will take the slightly slower entry path into the main
 765    loop.  */
 766
 767 #define AARCH64_MIN_PAGE_SIZE 4096
 768
 769 static const uchar *
 770 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 771 {
 772   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 773   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 774   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 775   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 776   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 777
 778 #ifdef __ARM_BIG_ENDIAN
 779   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 780 #else
 781   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 782 #endif
 783
 784   unsigned int found;
 785   const uint8_t *p;
 786   uint8x16_t data;
 787   uint8x16_t t;
 788   uint16x8_t m;
 789   uint8x16_t u, v, w;
 790
 791   /* Align the source pointer.  */
 792   p = (const uint8_t *)((uintptr_t)s & -16);
 793
 794   /* Assuming random string start positions, with a 4k page size we'll take
 795      the slow path about 0.37% of the time.  */
 796   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 797                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 798                         < 16, 0))
 799     {
 800       /* Slow path: the string starts near a possible page boundary.  */
 801       uint32_t misalign, mask;
 802
 803       misalign = (uintptr_t)s & 15;
 804       mask = (-1u << misalign) & 0xffff;
 805       data = vld1q_u8 (p);
 806       t = vceqq_u8 (data, repl_nl);
 807       u = vceqq_u8 (data, repl_cr);
 808       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 809       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 810       t = vorrq_u8 (v, w);
 811       t = vandq_u8 (t, xmask);
 812       m = vpaddlq_u8 (t);
 813       m = vshlq_u16 (m, shift);
 814       found = vaddvq_u16 (m);
 815       found &= mask;
 816       if (found)
 817         return (const uchar*)p + __builtin_ctz (found);
 818     }
 819   else
 820     {
 821       data = vld1q_u8 ((const uint8_t *) s);
 822       t = vceqq_u8 (data, repl_nl);
 823       u = vceqq_u8 (data, repl_cr);
 824       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 825       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 826       t = vorrq_u8 (v, w);
 827       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 828         goto done;
 829     }
 830
 831   do
 832     {
 833       p += 16;
 834       data = vld1q_u8 (p);
 835       t = vceqq_u8 (data, repl_nl);
 836       u = vceqq_u8 (data, repl_cr);
 837       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 838       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 839       t = vorrq_u8 (v, w);
 840     } while (!vpaddd_u64 ((uint64x2_t)t));
 841
 842 done:
 843   /* Now that we've found the terminating substring, work out precisely where
 844      we need to stop.  */
 845   t = vandq_u8 (t, xmask);
 846   m = vpaddlq_u8 (t);
 847   m = vshlq_u16 (m, shift);
 848   found = vaddvq_u16 (m);
 849   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 850           + __builtin_ctz (found));
 851 }
 852
 853 #elif defined (__ARM_NEON)
 854 #include "arm_neon.h"
 855
 856 static const uchar *
 857 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 858 {
 859   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 860   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 861   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 862   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 863   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 864
 865   unsigned int misalign, found, mask;
 866   const uint8_t *p;
 867   uint8x16_t data;
 868
 869   /* Align the source pointer.  */
 870   misalign = (uintptr_t)s & 15;
 871   p = (const uint8_t *)((uintptr_t)s & -16);
 872   data = vld1q_u8 (p);
 873
 874   /* Create a mask for the bytes that are valid within the first
 875      16-byte block.  The Idea here is that the AND with the mask
 876      within the loop is "free", since we need some AND or TEST
 877      insn in order to set the flags for the branch anyway.  */
 878   mask = (-1u << misalign) & 0xffff;
 879
 880   /* Main loop, processing 16 bytes at a time.  */
 881   goto start;
 882
 883   do
 884     {
 885       uint8x8_t l;
 886       uint16x4_t m;
 887       uint32x2_t n;
 888       uint8x16_t t, u, v, w;
 889
 890       p += 16;
 891       data = vld1q_u8 (p);
 892       mask = 0xffff;
 893
 894     start:
 895       t = vceqq_u8 (data, repl_nl);
 896       u = vceqq_u8 (data, repl_cr);
 897       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 898       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 899       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 900       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 901       m = vpaddl_u8 (l);
 902       n = vpaddl_u16 (m);
 903
 904       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 905               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 906       found &= mask;
 907     }
 908   while (!found);
 909
 910   /* FOUND contains 1 in bits for which we matched a relevant
 911      character.  Conversion to the byte index is trivial.  */
 912   found = __builtin_ctz (found);
 913   return (const uchar *)p + found;
 914 }
 915
 916 #else
 917
 918 /* We only have one accelerated alternative.  Use a direct call so that
 919    we encourage inlining.  */
 920
 921 #define search_line_fast  search_line_acc_char
 922
 923 #endif
 924
 925 /* Initialize the lexer if needed.  */
 926
 927 void
 928 _cpp_init_lexer (void)
 929 {
 930 #ifdef HAVE_init_vectorized_lexer
 931   init_vectorized_lexer ();
 932 #endif
 933 }
 934
 935 /* Returns with a logical line that contains no escaped newlines or
 936    trigraphs.  This is a time-critical inner loop.  */
 937 void
 938 _cpp_clean_line (cpp_reader *pfile)
 939 {
 940   cpp_buffer *buffer;
 941   const uchar *s;
 942   uchar c, *d, *p;
 943
 944   buffer = pfile->buffer;
 945   buffer->cur_note = buffer->notes_used = 0;
 946   buffer->cur = buffer->line_base = buffer->next_line;
 947   buffer->need_line = false;
 948   s = buffer->next_line;
 949
 950   if (!buffer->from_stage3)
 951     {
 952       const uchar *pbackslash = NULL;
 953
 954       /* Fast path.  This is the common case of an un-escaped line with
 955          no trigraphs.  The primary win here is by not writing any
 956          data back to memory until we have to.  */
 957       while (1)
 958         {
 959           /* Perform an optimized search for \n, \r, \\, ?.  */
 960           s = search_line_fast (s, buffer->rlimit);
 961
 962           c = *s;
 963           if (c == '\\')
 964             {
 965               /* Record the location of the backslash and continue.  */
 966               pbackslash = s++;
 967             }
 968           else if (__builtin_expect (c == '?', 0))
 969             {
 970               if (__builtin_expect (s[1] == '?', false)
 971                    && _cpp_trigraph_map[s[2]])
 972                 {
 973                   /* Have a trigraph.  We may or may not have to convert
 974                      it.  Add a line note regardless, for -Wtrigraphs.  */
 975                   add_line_note (buffer, s, s[2]);
 976                   if (CPP_OPTION (pfile, trigraphs))
 977                     {
 978                       /* We do, and that means we have to switch to the
 979                          slow path.  */
 980                       d = (uchar *) s;
 981                       *d = _cpp_trigraph_map[s[2]];
 982                       s += 2;
 983                       goto slow_path;
 984                     }
 985                 }
 986               /* Not a trigraph.  Continue on fast-path.  */
 987               s++;
 988             }
 989           else
 990             break;
 991         }
 992
 993       /* This must be \r or \n.  We're either done, or we'll be forced
 994          to write back to the buffer and continue on the slow path.  */
 995       d = (uchar *) s;
 996
 997       if (__builtin_expect (s == buffer->rlimit, false))
 998         goto done;
 999
1000       /* DOS line ending? */
1001       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1002         {
1003           s++;
1004           if (s == buffer->rlimit)
1005             goto done;
1006         }
1007
1008       if (__builtin_expect (pbackslash == NULL, true))
1009         goto done;
1010
1011       /* Check for escaped newline.  */
1012       p = d;
1013       while (is_nvspace (p[-1]))
1014         p--;
1015       if (p - 1 != pbackslash)
1016         goto done;
1017
1018       /* Have an escaped newline; process it and proceed to
1019          the slow path.  */
1020       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1021       d = p - 2;
1022       buffer->next_line = p - 1;
1023
1024     slow_path:
1025       while (1)
1026         {
1027           c = *++s;
1028           *++d = c;
1029
1030           if (c == '\n' || c == '\r')
1031             {
1032               /* Handle DOS line endings.  */
1033               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1034                 s++;
1035               if (s == buffer->rlimit)
1036                 break;
1037
1038               /* Escaped?  */
1039               p = d;
1040               while (p != buffer->next_line && is_nvspace (p[-1]))
1041                 p--;
1042               if (p == buffer->next_line || p[-1] != '\\')
1043                 break;
1044
1045               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1046               d = p - 2;
1047               buffer->next_line = p - 1;
1048             }
1049           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1050             {
1051               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1052               add_line_note (buffer, d, s[2]);
1053               if (CPP_OPTION (pfile, trigraphs))
1054                 {
1055                   *d = _cpp_trigraph_map[s[2]];
1056                   s += 2;
1057                 }
1058             }
1059         }
1060     }
1061   else
1062     {
1063       while (*s != '\n' && *s != '\r')
1064         s++;
1065       d = (uchar *) s;
1066
1067       /* Handle DOS line endings.  */
1068       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1069         s++;
1070     }
1071
1072  done:
1073   *d = '\n';
1074   /* A sentinel note that should never be processed.  */
1075   add_line_note (buffer, d + 1, '\n');
1076   buffer->next_line = s + 1;
1077 }
1078
1079 /* Return true if the trigraph indicated by NOTE should be warned
1080    about in a comment.  */
1081 static bool
1082 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1083 {
1084   const uchar *p;
1085
1086   /* Within comments we don't warn about trigraphs, unless the
1087      trigraph forms an escaped newline, as that may change
1088      behavior.  */
1089   if (note->type != '/')
1090     return false;
1091
1092   /* If -trigraphs, then this was an escaped newline iff the next note
1093      is coincident.  */
1094   if (CPP_OPTION (pfile, trigraphs))
1095     return note[1].pos == note->pos;
1096
1097   /* Otherwise, see if this forms an escaped newline.  */
1098   p = note->pos + 3;
1099   while (is_nvspace (*p))
1100     p++;
1101
1102   /* There might have been escaped newlines between the trigraph and the
1103      newline we found.  Hence the position test.  */
1104   return (*p == '\n' && p < note[1].pos);
1105 }
1106
1107 /* Process the notes created by add_line_note as far as the current
1108    location.  */
1109 void
1110 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1111 {
1112   cpp_buffer *buffer = pfile->buffer;
1113
1114   for (;;)
1115     {
1116       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1117       unsigned int col;
1118
1119       if (note->pos > buffer->cur)
1120         break;
1121
1122       buffer->cur_note++;
1123       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1124
1125       if (note->type == '\\' || note->type == ' ')
1126         {
1127           if (note->type == ' ' && !in_comment)
1128             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1129                                  "backslash and newline separated by space");
1130
1131           if (buffer->next_line > buffer->rlimit)
1132             {
1133               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1134                                    "backslash-newline at end of file");
1135               /* Prevent "no newline at end of file" warning.  */
1136               buffer->next_line = buffer->rlimit;
1137             }
1138
1139           buffer->line_base = note->pos;
1140           CPP_INCREMENT_LINE (pfile, 0);
1141         }
1142       else if (_cpp_trigraph_map[note->type])
1143         {
1144           if (CPP_OPTION (pfile, warn_trigraphs)
1145               && (!in_comment || warn_in_comment (pfile, note)))
1146             {
1147               if (CPP_OPTION (pfile, trigraphs))
1148                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1149                                        pfile->line_table->highest_line, col,
1150                                        "trigraph ??%c converted to %c",
1151                                        note->type,
1152                                        (int) _cpp_trigraph_map[note->type]);
1153               else
1154                 {
1155                   cpp_warning_with_line
1156                     (pfile, CPP_W_TRIGRAPHS,
1157                      pfile->line_table->highest_line, col,
1158                      "trigraph ??%c ignored, use -trigraphs to enable",
1159                      note->type);
1160                 }
1161             }
1162         }
1163       else if (note->type == 0)
1164         /* Already processed in lex_raw_string.  */;
1165       else
1166         abort ();
1167     }
1168 }
1169
1170 namespace bidi {
1171   enum class kind {
1172     NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1173   };
1174
1175   /* All the UTF-8 encodings of bidi characters start with E2.  */
1176   constexpr uchar utf8_start = 0xe2;
1177
1178   struct context
1179   {
1180     context () {}
1181     context (location_t loc, kind k, bool pdf, bool ucn)
1182     : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1183     {
1184     }
1185
1186     kind get_pop_kind () const
1187     {
1188       return m_pdf ? kind::PDF : kind::PDI;
1189     }
1190     bool ucn_p () const
1191     {
1192       return m_ucn;
1193     }
1194
1195     location_t m_loc;
1196     kind m_kind;
1197     unsigned m_pdf : 1;
1198     unsigned m_ucn : 1;
1199   };
1200
1201   /* A vector holding currently open bidi contexts.  We use a char for
1202      each context, its LSB is 1 if it represents a PDF context, 0 if it
1203      represents a PDI context.  The next bit is 1 if this context was open
1204      by a bidi character written as a UCN, and 0 when it was UTF-8.  */
1205   semi_embedded_vec <context, 16> vec;
1206
1207   /* Close the whole comment/identifier/string literal/character constant
1208      context.  */
1209   void on_close ()
1210   {
1211     vec.truncate (0);
1212   }
1213
1214   /* Pop the last element in the vector.  */
1215   void pop ()
1216   {
1217     unsigned int len = vec.count ();
1218     gcc_checking_assert (len > 0);
1219     vec.truncate (len - 1);
1220   }
1221
1222   /* Return the pop kind of the context of the Ith element.  */
1223   kind pop_kind_at (unsigned int i)
1224   {
1225     return vec[i].get_pop_kind ();
1226   }
1227
1228   /* Return the pop kind of the context that is currently opened.  */
1229   kind current_ctx ()
1230   {
1231     unsigned int len = vec.count ();
1232     if (len == 0)
1233       return kind::NONE;
1234     return vec[len - 1].get_pop_kind ();
1235   }
1236
1237   /* Return true if the current context comes from a UCN origin, that is,
1238      the bidi char which started this bidi context was written as a UCN.  */
1239   bool current_ctx_ucn_p ()
1240   {
1241     unsigned int len = vec.count ();
1242     gcc_checking_assert (len > 0);
1243     return vec[len - 1].m_ucn;
1244   }
1245
1246   location_t current_ctx_loc ()
1247   {
1248     unsigned int len = vec.count ();
1249     gcc_checking_assert (len > 0);
1250     return vec[len - 1].m_loc;
1251   }
1252
1253   /* We've read a bidi char, update the current vector as necessary.
1254      LOC is only valid when K is not kind::NONE.  */
1255   void on_char (kind k, bool ucn_p, location_t loc)
1256   {
1257     switch (k)
1258       {
1259       case kind::LRE:
1260       case kind::RLE:
1261       case kind::LRO:
1262       case kind::RLO:
1263         vec.push (context (loc, k, true, ucn_p));
1264         break;
1265       case kind::LRI:
1266       case kind::RLI:
1267       case kind::FSI:
1268         vec.push (context (loc, k, false, ucn_p));
1269         break;
1270       /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1271          whose scope has not yet been terminated.  */
1272       case kind::PDF:
1273         if (current_ctx () == kind::PDF)
1274           pop ();
1275         break;
1276       /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1277          scope has not yet been terminated, as well as the scopes of
1278          any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1279          yet been terminated.  */
1280       case kind::PDI:
1281         for (int i = vec.count () - 1; i >= 0; --i)
1282           if (pop_kind_at (i) == kind::PDI)
1283             {
1284               vec.truncate (i);
1285               break;
1286             }
1287         break;
1288       case kind::LTR:
1289       case kind::RTL:
1290         /* These aren't popped by a PDF/PDI.  */
1291         break;
1292       ATTR_LIKELY case kind::NONE:
1293         break;
1294       default:
1295         abort ();
1296       }
1297   }
1298
1299   /* Return a descriptive string for K.  */
1300   const char *to_str (kind k)
1301   {
1302     switch (k)
1303       {
1304       case kind::LRE:
1305         return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1306       case kind::RLE:
1307         return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1308       case kind::LRO:
1309         return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1310       case kind::RLO:
1311         return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1312       case kind::LRI:
1313         return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1314       case kind::RLI:
1315         return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1316       case kind::FSI:
1317         return "U+2068 (FIRST STRONG ISOLATE)";
1318       case kind::PDF:
1319         return "U+202C (POP DIRECTIONAL FORMATTING)";
1320       case kind::PDI:
1321         return "U+2069 (POP DIRECTIONAL ISOLATE)";
1322       case kind::LTR:
1323         return "U+200E (LEFT-TO-RIGHT MARK)";
1324       case kind::RTL:
1325         return "U+200F (RIGHT-TO-LEFT MARK)";
1326       default:
1327         abort ();
1328       }
1329   }
1330 }
1331
1332 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1333    within the current line in FILE, with the caret at START.  */
1334
1335 static location_t
1336 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1337                                          const unsigned char *const start,
1338                                          size_t num_bytes)
1339 {
1340   gcc_checking_assert (num_bytes > 0);
1341
1342   /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1343      to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1344      whereas linemap_position_for_column is 1-based.  */
1345
1346   /* Get 0-based offsets within the line.  */
1347   size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1348   size_t end_offset = start_offset + num_bytes - 1;
1349
1350   /* Now convert to location_t, where "columns" are 1-based byte offsets.  */
1351   location_t start_loc = linemap_position_for_column (pfile->line_table,
1352                                                       start_offset + 1);
1353   location_t end_loc = linemap_position_for_column (pfile->line_table,
1354                                                      end_offset + 1);
1355
1356   if (start_loc == end_loc)
1357     return start_loc;
1358
1359   source_range src_range;
1360   src_range.m_start = start_loc;
1361   src_range.m_finish = end_loc;
1362   location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
1363                                                    start_loc,
1364                                                    src_range,
1365                                                    NULL);
1366   return combined_loc;
1367 }
1368
1369 /* Parse a sequence of 3 bytes starting with P and return its bidi code.  */
1370
1371 static bidi::kind
1372 get_bidi_utf8_1 (const unsigned char *const p)
1373 {
1374   gcc_checking_assert (p[0] == bidi::utf8_start);
1375
1376   if (p[1] == 0x80)
1377     switch (p[2])
1378       {
1379       case 0xaa:
1380         return bidi::kind::LRE;
1381       case 0xab:
1382         return bidi::kind::RLE;
1383       case 0xac:
1384         return bidi::kind::PDF;
1385       case 0xad:
1386         return bidi::kind::LRO;
1387       case 0xae:
1388         return bidi::kind::RLO;
1389       case 0x8e:
1390         return bidi::kind::LTR;
1391       case 0x8f:
1392         return bidi::kind::RTL;
1393       default:
1394         break;
1395       }
1396   else if (p[1] == 0x81)
1397     switch (p[2])
1398       {
1399       case 0xa6:
1400         return bidi::kind::LRI;
1401       case 0xa7:
1402         return bidi::kind::RLI;
1403       case 0xa8:
1404         return bidi::kind::FSI;
1405       case 0xa9:
1406         return bidi::kind::PDI;
1407       default:
1408         break;
1409       }
1410
1411   return bidi::kind::NONE;
1412 }
1413
1414 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1415    If the kind is not NONE, write the location to *OUT.*/
1416
1417 static bidi::kind
1418 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1419 {
1420   bidi::kind result = get_bidi_utf8_1 (p);
1421   if (result != bidi::kind::NONE)
1422     {
1423       /* We have a sequence of 3 bytes starting at P.  */
1424       *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1425     }
1426   return result;
1427 }
1428
1429 /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
1430
1431 static bidi::kind
1432 get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
1433 {
1434   /* 6.4.3 Universal Character Names
1435       \u hex-quad
1436       \U hex-quad hex-quad
1437       \u { simple-hexadecimal-digit-sequence }
1438      where \unnnn means \U0000nnnn.  */
1439
1440   *end = p + 4;
1441   if (is_U)
1442     {
1443       if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1444         return bidi::kind::NONE;
1445       /* Skip 4B so we can treat \u and \U the same below.  */
1446       p += 4;
1447       *end += 4;
1448     }
1449   else if (p[0] == '{')
1450     {
1451       p++;
1452       while (*p == '0')
1453         p++;
1454       if (p[0] != '2'
1455           || p[1] != '0'
1456           || !ISXDIGIT (p[2])
1457           || !ISXDIGIT (p[3])
1458           || p[4] != '}')
1459         return bidi::kind::NONE;
1460       *end = p + 5;
1461     }
1462
1463   /* All code points we are looking for start with 20xx.  */
1464   if (p[0] != '2' || p[1] != '0')
1465     return bidi::kind::NONE;
1466   else if (p[2] == '2')
1467     switch (p[3])
1468       {
1469       case 'a':
1470       case 'A':
1471         return bidi::kind::LRE;
1472       case 'b':
1473       case 'B':
1474         return bidi::kind::RLE;
1475       case 'c':
1476       case 'C':
1477         return bidi::kind::PDF;
1478       case 'd':
1479       case 'D':
1480         return bidi::kind::LRO;
1481       case 'e':
1482       case 'E':
1483         return bidi::kind::RLO;
1484       default:
1485         break;
1486       }
1487   else if (p[2] == '6')
1488     switch (p[3])
1489       {
1490       case '6':
1491         return bidi::kind::LRI;
1492       case '7':
1493         return bidi::kind::RLI;
1494       case '8':
1495         return bidi::kind::FSI;
1496       case '9':
1497         return bidi::kind::PDI;
1498       default:
1499         break;
1500       }
1501   else if (p[2] == '0')
1502     switch (p[3])
1503       {
1504       case 'e':
1505       case 'E':
1506         return bidi::kind::LTR;
1507       case 'f':
1508       case 'F':
1509         return bidi::kind::RTL;
1510       default:
1511         break;
1512       }
1513
1514   return bidi::kind::NONE;
1515 }
1516
1517 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1518    If the kind is not NONE, write the location to *OUT.  */
1519
1520 static bidi::kind
1521 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1522               location_t *out)
1523 {
1524   const unsigned char *end;
1525   bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
1526   if (result != bidi::kind::NONE)
1527     {
1528       const unsigned char *start = p - 2;
1529       size_t num_bytes = end - start;
1530       *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1531     }
1532   return result;
1533 }
1534
1535 /* Parse a named universal character escape where P points just past \N and
1536    return its bidi code.  If the kind is not NONE, write the location to
1537    *OUT.  */
1538
1539 static bidi::kind
1540 get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1541 {
1542   bidi::kind result = bidi::kind::NONE;
1543   if (*p != '{')
1544     return bidi::kind::NONE;
1545   if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
1546     {
1547       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1548         result = bidi::kind::LTR;
1549       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1550         result = bidi::kind::LRE;
1551       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1552         result = bidi::kind::LRO;
1553       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1554         result = bidi::kind::LRI;
1555     }
1556   else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
1557     {
1558       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1559         result = bidi::kind::RTL;
1560       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1561         result = bidi::kind::RLE;
1562       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1563         result = bidi::kind::RLO;
1564       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1565         result = bidi::kind::RLI;
1566     }
1567   else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
1568     {
1569       if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
1570         result = bidi::kind::PDF;
1571       else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
1572         result = bidi::kind::PDI;
1573     }
1574   else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
1575     result = bidi::kind::FSI;
1576   if (result != bidi::kind::NONE)
1577     *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
1578                                                     (strchr ((const char *)
1579                                                              (p + 1), '}')
1580                                                      - (const char *) p)
1581                                                     + 3);
1582   return result;
1583 }
1584
1585 /* Subclass of rich_location for reporting on unpaired UTF-8
1586    bidirectional control character(s).
1587    Escape the source lines on output, and show all unclosed
1588    bidi context, labelling everything.  */
1589
1590 class unpaired_bidi_rich_location : public rich_location
1591 {
1592  public:
1593   class custom_range_label : public range_label
1594   {
1595    public:
1596      label_text get_text (unsigned range_idx) const final override
1597      {
1598        /* range 0 is the primary location; each subsequent range i + 1
1599           is for bidi::vec[i].  */
1600        if (range_idx > 0)
1601          {
1602            const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1603            return label_text::borrow (bidi::to_str (ctxt.m_kind));
1604          }
1605        else
1606          return label_text::borrow (_("end of bidirectional context"));
1607      }
1608   };
1609
1610   unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1611   : rich_location (pfile->line_table, loc, &m_custom_label)
1612   {
1613     set_escape_on_output (true);
1614     for (unsigned i = 0; i < bidi::vec.count (); i++)
1615       add_range (bidi::vec[i].m_loc,
1616                  SHOW_RANGE_WITHOUT_CARET,
1617                  &m_custom_label);
1618   }
1619
1620  private:
1621    custom_range_label m_custom_label;
1622 };
1623
1624 /* We're closing a bidi context, that is, we've encountered a newline,
1625    are closing a C-style comment, or are at the end of a string literal,
1626    character constant, or identifier.  Warn if this context was not
1627    properly terminated by a PDI or PDF.  P points to the last character
1628    in this context.  */
1629
1630 static void
1631 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1632 {
1633   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1634   if (bidi::vec.count () > 0
1635       && (warn_bidi & bidirectional_unpaired
1636           && (!bidi::current_ctx_ucn_p ()
1637               || (warn_bidi & bidirectional_ucn))))
1638     {
1639       const location_t loc
1640         = linemap_position_for_column (pfile->line_table,
1641                                        CPP_BUF_COLUMN (pfile->buffer, p));
1642       unpaired_bidi_rich_location rich_loc (pfile, loc);
1643       /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1644          forms of a diagnostic, so fake it for now.  */
1645       if (bidi::vec.count () > 1)
1646         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1647                         "unpaired UTF-8 bidirectional control characters "
1648                         "detected");
1649       else
1650         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1651                         "unpaired UTF-8 bidirectional control character "
1652                         "detected");
1653     }
1654   /* We're done with this context.  */
1655   bidi::on_close ();
1656 }
1657
1658 /* We're at the beginning or in the middle of an identifier/comment/string
1659    literal/character constant.  Warn if we've encountered a bidi character.
1660    KIND says which bidi control character it was; UCN_P is true iff this bidi
1661    control character was written as a UCN.  LOC is the location of the
1662    character, but is only valid if KIND != bidi::kind::NONE.  */
1663
1664 static void
1665 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1666                          bool ucn_p, location_t loc)
1667 {
1668   if (__builtin_expect (kind == bidi::kind::NONE, 1))
1669     return;
1670
1671   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1672
1673   if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1674     {
1675       rich_location rich_loc (pfile->line_table, loc);
1676       rich_loc.set_escape_on_output (true);
1677
1678       /* It seems excessive to warn about a PDI/PDF that is closing
1679          an opened context because we've already warned about the
1680          opening character.  Except warn when we have a UCN x UTF-8
1681          mismatch, if UCN checking is enabled.  */
1682       if (kind == bidi::current_ctx ())
1683         {
1684           if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1685               && bidi::current_ctx_ucn_p () != ucn_p)
1686             {
1687               rich_loc.add_range (bidi::current_ctx_loc ());
1688               cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1689                               "UTF-8 vs UCN mismatch when closing "
1690                               "a context by \"%s\"", bidi::to_str (kind));
1691             }
1692         }
1693       else if (warn_bidi & bidirectional_any
1694                && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1695         {
1696           if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1697             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1698                             "\"%s\" is closing an unopened context",
1699                             bidi::to_str (kind));
1700           else
1701             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1702                             "found problematic Unicode character \"%s\"",
1703                             bidi::to_str (kind));
1704         }
1705     }
1706   /* We're done with this context.  */
1707   bidi::on_char (kind, ucn_p, loc);
1708 }
1709
1710 static const cppchar_t utf8_continuation = 0x80;
1711 static const cppchar_t utf8_signifier = 0xC0;
1712
1713 /* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1714    at PFILE->buffer->cur.  Return a pointer after the diagnosed
1715    invalid character.  */
1716
1717 static const uchar *
1718 _cpp_warn_invalid_utf8 (cpp_reader *pfile)
1719 {
1720   cpp_buffer *buffer = pfile->buffer;
1721   const uchar *cur = buffer->cur;
1722   bool pedantic = (CPP_PEDANTIC (pfile)
1723                    && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1724
1725   if (cur[0] < utf8_signifier
1726       || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1727     {
1728       if (pedantic)
1729         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1730                              pfile->line_table->highest_line,
1731                              CPP_BUF_COL (buffer),
1732                              "invalid UTF-8 character <%x>",
1733                              cur[0]);
1734       else
1735         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1736                                pfile->line_table->highest_line,
1737                                CPP_BUF_COL (buffer),
1738                                "invalid UTF-8 character <%x>",
1739                                cur[0]);
1740       return cur + 1;
1741     }
1742   else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1743     {
1744       if (pedantic)
1745         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1746                              pfile->line_table->highest_line,
1747                              CPP_BUF_COL (buffer),
1748                              "invalid UTF-8 character <%x><%x>",
1749                              cur[0], cur[1]);
1750       else
1751         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1752                                pfile->line_table->highest_line,
1753                                CPP_BUF_COL (buffer),
1754                                "invalid UTF-8 character <%x><%x>",
1755                                cur[0], cur[1]);
1756       return cur + 2;
1757     }
1758   else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1759     {
1760       if (pedantic)
1761         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1762                              pfile->line_table->highest_line,
1763                              CPP_BUF_COL (buffer),
1764                              "invalid UTF-8 character <%x><%x><%x>",
1765                              cur[0], cur[1], cur[2]);
1766       else
1767         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1768                                pfile->line_table->highest_line,
1769                                CPP_BUF_COL (buffer),
1770                                "invalid UTF-8 character <%x><%x><%x>",
1771                                cur[0], cur[1], cur[2]);
1772       return cur + 3;
1773     }
1774   else
1775     {
1776       if (pedantic)
1777         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1778                              pfile->line_table->highest_line,
1779                              CPP_BUF_COL (buffer),
1780                              "invalid UTF-8 character <%x><%x><%x><%x>",
1781                              cur[0], cur[1], cur[2], cur[3]);
1782       else
1783         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1784                                pfile->line_table->highest_line,
1785                                CPP_BUF_COL (buffer),
1786                                "invalid UTF-8 character <%x><%x><%x><%x>",
1787                                cur[0], cur[1], cur[2], cur[3]);
1788       return cur + 4;
1789     }
1790 }
1791
1792 /* Helper function of *skip_*_comment and lex*_string.  For C,
1793    character at CUR[-1] with MSB set handle -Wbidi-chars* and
1794    -Winvalid-utf8 diagnostics and return pointer to first character
1795    that should be processed next.  */
1796
1797 static inline const uchar *
1798 _cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1799                             const uchar *cur, bool warn_bidi_p,
1800                             bool warn_invalid_utf8_p)
1801 {
1802   /* If this is a beginning of a UTF-8 encoding, it might be
1803      a bidirectional control character.  */
1804   if (c == bidi::utf8_start && warn_bidi_p)
1805     {
1806       location_t loc;
1807       bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1808       maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1809     }
1810   if (!warn_invalid_utf8_p)
1811     return cur;
1812   if (c >= utf8_signifier)
1813     {
1814       cppchar_t s;
1815       const uchar *pstr = cur - 1;
1816       if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
1817           && s <= UCS_LIMIT)
1818         return pstr;
1819     }
1820   pfile->buffer->cur = cur - 1;
1821   return _cpp_warn_invalid_utf8 (pfile);
1822 }
1823
1824 /* Skip a C-style block comment.  We find the end of the comment by
1825    seeing if an asterisk is before every '/' we encounter.  Returns
1826    nonzero if comment terminated by EOF, zero otherwise.
1827
1828    Buffer->cur points to the initial asterisk of the comment.  */
1829 bool
1830 _cpp_skip_block_comment (cpp_reader *pfile)
1831 {
1832   cpp_buffer *buffer = pfile->buffer;
1833   const uchar *cur = buffer->cur;
1834   uchar c;
1835   const bool warn_bidi_p = pfile->warn_bidi_p ();
1836   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1837   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1838
1839   cur++;
1840   if (*cur == '/')
1841     cur++;
1842
1843   for (;;)
1844     {
1845       /* People like decorating comments with '*', so check for '/'
1846          instead for efficiency.  */
1847       c = *cur++;
1848
1849       if (c == '/')
1850         {
1851           if (cur[-2] == '*')
1852             {
1853               if (warn_bidi_p)
1854                 maybe_warn_bidi_on_close (pfile, cur);
1855               break;
1856             }
1857
1858           /* Warn about potential nested comments, but not if the '/'
1859              comes immediately before the true comment delimiter.
1860              Don't bother to get it right across escaped newlines.  */
1861           if (CPP_OPTION (pfile, warn_comments)
1862               && cur[0] == '*' && cur[1] != '/')
1863             {
1864               buffer->cur = cur;
1865               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1866                                      pfile->line_table->highest_line,
1867                                      CPP_BUF_COL (buffer),
1868                                      "\"/*\" within comment");
1869             }
1870         }
1871       else if (c == '\n')
1872         {
1873           unsigned int cols;
1874           buffer->cur = cur - 1;
1875           if (warn_bidi_p)
1876             maybe_warn_bidi_on_close (pfile, cur);
1877           _cpp_process_line_notes (pfile, true);
1878           if (buffer->next_line >= buffer->rlimit)
1879             return true;
1880           _cpp_clean_line (pfile);
1881
1882           cols = buffer->next_line - buffer->line_base;
1883           CPP_INCREMENT_LINE (pfile, cols);
1884
1885           cur = buffer->cur;
1886         }
1887       else if (__builtin_expect (c >= utf8_continuation, 0)
1888                && warn_bidi_or_invalid_utf8_p)
1889         cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1890                                           warn_invalid_utf8_p);
1891     }
1892
1893   buffer->cur = cur;
1894   _cpp_process_line_notes (pfile, true);
1895   return false;
1896 }
1897
1898 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1899    terminating newline.  Handles escaped newlines.  Returns nonzero
1900    if a multiline comment.  */
1901 static int
1902 skip_line_comment (cpp_reader *pfile)
1903 {
1904   cpp_buffer *buffer = pfile->buffer;
1905   location_t orig_line = pfile->line_table->highest_line;
1906   const bool warn_bidi_p = pfile->warn_bidi_p ();
1907   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1908   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1909
1910   if (!warn_bidi_or_invalid_utf8_p)
1911     while (*buffer->cur != '\n')
1912       buffer->cur++;
1913   else if (!warn_invalid_utf8_p)
1914     {
1915       while (*buffer->cur != '\n'
1916              && *buffer->cur != bidi::utf8_start)
1917         buffer->cur++;
1918       if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1919         {
1920           while (*buffer->cur != '\n')
1921             {
1922               if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1923                 {
1924                   location_t loc;
1925                   bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1926                   maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1927                 }
1928               buffer->cur++;
1929             }
1930           maybe_warn_bidi_on_close (pfile, buffer->cur);
1931         }
1932     }
1933   else
1934     {
1935       while (*buffer->cur != '\n')
1936         {
1937           if (*buffer->cur < utf8_continuation)
1938             {
1939               buffer->cur++;
1940               continue;
1941             }
1942           buffer->cur
1943             = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
1944                                           warn_bidi_p, warn_invalid_utf8_p);
1945         }
1946       if (warn_bidi_p)
1947         maybe_warn_bidi_on_close (pfile, buffer->cur);
1948     }
1949
1950   _cpp_process_line_notes (pfile, true);
1951   return orig_line != pfile->line_table->highest_line;
1952 }
1953
1954 /* Skips whitespace, saving the next non-whitespace character.  */
1955 static void
1956 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1957 {
1958   cpp_buffer *buffer = pfile->buffer;
1959   bool saw_NUL = false;
1960
1961   do
1962     {
1963       /* Horizontal space always OK.  */
1964       if (c == ' ' || c == '\t')
1965         ;
1966       /* Just \f \v or \0 left.  */
1967       else if (c == '\0')
1968         saw_NUL = true;
1969       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1970         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1971                              CPP_BUF_COL (buffer),
1972                              "%s in preprocessing directive",
1973                              c == '\f' ? "form feed" : "vertical tab");
1974
1975       c = *buffer->cur++;
1976     }
1977   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1978   while (is_nvspace (c));
1979
1980   if (saw_NUL)
1981     {
1982       encoding_rich_location rich_loc (pfile);
1983       cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1984                     "null character(s) ignored");
1985     }
1986
1987   buffer->cur--;
1988 }
1989
1990 /* See if the characters of a number token are valid in a name (no
1991    '.', '+' or '-').  */
1992 static int
1993 name_p (cpp_reader *pfile, const cpp_string *string)
1994 {
1995   unsigned int i;
1996
1997   for (i = 0; i < string->len; i++)
1998     if (!is_idchar (string->text[i]))
1999       return 0;
2000
2001   return 1;
2002 }
2003
2004 /* After parsing an identifier or other sequence, produce a warning about
2005    sequences not in NFC/NFKC.  */
2006 static void
2007 warn_about_normalization (cpp_reader *pfile,
2008                           const cpp_token *token,
2009                           const struct normalize_state *s)
2010 {
2011   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2012       && !pfile->state.skipping)
2013     {
2014       location_t loc = token->src_loc;
2015
2016       /* If possible, create a location range for the token.  */
2017       if (loc >= RESERVED_LOCATION_COUNT
2018           && token->type != CPP_EOF
2019           /* There must be no line notes to process.  */
2020           && (!(pfile->buffer->cur
2021                 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2022                 && !pfile->overlaid_buffer)))
2023         {
2024           source_range tok_range;
2025           tok_range.m_start = loc;
2026           tok_range.m_finish
2027             = linemap_position_for_column (pfile->line_table,
2028                                            CPP_BUF_COLUMN (pfile->buffer,
2029                                                            pfile->buffer->cur));
2030           loc = COMBINE_LOCATION_DATA (pfile->line_table,
2031                                        loc, tok_range, NULL);
2032         }
2033
2034       encoding_rich_location rich_loc (pfile, loc);
2035
2036       /* Make sure that the token is printed using UCNs, even
2037          if we'd otherwise happily print UTF-8.  */
2038       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2039       size_t sz;
2040
2041       sz = cpp_spell_token (pfile, token, buf, false) - buf;
2042       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2043         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2044                         "`%.*s' is not in NFKC", (int) sz, buf);
2045       else if (CPP_OPTION (pfile, cplusplus))
2046         cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2047                                   "`%.*s' is not in NFC", (int) sz, buf);
2048       else
2049         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2050                         "`%.*s' is not in NFC", (int) sz, buf);
2051       free (buf);
2052     }
2053 }
2054
2055 /* Returns TRUE if the sequence starting at buffer->cur is valid in
2056    an identifier.  FIRST is TRUE if this starts an identifier.  */
2057
2058 static bool
2059 forms_identifier_p (cpp_reader *pfile, int first,
2060                     struct normalize_state *state)
2061 {
2062   cpp_buffer *buffer = pfile->buffer;
2063   const bool warn_bidi_p = pfile->warn_bidi_p ();
2064
2065   if (*buffer->cur == '$')
2066     {
2067       if (!CPP_OPTION (pfile, dollars_in_ident))
2068         return false;
2069
2070       buffer->cur++;
2071       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
2072         {
2073           CPP_OPTION (pfile, warn_dollars) = 0;
2074           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
2075         }
2076
2077       return true;
2078     }
2079
2080   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
2081   if (CPP_OPTION (pfile, extended_identifiers))
2082     {
2083       cppchar_t s;
2084       if (*buffer->cur >= utf8_signifier)
2085         {
2086           if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2087               && warn_bidi_p)
2088             {
2089               location_t loc;
2090               bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
2091               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2092             }
2093           if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2094                                state, &s))
2095             return true;
2096         }
2097       else if (*buffer->cur == '\\'
2098                && (buffer->cur[1] == 'u'
2099                    || buffer->cur[1] == 'U'
2100                    || buffer->cur[1] == 'N'))
2101         {
2102           buffer->cur += 2;
2103           if (warn_bidi_p)
2104             {
2105               location_t loc;
2106               bidi::kind kind;
2107               if (buffer->cur[-1] == 'N')
2108                 kind = get_bidi_named (pfile, buffer->cur, &loc);
2109               else
2110                 kind = get_bidi_ucn (pfile, buffer->cur,
2111                                      buffer->cur[-1] == 'U', &loc);
2112               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2113             }
2114           if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2115                               state, &s, NULL, NULL))
2116             return true;
2117           buffer->cur -= 2;
2118         }
2119     }
2120
2121   return false;
2122 }
2123
2124 /* Helper function to issue error about improper __VA_OPT__ use.  */
2125 static void
2126 maybe_va_opt_error (cpp_reader *pfile)
2127 {
2128   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2129     {
2130       /* __VA_OPT__ should not be accepted at all, but allow it in
2131          system headers.  */
2132       if (!_cpp_in_system_header (pfile))
2133         cpp_error (pfile, CPP_DL_PEDWARN,
2134                    "__VA_OPT__ is not available until C++20");
2135     }
2136   else if (!pfile->state.va_args_ok)
2137     {
2138       /* __VA_OPT__ should only appear in the replacement list of a
2139          variadic macro.  */
2140       cpp_error (pfile, CPP_DL_PEDWARN,
2141                  "__VA_OPT__ can only appear in the expansion"
2142                  " of a C++20 variadic macro");
2143     }
2144 }
2145
2146 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
2147 static cpp_hashnode *
2148 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
2149 {
2150   cpp_hashnode *result;
2151   const uchar *cur;
2152   unsigned int len;
2153   unsigned int hash = HT_HASHSTEP (0, *base);
2154
2155   cur = base + 1;
2156   while (ISIDNUM (*cur))
2157     {
2158       hash = HT_HASHSTEP (hash, *cur);
2159       cur++;
2160     }
2161   len = cur - base;
2162   hash = HT_HASHFINISH (hash, len);
2163   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2164                                               base, len, hash, HT_ALLOC));
2165
2166   /* Rarely, identifiers require diagnostics when lexed.  */
2167   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2168                         && !pfile->state.skipping, 0))
2169     {
2170       /* It is allowed to poison the same identifier twice.  */
2171       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2172         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2173                    NODE_NAME (result));
2174
2175       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2176          replacement list of a variadic macro.  */
2177       if (result == pfile->spec_nodes.n__VA_ARGS__
2178           && !pfile->state.va_args_ok)
2179         {
2180           if (CPP_OPTION (pfile, cplusplus))
2181             cpp_error (pfile, CPP_DL_PEDWARN,
2182                        "__VA_ARGS__ can only appear in the expansion"
2183                        " of a C++11 variadic macro");
2184           else
2185             cpp_error (pfile, CPP_DL_PEDWARN,
2186                        "__VA_ARGS__ can only appear in the expansion"
2187                        " of a C99 variadic macro");
2188         }
2189
2190       if (result == pfile->spec_nodes.n__VA_OPT__)
2191         maybe_va_opt_error (pfile);
2192
2193       /* For -Wc++-compat, warn about use of C++ named operators.  */
2194       if (result->flags & NODE_WARN_OPERATOR)
2195         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2196                      "identifier \"%s\" is a special operator name in C++",
2197                      NODE_NAME (result));
2198     }
2199
2200   return result;
2201 }
2202
2203 /* Get the cpp_hashnode of an identifier specified by NAME in
2204    the current cpp_reader object.  If none is found, NULL is returned.  */
2205 cpp_hashnode *
2206 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2207 {
2208   cpp_hashnode *result;
2209   result = lex_identifier_intern (pfile, (uchar *) name);
2210   return result;
2211 }
2212
2213 /* Lex an identifier starting at BUFFER->CUR - 1.  */
2214 static cpp_hashnode *
2215 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2216                 struct normalize_state *nst, cpp_hashnode **spelling)
2217 {
2218   cpp_hashnode *result;
2219   const uchar *cur;
2220   unsigned int len;
2221   unsigned int hash = HT_HASHSTEP (0, *base);
2222   const bool warn_bidi_p = pfile->warn_bidi_p ();
2223
2224   cur = pfile->buffer->cur;
2225   if (! starts_ucn)
2226     {
2227       while (ISIDNUM (*cur))
2228         {
2229           hash = HT_HASHSTEP (hash, *cur);
2230           cur++;
2231         }
2232       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2233     }
2234   pfile->buffer->cur = cur;
2235   if (starts_ucn || forms_identifier_p (pfile, false, nst))
2236     {
2237       /* Slower version for identifiers containing UCNs
2238          or extended chars (including $).  */
2239       do {
2240         while (ISIDNUM (*pfile->buffer->cur))
2241           {
2242             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2243             pfile->buffer->cur++;
2244           }
2245       } while (forms_identifier_p (pfile, false, nst));
2246       if (warn_bidi_p)
2247         maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2248       result = _cpp_interpret_identifier (pfile, base,
2249                                           pfile->buffer->cur - base);
2250       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2251     }
2252   else
2253     {
2254       len = cur - base;
2255       hash = HT_HASHFINISH (hash, len);
2256
2257       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2258                                                   base, len, hash, HT_ALLOC));
2259       *spelling = result;
2260     }
2261
2262   /* Rarely, identifiers require diagnostics when lexed.  */
2263   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2264                         && !pfile->state.skipping, 0))
2265     {
2266       /* It is allowed to poison the same identifier twice.  */
2267       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2268         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2269                    NODE_NAME (result));
2270
2271       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2272          replacement list of a variadic macro.  */
2273       if (result == pfile->spec_nodes.n__VA_ARGS__
2274           && !pfile->state.va_args_ok)
2275         {
2276           if (CPP_OPTION (pfile, cplusplus))
2277             cpp_error (pfile, CPP_DL_PEDWARN,
2278                        "__VA_ARGS__ can only appear in the expansion"
2279                        " of a C++11 variadic macro");
2280           else
2281             cpp_error (pfile, CPP_DL_PEDWARN,
2282                        "__VA_ARGS__ can only appear in the expansion"
2283                        " of a C99 variadic macro");
2284         }
2285
2286       /* __VA_OPT__ should only appear in the replacement list of a
2287          variadic macro.  */
2288       if (result == pfile->spec_nodes.n__VA_OPT__)
2289         maybe_va_opt_error (pfile);
2290
2291       /* For -Wc++-compat, warn about use of C++ named operators.  */
2292       if (result->flags & NODE_WARN_OPERATOR)
2293         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2294                      "identifier \"%s\" is a special operator name in C++",
2295                      NODE_NAME (result));
2296     }
2297
2298   return result;
2299 }
2300
2301 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
2302 static void
2303 lex_number (cpp_reader *pfile, cpp_string *number,
2304             struct normalize_state *nst)
2305 {
2306   const uchar *cur;
2307   const uchar *base;
2308   uchar *dest;
2309
2310   base = pfile->buffer->cur - 1;
2311   do
2312     {
2313       const uchar *adj_digit_sep = NULL;
2314       cur = pfile->buffer->cur;
2315
2316       /* N.B. ISIDNUM does not include $.  */
2317       while (ISIDNUM (*cur)
2318              || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2319              || DIGIT_SEP (*cur)
2320              || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2321         {
2322           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2323           /* Adjacent digit separators do not form part of the pp-number syntax.
2324              However, they can safely be diagnosed here as an error, since '' is
2325              not a valid preprocessing token.  */
2326           if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2327             adj_digit_sep = cur;
2328           cur++;
2329         }
2330       /* A number can't end with a digit separator.  */
2331       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2332         --cur;
2333       if (adj_digit_sep && adj_digit_sep < cur)
2334         cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2335
2336       pfile->buffer->cur = cur;
2337     }
2338   while (forms_identifier_p (pfile, false, nst));
2339
2340   number->len = cur - base;
2341   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2342   memcpy (dest, base, number->len);
2343   dest[number->len] = '\0';
2344   number->text = dest;
2345 }
2346
2347 /* Create a token of type TYPE with a literal spelling.  */
2348 static void
2349 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2350                 unsigned int len, enum cpp_ttype type)
2351 {
2352   token->type = type;
2353   token->val.str.len = len;
2354   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2355 }
2356
2357 const uchar *
2358 cpp_alloc_token_string (cpp_reader *pfile,
2359                         const unsigned char *ptr, unsigned len)
2360 {
2361   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2362
2363   dest[len] = 0;
2364   memcpy (dest, ptr, len);
2365   return dest;
2366 }
2367
2368 /* A pair of raw buffer pointers.  The currently open one is [1], the
2369    first one is [0].  Used for string literal lexing.  */
2370 struct lit_accum {
2371   _cpp_buff *first;
2372   _cpp_buff *last;
2373   const uchar *rpos;
2374   size_t accum;
2375
2376   lit_accum ()
2377     : first (NULL), last (NULL), rpos (0), accum (0)
2378   {
2379   }
2380
2381   void append (cpp_reader *, const uchar *, size_t);
2382
2383   void read_begin (cpp_reader *);
2384   bool reading_p () const
2385   {
2386     return rpos != NULL;
2387   }
2388   char read_char ()
2389   {
2390     char c = *rpos++;
2391     if (rpos == BUFF_FRONT (last))
2392       rpos = NULL;
2393     return c;
2394   }
2395 };
2396
2397 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2398    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
2399
2400 void
2401 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2402 {
2403   if (!last)
2404     /* Starting.  */
2405     first = last = _cpp_get_buff (pfile, len);
2406   else if (len > BUFF_ROOM (last))
2407     {
2408       /* There is insufficient room in the buffer.  Copy what we can,
2409          and then either extend or create a new one.  */
2410       size_t room = BUFF_ROOM (last);
2411       memcpy (BUFF_FRONT (last), base, room);
2412       BUFF_FRONT (last) += room;
2413       base += room;
2414       len -= room;
2415       accum += room;
2416
2417       gcc_checking_assert (!rpos);
2418
2419       last = _cpp_append_extend_buff (pfile, last, len);
2420     }
2421
2422   memcpy (BUFF_FRONT (last), base, len);
2423   BUFF_FRONT (last) += len;
2424   accum += len;
2425 }
2426
2427 void
2428 lit_accum::read_begin (cpp_reader *pfile)
2429 {
2430   /* We never accumulate more than 4 chars to read.  */
2431   if (BUFF_ROOM (last) < 4)
2432
2433     last = _cpp_append_extend_buff (pfile, last, 4);
2434   rpos = BUFF_FRONT (last);
2435 }
2436
2437 /* Returns true if a macro has been defined.
2438    This might not work if compile with -save-temps,
2439    or preprocess separately from compilation.  */
2440
2441 static bool
2442 is_macro(cpp_reader *pfile, const uchar *base)
2443 {
2444   const uchar *cur = base;
2445   if (! ISIDST (*cur))
2446     return false;
2447   unsigned int hash = HT_HASHSTEP (0, *cur);
2448   ++cur;
2449   while (ISIDNUM (*cur))
2450     {
2451       hash = HT_HASHSTEP (hash, *cur);
2452       ++cur;
2453     }
2454   hash = HT_HASHFINISH (hash, cur - base);
2455
2456   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2457                                         base, cur - base, hash, HT_NO_INSERT));
2458
2459   return result && cpp_macro_p (result);
2460 }
2461
2462 /* Returns true if a literal suffix does not have the expected form
2463    and is defined as a macro.  */
2464
2465 static bool
2466 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
2467 {
2468   /* User-defined literals outside of namespace std must start with a single
2469      underscore, so assume anything of that form really is a UDL suffix.
2470      We don't need to worry about UDLs defined inside namespace std because
2471      their names are reserved, so cannot be used as macro names in valid
2472      programs.  */
2473   if (base[0] == '_' && base[1] != '_')
2474     return false;
2475   return is_macro (pfile, base);
2476 }
2477
2478 /* Lexes a raw string.  The stored string contains the spelling,
2479    including double quotes, delimiter string, '(' and ')', any leading
2480    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
2481    the type of the literal, or CPP_OTHER if it was not properly
2482    terminated.
2483
2484    BASE is the start of the token.  Updates pfile->buffer->cur to just
2485    after the lexed string.
2486
2487    The spelling is NUL-terminated, but it is not guaranteed that this
2488    is the first NUL since embedded NULs are preserved.  */
2489
2490 static void
2491 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2492 {
2493   const uchar *pos = base;
2494   const bool warn_bidi_p = pfile->warn_bidi_p ();
2495   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2496   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2497
2498   /* 'tis a pity this information isn't passed down from the lexer's
2499      initial categorization of the token.  */
2500   enum cpp_ttype type = CPP_STRING;
2501
2502   if (*pos == 'L')
2503     {
2504       type = CPP_WSTRING;
2505       pos++;
2506     }
2507   else if (*pos == 'U')
2508     {
2509       type = CPP_STRING32;
2510       pos++;
2511     }
2512   else if (*pos == 'u')
2513     {
2514       if (pos[1] == '8')
2515         {
2516           type = CPP_UTF8STRING;
2517           pos++;
2518         }
2519       else
2520         type = CPP_STRING16;
2521       pos++;
2522     }
2523
2524   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2525   pos += 2;
2526
2527   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2528
2529   /* Skip notes before the ".  */
2530   while (note->pos < pos)
2531     ++note;
2532
2533   lit_accum accum;
2534
2535   uchar prefix[17];
2536   unsigned prefix_len = 0;
2537   enum Phase
2538   {
2539    PHASE_PREFIX = -2,
2540    PHASE_NONE = -1,
2541    PHASE_SUFFIX = 0
2542   } phase = PHASE_PREFIX;
2543
2544   for (;;)
2545     {
2546       gcc_checking_assert (note->pos >= pos);
2547
2548       /* Undo any escaped newlines and trigraphs.  */
2549       if (!accum.reading_p () && note->pos == pos)
2550         switch (note->type)
2551           {
2552           case '\\':
2553           case ' ':
2554             /* Restore backslash followed by newline.  */
2555             accum.append (pfile, base, pos - base);
2556             base = pos;
2557             accum.read_begin (pfile);
2558             accum.append (pfile, UC"\\", 1);
2559
2560           after_backslash:
2561             if (note->type == ' ')
2562               /* GNU backslash whitespace newline extension.  FIXME
2563                  could be any sequence of non-vertical space.  When we
2564                  can properly restore any such sequence, we should
2565                  mark this note as handled so _cpp_process_line_notes
2566                  doesn't warn.  */
2567               accum.append (pfile, UC" ", 1);
2568
2569             accum.append (pfile, UC"\n", 1);
2570             note++;
2571             break;
2572
2573           case '\n':
2574             /* This can happen for ??/<NEWLINE> when trigraphs are not
2575                being interpretted.  */
2576             gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2577             note->type = 0;
2578             note++;
2579             break;
2580
2581           default:
2582             gcc_checking_assert (_cpp_trigraph_map[note->type]);
2583
2584             /* Don't warn about this trigraph in
2585                _cpp_process_line_notes, since trigraphs show up as
2586                trigraphs in raw strings.  */
2587             uchar type = note->type;
2588             note->type = 0;
2589
2590             if (CPP_OPTION (pfile, trigraphs))
2591               {
2592                 accum.append (pfile, base, pos - base);
2593                 base = pos;
2594                 accum.read_begin (pfile);
2595                 accum.append (pfile, UC"??", 2);
2596                 accum.append (pfile, &type, 1);
2597
2598                 /* ??/ followed by newline gets two line notes, one for
2599                    the trigraph and one for the backslash/newline.  */
2600                 if (type == '/' && note[1].pos == pos)
2601                   {
2602                     note++;
2603                     gcc_assert (note->type == '\\' || note->type == ' ');
2604                     goto after_backslash;
2605                   }
2606                 /* Skip the replacement character.  */
2607                 base = ++pos;
2608               }
2609
2610             note++;
2611             break;
2612           }
2613
2614       /* Now get a char to process.  Either from an expanded note, or
2615          from the line buffer.  */
2616       bool read_note = accum.reading_p ();
2617       char c = read_note ? accum.read_char () : *pos++;
2618
2619       if (phase == PHASE_PREFIX)
2620         {
2621           if (c == '(')
2622             {
2623               /* Done.  */
2624               phase = PHASE_NONE;
2625               prefix[prefix_len++] = '"';
2626             }
2627           else if (prefix_len < 16
2628                    /* Prefix chars are any of the basic character set,
2629                       [lex.charset] except for '
2630                       ()\\\t\v\f\n'. Optimized for a contiguous
2631                       alphabet.  */
2632                    /* Unlike a switch, this collapses down to one or
2633                       two shift and bitmask operations on an ASCII
2634                       system, with an outlier or two.   */
2635                    && (('Z' - 'A' == 25
2636                         ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2637                         : ISIDST (c))
2638                        || (c >= '0' && c <= '9')
2639                        || c == '_' || c == '{' || c == '}'
2640                        || c == '[' || c == ']' || c == '#'
2641                        || c == '<' || c == '>' || c == '%'
2642                        || c == ':' || c == ';' || c == '.' || c == '?'
2643                        || c == '*' || c == '+' || c == '-' || c == '/'
2644                        || c == '^' || c == '&' || c == '|' || c == '~'
2645                        || c == '!' || c == '=' || c == ','
2646                        || c == '"' || c == '\''))
2647             prefix[prefix_len++] = c;
2648           else
2649             {
2650               /* Something is wrong.  */
2651               int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2652               if (prefix_len == 16)
2653                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2654                                      col, "raw string delimiter longer "
2655                                      "than 16 characters");
2656               else if (c == '\n')
2657                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2658                                      col, "invalid new-line in raw "
2659                                      "string delimiter");
2660               else
2661                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2662                                      col, "invalid character '%c' in "
2663                                      "raw string delimiter", c);
2664               type = CPP_OTHER;
2665               phase = PHASE_NONE;
2666               /* Continue until we get a close quote, that's probably
2667                  the best failure mode.  */
2668               prefix_len = 0;
2669             }
2670           if (c != '\n')
2671             continue;
2672         }
2673
2674       if (phase != PHASE_NONE)
2675         {
2676           if (prefix[phase] != c)
2677             phase = PHASE_NONE;
2678           else if (unsigned (phase + 1) == prefix_len)
2679             break;
2680           else
2681             {
2682               phase = Phase (phase + 1);
2683               continue;
2684             }
2685         }
2686
2687       if (!prefix_len && c == '"')
2688         /* Failure mode lexing.  */
2689         goto out;
2690       else if (prefix_len && c == ')')
2691         phase = PHASE_SUFFIX;
2692       else if (!read_note && c == '\n')
2693         {
2694           pos--;
2695           pfile->buffer->cur = pos;
2696           if (pfile->state.in_directive
2697               || (pfile->state.parsing_args
2698                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
2699             {
2700               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2701                                    "unterminated raw string");
2702               type = CPP_OTHER;
2703               goto out;
2704             }
2705
2706           accum.append (pfile, base, pos - base + 1);
2707           _cpp_process_line_notes (pfile, false);
2708
2709           if (pfile->buffer->next_line < pfile->buffer->rlimit)
2710             CPP_INCREMENT_LINE (pfile, 0);
2711           pfile->buffer->need_line = true;
2712
2713           if (!_cpp_get_fresh_line (pfile))
2714             {
2715               /* We ran out of file and failed to get a line.  */
2716               location_t src_loc = token->src_loc;
2717               token->type = CPP_EOF;
2718               /* Tell the compiler the line number of the EOF token.  */
2719               token->src_loc = pfile->line_table->highest_line;
2720               token->flags = BOL;
2721               if (accum.first)
2722                 _cpp_release_buff (pfile, accum.first);
2723               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2724                                    "unterminated raw string");
2725               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
2726               _cpp_pop_buffer (pfile);
2727               return;
2728             }
2729
2730           pos = base = pfile->buffer->cur;
2731           note = &pfile->buffer->notes[pfile->buffer->cur_note];
2732         }
2733       else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2734                && warn_bidi_or_invalid_utf8_p)
2735         pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
2736                                           warn_invalid_utf8_p);
2737     }
2738
2739   if (warn_bidi_p)
2740     maybe_warn_bidi_on_close (pfile, pos);
2741
2742   if (CPP_OPTION (pfile, user_literals))
2743     {
2744       /* If a string format macro, say from inttypes.h, is placed touching
2745          a string literal it could be parsed as a C++11 user-defined string
2746          literal thus breaking the program.  */
2747       if (is_macro_not_literal_suffix (pfile, pos))
2748         {
2749           /* Raise a warning, but do not consume subsequent tokens.  */
2750           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2751             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2752                                    token->src_loc, 0,
2753                                    "invalid suffix on literal; C++11 requires "
2754                                    "a space between literal and string macro");
2755         }
2756       /* Grab user defined literal suffix.  */
2757       else if (ISIDST (*pos))
2758         {
2759           type = cpp_userdef_string_add_type (type);
2760           ++pos;
2761
2762           while (ISIDNUM (*pos))
2763             ++pos;
2764         }
2765     }
2766
2767  out:
2768   pfile->buffer->cur = pos;
2769   if (!accum.accum)
2770     create_literal (pfile, token, base, pos - base, type);
2771   else
2772     {
2773       size_t extra_len = pos - base;
2774       uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2775
2776       token->type = type;
2777       token->val.str.len = accum.accum + extra_len;
2778       token->val.str.text = dest;
2779       for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2780         {
2781           size_t len = BUFF_FRONT (buf) - buf->base;
2782           memcpy (dest, buf->base, len);
2783           dest += len;
2784         }
2785       _cpp_release_buff (pfile, accum.first);
2786       memcpy (dest, base, extra_len);
2787       dest[extra_len] = '\0';
2788     }
2789 }
2790
2791 /* Lexes a string, character constant, or angle-bracketed header file
2792    name.  The stored string contains the spelling, including opening
2793    quote and any leading 'L', 'u', 'U' or 'u8' and optional
2794    'R' modifier.  It returns the type of the literal, or CPP_OTHER
2795    if it was not properly terminated, or CPP_LESS for an unterminated
2796    header name which must be relexed as normal tokens.
2797
2798    The spelling is NUL-terminated, but it is not guaranteed that this
2799    is the first NUL since embedded NULs are preserved.  */
2800 static void
2801 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2802 {
2803   bool saw_NUL = false;
2804   const uchar *cur;
2805   cppchar_t terminator;
2806   enum cpp_ttype type;
2807
2808   cur = base;
2809   terminator = *cur++;
2810   if (terminator == 'L' || terminator == 'U')
2811     terminator = *cur++;
2812   else if (terminator == 'u')
2813     {
2814       terminator = *cur++;
2815       if (terminator == '8')
2816         terminator = *cur++;
2817     }
2818   if (terminator == 'R')
2819     {
2820       lex_raw_string (pfile, token, base);
2821       return;
2822     }
2823   if (terminator == '"')
2824     type = (*base == 'L' ? CPP_WSTRING :
2825             *base == 'U' ? CPP_STRING32 :
2826             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2827                          : CPP_STRING);
2828   else if (terminator == '\'')
2829     type = (*base == 'L' ? CPP_WCHAR :
2830             *base == 'U' ? CPP_CHAR32 :
2831             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2832                          : CPP_CHAR);
2833   else
2834     terminator = '>', type = CPP_HEADER_NAME;
2835
2836   const bool warn_bidi_p = pfile->warn_bidi_p ();
2837   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2838   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2839   for (;;)
2840     {
2841       cppchar_t c = *cur++;
2842
2843       /* In #include-style directives, terminators are not escapable.  */
2844       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2845         {
2846           if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
2847             {
2848               location_t loc;
2849               bidi::kind kind;
2850               if (cur[0] == 'N')
2851                 kind = get_bidi_named (pfile, cur + 1, &loc);
2852               else
2853                 kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
2854               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2855             }
2856           cur++;
2857         }
2858       else if (c == terminator)
2859         {
2860           if (warn_bidi_p)
2861             maybe_warn_bidi_on_close (pfile, cur - 1);
2862           break;
2863         }
2864       else if (c == '\n')
2865         {
2866           cur--;
2867           /* Unmatched quotes always yield undefined behavior, but
2868              greedy lexing means that what appears to be an unterminated
2869              header name may actually be a legitimate sequence of tokens.  */
2870           if (terminator == '>')
2871             {
2872               token->type = CPP_LESS;
2873               return;
2874             }
2875           type = CPP_OTHER;
2876           break;
2877         }
2878       else if (c == '\0')
2879         saw_NUL = true;
2880       else if (__builtin_expect (c >= utf8_continuation, 0)
2881                && warn_bidi_or_invalid_utf8_p)
2882         cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2883                                           warn_invalid_utf8_p);
2884     }
2885
2886   if (saw_NUL && !pfile->state.skipping)
2887     cpp_error (pfile, CPP_DL_WARNING,
2888                "null character(s) preserved in literal");
2889
2890   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2891     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2892                (int) terminator);
2893
2894   if (CPP_OPTION (pfile, user_literals))
2895     {
2896       /* If a string format macro, say from inttypes.h, is placed touching
2897          a string literal it could be parsed as a C++11 user-defined string
2898          literal thus breaking the program.  */
2899       if (is_macro_not_literal_suffix (pfile, cur))
2900         {
2901           /* Raise a warning, but do not consume subsequent tokens.  */
2902           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2903             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2904                                    token->src_loc, 0,
2905                                    "invalid suffix on literal; C++11 requires "
2906                                    "a space between literal and string macro");
2907         }
2908       /* Grab user defined literal suffix.  */
2909       else if (ISIDST (*cur))
2910         {
2911           type = cpp_userdef_char_add_type (type);
2912           type = cpp_userdef_string_add_type (type);
2913           ++cur;
2914
2915           while (ISIDNUM (*cur))
2916             ++cur;
2917         }
2918     }
2919   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2920            && is_macro (pfile, cur)
2921            && !pfile->state.skipping)
2922     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2923                            token->src_loc, 0, "C++11 requires a space "
2924                            "between string literal and macro");
2925
2926   pfile->buffer->cur = cur;
2927   create_literal (pfile, token, base, cur - base, type);
2928 }
2929
2930 /* Return the comment table. The client may not make any assumption
2931    about the ordering of the table.  */
2932 cpp_comment_table *
2933 cpp_get_comments (cpp_reader *pfile)
2934 {
2935   return &pfile->comments;
2936 }
2937
2938 /* Append a comment to the end of the comment table. */
2939 static void
2940 store_comment (cpp_reader *pfile, cpp_token *token)
2941 {
2942   int len;
2943
2944   if (pfile->comments.allocated == 0)
2945     {
2946       pfile->comments.allocated = 256;
2947       pfile->comments.entries = (cpp_comment *) xmalloc
2948         (pfile->comments.allocated * sizeof (cpp_comment));
2949     }
2950
2951   if (pfile->comments.count == pfile->comments.allocated)
2952     {
2953       pfile->comments.allocated *= 2;
2954       pfile->comments.entries = (cpp_comment *) xrealloc
2955         (pfile->comments.entries,
2956          pfile->comments.allocated * sizeof (cpp_comment));
2957     }
2958
2959   len = token->val.str.len;
2960
2961   /* Copy comment. Note, token may not be NULL terminated. */
2962   pfile->comments.entries[pfile->comments.count].comment =
2963     (char *) xmalloc (sizeof (char) * (len + 1));
2964   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2965           token->val.str.text, len);
2966   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2967
2968   /* Set source location. */
2969   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2970
2971   /* Increment the count of entries in the comment table. */
2972   pfile->comments.count++;
2973 }
2974
2975 /* The stored comment includes the comment start and any terminator.  */
2976 static void
2977 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2978               cppchar_t type)
2979 {
2980   unsigned char *buffer;
2981   unsigned int len, clen, i;
2982
2983   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2984
2985   /* C++ comments probably (not definitely) have moved past a new
2986      line, which we don't want to save in the comment.  */
2987   if (is_vspace (pfile->buffer->cur[-1]))
2988     len--;
2989
2990   /* If we are currently in a directive or in argument parsing, then
2991      we need to store all C++ comments as C comments internally, and
2992      so we need to allocate a little extra space in that case.
2993
2994      Note that the only time we encounter a directive here is
2995      when we are saving comments in a "#define".  */
2996   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2997           && type == '/') ? len + 2 : len;
2998
2999   buffer = _cpp_unaligned_alloc (pfile, clen);
3000
3001   token->type = CPP_COMMENT;
3002   token->val.str.len = clen;
3003   token->val.str.text = buffer;
3004
3005   buffer[0] = '/';
3006   memcpy (buffer + 1, from, len - 1);
3007
3008   /* Finish conversion to a C comment, if necessary.  */
3009   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
3010     {
3011       buffer[1] = '*';
3012       buffer[clen - 2] = '*';
3013       buffer[clen - 1] = '/';
3014       /* As there can be in a C++ comments illegal sequences for C comments
3015          we need to filter them out.  */
3016       for (i = 2; i < (clen - 2); i++)
3017         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3018           buffer[i] = '|';
3019     }
3020
3021   /* Finally store this comment for use by clients of libcpp. */
3022   store_comment (pfile, token);
3023 }
3024
3025 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3026    comment.  */
3027
3028 static bool
3029 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3030 {
3031   const unsigned char *from = comment_start + 1;
3032
3033   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3034     {
3035       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3036          don't recognize any comments.  The latter only checks attributes,
3037          the former doesn't warn.  */
3038     case 0:
3039     default:
3040       return false;
3041       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3042          content it has.  */
3043     case 1:
3044       return true;
3045     case 2:
3046       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3047          .*falls?[ \t-]*thr(u|ough).* regex.  */
3048       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3049            from++)
3050         {
3051           /* Is there anything like strpbrk with upper boundary, or
3052              memchr looking for 2 characters rather than just one?  */
3053           if (from[0] != 'f' && from[0] != 'F')
3054             continue;
3055           if (from[1] != 'a' && from[1] != 'A')
3056             continue;
3057           if (from[2] != 'l' && from[2] != 'L')
3058             continue;
3059           if (from[3] != 'l' && from[3] != 'L')
3060             continue;
3061           from += sizeof "fall" - 1;
3062           if (from[0] == 's' || from[0] == 'S')
3063             from++;
3064           while (*from == ' ' || *from == '\t' || *from == '-')
3065             from++;
3066           if (from[0] != 't' && from[0] != 'T')
3067             continue;
3068           if (from[1] != 'h' && from[1] != 'H')
3069             continue;
3070           if (from[2] != 'r' && from[2] != 'R')
3071             continue;
3072           if (from[3] == 'u' || from[3] == 'U')
3073             return true;
3074           if (from[3] != 'o' && from[3] != 'O')
3075             continue;
3076           if (from[4] != 'u' && from[4] != 'U')
3077             continue;
3078           if (from[5] != 'g' && from[5] != 'G')
3079             continue;
3080           if (from[6] != 'h' && from[6] != 'H')
3081             continue;
3082           return true;
3083         }
3084       return false;
3085     case 3:
3086     case 4:
3087       break;
3088     }
3089
3090   /* Whole comment contents:
3091      -fallthrough
3092      @fallthrough@
3093    */
3094   if (*from == '-' || *from == '@')
3095     {
3096       size_t len = sizeof "fallthrough" - 1;
3097       if ((size_t) (pfile->buffer->cur - from - 1) < len)
3098         return false;
3099       if (memcmp (from + 1, "fallthrough", len))
3100         return false;
3101       if (*from == '@')
3102         {
3103           if (from[len + 1] != '@')
3104             return false;
3105           len++;
3106         }
3107       from += 1 + len;
3108     }
3109   /* Whole comment contents (regex):
3110      lint -fallthrough[ \t]*
3111    */
3112   else if (*from == 'l')
3113     {
3114       size_t len = sizeof "int -fallthrough" - 1;
3115       if ((size_t) (pfile->buffer->cur - from - 1) < len)
3116         return false;
3117       if (memcmp (from + 1, "int -fallthrough", len))
3118         return false;
3119       from += 1 + len;
3120       while (*from == ' ' || *from == '\t')
3121         from++;
3122     }
3123   /* Whole comment contents (regex):
3124      [ \t]*FALLTHR(U|OUGH)[ \t]*
3125    */
3126   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3127     {
3128       while (*from == ' ' || *from == '\t')
3129         from++;
3130       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
3131         return false;
3132       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
3133         return false;
3134       from += sizeof "FALLTHR" - 1;
3135       if (*from == 'U')
3136         from++;
3137       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
3138         return false;
3139       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
3140         return false;
3141       else
3142         from += sizeof "OUGH" - 1;
3143       while (*from == ' ' || *from == '\t')
3144         from++;
3145     }
3146   /* Whole comment contents (regex):
3147      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3148      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3149      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
3150    */
3151   else
3152     {
3153       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3154         from++;
3155       unsigned char f = *from;
3156       bool all_upper = false;
3157       if (f == 'E' || f == 'e')
3158         {
3159           if ((size_t) (pfile->buffer->cur - from)
3160               < sizeof "else fallthru" - 1)
3161             return false;
3162           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
3163             all_upper = true;
3164           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
3165             return false;
3166           from += sizeof "else" - 1;
3167           if (*from == ',')
3168             from++;
3169           if (*from != ' ')
3170             return false;
3171           from++;
3172           if (all_upper && *from == 'f')
3173             return false;
3174           if (f == 'e' && *from == 'F')
3175             return false;
3176           f = *from;
3177         }
3178       else if (f == 'I' || f == 'i')
3179         {
3180           if ((size_t) (pfile->buffer->cur - from)
3181               < sizeof "intentional fallthru" - 1)
3182             return false;
3183           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
3184                                   sizeof "NTENTIONAL" - 1) == 0)
3185             all_upper = true;
3186           else if (memcmp (from + 1, "ntentional",
3187                            sizeof "ntentional" - 1))
3188             return false;
3189           from += sizeof "intentional" - 1;
3190           if (*from == ' ')
3191             {
3192               from++;
3193               if (all_upper && *from == 'f')
3194                 return false;
3195             }
3196           else if (all_upper)
3197             {
3198               if (memcmp (from, "LY F", sizeof "LY F" - 1))
3199                 return false;
3200               from += sizeof "LY " - 1;
3201             }
3202           else
3203             {
3204               if (memcmp (from, "ly ", sizeof "ly " - 1))
3205                 return false;
3206               from += sizeof "ly " - 1;
3207             }
3208           if (f == 'i' && *from == 'F')
3209             return false;
3210           f = *from;
3211         }
3212       if (f != 'F' && f != 'f')
3213         return false;
3214       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3215         return false;
3216       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3217         all_upper = true;
3218       else if (all_upper)
3219         return false;
3220       else if (memcmp (from + 1, "all", sizeof "all" - 1))
3221         return false;
3222       from += sizeof "fall" - 1;
3223       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3224         from += 2;
3225       else if (*from == ' ' || *from == '-')
3226         from++;
3227       else if (*from != (all_upper ? 'T' : 't'))
3228         return false;
3229       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3230         return false;
3231       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3232         return false;
3233       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3234         {
3235           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3236             return false;
3237           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3238                       sizeof "hrough" - 1))
3239             return false;
3240           from += sizeof "through" - 1;
3241         }
3242       else
3243         from += sizeof "thru" - 1;
3244       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3245         from++;
3246       if (*from == '-')
3247         {
3248           from++;
3249           if (*comment_start == '*')
3250             {
3251               do
3252                 {
3253                   while (*from && *from != '*'
3254                          && *from != '\n' && *from != '\r')
3255                     from++;
3256                   if (*from != '*' || from[1] == '/')
3257                     break;
3258                   from++;
3259                 }
3260               while (1);
3261             }
3262           else
3263             while (*from && *from != '\n' && *from != '\r')
3264               from++;
3265         }
3266     }
3267   /* C block comment.  */
3268   if (*comment_start == '*')
3269     {
3270       if (*from != '*' || from[1] != '/')
3271         return false;
3272     }
3273   /* C++ line comment.  */
3274   else if (*from != '\n')
3275     return false;
3276
3277   return true;
3278 }
3279
3280 /* Allocate COUNT tokens for RUN.  */
3281 void
3282 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3283 {
3284   run->base = XNEWVEC (cpp_token, count);
3285   run->limit = run->base + count;
3286   run->next = NULL;
3287 }
3288
3289 /* Returns the next tokenrun, or creates one if there is none.  */
3290 static tokenrun *
3291 next_tokenrun (tokenrun *run)
3292 {
3293   if (run->next == NULL)
3294     {
3295       run->next = XNEW (tokenrun);
3296       run->next->prev = run;
3297       _cpp_init_tokenrun (run->next, 250);
3298     }
3299
3300   return run->next;
3301 }
3302
3303 /* Return the number of not yet processed token in a given
3304    context.  */
3305 int
3306 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3307 {
3308   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3309     return (LAST (context).token - FIRST (context).token);
3310   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3311            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3312     return (LAST (context).ptoken - FIRST (context).ptoken);
3313   else
3314       abort ();
3315 }
3316
3317 /* Returns the token present at index INDEX in a given context.  If
3318    INDEX is zero, the next token to be processed is returned.  */
3319 static const cpp_token*
3320 _cpp_token_from_context_at (cpp_context *context, int index)
3321 {
3322   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3323     return &(FIRST (context).token[index]);
3324   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3325            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3326     return FIRST (context).ptoken[index];
3327  else
3328    abort ();
3329 }
3330
3331 /* Look ahead in the input stream.  */
3332 const cpp_token *
3333 cpp_peek_token (cpp_reader *pfile, int index)
3334 {
3335   cpp_context *context = pfile->context;
3336   const cpp_token *peektok;
3337   int count;
3338
3339   /* First, scan through any pending cpp_context objects.  */
3340   while (context->prev)
3341     {
3342       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3343
3344       if (index < (int) sz)
3345         return _cpp_token_from_context_at (context, index);
3346       index -= (int) sz;
3347       context = context->prev;
3348     }
3349
3350   /* We will have to read some new tokens after all (and do so
3351      without invalidating preceding tokens).  */
3352   count = index;
3353   pfile->keep_tokens++;
3354
3355   /* For peeked tokens temporarily disable line_change reporting,
3356      until the tokens are parsed for real.  */
3357   void (*line_change) (cpp_reader *, const cpp_token *, int)
3358     = pfile->cb.line_change;
3359   pfile->cb.line_change = NULL;
3360
3361   do
3362     {
3363       peektok = _cpp_lex_token (pfile);
3364       if (peektok->type == CPP_EOF)
3365         {
3366           index--;
3367           break;
3368         }
3369       else if (peektok->type == CPP_PRAGMA)
3370         {
3371           /* Don't peek past a pragma.  */
3372           if (peektok == &pfile->directive_result)
3373             /* Save the pragma in the buffer.  */
3374             *pfile->cur_token++ = *peektok;
3375           index--;
3376           break;
3377         }
3378     }
3379   while (index--);
3380
3381   _cpp_backup_tokens_direct (pfile, count - index);
3382   pfile->keep_tokens--;
3383   pfile->cb.line_change = line_change;
3384
3385   return peektok;
3386 }
3387
3388 /* Allocate a single token that is invalidated at the same time as the
3389    rest of the tokens on the line.  Has its line and col set to the
3390    same as the last lexed token, so that diagnostics appear in the
3391    right place.  */
3392 cpp_token *
3393 _cpp_temp_token (cpp_reader *pfile)
3394 {
3395   cpp_token *old, *result;
3396   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3397   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3398
3399   old = pfile->cur_token - 1;
3400   /* Any pre-existing lookaheads must not be clobbered.  */
3401   if (la)
3402     {
3403       if (sz <= la)
3404         {
3405           tokenrun *next = next_tokenrun (pfile->cur_run);
3406
3407           if (sz < la)
3408             memmove (next->base + 1, next->base,
3409                      (la - sz) * sizeof (cpp_token));
3410
3411           next->base[0] = pfile->cur_run->limit[-1];
3412         }
3413
3414       if (sz > 1)
3415         memmove (pfile->cur_token + 1, pfile->cur_token,
3416                  MIN (la, sz - 1) * sizeof (cpp_token));
3417     }
3418
3419   if (!sz && pfile->cur_token == pfile->cur_run->limit)
3420     {
3421       pfile->cur_run = next_tokenrun (pfile->cur_run);
3422       pfile->cur_token = pfile->cur_run->base;
3423     }
3424
3425   result = pfile->cur_token++;
3426   result->src_loc = old->src_loc;
3427   return result;
3428 }
3429
3430 /* We're at the beginning of a logical line (so not in
3431   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
3432   if we should enter deferred_pragma mode to tokenize the rest of the
3433   line as a module control-line.  */
3434
3435 static void
3436 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3437 {
3438   unsigned backup = 0; /* Tokens we peeked.  */
3439   cpp_hashnode *node = result->val.node.node;
3440   cpp_token *peek = result;
3441   cpp_token *keyword = peek;
3442   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3443   int header_count = 0;
3444
3445   /* Make sure the incoming state is as we expect it.  This way we
3446      can restore it using constants.  */
3447   gcc_checking_assert (!pfile->state.in_deferred_pragma
3448                        && !pfile->state.skipping
3449                        && !pfile->state.parsing_args
3450                        && !pfile->state.angled_headers
3451                        && (pfile->state.save_comments
3452                            == !CPP_OPTION (pfile, discard_comments)));
3453
3454   /* Enter directives mode sufficiently for peeking.  We don't have
3455      to actually set in_directive.  */
3456   pfile->state.in_deferred_pragma = true;
3457
3458   /* These two fields are needed to process tokenization in deferred
3459      pragma mode.  They are not used outside deferred pragma mode or
3460      directives mode.  */
3461   pfile->state.pragma_allow_expansion = true;
3462   pfile->directive_line = result->src_loc;
3463
3464   /* Saving comments is incompatible with directives mode.   */
3465   pfile->state.save_comments = 0;
3466
3467   if (node == n_modules[spec_nodes::M_EXPORT][0])
3468     {
3469       peek = _cpp_lex_direct (pfile);
3470       keyword = peek;
3471       backup++;
3472       if (keyword->type != CPP_NAME)
3473         goto not_module;
3474       node = keyword->val.node.node;
3475       if (!(node->flags & NODE_MODULE))
3476         goto not_module;
3477     }
3478
3479   if (node == n_modules[spec_nodes::M__IMPORT][0])
3480     /* __import  */
3481     header_count = backup + 2 + 16;
3482   else if (node == n_modules[spec_nodes::M_IMPORT][0])
3483     /* import  */
3484     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3485   else if (node == n_modules[spec_nodes::M_MODULE][0])
3486     ; /* module  */
3487   else
3488     goto not_module;
3489
3490   /* We've seen [export] {module|import|__import}.  Check the next token.  */
3491   if (header_count)
3492     /* After '{,__}import' a header name may appear.  */
3493     pfile->state.angled_headers = true;
3494   peek = _cpp_lex_direct (pfile);
3495   backup++;
3496
3497   /* ... import followed by identifier, ':', '<' or
3498      header-name preprocessing tokens, or module
3499      followed by cpp-identifier, ':' or ';' preprocessing
3500      tokens.  C++ keywords are not yet relevant.  */
3501   if (peek->type == CPP_NAME
3502       || peek->type == CPP_COLON
3503       ||  (header_count
3504            ? (peek->type == CPP_LESS
3505               || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3506               || peek->type == CPP_HEADER_NAME)
3507            : peek->type == CPP_SEMICOLON))
3508     {
3509       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3510       if (!pfile->state.pragma_allow_expansion)
3511         pfile->state.prevent_expansion++;
3512
3513       if (!header_count && linemap_included_from
3514           (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3515         cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3516                              "module control-line cannot be in included file");
3517
3518       /* The first one or two tokens cannot be macro names.  */
3519       for (int ix = backup; ix--;)
3520         {
3521           cpp_token *tok = ix ? keyword : result;
3522           cpp_hashnode *node = tok->val.node.node;
3523
3524           /* Don't attempt to expand the token.  */
3525           tok->flags |= NO_EXPAND;
3526           if (_cpp_defined_macro_p (node)
3527               && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3528               && !cpp_fun_like_macro_p (node))
3529             cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3530                                  "module control-line \"%s\" cannot be"
3531                                  " an object-like macro",
3532                                  NODE_NAME (node));
3533         }
3534
3535       /* Map to underbar variants.  */
3536       keyword->val.node.node = n_modules[header_count
3537                                          ? spec_nodes::M_IMPORT
3538                                          : spec_nodes::M_MODULE][1];
3539       if (backup != 1)
3540         result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3541
3542       /* Maybe tell the tokenizer we expect a header-name down the
3543          road.  */
3544       pfile->state.directive_file_token = header_count;
3545     }
3546   else
3547     {
3548     not_module:
3549       /* Drop out of directive mode.  */
3550       /* We aaserted save_comments had this value upon entry.  */
3551       pfile->state.save_comments
3552         = !CPP_OPTION (pfile, discard_comments);
3553       pfile->state.in_deferred_pragma = false;
3554       /* Do not let this remain on.  */
3555       pfile->state.angled_headers = false;
3556     }
3557
3558   /* In either case we want to backup the peeked tokens.  */
3559   if (backup)
3560     {
3561       /* If we saw EOL, we should drop it, because this isn't a module
3562          control-line after all.  */
3563       bool eol = peek->type == CPP_PRAGMA_EOL;
3564       if (!eol || backup > 1)
3565         {
3566           /* Put put the peeked tokens back  */
3567           _cpp_backup_tokens_direct (pfile, backup);
3568           /* But if the last one was an EOL, forget it.  */
3569           if (eol)
3570             pfile->lookaheads--;
3571         }
3572     }
3573 }
3574
3575 /* Lex a token into RESULT (external interface).  Takes care of issues
3576    like directive handling, token lookahead, multiple include
3577    optimization and skipping.  */
3578 const cpp_token *
3579 _cpp_lex_token (cpp_reader *pfile)
3580 {
3581   cpp_token *result;
3582
3583   for (;;)
3584     {
3585       if (pfile->cur_token == pfile->cur_run->limit)
3586         {
3587           pfile->cur_run = next_tokenrun (pfile->cur_run);
3588           pfile->cur_token = pfile->cur_run->base;
3589         }
3590       /* We assume that the current token is somewhere in the current
3591          run.  */
3592       if (pfile->cur_token < pfile->cur_run->base
3593           || pfile->cur_token >= pfile->cur_run->limit)
3594         abort ();
3595
3596       if (pfile->lookaheads)
3597         {
3598           pfile->lookaheads--;
3599           result = pfile->cur_token++;
3600         }
3601       else
3602         result = _cpp_lex_direct (pfile);
3603
3604       if (result->flags & BOL)
3605         {
3606           /* Is this a directive.  If _cpp_handle_directive returns
3607              false, it is an assembler #.  */
3608           if (result->type == CPP_HASH
3609               /* 6.10.3 p 11: Directives in a list of macro arguments
3610                  gives undefined behavior.  This implementation
3611                  handles the directive as normal.  */
3612               && pfile->state.parsing_args != 1)
3613             {
3614               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3615                 {
3616                   if (pfile->directive_result.type == CPP_PADDING)
3617                     continue;
3618                   result = &pfile->directive_result;
3619                 }
3620             }
3621           else if (pfile->state.in_deferred_pragma)
3622             result = &pfile->directive_result;
3623           else if (result->type == CPP_NAME
3624                    && (result->val.node.node->flags & NODE_MODULE)
3625                    && !pfile->state.skipping
3626                    /* Unlike regular directives, we do not deal with
3627                       tokenizing module directives as macro arguments.
3628                       That's not permitted.  */
3629                    && !pfile->state.parsing_args)
3630             {
3631               /* P1857.  Before macro expansion, At start of logical
3632                  line ... */
3633               /* We don't have to consider lookaheads at this point.  */
3634               gcc_checking_assert (!pfile->lookaheads);
3635
3636               cpp_maybe_module_directive (pfile, result);
3637             }
3638
3639           if (pfile->cb.line_change && !pfile->state.skipping)
3640             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3641         }
3642
3643       /* We don't skip tokens in directives.  */
3644       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3645         break;
3646
3647       /* Outside a directive, invalidate controlling macros.  At file
3648          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3649          get here and MI optimization works.  */
3650       pfile->mi_valid = false;
3651
3652       if (!pfile->state.skipping || result->type == CPP_EOF)
3653         break;
3654     }
3655
3656   return result;
3657 }
3658
3659 /* Returns true if a fresh line has been loaded.  */
3660 bool
3661 _cpp_get_fresh_line (cpp_reader *pfile)
3662 {
3663   /* We can't get a new line until we leave the current directive.  */
3664   if (pfile->state.in_directive)
3665     return false;
3666
3667   for (;;)
3668     {
3669       cpp_buffer *buffer = pfile->buffer;
3670
3671       if (!buffer->need_line)
3672         return true;
3673
3674       if (buffer->next_line < buffer->rlimit)
3675         {
3676           _cpp_clean_line (pfile);
3677           return true;
3678         }
3679
3680       /* First, get out of parsing arguments state.  */
3681       if (pfile->state.parsing_args)
3682         return false;
3683
3684       /* End of buffer.  Non-empty files should end in a newline.  */
3685       if (buffer->buf != buffer->rlimit
3686           && buffer->next_line > buffer->rlimit
3687           && !buffer->from_stage3)
3688         {
3689           /* Clip to buffer size.  */
3690           buffer->next_line = buffer->rlimit;
3691         }
3692
3693       if (buffer->prev && !buffer->return_at_eof)
3694         _cpp_pop_buffer (pfile);
3695       else
3696         {
3697           /* End of translation.  Do not pop the buffer yet. Increment
3698              line number so that the EOF token is on a line of its own
3699              (_cpp_lex_direct doesn't increment in that case, because
3700              it's hard for it to distinguish this special case). */
3701           CPP_INCREMENT_LINE (pfile, 0);
3702           return false;
3703         }
3704     }
3705 }
3706
3707 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
3708   do                                                    \
3709     {                                                   \
3710       result->type = ELSE_TYPE;                         \
3711       if (*buffer->cur == CHAR)                         \
3712         buffer->cur++, result->type = THEN_TYPE;        \
3713     }                                                   \
3714   while (0)
3715
3716 /* Lex a token into pfile->cur_token, which is also incremented, to
3717    get diagnostics pointing to the correct location.
3718
3719    Does not handle issues such as token lookahead, multiple-include
3720    optimization, directives, skipping etc.  This function is only
3721    suitable for use by _cpp_lex_token, and in special cases like
3722    lex_expansion_token which doesn't care for any of these issues.
3723
3724    When meeting a newline, returns CPP_EOF if parsing a directive,
3725    otherwise returns to the start of the token buffer if permissible.
3726    Returns the location of the lexed token.  */
3727 cpp_token *
3728 _cpp_lex_direct (cpp_reader *pfile)
3729 {
3730   cppchar_t c;
3731   cpp_buffer *buffer;
3732   const unsigned char *comment_start;
3733   bool fallthrough_comment = false;
3734   cpp_token *result = pfile->cur_token++;
3735
3736  fresh_line:
3737   result->flags = 0;
3738   buffer = pfile->buffer;
3739   if (buffer->need_line)
3740     {
3741       if (pfile->state.in_deferred_pragma)
3742         {
3743           /* This can happen in cases like:
3744              #define loop(x) whatever
3745              #pragma omp loop
3746              where when trying to expand loop we need to peek
3747              next token after loop, but aren't still in_deferred_pragma
3748              mode but are in in_directive mode, so buffer->need_line
3749              is set, a CPP_EOF is peeked.  */
3750           result->type = CPP_PRAGMA_EOL;
3751           pfile->state.in_deferred_pragma = false;
3752           if (!pfile->state.pragma_allow_expansion)
3753             pfile->state.prevent_expansion--;
3754           return result;
3755         }
3756       if (!_cpp_get_fresh_line (pfile))
3757         {
3758           result->type = CPP_EOF;
3759           /* Not a real EOF in a directive or arg parsing -- we refuse
3760              to advance to the next file now, and will once we're out
3761              of those modes.  */
3762           if (!pfile->state.in_directive && !pfile->state.parsing_args)
3763             {
3764               /* Tell the compiler the line number of the EOF token.  */
3765               result->src_loc = pfile->line_table->highest_line;
3766               result->flags = BOL;
3767               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
3768               _cpp_pop_buffer (pfile);
3769             }
3770           return result;
3771         }
3772       if (buffer != pfile->buffer)
3773         fallthrough_comment = false;
3774       if (!pfile->keep_tokens)
3775         {
3776           pfile->cur_run = &pfile->base_run;
3777           result = pfile->base_run.base;
3778           pfile->cur_token = result + 1;
3779         }
3780       result->flags = BOL;
3781       if (pfile->state.parsing_args == 2)
3782         result->flags |= PREV_WHITE;
3783     }
3784   buffer = pfile->buffer;
3785  update_tokens_line:
3786   result->src_loc = pfile->line_table->highest_line;
3787
3788  skipped_white:
3789   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3790       && !pfile->overlaid_buffer)
3791     {
3792       _cpp_process_line_notes (pfile, false);
3793       result->src_loc = pfile->line_table->highest_line;
3794     }
3795   c = *buffer->cur++;
3796
3797   if (pfile->forced_token_location)
3798     result->src_loc = pfile->forced_token_location;
3799   else
3800     result->src_loc = linemap_position_for_column (pfile->line_table,
3801                                           CPP_BUF_COLUMN (buffer, buffer->cur));
3802
3803   switch (c)
3804     {
3805     case ' ': case '\t': case '\f': case '\v': case '\0':
3806       result->flags |= PREV_WHITE;
3807       skip_whitespace (pfile, c);
3808       goto skipped_white;
3809
3810     case '\n':
3811       /* Increment the line, unless this is the last line ...  */
3812       if (buffer->cur < buffer->rlimit
3813           /* ... or this is a #include, (where _cpp_stack_file needs to
3814              unwind by one line) ...  */
3815           || (pfile->state.in_directive > 1
3816               /* ... except traditional-cpp increments this elsewhere.  */
3817               && !CPP_OPTION (pfile, traditional)))
3818         CPP_INCREMENT_LINE (pfile, 0);
3819       buffer->need_line = true;
3820       if (pfile->state.in_deferred_pragma)
3821         {
3822           /* Produce the PRAGMA_EOL on this line.  File reading
3823              ensures there is always a \n at end of the buffer, thus
3824              in a deferred pragma we always see CPP_PRAGMA_EOL before
3825              any CPP_EOF.  */
3826           result->type = CPP_PRAGMA_EOL;
3827           result->flags &= ~PREV_WHITE;
3828           pfile->state.in_deferred_pragma = false;
3829           if (!pfile->state.pragma_allow_expansion)
3830             pfile->state.prevent_expansion--;
3831           return result;
3832         }
3833       goto fresh_line;
3834
3835     case '0': case '1': case '2': case '3': case '4':
3836     case '5': case '6': case '7': case '8': case '9':
3837       {
3838         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3839         result->type = CPP_NUMBER;
3840         lex_number (pfile, &result->val.str, &nst);
3841         warn_about_normalization (pfile, result, &nst);
3842         break;
3843       }
3844
3845     case 'L':
3846     case 'u':
3847     case 'U':
3848     case 'R':
3849       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3850          wide strings or raw strings.  */
3851       if (c == 'L' || CPP_OPTION (pfile, rliterals)
3852           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3853         {
3854           if ((*buffer->cur == '\'' && c != 'R')
3855               || *buffer->cur == '"'
3856               || (*buffer->cur == 'R'
3857                   && c != 'R'
3858                   && buffer->cur[1] == '"'
3859                   && CPP_OPTION (pfile, rliterals))
3860               || (*buffer->cur == '8'
3861                   && c == 'u'
3862                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3863                                 && CPP_OPTION (pfile, utf8_char_literals)))
3864                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3865                           && CPP_OPTION (pfile, rliterals)))))
3866             {
3867               lex_string (pfile, result, buffer->cur - 1);
3868               break;
3869             }
3870         }
3871       /* Fall through.  */
3872
3873     case '_':
3874     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3875     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3876     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3877     case 's': case 't':           case 'v': case 'w': case 'x':
3878     case 'y': case 'z':
3879     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3880     case 'G': case 'H': case 'I': case 'J': case 'K':
3881     case 'M': case 'N': case 'O': case 'P': case 'Q':
3882     case 'S': case 'T':           case 'V': case 'W': case 'X':
3883     case 'Y': case 'Z':
3884       result->type = CPP_NAME;
3885       {
3886         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3887         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3888                                                 &nst,
3889                                                 &result->val.node.spelling);
3890         warn_about_normalization (pfile, result, &nst);
3891       }
3892
3893       /* Convert named operators to their proper types.  */
3894       if (result->val.node.node->flags & NODE_OPERATOR)
3895         {
3896           result->flags |= NAMED_OP;
3897           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3898         }
3899
3900       /* Signal FALLTHROUGH comment followed by another token.  */
3901       if (fallthrough_comment)
3902         result->flags |= PREV_FALLTHROUGH;
3903       break;
3904
3905     case '\'':
3906     case '"':
3907       lex_string (pfile, result, buffer->cur - 1);
3908       break;
3909
3910     case '/':
3911       /* A potential block or line comment.  */
3912       comment_start = buffer->cur;
3913       c = *buffer->cur;
3914
3915       if (c == '*')
3916         {
3917           if (_cpp_skip_block_comment (pfile))
3918             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3919         }
3920       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3921         {
3922           /* Don't warn for system headers.  */
3923           if (_cpp_in_system_header (pfile))
3924             ;
3925           /* Warn about comments if pedantically GNUC89, and not
3926              in system headers.  */
3927           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3928                    && CPP_PEDANTIC (pfile)
3929                    && ! buffer->warned_cplusplus_comments)
3930             {
3931               if (cpp_error (pfile, CPP_DL_PEDWARN,
3932                              "C++ style comments are not allowed in ISO C90"))
3933                 cpp_error (pfile, CPP_DL_NOTE,
3934                            "(this will be reported only once per input file)");
3935               buffer->warned_cplusplus_comments = 1;
3936             }
3937           /* Or if specifically desired via -Wc90-c99-compat.  */
3938           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3939                    && ! CPP_OPTION (pfile, cplusplus)
3940                    && ! buffer->warned_cplusplus_comments)
3941             {
3942               if (cpp_error (pfile, CPP_DL_WARNING,
3943                              "C++ style comments are incompatible with C90"))
3944                 cpp_error (pfile, CPP_DL_NOTE,
3945                            "(this will be reported only once per input file)");
3946               buffer->warned_cplusplus_comments = 1;
3947             }
3948           /* In C89/C94, C++ style comments are forbidden.  */
3949           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3950                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
3951             {
3952               /* But don't be confused about valid code such as
3953                  - // immediately followed by *,
3954                  - // in a preprocessing directive,
3955                  - // in an #if 0 block.  */
3956               if (buffer->cur[1] == '*'
3957                   || pfile->state.in_directive
3958                   || pfile->state.skipping)
3959                 {
3960                   result->type = CPP_DIV;
3961                   break;
3962                 }
3963               else if (! buffer->warned_cplusplus_comments)
3964                 {
3965                   if (cpp_error (pfile, CPP_DL_ERROR,
3966                                  "C++ style comments are not allowed in "
3967                                  "ISO C90"))
3968                     cpp_error (pfile, CPP_DL_NOTE,
3969                                "(this will be reported only once per input "
3970                                "file)");
3971                   buffer->warned_cplusplus_comments = 1;
3972                 }
3973             }
3974           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3975             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3976         }
3977       else if (c == '=')
3978         {
3979           buffer->cur++;
3980           result->type = CPP_DIV_EQ;
3981           break;
3982         }
3983       else
3984         {
3985           result->type = CPP_DIV;
3986           break;
3987         }
3988
3989       if (fallthrough_comment_p (pfile, comment_start))
3990         fallthrough_comment = true;
3991
3992       if (pfile->cb.comment)
3993         {
3994           size_t len = pfile->buffer->cur - comment_start;
3995           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3996                              len + 1);
3997         }
3998
3999       if (!pfile->state.save_comments)
4000         {
4001           result->flags |= PREV_WHITE;
4002           goto update_tokens_line;
4003         }
4004
4005       if (fallthrough_comment)
4006         result->flags |= PREV_FALLTHROUGH;
4007
4008       /* Save the comment as a token in its own right.  */
4009       save_comment (pfile, result, comment_start, c);
4010       break;
4011
4012     case '<':
4013       if (pfile->state.angled_headers)
4014         {
4015           lex_string (pfile, result, buffer->cur - 1);
4016           if (result->type != CPP_LESS)
4017             break;
4018         }
4019
4020       result->type = CPP_LESS;
4021       if (*buffer->cur == '=')
4022         {
4023           buffer->cur++, result->type = CPP_LESS_EQ;
4024           if (*buffer->cur == '>'
4025               && CPP_OPTION (pfile, cplusplus)
4026               && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
4027             buffer->cur++, result->type = CPP_SPACESHIP;
4028         }
4029       else if (*buffer->cur == '<')
4030         {
4031           buffer->cur++;
4032           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
4033         }
4034       else if (CPP_OPTION (pfile, digraphs))
4035         {
4036           if (*buffer->cur == ':')
4037             {
4038               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4039                  three characters are <:: and the subsequent character
4040                  is neither : nor >, the < is treated as a preprocessor
4041                  token by itself".  */
4042               if (CPP_OPTION (pfile, cplusplus)
4043                   && CPP_OPTION (pfile, lang) != CLK_CXX98
4044                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
4045                   && buffer->cur[1] == ':'
4046                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4047                 break;
4048
4049               buffer->cur++;
4050               result->flags |= DIGRAPH;
4051               result->type = CPP_OPEN_SQUARE;
4052             }
4053           else if (*buffer->cur == '%')
4054             {
4055               buffer->cur++;
4056               result->flags |= DIGRAPH;
4057               result->type = CPP_OPEN_BRACE;
4058             }
4059         }
4060       break;
4061
4062     case '>':
4063       result->type = CPP_GREATER;
4064       if (*buffer->cur == '=')
4065         buffer->cur++, result->type = CPP_GREATER_EQ;
4066       else if (*buffer->cur == '>')
4067         {
4068           buffer->cur++;
4069           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4070         }
4071       break;
4072
4073     case '%':
4074       result->type = CPP_MOD;
4075       if (*buffer->cur == '=')
4076         buffer->cur++, result->type = CPP_MOD_EQ;
4077       else if (CPP_OPTION (pfile, digraphs))
4078         {
4079           if (*buffer->cur == ':')
4080             {
4081               buffer->cur++;
4082               result->flags |= DIGRAPH;
4083               result->type = CPP_HASH;
4084               if (*buffer->cur == '%' && buffer->cur[1] == ':')
4085                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
4086             }
4087           else if (*buffer->cur == '>')
4088             {
4089               buffer->cur++;
4090               result->flags |= DIGRAPH;
4091               result->type = CPP_CLOSE_BRACE;
4092             }
4093         }
4094       break;
4095
4096     case '.':
4097       result->type = CPP_DOT;
4098       if (ISDIGIT (*buffer->cur))
4099         {
4100           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4101           result->type = CPP_NUMBER;
4102           lex_number (pfile, &result->val.str, &nst);
4103           warn_about_normalization (pfile, result, &nst);
4104         }
4105       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4106         buffer->cur += 2, result->type = CPP_ELLIPSIS;
4107       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4108         buffer->cur++, result->type = CPP_DOT_STAR;
4109       break;
4110
4111     case '+':
4112       result->type = CPP_PLUS;
4113       if (*buffer->cur == '+')
4114         buffer->cur++, result->type = CPP_PLUS_PLUS;
4115       else if (*buffer->cur == '=')
4116         buffer->cur++, result->type = CPP_PLUS_EQ;
4117       break;
4118
4119     case '-':
4120       result->type = CPP_MINUS;
4121       if (*buffer->cur == '>')
4122         {
4123           buffer->cur++;
4124           result->type = CPP_DEREF;
4125           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4126             buffer->cur++, result->type = CPP_DEREF_STAR;
4127         }
4128       else if (*buffer->cur == '-')
4129         buffer->cur++, result->type = CPP_MINUS_MINUS;
4130       else if (*buffer->cur == '=')
4131         buffer->cur++, result->type = CPP_MINUS_EQ;
4132       break;
4133
4134     case '&':
4135       result->type = CPP_AND;
4136       if (*buffer->cur == '&')
4137         buffer->cur++, result->type = CPP_AND_AND;
4138       else if (*buffer->cur == '=')
4139         buffer->cur++, result->type = CPP_AND_EQ;
4140       break;
4141
4142     case '|':
4143       result->type = CPP_OR;
4144       if (*buffer->cur == '|')
4145         buffer->cur++, result->type = CPP_OR_OR;
4146       else if (*buffer->cur == '=')
4147         buffer->cur++, result->type = CPP_OR_EQ;
4148       break;
4149
4150     case ':':
4151       result->type = CPP_COLON;
4152       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
4153         buffer->cur++, result->type = CPP_SCOPE;
4154       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
4155         {
4156           buffer->cur++;
4157           result->flags |= DIGRAPH;
4158           result->type = CPP_CLOSE_SQUARE;
4159         }
4160       break;
4161
4162     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4163     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4164     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4165     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
4166     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
4167
4168     case '?': result->type = CPP_QUERY; break;
4169     case '~': result->type = CPP_COMPL; break;
4170     case ',': result->type = CPP_COMMA; break;
4171     case '(': result->type = CPP_OPEN_PAREN; break;
4172     case ')': result->type = CPP_CLOSE_PAREN; break;
4173     case '[': result->type = CPP_OPEN_SQUARE; break;
4174     case ']': result->type = CPP_CLOSE_SQUARE; break;
4175     case '{': result->type = CPP_OPEN_BRACE; break;
4176     case '}': result->type = CPP_CLOSE_BRACE; break;
4177     case ';': result->type = CPP_SEMICOLON; break;
4178
4179       /* @ is a punctuator in Objective-C.  */
4180     case '@': result->type = CPP_ATSIGN; break;
4181
4182     default:
4183       {
4184         const uchar *base = --buffer->cur;
4185         static int no_warn_cnt;
4186
4187         /* Check for an extended identifier ($ or UCN or UTF-8).  */
4188         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4189         if (forms_identifier_p (pfile, true, &nst))
4190           {
4191             result->type = CPP_NAME;
4192             result->val.node.node = lex_identifier (pfile, base, true, &nst,
4193                                                     &result->val.node.spelling);
4194             warn_about_normalization (pfile, result, &nst);
4195             break;
4196           }
4197
4198         /* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
4199            single token.  */
4200         buffer->cur++;
4201         if (c >= utf8_signifier)
4202           {
4203             const uchar *pstr = base;
4204             cppchar_t s;
4205             if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4206               {
4207                 if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4208                   {
4209                     buffer->cur = base;
4210                     _cpp_warn_invalid_utf8 (pfile);
4211                   }
4212                 buffer->cur = pstr;
4213               }
4214             else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4215               {
4216                 buffer->cur = base;
4217                 const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4218                 buffer->cur = base + 1;
4219                 no_warn_cnt = end - buffer->cur;
4220               }
4221           }
4222         else if (c >= utf8_continuation
4223                  && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4224           {
4225             if (no_warn_cnt)
4226               --no_warn_cnt;
4227             else
4228               {
4229                 buffer->cur = base;
4230                 _cpp_warn_invalid_utf8 (pfile);
4231                 buffer->cur = base + 1;
4232               }
4233           }
4234         create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4235         break;
4236       }
4237
4238     }
4239
4240   /* Potentially convert the location of the token to a range.  */
4241   if (result->src_loc >= RESERVED_LOCATION_COUNT
4242       && result->type != CPP_EOF)
4243     {
4244       /* Ensure that any line notes are processed, so that we have the
4245          correct physical line/column for the end-point of the token even
4246          when a logical line is split via one or more backslashes.  */
4247       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4248           && !pfile->overlaid_buffer)
4249         _cpp_process_line_notes (pfile, false);
4250
4251       source_range tok_range;
4252       tok_range.m_start = result->src_loc;
4253       tok_range.m_finish
4254         = linemap_position_for_column (pfile->line_table,
4255                                        CPP_BUF_COLUMN (buffer, buffer->cur));
4256
4257       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
4258                                                result->src_loc,
4259                                                tok_range, NULL);
4260     }
4261
4262   return result;
4263 }
4264
4265 /* An upper bound on the number of bytes needed to spell TOKEN.
4266    Does not include preceding whitespace.  */
4267 unsigned int
4268 cpp_token_len (const cpp_token *token)
4269 {
4270   unsigned int len;
4271
4272   switch (TOKEN_SPELL (token))
4273     {
4274     default:            len = 6;                                break;
4275     case SPELL_LITERAL: len = token->val.str.len;               break;
4276     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
4277     }
4278
4279   return len;
4280 }
4281
4282 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4283    Return the number of bytes read out of NAME.  (There are always
4284    10 bytes written to BUFFER.)  */
4285
4286 static size_t
4287 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4288 {
4289   int j;
4290   int ucn_len = 0;
4291   int ucn_len_c;
4292   unsigned t;
4293   unsigned long utf32;
4294
4295   /* Compute the length of the UTF-8 sequence.  */
4296   for (t = *name; t & 0x80; t <<= 1)
4297     ucn_len++;
4298
4299   utf32 = *name & (0x7F >> ucn_len);
4300   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4301     {
4302       utf32 = (utf32 << 6) | (*++name & 0x3F);
4303
4304       /* Ill-formed UTF-8.  */
4305       if ((*name & ~0x3F) != 0x80)
4306         abort ();
4307     }
4308
4309   *buffer++ = '\\';
4310   *buffer++ = 'U';
4311   for (j = 7; j >= 0; j--)
4312     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4313   return ucn_len;
4314 }
4315
4316 /* Given a token TYPE corresponding to a digraph, return a pointer to
4317    the spelling of the digraph.  */
4318 static const unsigned char *
4319 cpp_digraph2name (enum cpp_ttype type)
4320 {
4321   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4322 }
4323
4324 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4325    The buffer must already contain the enough space to hold the
4326    token's spelling.  Returns a pointer to the character after the
4327    last character written.  */
4328 unsigned char *
4329 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4330 {
4331   size_t i;
4332   const unsigned char *name = NODE_NAME (ident);
4333
4334   for (i = 0; i < NODE_LEN (ident); i++)
4335     if (name[i] & ~0x7F)
4336       {
4337         i += utf8_to_ucn (buffer, name + i) - 1;
4338         buffer += 10;
4339       }
4340     else
4341       *buffer++ = name[i];
4342
4343   return buffer;
4344 }
4345
4346 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
4347    already contain the enough space to hold the token's spelling.
4348    Returns a pointer to the character after the last character written.
4349    FORSTRING is true if this is to be the spelling after translation
4350    phase 1 (with the original spelling of extended identifiers), false
4351    if extended identifiers should always be written using UCNs (there is
4352    no option for always writing them in the internal UTF-8 form).
4353    FIXME: Would be nice if we didn't need the PFILE argument.  */
4354 unsigned char *
4355 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4356                  unsigned char *buffer, bool forstring)
4357 {
4358   switch (TOKEN_SPELL (token))
4359     {
4360     case SPELL_OPERATOR:
4361       {
4362         const unsigned char *spelling;
4363         unsigned char c;
4364
4365         if (token->flags & DIGRAPH)
4366           spelling = cpp_digraph2name (token->type);
4367         else if (token->flags & NAMED_OP)
4368           goto spell_ident;
4369         else
4370           spelling = TOKEN_NAME (token);
4371
4372         while ((c = *spelling++) != '\0')
4373           *buffer++ = c;
4374       }
4375       break;
4376
4377     spell_ident:
4378     case SPELL_IDENT:
4379       if (forstring)
4380         {
4381           memcpy (buffer, NODE_NAME (token->val.node.spelling),
4382                   NODE_LEN (token->val.node.spelling));
4383           buffer += NODE_LEN (token->val.node.spelling);
4384         }
4385       else
4386         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4387       break;
4388
4389     case SPELL_LITERAL:
4390       memcpy (buffer, token->val.str.text, token->val.str.len);
4391       buffer += token->val.str.len;
4392       break;
4393
4394     case SPELL_NONE:
4395       cpp_error (pfile, CPP_DL_ICE,
4396                  "unspellable token %s", TOKEN_NAME (token));
4397       break;
4398     }
4399
4400   return buffer;
4401 }
4402
4403 /* Returns TOKEN spelt as a null-terminated string.  The string is
4404    freed when the reader is destroyed.  Useful for diagnostics.  */
4405 unsigned char *
4406 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4407 {
4408   unsigned int len = cpp_token_len (token) + 1;
4409   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4410
4411   end = cpp_spell_token (pfile, token, start, false);
4412   end[0] = '\0';
4413
4414   return start;
4415 }
4416
4417 /* Returns a pointer to a string which spells the token defined by
4418    TYPE and FLAGS.  Used by C front ends, which really should move to
4419    using cpp_token_as_text.  */
4420 const char *
4421 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4422 {
4423   if (flags & DIGRAPH)
4424     return (const char *) cpp_digraph2name (type);
4425   else if (flags & NAMED_OP)
4426     return cpp_named_operator2name (type);
4427
4428   return (const char *) token_spellings[type].name;
4429 }
4430
4431 /* Writes the spelling of token to FP, without any preceding space.
4432    Separated from cpp_spell_token for efficiency - to avoid stdio
4433    double-buffering.  */
4434 void
4435 cpp_output_token (const cpp_token *token, FILE *fp)
4436 {
4437   switch (TOKEN_SPELL (token))
4438     {
4439     case SPELL_OPERATOR:
4440       {
4441         const unsigned char *spelling;
4442         int c;
4443
4444         if (token->flags & DIGRAPH)
4445           spelling = cpp_digraph2name (token->type);
4446         else if (token->flags & NAMED_OP)
4447           goto spell_ident;
4448         else
4449           spelling = TOKEN_NAME (token);
4450
4451         c = *spelling;
4452         do
4453           putc (c, fp);
4454         while ((c = *++spelling) != '\0');
4455       }
4456       break;
4457
4458     spell_ident:
4459     case SPELL_IDENT:
4460       {
4461         size_t i;
4462         const unsigned char * name = NODE_NAME (token->val.node.node);
4463
4464         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4465           if (name[i] & ~0x7F)
4466             {
4467               unsigned char buffer[10];
4468               i += utf8_to_ucn (buffer, name + i) - 1;
4469               fwrite (buffer, 1, 10, fp);
4470             }
4471           else
4472             fputc (NODE_NAME (token->val.node.node)[i], fp);
4473       }
4474       break;
4475
4476     case SPELL_LITERAL:
4477       if (token->type == CPP_HEADER_NAME)
4478         fputc ('"', fp);
4479       fwrite (token->val.str.text, 1, token->val.str.len, fp);
4480       if (token->type == CPP_HEADER_NAME)
4481         fputc ('"', fp);
4482       break;
4483
4484     case SPELL_NONE:
4485       /* An error, most probably.  */
4486       break;
4487     }
4488 }
4489
4490 /* Compare two tokens.  */
4491 int
4492 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4493 {
4494   if (a->type == b->type && a->flags == b->flags)
4495     switch (TOKEN_SPELL (a))
4496       {
4497       default:                  /* Keep compiler happy.  */
4498       case SPELL_OPERATOR:
4499         /* token_no is used to track where multiple consecutive ##
4500            tokens were originally located.  */
4501         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4502       case SPELL_NONE:
4503         return (a->type != CPP_MACRO_ARG
4504                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4505                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4506       case SPELL_IDENT:
4507         return (a->val.node.node == b->val.node.node
4508                 && a->val.node.spelling == b->val.node.spelling);
4509       case SPELL_LITERAL:
4510         return (a->val.str.len == b->val.str.len
4511                 && !memcmp (a->val.str.text, b->val.str.text,
4512                             a->val.str.len));
4513       }
4514
4515   return 0;
4516 }
4517
4518 /* Returns nonzero if a space should be inserted to avoid an
4519    accidental token paste for output.  For simplicity, it is
4520    conservative, and occasionally advises a space where one is not
4521    needed, e.g. "." and ".2".  */
4522 int
4523 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4524                  const cpp_token *token2)
4525 {
4526   enum cpp_ttype a = token1->type, b = token2->type;
4527   cppchar_t c;
4528
4529   if (token1->flags & NAMED_OP)
4530     a = CPP_NAME;
4531   if (token2->flags & NAMED_OP)
4532     b = CPP_NAME;
4533
4534   c = EOF;
4535   if (token2->flags & DIGRAPH)
4536     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4537   else if (token_spellings[b].category == SPELL_OPERATOR)
4538     c = token_spellings[b].name[0];
4539
4540   /* Quickly get everything that can paste with an '='.  */
4541   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4542     return 1;
4543
4544   switch (a)
4545     {
4546     case CPP_GREATER:   return c == '>';
4547     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
4548     case CPP_PLUS:      return c == '+';
4549     case CPP_MINUS:     return c == '-' || c == '>';
4550     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
4551     case CPP_MOD:       return c == ':' || c == '>';
4552     case CPP_AND:       return c == '&';
4553     case CPP_OR:        return c == '|';
4554     case CPP_COLON:     return c == ':' || c == '>';
4555     case CPP_DEREF:     return c == '*';
4556     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
4557     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
4558     case CPP_PRAGMA:
4559     case CPP_NAME:      return ((b == CPP_NUMBER
4560                                  && name_p (pfile, &token2->val.str))
4561                                 || b == CPP_NAME
4562                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
4563     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
4564                                 || b == CPP_CHAR
4565                                 || c == '.' || c == '+' || c == '-');
4566                                       /* UCNs */
4567     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
4568                                  && b == CPP_NAME)
4569                                 || (CPP_OPTION (pfile, objc)
4570                                     && token1->val.str.text[0] == '@'
4571                                     && (b == CPP_NAME || b == CPP_STRING)));
4572     case CPP_LESS_EQ:   return c == '>';
4573     case CPP_STRING:
4574     case CPP_WSTRING:
4575     case CPP_UTF8STRING:
4576     case CPP_STRING16:
4577     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
4578                                 && (b == CPP_NAME
4579                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
4580                                         && ISIDST (token2->val.str.text[0]))));
4581
4582     default:            break;
4583     }
4584
4585   return 0;
4586 }
4587
4588 /* Output all the remaining tokens on the current line, and a newline
4589    character, to FP.  Leading whitespace is removed.  If there are
4590    macros, special token padding is not performed.  */
4591 void
4592 cpp_output_line (cpp_reader *pfile, FILE *fp)
4593 {
4594   const cpp_token *token;
4595
4596   token = cpp_get_token (pfile);
4597   while (token->type != CPP_EOF)
4598     {
4599       cpp_output_token (token, fp);
4600       token = cpp_get_token (pfile);
4601       if (token->flags & PREV_WHITE)
4602         putc (' ', fp);
4603     }
4604
4605   putc ('\n', fp);
4606 }
4607
4608 /* Return a string representation of all the remaining tokens on the
4609    current line.  The result is allocated using xmalloc and must be
4610    freed by the caller.  */
4611 unsigned char *
4612 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4613 {
4614   const cpp_token *token;
4615   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4616   unsigned int alloced = 120 + out;
4617   unsigned char *result = (unsigned char *) xmalloc (alloced);
4618
4619   /* If DIR_NAME is empty, there are no initial contents.  */
4620   if (dir_name)
4621     {
4622       sprintf ((char *) result, "#%s ", dir_name);
4623       out += 2;
4624     }
4625
4626   token = cpp_get_token (pfile);
4627   while (token->type != CPP_EOF)
4628     {
4629       unsigned char *last;
4630       /* Include room for a possible space and the terminating nul.  */
4631       unsigned int len = cpp_token_len (token) + 2;
4632
4633       if (out + len > alloced)
4634         {
4635           alloced *= 2;
4636           if (out + len > alloced)
4637             alloced = out + len;
4638           result = (unsigned char *) xrealloc (result, alloced);
4639         }
4640
4641       last = cpp_spell_token (pfile, token, &result[out], 0);
4642       out = last - result;
4643
4644       token = cpp_get_token (pfile);
4645       if (token->flags & PREV_WHITE)
4646         result[out++] = ' ';
4647     }
4648
4649   result[out] = '\0';
4650   return result;
4651 }
4652
4653 /* Memory buffers.  Changing these three constants can have a dramatic
4654    effect on performance.  The values here are reasonable defaults,
4655    but might be tuned.  If you adjust them, be sure to test across a
4656    range of uses of cpplib, including heavy nested function-like macro
4657    expansion.  Also check the change in peak memory usage (NJAMD is a
4658    good tool for this).  */
4659 #define MIN_BUFF_SIZE 8000
4660 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4661 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4662         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4663
4664 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4665   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4666 #endif
4667
4668 /* Create a new allocation buffer.  Place the control block at the end
4669    of the buffer, so that buffer overflows will cause immediate chaos.  */
4670 static _cpp_buff *
4671 new_buff (size_t len)
4672 {
4673   _cpp_buff *result;
4674   unsigned char *base;
4675
4676   if (len < MIN_BUFF_SIZE)
4677     len = MIN_BUFF_SIZE;
4678   len = CPP_ALIGN (len);
4679
4680 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4681   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4682      struct first.  */
4683   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4684   base = XNEWVEC (unsigned char, len + slen);
4685   result = (_cpp_buff *) base;
4686   base += slen;
4687 #else
4688   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4689   result = (_cpp_buff *) (base + len);
4690 #endif
4691   result->base = base;
4692   result->cur = base;
4693   result->limit = base + len;
4694   result->next = NULL;
4695   return result;
4696 }
4697
4698 /* Place a chain of unwanted allocation buffers on the free list.  */
4699 void
4700 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4701 {
4702   _cpp_buff *end = buff;
4703
4704   while (end->next)
4705     end = end->next;
4706   end->next = pfile->free_buffs;
4707   pfile->free_buffs = buff;
4708 }
4709
4710 /* Return a free buffer of size at least MIN_SIZE.  */
4711 _cpp_buff *
4712 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4713 {
4714   _cpp_buff *result, **p;
4715
4716   for (p = &pfile->free_buffs;; p = &(*p)->next)
4717     {
4718       size_t size;
4719
4720       if (*p == NULL)
4721         return new_buff (min_size);
4722       result = *p;
4723       size = result->limit - result->base;
4724       /* Return a buffer that's big enough, but don't waste one that's
4725          way too big.  */
4726       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4727         break;
4728     }
4729
4730   *p = result->next;
4731   result->next = NULL;
4732   result->cur = result->base;
4733   return result;
4734 }
4735
4736 /* Creates a new buffer with enough space to hold the uncommitted
4737    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
4738    the excess bytes to the new buffer.  Chains the new buffer after
4739    BUFF, and returns the new buffer.  */
4740 _cpp_buff *
4741 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4742 {
4743   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4744   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4745
4746   buff->next = new_buff;
4747   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4748   return new_buff;
4749 }
4750
4751 /* Creates a new buffer with enough space to hold the uncommitted
4752    remaining bytes of the buffer pointed to by BUFF, and at least
4753    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
4754    Chains the new buffer before the buffer pointed to by BUFF, and
4755    updates the pointer to point to the new buffer.  */
4756 void
4757 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4758 {
4759   _cpp_buff *new_buff, *old_buff = *pbuff;
4760   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4761
4762   new_buff = _cpp_get_buff (pfile, size);
4763   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4764   new_buff->next = old_buff;
4765   *pbuff = new_buff;
4766 }
4767
4768 /* Free a chain of buffers starting at BUFF.  */
4769 void
4770 _cpp_free_buff (_cpp_buff *buff)
4771 {
4772   _cpp_buff *next;
4773
4774   for (; buff; buff = next)
4775     {
4776       next = buff->next;
4777 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4778       free (buff);
4779 #else
4780       free (buff->base);
4781 #endif
4782     }
4783 }
4784
4785 /* Allocate permanent, unaligned storage of length LEN.  */
4786 unsigned char *
4787 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4788 {
4789   _cpp_buff *buff = pfile->u_buff;
4790   unsigned char *result = buff->cur;
4791
4792   if (len > (size_t) (buff->limit - result))
4793     {
4794       buff = _cpp_get_buff (pfile, len);
4795       buff->next = pfile->u_buff;
4796       pfile->u_buff = buff;
4797       result = buff->cur;
4798     }
4799
4800   buff->cur = result + len;
4801   return result;
4802 }
4803
4804 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4805    That buffer is used for growing allocations when saving macro
4806    replacement lists in a #define, and when parsing an answer to an
4807    assertion in #assert, #unassert or #if (and therefore possibly
4808    whilst expanding macros).  It therefore must not be used by any
4809    code that they might call: specifically the lexer and the guts of
4810    the macro expander.
4811
4812    All existing other uses clearly fit this restriction: storing
4813    registered pragmas during initialization.  */
4814 unsigned char *
4815 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4816 {
4817   _cpp_buff *buff = pfile->a_buff;
4818   unsigned char *result = buff->cur;
4819
4820   if (len > (size_t) (buff->limit - result))
4821     {
4822       buff = _cpp_get_buff (pfile, len);
4823       buff->next = pfile->a_buff;
4824       pfile->a_buff = buff;
4825       result = buff->cur;
4826     }
4827
4828   buff->cur = result + len;
4829   return result;
4830 }
4831
4832 /* Commit or allocate storage from a buffer.  */
4833
4834 void *
4835 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4836 {
4837   void *ptr = BUFF_FRONT (pfile->a_buff);
4838
4839   if (pfile->hash_table->alloc_subobject)
4840     {
4841       void *copy = pfile->hash_table->alloc_subobject (size);
4842       memcpy (copy, ptr, size);
4843       ptr = copy;
4844     }
4845   else
4846     BUFF_FRONT (pfile->a_buff) += size;
4847
4848   return ptr;
4849 }
4850
4851 /* Say which field of TOK is in use.  */
4852
4853 enum cpp_token_fld_kind
4854 cpp_token_val_index (const cpp_token *tok)
4855 {
4856   switch (TOKEN_SPELL (tok))
4857     {
4858     case SPELL_IDENT:
4859       return CPP_TOKEN_FLD_NODE;
4860     case SPELL_LITERAL:
4861       return CPP_TOKEN_FLD_STR;
4862     case SPELL_OPERATOR:
4863       /* Operands which were originally spelled as ident keep around
4864          the node for the exact spelling.  */
4865       if (tok->flags & NAMED_OP)
4866         return CPP_TOKEN_FLD_NODE;
4867       else if (tok->type == CPP_PASTE)
4868         return CPP_TOKEN_FLD_TOKEN_NO;
4869       else
4870         return CPP_TOKEN_FLD_NONE;
4871     case SPELL_NONE:
4872       if (tok->type == CPP_MACRO_ARG)
4873         return CPP_TOKEN_FLD_ARG_NO;
4874       else if (tok->type == CPP_PADDING)
4875         return CPP_TOKEN_FLD_SOURCE;
4876       else if (tok->type == CPP_PRAGMA)
4877         return CPP_TOKEN_FLD_PRAGMA;
4878       /* fall through */
4879     default:
4880       return CPP_TOKEN_FLD_NONE;
4881     }
4882 }
4883
4884 /* All tokens lexed in R after calling this function will be forced to
4885    have their location_t to be P, until
4886    cpp_stop_forcing_token_locations is called for R.  */
4887
4888 void
4889 cpp_force_token_locations (cpp_reader *r, location_t loc)
4890 {
4891   r->forced_token_location = loc;
4892 }
4893
4894 /* Go back to assigning locations naturally for lexed tokens.  */
4895
4896 void
4897 cpp_stop_forcing_token_locations (cpp_reader *r)
4898 {
4899   r->forced_token_location = 0;
4900 }
4901
4902 /* We're looking at \, if it's escaping EOL, look past it.  If at
4903    LIMIT, don't advance.  */
4904
4905 static const unsigned char *
4906 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4907 {
4908   const unsigned char *probe = peek;
4909
4910   if (__builtin_expect (peek[1] == '\n', true))
4911     {
4912     eol:
4913       probe += 2;
4914       if (__builtin_expect (probe < limit, true))
4915         {
4916           peek = probe;
4917           if (*peek == '\\')
4918             /* The user might be perverse.  */
4919             return do_peek_backslash (peek, limit);
4920         }
4921     }
4922   else if (__builtin_expect (peek[1] == '\r', false))
4923     {
4924       if (probe[2] == '\n')
4925         probe++;
4926       goto eol;
4927     }
4928
4929   return peek;
4930 }
4931
4932 static const unsigned char *
4933 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4934 {
4935   if (__builtin_expect (*peek == '\\', false))
4936     peek = do_peek_backslash (peek, limit);
4937   return peek;
4938 }
4939
4940 static const unsigned char *
4941 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4942 {
4943   if (peek == bound)
4944     return NULL;
4945
4946   unsigned char c = *--peek;
4947   if (__builtin_expect (c == '\n', false)
4948       || __builtin_expect (c == 'r', false))
4949     {
4950       if (peek == bound)
4951         return peek;
4952       int ix = -1;
4953       if (c == '\n' && peek[ix] == '\r')
4954         {
4955           if (peek + ix == bound)
4956             return peek;
4957           ix--;
4958         }
4959
4960       if (peek[ix] == '\\')
4961         return do_peek_prev (peek + ix, bound);
4962
4963       return peek;
4964     }
4965   else
4966     return peek;
4967 }
4968
4969 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4970    space.  Otherwise return NULL.  */
4971
4972 static const unsigned char *
4973 do_peek_ident (const char *match, const unsigned char *peek,
4974                const unsigned char *limit)
4975 {
4976   for (; *++match; peek++)
4977     if (*peek != *match)
4978       {
4979         peek = do_peek_next (peek, limit);
4980         if (*peek != *match)
4981           return NULL;
4982       }
4983
4984   /* Must now not be looking at an identifier char.  */
4985   peek = do_peek_next (peek, limit);
4986   if (ISIDNUM (*peek))
4987     return NULL;
4988
4989   /* Skip control-line whitespace.  */
4990  ws:
4991   while (*peek == ' ' || *peek == '\t')
4992     peek++;
4993   if (__builtin_expect (*peek == '\\', false))
4994     {
4995       peek = do_peek_backslash (peek, limit);
4996       if (*peek != '\\')
4997         goto ws;
4998     }
4999
5000   return peek;
5001 }
5002
5003 /* Are we looking at a module control line starting as PEEK - 1?  */
5004
5005 static bool
5006 do_peek_module (cpp_reader *pfile, unsigned char c,
5007                 const unsigned char *peek, const unsigned char *limit)
5008 {
5009   bool import = false;
5010
5011   if (__builtin_expect (c == 'e', false))
5012     {
5013       if (!((peek[0] == 'x' || peek[0] == '\\')
5014             && (peek = do_peek_ident ("export", peek, limit))))
5015         return false;
5016
5017       /* export, peek for import or module.  No need to peek __import
5018          here.  */
5019       if (peek[0] == 'i')
5020         {
5021           if (!((peek[1] == 'm' || peek[1] == '\\')
5022                 && (peek = do_peek_ident ("import", peek + 1, limit))))
5023             return false;
5024           import = true;
5025         }
5026       else if (peek[0] == 'm')
5027         {
5028           if (!((peek[1] == 'o' || peek[1] == '\\')
5029                 && (peek = do_peek_ident ("module", peek + 1, limit))))
5030             return false;
5031         }
5032       else
5033         return false;
5034     }
5035   else if (__builtin_expect (c == 'i', false))
5036     {
5037       if (!((peek[0] == 'm' || peek[0] == '\\')
5038             && (peek = do_peek_ident ("import", peek, limit))))
5039         return false;
5040       import = true;
5041     }
5042   else if (__builtin_expect (c == '_', false))
5043     {
5044       /* Needed for translated includes.   */
5045       if (!((peek[0] == '_' || peek[0] == '\\')
5046             && (peek = do_peek_ident ("__import", peek, limit))))
5047         return false;
5048       import = true;
5049     }
5050   else if (__builtin_expect (c == 'm', false))
5051     {
5052       if (!((peek[0] == 'o' || peek[0] == '\\')
5053             && (peek = do_peek_ident ("module", peek, limit))))
5054         return false;
5055     }
5056   else
5057     return false;
5058
5059   /* Peek the next character to see if it's good enough.  We'll be at
5060      the first non-whitespace char, including skipping an escaped
5061      newline.  */
5062   /* ... import followed by identifier, ':', '<' or header-name
5063      preprocessing tokens, or module followed by identifier, ':' or
5064      ';' preprocessing tokens.  */
5065   unsigned char p = *peek++;
5066
5067   /* A character literal is ... single quotes, ... optionally preceded
5068      by u8, u, U, or L */
5069   /* A string-literal is a ... double quotes, optionally prefixed by
5070      R, u8, u8R, u, uR, U, UR, L, or LR */
5071   if (p == 'u')
5072     {
5073       peek = do_peek_next (peek, limit);
5074       if (*peek == '8')
5075         {
5076           peek++;
5077           goto peek_u8;
5078         }
5079       goto peek_u;
5080     }
5081   else if (p == 'U' || p == 'L')
5082     {
5083     peek_u8:
5084       peek = do_peek_next (peek, limit);
5085     peek_u:
5086       if (*peek == '\"' || *peek == '\'')
5087         return false;
5088
5089       if (*peek == 'R')
5090         goto peek_R;
5091       /* Identifier. Ok.  */
5092     }
5093   else if (p == 'R')
5094     {
5095     peek_R:
5096       if (CPP_OPTION (pfile, rliterals))
5097         {
5098           peek = do_peek_next (peek, limit);
5099           if (*peek == '\"')
5100             return false;
5101         }
5102       /* Identifier. Ok.  */
5103     }
5104   else if ('Z' - 'A' == 25
5105            ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5106            : ISIDST (p))
5107     {
5108       /* Identifier.  Ok. */
5109     }
5110   else if (p == '<')
5111     {
5112       /* Maybe angle header, ok for import.  Reject
5113          '<=', '<<' digraph:'<:'.  */
5114       if (!import)
5115         return false;
5116       peek = do_peek_next (peek, limit);
5117       if (*peek == '=' || *peek == '<'
5118           || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5119         return false;
5120     }
5121   else if (p == ';')
5122     {
5123       /* SEMICOLON, ok for module.  */
5124       if (import)
5125         return false;
5126     }
5127   else if (p == '"')
5128     {
5129       /* STRING, ok for import.  */
5130       if (!import)
5131         return false;
5132     }
5133   else if (p == ':')
5134     {
5135       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
5136       peek = do_peek_next (peek, limit);
5137       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5138         return false;
5139     }
5140   else
5141     /* FIXME: Detect a unicode character, excluding those not
5142        permitted as the initial character. [lex.name]/1.  I presume
5143        we need to check the \[uU] spellings, and directly using
5144        Unicode in say UTF8 form?  Or perhaps we do the phase-1
5145        conversion of UTF8 to universal-character-names?  */
5146     return false;
5147
5148   return true;
5149 }
5150
5151 /* Directives-only scanning.  Somewhat more relaxed than correct
5152    parsing -- some ill-formed programs will not be rejected.  */
5153
5154 void
5155 cpp_directive_only_process (cpp_reader *pfile,
5156                             void *data,
5157                             void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5158 {
5159   bool module_p = CPP_OPTION (pfile, module_directives);
5160
5161   do
5162     {
5163     restart:
5164       /* Buffer initialization, but no line cleaning. */
5165       cpp_buffer *buffer = pfile->buffer;
5166       buffer->cur_note = buffer->notes_used = 0;
5167       buffer->cur = buffer->line_base = buffer->next_line;
5168       buffer->need_line = false;
5169       /* Files always end in a newline or carriage return.  We rely on this for
5170          character peeking safety.  */
5171       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
5172
5173       const unsigned char *base = buffer->cur;
5174       unsigned line_count = 0;
5175       const unsigned char *line_start = base;
5176
5177       bool bol = true;
5178       bool raw = false;
5179
5180       const unsigned char *lwm = base;
5181       for (const unsigned char *pos = base, *limit = buffer->rlimit;
5182            pos < limit;)
5183         {
5184           unsigned char c = *pos++;
5185           /* This matches the switch in _cpp_lex_direct.  */
5186           switch (c)
5187             {
5188             case ' ': case '\t': case '\f': case '\v':
5189               /* Whitespace, do nothing.  */
5190               break;
5191
5192             case '\r': /* MAC line ending, or Windows \r\n  */
5193               if (*pos == '\n')
5194                 pos++;
5195               /* FALLTHROUGH */
5196
5197             case '\n':
5198               bol = true;
5199
5200             next_line:
5201               CPP_INCREMENT_LINE (pfile, 0);
5202               line_count++;
5203               line_start = pos;
5204               break;
5205
5206             case '\\':
5207               /* <backslash><newline> is removed, and doesn't undo any
5208                  preceeding escape or whatnot.  */
5209               if (*pos == '\n')
5210                 {
5211                   pos++;
5212                   goto next_line;
5213                 }
5214               else if (*pos == '\r')
5215                 {
5216                   if (pos[1] == '\n')
5217                     pos++;
5218                   pos++;
5219                   goto next_line;
5220                 }
5221               goto dflt;
5222
5223             case '#':
5224               if (bol)
5225                 {
5226                   /* Line directive.  */
5227                   if (pos - 1 > base && !pfile->state.skipping)
5228                     cb (pfile, CPP_DO_print, data,
5229                         line_count, base, pos - 1 - base);
5230
5231                   /* Prep things for directive handling. */
5232                   buffer->next_line = pos;
5233                   buffer->need_line = true;
5234                   bool ok = _cpp_get_fresh_line (pfile);
5235                   gcc_checking_assert (ok);
5236
5237                   /* Ensure proper column numbering for generated
5238                      error messages. */
5239                   buffer->line_base -= pos - line_start;
5240
5241                   _cpp_handle_directive (pfile, line_start + 1 != pos);
5242
5243                   /* Sanitize the line settings.  Duplicate #include's can
5244                      mess things up. */
5245                   // FIXME: Necessary?
5246                   pfile->line_table->highest_location
5247                     = pfile->line_table->highest_line;
5248
5249                   if (!pfile->state.skipping
5250                       && pfile->buffer->next_line < pfile->buffer->rlimit)
5251                     cb (pfile, CPP_DO_location, data,
5252                         pfile->line_table->highest_line);
5253
5254                   goto restart;
5255                 }
5256               goto dflt;
5257
5258             case '/':
5259               {
5260                 const unsigned char *peek = do_peek_next (pos, limit);
5261                 if (!(*peek == '/' || *peek == '*'))
5262                   goto dflt;
5263
5264                 /* Line or block comment  */
5265                 bool is_block = *peek == '*';
5266                 bool star = false;
5267                 bool esc = false;
5268                 location_t sloc
5269                   = linemap_position_for_column (pfile->line_table,
5270                                                  pos - line_start);
5271
5272                 while (pos < limit)
5273                   {
5274                     char c = *pos++;
5275                     switch (c)
5276                       {
5277                       case '\\':
5278                         esc = true;
5279                         break;
5280
5281                       case '\r':
5282                         if (*pos == '\n')
5283                           pos++;
5284                         /* FALLTHROUGH  */
5285
5286                       case '\n':
5287                         {
5288                           CPP_INCREMENT_LINE (pfile, 0);
5289                           line_count++;
5290                           line_start = pos;
5291                           if (!esc && !is_block)
5292                             {
5293                               bol = true;
5294                               goto done_comment;
5295                             }
5296                         }
5297                         if (!esc)
5298                           star = false;
5299                         esc = false;
5300                         break;
5301
5302                       case '*':
5303                         if (pos > peek)
5304                           star = is_block;
5305                         esc = false;
5306                         break;
5307
5308                       case '/':
5309                         if (star)
5310                           goto done_comment;
5311                         /* FALLTHROUGH  */
5312
5313                       default:
5314                         star = false;
5315                         esc = false;
5316                         break;
5317                       }
5318                   }
5319                 if (pos < limit || is_block)
5320                   cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5321                                        "unterminated comment");
5322               done_comment:
5323                 lwm = pos;
5324                 break;
5325               }
5326
5327             case '\'':
5328               if (!CPP_OPTION (pfile, digit_separators))
5329                 goto delimited_string;
5330
5331               /* Possibly a number punctuator.  */
5332               if (!ISIDNUM (*do_peek_next (pos, limit)))
5333                 goto delimited_string;
5334
5335               goto quote_peek;
5336
5337             case '\"':
5338               if (!CPP_OPTION (pfile, rliterals))
5339                 goto delimited_string;
5340
5341             quote_peek:
5342               {
5343                 /* For ' see if it's a number punctuator
5344                    \.?<digit>(<digit>|<identifier-nondigit>
5345                    |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5346                 /* For " see if it's a raw string
5347                    {U,L,u,u8}R.  This includes CPP_NUMBER detection,
5348                    because that could be 0e+R.  */
5349                 const unsigned char *peek = pos - 1;
5350                 bool quote_first = c == '"';
5351                 bool quote_eight = false;
5352                 bool maybe_number_start = false;
5353                 bool want_number = false;
5354
5355                 while ((peek = do_peek_prev (peek, lwm)))
5356                   {
5357                     unsigned char p = *peek;
5358                     if (quote_first)
5359                       {
5360                         if (!raw)
5361                           {
5362                             if (p != 'R')
5363                               break;
5364                             raw = true;
5365                             continue;
5366                           }
5367
5368                         quote_first = false;
5369                         if (p == 'L' || p == 'U' || p == 'u')
5370                           ;
5371                         else if (p == '8')
5372                           quote_eight = true;
5373                         else
5374                           goto second_raw;
5375                       }
5376                     else if (quote_eight)
5377                       {
5378                         if (p != 'u')
5379                           {
5380                             raw = false;
5381                             break;
5382                           }
5383                         quote_eight = false;
5384                       }
5385                     else if (c == '"')
5386                       {
5387                       second_raw:;
5388                         if (!want_number && ISIDNUM (p))
5389                           {
5390                             raw = false;
5391                             break;
5392                           }
5393                       }
5394
5395                     if (ISDIGIT (p))
5396                       maybe_number_start = true;
5397                     else if (p == '.')
5398                       want_number = true;
5399                     else if (ISIDNUM (p))
5400                       maybe_number_start = false;
5401                     else if (p == '+' || p == '-')
5402                       {
5403                         if (const unsigned char *peek_prev
5404                             = do_peek_prev (peek, lwm))
5405                           {
5406                             p = *peek_prev;
5407                             if (p == 'e' || p == 'E'
5408                                 || p == 'p' || p == 'P')
5409                               {
5410                                 want_number = true;
5411                                 maybe_number_start = false;
5412                               }
5413                             else
5414                               break;
5415                           }
5416                         else
5417                           break;
5418                       }
5419                     else if (p == '\'' || p == '\"')
5420                       {
5421                         /* If this is lwm, this must be the end of a
5422                            previous string.  So this is a trailing
5423                            literal type, (a) if those are allowed,
5424                              and (b) maybe_start is false.  Otherwise
5425                              this must be a CPP_NUMBER because we've
5426                              met another ', and we'd have checked that
5427                              in its own right.  */
5428                         if (peek == lwm && CPP_OPTION (pfile, uliterals))
5429                           {
5430                             if  (!maybe_number_start && !want_number)
5431                               /* Must be a literal type.  */
5432                               raw = false;
5433                           }
5434                         else if (p == '\''
5435                                  && CPP_OPTION (pfile, digit_separators))
5436                           maybe_number_start = true;
5437                         break;
5438                       }
5439                     else if (c == '\'')
5440                       break;
5441                     else if (!quote_first && !quote_eight)
5442                       break;
5443                   }
5444
5445                 if (maybe_number_start)
5446                   {
5447                     if (c == '\'')
5448                       /* A CPP NUMBER.  */
5449                       goto dflt;
5450                     raw = false;
5451                   }
5452
5453                 goto delimited_string;
5454               }
5455
5456             delimited_string:
5457               {
5458                 /* (Possibly raw) string or char literal.  */
5459                 unsigned char end = c;
5460                 int delim_len = -1;
5461                 const unsigned char *delim = NULL;
5462                 location_t sloc = linemap_position_for_column (pfile->line_table,
5463                                                                pos - line_start);
5464                 int esc = 0;
5465
5466                 if (raw)
5467                   {
5468                     /* There can be no line breaks in the delimiter.  */
5469                     delim = pos;
5470                     for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5471                       {
5472                         if (delim_len == 16)
5473                           {
5474                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5475                                                  sloc, 0,
5476                                                  "raw string delimiter"
5477                                                  " longer than %d"
5478                                                  " characters",
5479                                                  delim_len);
5480                             raw = false;
5481                             pos = delim;
5482                             break;
5483                           }
5484                         if (strchr (") \\\t\v\f\n", c))
5485                           {
5486                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5487                                                  sloc, 0,
5488                                                  "invalid character '%c'"
5489                                                  " in raw string"
5490                                                  " delimiter", c);
5491                             raw = false;
5492                             pos = delim;
5493                             break;
5494                           }
5495                         if (pos >= limit)
5496                           goto bad_string;
5497                       }
5498                   }
5499
5500                 while (pos < limit)
5501                   {
5502                     char c = *pos++;
5503                     switch (c)
5504                       {
5505                       case '\\':
5506                         if (!raw)
5507                           esc++;
5508                         break;
5509
5510                       case '\r':
5511                         if (*pos == '\n')
5512                           pos++;
5513                         /* FALLTHROUGH  */
5514
5515                       case '\n':
5516                         {
5517                           CPP_INCREMENT_LINE (pfile, 0);
5518                           line_count++;
5519                           line_start = pos;
5520                         }
5521                         if (esc)
5522                           esc--;
5523                         break;
5524
5525                       case ')':
5526                         if (raw
5527                             && pos + delim_len + 1 < limit
5528                             && pos[delim_len] == end
5529                             && !memcmp (delim, pos, delim_len))
5530                           {
5531                             pos += delim_len + 1;
5532                             raw = false;
5533                             goto done_string;
5534                           }
5535                         break;
5536
5537                       default:
5538                         if (!raw && !(esc & 1) && c == end)
5539                           goto done_string;
5540                         esc = 0;
5541                         break;
5542                       }
5543                   }
5544               bad_string:
5545                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5546                                      "unterminated literal");
5547
5548               done_string:
5549                 raw = false;
5550                 lwm = pos - 1;
5551               }
5552               goto dflt;
5553
5554             case '_':
5555             case 'e':
5556             case 'i':
5557             case 'm':
5558               if (bol && module_p && !pfile->state.skipping
5559                   && do_peek_module (pfile, c, pos, limit))
5560                 {
5561                   /* We've seen the start of a module control line.
5562                      Start up the tokenizer.  */
5563                   pos--; /* Backup over the first character.  */
5564
5565                   /* Backup over whitespace to start of line.  */
5566                   while (pos > line_start
5567                          && (pos[-1] == ' ' || pos[-1] == '\t'))
5568                     pos--;
5569
5570                   if (pos > base)
5571                     cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5572
5573                   /* Prep things for directive handling. */
5574                   buffer->next_line = pos;
5575                   buffer->need_line = true;
5576
5577                   /* Now get tokens until the PRAGMA_EOL.  */
5578                   do
5579                     {
5580                       location_t spelling;
5581                       const cpp_token *tok
5582                         = cpp_get_token_with_location (pfile, &spelling);
5583
5584                       gcc_assert (pfile->state.in_deferred_pragma
5585                                   || tok->type == CPP_PRAGMA_EOL);
5586                       cb (pfile, CPP_DO_token, data, tok, spelling);
5587                     }
5588                   while (pfile->state.in_deferred_pragma);
5589
5590                   if (pfile->buffer->next_line < pfile->buffer->rlimit)
5591                     cb (pfile, CPP_DO_location, data,
5592                         pfile->line_table->highest_line);
5593
5594                   pfile->mi_valid = false;
5595                   goto restart;
5596                 }
5597               goto dflt;
5598
5599             default:
5600             dflt:
5601               bol = false;
5602               pfile->mi_valid = false;
5603               break;
5604             }
5605         }
5606
5607       if (buffer->rlimit > base && !pfile->state.skipping)
5608         {
5609           const unsigned char *limit = buffer->rlimit;
5610           /* If the file was not newline terminated, add rlimit, which is
5611              guaranteed to point to a newline, to the end of our range.  */
5612           if (limit[-1] != '\n')
5613             {
5614               limit++;
5615               CPP_INCREMENT_LINE (pfile, 0);
5616               line_count++;
5617             }
5618           cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5619         }
5620
5621       _cpp_pop_buffer (pfile);
5622     }
5623   while (pfile->buffer);
5624 }