libcpp/lex.cc

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2022 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive.  */
  54 #define UCS_LIMIT 0x10FFFF
  55
  56 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  57 static int skip_line_comment (cpp_reader *);
  58 static void skip_whitespace (cpp_reader *, cppchar_t);
  59 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  60 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  61 static void store_comment (cpp_reader *, cpp_token *);
  62 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  63                             unsigned int, enum cpp_ttype);
  64 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  65 static int name_p (cpp_reader *, const cpp_string *);
  66 static tokenrun *next_tokenrun (tokenrun *);
  67
  68 static _cpp_buff *new_buff (size_t);
  69
  70
  71 /* Utility routine:
  72
  73    Compares, the token TOKEN to the NUL-terminated string STRING.
  74    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  75 int
  76 cpp_ideq (const cpp_token *token, const char *string)
  77 {
  78   if (token->type != CPP_NAME)
  79     return 0;
  80
  81   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  82 }
  83
  84 /* Record a note TYPE at byte POS into the current cleaned logical
  85    line.  */
  86 static void
  87 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  88 {
  89   if (buffer->notes_used == buffer->notes_cap)
  90     {
  91       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  92       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  93                                   buffer->notes_cap);
  94     }
  95
  96   buffer->notes[buffer->notes_used].pos = pos;
  97   buffer->notes[buffer->notes_used].type = type;
  98   buffer->notes_used++;
  99 }
 100
 101 \f
 102 /* Fast path to find line special characters using optimized character
 103    scanning algorithms.  Anything complicated falls back to the slow
 104    path below.  Since this loop is very hot it's worth doing these kinds
 105    of optimizations.
 106
 107    One of the paths through the ifdefs should provide
 108
 109      const uchar *search_line_fast (const uchar *s, const uchar *end);
 110
 111    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 112    the found character.
 113
 114    Note that the last character of the buffer is *always* a newline,
 115    as forced by _cpp_convert_input.  This fact can be used to avoid
 116    explicitly looking for the end of the buffer.  */
 117
 118 /* Configure gives us an ifdef test.  */
 119 #ifndef WORDS_BIGENDIAN
 120 #define WORDS_BIGENDIAN 0
 121 #endif
 122
 123 /* We'd like the largest integer that fits into a register.  There's nothing
 124    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 125    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 126    can get the "real" word size.  */
 127 #ifdef __GNUC__
 128 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 129 #else
 130 typedef unsigned long word_type;
 131 #endif
 132
 133 /* The code below is only expecting sizes 4 or 8.
 134    Die at compile-time if this expectation is violated.  */
 135 typedef char check_word_type_size
 136   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 137
 138 /* Return X with the first N bytes forced to values that won't match one
 139    of the interesting characters.  Note that NUL is not interesting.  */
 140
 141 static inline word_type
 142 acc_char_mask_misalign (word_type val, unsigned int n)
 143 {
 144   word_type mask = -1;
 145   if (WORDS_BIGENDIAN)
 146     mask >>= n * 8;
 147   else
 148     mask <<= n * 8;
 149   return val & mask;
 150 }
 151
 152 /* Return X replicated to all byte positions within WORD_TYPE.  */
 153
 154 static inline word_type
 155 acc_char_replicate (uchar x)
 156 {
 157   word_type ret;
 158
 159   ret = (x << 24) | (x << 16) | (x << 8) | x;
 160   if (sizeof(word_type) == 8)
 161     ret = (ret << 16 << 16) | ret;
 162   return ret;
 163 }
 164
 165 /* Return non-zero if some byte of VAL is (probably) C.  */
 166
 167 static inline word_type
 168 acc_char_cmp (word_type val, word_type c)
 169 {
 170 #if defined(__GNUC__) && defined(__alpha__)
 171   /* We can get exact results using a compare-bytes instruction.
 172      Get (val == c) via (0 >= (val ^ c)).  */
 173   return __builtin_alpha_cmpbge (0, val ^ c);
 174 #else
 175   word_type magic = 0x7efefefeU;
 176   if (sizeof(word_type) == 8)
 177     magic = (magic << 16 << 16) | 0xfefefefeU;
 178   magic |= 1;
 179
 180   val ^= c;
 181   return ((val + magic) ^ ~val) & ~magic;
 182 #endif
 183 }
 184
 185 /* Given the result of acc_char_cmp is non-zero, return the index of
 186    the found character.  If this was a false positive, return -1.  */
 187
 188 static inline int
 189 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 190                 word_type val ATTRIBUTE_UNUSED)
 191 {
 192 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 193   /* The cmpbge instruction sets *bits* of the result corresponding to
 194      matches in the bytes with no false positives.  */
 195   return __builtin_ctzl (cmp);
 196 #else
 197   unsigned int i;
 198
 199   /* ??? It would be nice to force unrolling here,
 200      and have all of these constants folded.  */
 201   for (i = 0; i < sizeof(word_type); ++i)
 202     {
 203       uchar c;
 204       if (WORDS_BIGENDIAN)
 205         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 206       else
 207         c = (val >> i * 8) & 0xff;
 208
 209       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 210         return i;
 211     }
 212
 213   return -1;
 214 #endif
 215 }
 216
 217 /* A version of the fast scanner using bit fiddling techniques.
 218
 219    For 32-bit words, one would normally perform 16 comparisons and
 220    16 branches.  With this algorithm one performs 24 arithmetic
 221    operations and one branch.  Whether this is faster with a 32-bit
 222    word size is going to be somewhat system dependent.
 223
 224    For 64-bit words, we eliminate twice the number of comparisons
 225    and branches without increasing the number of arithmetic operations.
 226    It's almost certainly going to be a win with 64-bit word size.  */
 227
 228 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 229   ATTRIBUTE_UNUSED;
 230
 231 static const uchar *
 232 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 233 {
 234   const word_type repl_nl = acc_char_replicate ('\n');
 235   const word_type repl_cr = acc_char_replicate ('\r');
 236   const word_type repl_bs = acc_char_replicate ('\\');
 237   const word_type repl_qm = acc_char_replicate ('?');
 238
 239   unsigned int misalign;
 240   const word_type *p;
 241   word_type val, t;
 242
 243   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 244   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 245   val = *p;
 246   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 247   if (misalign)
 248     val = acc_char_mask_misalign (val, misalign);
 249
 250   /* Main loop.  */
 251   while (1)
 252     {
 253       t  = acc_char_cmp (val, repl_nl);
 254       t |= acc_char_cmp (val, repl_cr);
 255       t |= acc_char_cmp (val, repl_bs);
 256       t |= acc_char_cmp (val, repl_qm);
 257
 258       if (__builtin_expect (t != 0, 0))
 259         {
 260           int i = acc_char_index (t, val);
 261           if (i >= 0)
 262             return (const uchar *)p + i;
 263         }
 264
 265       val = *++p;
 266     }
 267 }
 268
 269 /* Disable on Solaris 2/x86 until the following problem can be properly
 270    autoconfed:
 271
 272    The Solaris 10+ assembler tags objects with the instruction set
 273    extensions used, so SSE4.2 executables cannot run on machines that
 274    don't support that extension.  */
 275
 276 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 277
 278 /* Replicated character data to be shared between implementations.
 279    Recall that outside of a context with vector support we can't
 280    define compatible vector types, therefore these are all defined
 281    in terms of raw characters.  */
 282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 283   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 284     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 285   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 286     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 287   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 288     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 289   { '?', '?', '?', '?', '?', '?', '?', '?',
 290     '?', '?', '?', '?', '?', '?', '?', '?' },
 291 };
 292
 293 /* A version of the fast scanner using MMX vectorized byte compare insns.
 294
 295    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 296    which was packaged into SSE1; it is also present in the AMD MMX
 297    extension.  Mark the function as using "sse" so that we emit a real
 298    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 299
 300 static const uchar *
 301 #ifndef __SSE__
 302 __attribute__((__target__("sse")))
 303 #endif
 304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 305 {
 306   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 307   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 308
 309   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 310   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 311   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 312   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 313
 314   unsigned int misalign, found, mask;
 315   const v8qi *p;
 316   v8qi data, t, c;
 317
 318   /* Align the source pointer.  While MMX doesn't generate unaligned data
 319      faults, this allows us to safely scan to the end of the buffer without
 320      reading beyond the end of the last page.  */
 321   misalign = (uintptr_t)s & 7;
 322   p = (const v8qi *)((uintptr_t)s & -8);
 323   data = *p;
 324
 325   /* Create a mask for the bytes that are valid within the first
 326      16-byte block.  The Idea here is that the AND with the mask
 327      within the loop is "free", since we need some AND or TEST
 328      insn in order to set the flags for the branch anyway.  */
 329   mask = -1u << misalign;
 330
 331   /* Main loop processing 8 bytes at a time.  */
 332   goto start;
 333   do
 334     {
 335       data = *++p;
 336       mask = -1;
 337
 338     start:
 339       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 340       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 341       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 342       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 343       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 344       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 345       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 346       found = __builtin_ia32_pmovmskb (t);
 347       found &= mask;
 348     }
 349   while (!found);
 350
 351   __builtin_ia32_emms ();
 352
 353   /* FOUND contains 1 in bits for which we matched a relevant
 354      character.  Conversion to the byte index is trivial.  */
 355   found = __builtin_ctz(found);
 356   return (const uchar *)p + found;
 357 }
 358
 359 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 360
 361 static const uchar *
 362 #ifndef __SSE2__
 363 __attribute__((__target__("sse2")))
 364 #endif
 365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 366 {
 367   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 368
 369   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 370   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 371   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 372   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 373
 374   unsigned int misalign, found, mask;
 375   const v16qi *p;
 376   v16qi data, t;
 377
 378   /* Align the source pointer.  */
 379   misalign = (uintptr_t)s & 15;
 380   p = (const v16qi *)((uintptr_t)s & -16);
 381   data = *p;
 382
 383   /* Create a mask for the bytes that are valid within the first
 384      16-byte block.  The Idea here is that the AND with the mask
 385      within the loop is "free", since we need some AND or TEST
 386      insn in order to set the flags for the branch anyway.  */
 387   mask = -1u << misalign;
 388
 389   /* Main loop processing 16 bytes at a time.  */
 390   goto start;
 391   do
 392     {
 393       data = *++p;
 394       mask = -1;
 395
 396     start:
 397       t  = data == repl_nl;
 398       t |= data == repl_cr;
 399       t |= data == repl_bs;
 400       t |= data == repl_qm;
 401       found = __builtin_ia32_pmovmskb128 (t);
 402       found &= mask;
 403     }
 404   while (!found);
 405
 406   /* FOUND contains 1 in bits for which we matched a relevant
 407      character.  Conversion to the byte index is trivial.  */
 408   found = __builtin_ctz(found);
 409   return (const uchar *)p + found;
 410 }
 411
 412 #ifdef HAVE_SSE4
 413 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 414
 415 static const uchar *
 416 #ifndef __SSE4_2__
 417 __attribute__((__target__("sse4.2")))
 418 #endif
 419 search_line_sse42 (const uchar *s, const uchar *end)
 420 {
 421   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 422   static const v16qi search = { '\n', '\r', '?', '\\' };
 423
 424   uintptr_t si = (uintptr_t)s;
 425   uintptr_t index;
 426
 427   /* Check for unaligned input.  */
 428   if (si & 15)
 429     {
 430       v16qi sv;
 431
 432       if (__builtin_expect (end - s < 16, 0)
 433           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 434         {
 435           /* There are less than 16 bytes left in the buffer, and less
 436              than 16 bytes left on the page.  Reading 16 bytes at this
 437              point might generate a spurious page fault.  Defer to the
 438              SSE2 implementation, which already handles alignment.  */
 439           return search_line_sse2 (s, end);
 440         }
 441
 442       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 443          memory need not be aligned.  */
 444       sv = __builtin_ia32_loaddqu ((const char *) s);
 445       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 446
 447       if (__builtin_expect (index < 16, 0))
 448         goto found;
 449
 450       /* Advance the pointer to an aligned address.  We will re-scan a
 451          few bytes, but we no longer need care for reading past the
 452          end of a page, since we're guaranteed a match.  */
 453       s = (const uchar *)((si + 15) & -16);
 454     }
 455
 456   /* Main loop, processing 16 bytes at a time.  */
 457 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 458   while (1)
 459     {
 460       char f;
 461
 462       /* By using inline assembly instead of the builtin,
 463          we can use the result, as well as the flags set.  */
 464       __asm ("%vpcmpestri\t$0, %2, %3"
 465              : "=c"(index), "=@ccc"(f)
 466              : "m"(*s), "x"(search), "a"(4), "d"(16));
 467       if (f)
 468         break;
 469
 470       s += 16;
 471     }
 472 #else
 473   s -= 16;
 474   /* By doing the whole loop in inline assembly,
 475      we can make proper use of the flags set.  */
 476   __asm (      ".balign 16\n"
 477         "0:     add $16, %1\n"
 478         "       %vpcmpestri\t$0, (%1), %2\n"
 479         "       jnc 0b"
 480         : "=&c"(index), "+r"(s)
 481         : "x"(search), "a"(4), "d"(16));
 482 #endif
 483
 484  found:
 485   return s + index;
 486 }
 487
 488 #else
 489 /* Work around out-dated assemblers without sse4 support.  */
 490 #define search_line_sse42 search_line_sse2
 491 #endif
 492
 493 /* Check the CPU capabilities.  */
 494
 495 #include "../gcc/config/i386/cpuid.h"
 496
 497 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 498 static search_line_fast_type search_line_fast;
 499
 500 #define HAVE_init_vectorized_lexer 1
 501 static inline void
 502 init_vectorized_lexer (void)
 503 {
 504   unsigned dummy, ecx = 0, edx = 0;
 505   search_line_fast_type impl = search_line_acc_char;
 506   int minimum = 0;
 507
 508 #if defined(__SSE4_2__)
 509   minimum = 3;
 510 #elif defined(__SSE2__)
 511   minimum = 2;
 512 #elif defined(__SSE__)
 513   minimum = 1;
 514 #endif
 515
 516   if (minimum == 3)
 517     impl = search_line_sse42;
 518   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 519     {
 520       if (minimum == 3 || (ecx & bit_SSE4_2))
 521         impl = search_line_sse42;
 522       else if (minimum == 2 || (edx & bit_SSE2))
 523         impl = search_line_sse2;
 524       else if (minimum == 1 || (edx & bit_SSE))
 525         impl = search_line_mmx;
 526     }
 527   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 528     {
 529       if (minimum == 1
 530           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 531         impl = search_line_mmx;
 532     }
 533
 534   search_line_fast = impl;
 535 }
 536
 537 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 538
 539 /* A vection of the fast scanner using AltiVec vectorized byte compares
 540    and VSX unaligned loads (when VSX is available).  This is otherwise
 541    the same as the AltiVec version.  */
 542
 543 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 544 static const uchar *
 545 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 546 {
 547   typedef __attribute__((altivec(vector))) unsigned char vc;
 548
 549   const vc repl_nl = {
 550     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 551     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 552   };
 553   const vc repl_cr = {
 554     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 555     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 556   };
 557   const vc repl_bs = {
 558     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 559     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 560   };
 561   const vc repl_qm = {
 562     '?', '?', '?', '?', '?', '?', '?', '?',
 563     '?', '?', '?', '?', '?', '?', '?', '?',
 564   };
 565   const vc zero = { 0 };
 566
 567   vc data, t;
 568
 569   /* Main loop processing 16 bytes at a time.  */
 570   do
 571     {
 572       vc m_nl, m_cr, m_bs, m_qm;
 573
 574       data = __builtin_vec_vsx_ld (0, s);
 575       s += 16;
 576
 577       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 578       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 579       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 580       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 581       t = (m_nl | m_cr) | (m_bs | m_qm);
 582
 583       /* T now contains 0xff in bytes for which we matched one of the relevant
 584          characters.  We want to exit the loop if any byte in T is non-zero.
 585          Below is the expansion of vec_any_ne(t, zero).  */
 586     }
 587   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 588
 589   /* Restore s to to point to the 16 bytes we just processed.  */
 590   s -= 16;
 591
 592   {
 593 #define N  (sizeof(vc) / sizeof(long))
 594
 595     union {
 596       vc v;
 597       /* Statically assert that N is 2 or 4.  */
 598       unsigned long l[(N == 2 || N == 4) ? N : -1];
 599     } u;
 600     unsigned long l, i = 0;
 601
 602     u.v = t;
 603
 604     /* Find the first word of T that is non-zero.  */
 605     switch (N)
 606       {
 607       case 4:
 608         l = u.l[i++];
 609         if (l != 0)
 610           break;
 611         s += sizeof(unsigned long);
 612         l = u.l[i++];
 613         if (l != 0)
 614           break;
 615         s += sizeof(unsigned long);
 616         /* FALLTHRU */
 617       case 2:
 618         l = u.l[i++];
 619         if (l != 0)
 620           break;
 621         s += sizeof(unsigned long);
 622         l = u.l[i];
 623       }
 624
 625     /* L now contains 0xff in bytes for which we matched one of the
 626        relevant characters.  We can find the byte index by finding
 627        its bit index and dividing by 8.  */
 628 #ifdef __BIG_ENDIAN__
 629     l = __builtin_clzl(l) >> 3;
 630 #else
 631     l = __builtin_ctzl(l) >> 3;
 632 #endif
 633     return s + l;
 634
 635 #undef N
 636   }
 637 }
 638
 639 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 640
 641 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 642    This cannot be used for little endian because vec_lvsl/lvsr are
 643    deprecated for little endian and the code won't work properly.  */
 644 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 645    so we can't compile this function without -maltivec on the command line
 646    (or implied by some other switch).  */
 647
 648 static const uchar *
 649 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 650 {
 651   typedef __attribute__((altivec(vector))) unsigned char vc;
 652
 653   const vc repl_nl = {
 654     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 655     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 656   };
 657   const vc repl_cr = {
 658     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 659     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 660   };
 661   const vc repl_bs = {
 662     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 663     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 664   };
 665   const vc repl_qm = {
 666     '?', '?', '?', '?', '?', '?', '?', '?',
 667     '?', '?', '?', '?', '?', '?', '?', '?',
 668   };
 669   const vc ones = {
 670     -1, -1, -1, -1, -1, -1, -1, -1,
 671     -1, -1, -1, -1, -1, -1, -1, -1,
 672   };
 673   const vc zero = { 0 };
 674
 675   vc data, mask, t;
 676
 677   /* Altivec loads automatically mask addresses with -16.  This lets us
 678      issue the first load as early as possible.  */
 679   data = __builtin_vec_ld(0, (const vc *)s);
 680
 681   /* Discard bytes before the beginning of the buffer.  Do this by
 682      beginning with all ones and shifting in zeros according to the
 683      mis-alignment.  The LVSR instruction pulls the exact shift we
 684      want from the address.  */
 685   mask = __builtin_vec_lvsr(0, s);
 686   mask = __builtin_vec_perm(zero, ones, mask);
 687   data &= mask;
 688
 689   /* While altivec loads mask addresses, we still need to align S so
 690      that the offset we compute at the end is correct.  */
 691   s = (const uchar *)((uintptr_t)s & -16);
 692
 693   /* Main loop processing 16 bytes at a time.  */
 694   goto start;
 695   do
 696     {
 697       vc m_nl, m_cr, m_bs, m_qm;
 698
 699       s += 16;
 700       data = __builtin_vec_ld(0, (const vc *)s);
 701
 702     start:
 703       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 704       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 705       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 706       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 707       t = (m_nl | m_cr) | (m_bs | m_qm);
 708
 709       /* T now contains 0xff in bytes for which we matched one of the relevant
 710          characters.  We want to exit the loop if any byte in T is non-zero.
 711          Below is the expansion of vec_any_ne(t, zero).  */
 712     }
 713   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 714
 715   {
 716 #define N  (sizeof(vc) / sizeof(long))
 717
 718     union {
 719       vc v;
 720       /* Statically assert that N is 2 or 4.  */
 721       unsigned long l[(N == 2 || N == 4) ? N : -1];
 722     } u;
 723     unsigned long l, i = 0;
 724
 725     u.v = t;
 726
 727     /* Find the first word of T that is non-zero.  */
 728     switch (N)
 729       {
 730       case 4:
 731         l = u.l[i++];
 732         if (l != 0)
 733           break;
 734         s += sizeof(unsigned long);
 735         l = u.l[i++];
 736         if (l != 0)
 737           break;
 738         s += sizeof(unsigned long);
 739         /* FALLTHROUGH */
 740       case 2:
 741         l = u.l[i++];
 742         if (l != 0)
 743           break;
 744         s += sizeof(unsigned long);
 745         l = u.l[i];
 746       }
 747
 748     /* L now contains 0xff in bytes for which we matched one of the
 749        relevant characters.  We can find the byte index by finding
 750        its bit index and dividing by 8.  */
 751     l = __builtin_clzl(l) >> 3;
 752     return s + l;
 753
 754 #undef N
 755   }
 756 }
 757
 758 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 759 #include "arm_neon.h"
 760
 761 /* This doesn't have to be the exact page size, but no system may use
 762    a size smaller than this.  ARMv8 requires a minimum page size of
 763    4k.  The impact of being conservative here is a small number of
 764    cases will take the slightly slower entry path into the main
 765    loop.  */
 766
 767 #define AARCH64_MIN_PAGE_SIZE 4096
 768
 769 static const uchar *
 770 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 771 {
 772   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 773   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 774   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 775   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 776   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 777
 778 #ifdef __ARM_BIG_ENDIAN
 779   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 780 #else
 781   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 782 #endif
 783
 784   unsigned int found;
 785   const uint8_t *p;
 786   uint8x16_t data;
 787   uint8x16_t t;
 788   uint16x8_t m;
 789   uint8x16_t u, v, w;
 790
 791   /* Align the source pointer.  */
 792   p = (const uint8_t *)((uintptr_t)s & -16);
 793
 794   /* Assuming random string start positions, with a 4k page size we'll take
 795      the slow path about 0.37% of the time.  */
 796   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 797                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 798                         < 16, 0))
 799     {
 800       /* Slow path: the string starts near a possible page boundary.  */
 801       uint32_t misalign, mask;
 802
 803       misalign = (uintptr_t)s & 15;
 804       mask = (-1u << misalign) & 0xffff;
 805       data = vld1q_u8 (p);
 806       t = vceqq_u8 (data, repl_nl);
 807       u = vceqq_u8 (data, repl_cr);
 808       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 809       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 810       t = vorrq_u8 (v, w);
 811       t = vandq_u8 (t, xmask);
 812       m = vpaddlq_u8 (t);
 813       m = vshlq_u16 (m, shift);
 814       found = vaddvq_u16 (m);
 815       found &= mask;
 816       if (found)
 817         return (const uchar*)p + __builtin_ctz (found);
 818     }
 819   else
 820     {
 821       data = vld1q_u8 ((const uint8_t *) s);
 822       t = vceqq_u8 (data, repl_nl);
 823       u = vceqq_u8 (data, repl_cr);
 824       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 825       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 826       t = vorrq_u8 (v, w);
 827       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 828         goto done;
 829     }
 830
 831   do
 832     {
 833       p += 16;
 834       data = vld1q_u8 (p);
 835       t = vceqq_u8 (data, repl_nl);
 836       u = vceqq_u8 (data, repl_cr);
 837       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 838       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 839       t = vorrq_u8 (v, w);
 840     } while (!vpaddd_u64 ((uint64x2_t)t));
 841
 842 done:
 843   /* Now that we've found the terminating substring, work out precisely where
 844      we need to stop.  */
 845   t = vandq_u8 (t, xmask);
 846   m = vpaddlq_u8 (t);
 847   m = vshlq_u16 (m, shift);
 848   found = vaddvq_u16 (m);
 849   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 850           + __builtin_ctz (found));
 851 }
 852
 853 #elif defined (__ARM_NEON)
 854 #include "arm_neon.h"
 855
 856 static const uchar *
 857 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 858 {
 859   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 860   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 861   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 862   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 863   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 864
 865   unsigned int misalign, found, mask;
 866   const uint8_t *p;
 867   uint8x16_t data;
 868
 869   /* Align the source pointer.  */
 870   misalign = (uintptr_t)s & 15;
 871   p = (const uint8_t *)((uintptr_t)s & -16);
 872   data = vld1q_u8 (p);
 873
 874   /* Create a mask for the bytes that are valid within the first
 875      16-byte block.  The Idea here is that the AND with the mask
 876      within the loop is "free", since we need some AND or TEST
 877      insn in order to set the flags for the branch anyway.  */
 878   mask = (-1u << misalign) & 0xffff;
 879
 880   /* Main loop, processing 16 bytes at a time.  */
 881   goto start;
 882
 883   do
 884     {
 885       uint8x8_t l;
 886       uint16x4_t m;
 887       uint32x2_t n;
 888       uint8x16_t t, u, v, w;
 889
 890       p += 16;
 891       data = vld1q_u8 (p);
 892       mask = 0xffff;
 893
 894     start:
 895       t = vceqq_u8 (data, repl_nl);
 896       u = vceqq_u8 (data, repl_cr);
 897       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 898       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 899       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 900       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 901       m = vpaddl_u8 (l);
 902       n = vpaddl_u16 (m);
 903
 904       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 905               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 906       found &= mask;
 907     }
 908   while (!found);
 909
 910   /* FOUND contains 1 in bits for which we matched a relevant
 911      character.  Conversion to the byte index is trivial.  */
 912   found = __builtin_ctz (found);
 913   return (const uchar *)p + found;
 914 }
 915
 916 #else
 917
 918 /* We only have one accelerated alternative.  Use a direct call so that
 919    we encourage inlining.  */
 920
 921 #define search_line_fast  search_line_acc_char
 922
 923 #endif
 924
 925 /* Initialize the lexer if needed.  */
 926
 927 void
 928 _cpp_init_lexer (void)
 929 {
 930 #ifdef HAVE_init_vectorized_lexer
 931   init_vectorized_lexer ();
 932 #endif
 933 }
 934
 935 /* Returns with a logical line that contains no escaped newlines or
 936    trigraphs.  This is a time-critical inner loop.  */
 937 void
 938 _cpp_clean_line (cpp_reader *pfile)
 939 {
 940   cpp_buffer *buffer;
 941   const uchar *s;
 942   uchar c, *d, *p;
 943
 944   buffer = pfile->buffer;
 945   buffer->cur_note = buffer->notes_used = 0;
 946   buffer->cur = buffer->line_base = buffer->next_line;
 947   buffer->need_line = false;
 948   s = buffer->next_line;
 949
 950   if (!buffer->from_stage3)
 951     {
 952       const uchar *pbackslash = NULL;
 953
 954       /* Fast path.  This is the common case of an un-escaped line with
 955          no trigraphs.  The primary win here is by not writing any
 956          data back to memory until we have to.  */
 957       while (1)
 958         {
 959           /* Perform an optimized search for \n, \r, \\, ?.  */
 960           s = search_line_fast (s, buffer->rlimit);
 961
 962           c = *s;
 963           if (c == '\\')
 964             {
 965               /* Record the location of the backslash and continue.  */
 966               pbackslash = s++;
 967             }
 968           else if (__builtin_expect (c == '?', 0))
 969             {
 970               if (__builtin_expect (s[1] == '?', false)
 971                    && _cpp_trigraph_map[s[2]])
 972                 {
 973                   /* Have a trigraph.  We may or may not have to convert
 974                      it.  Add a line note regardless, for -Wtrigraphs.  */
 975                   add_line_note (buffer, s, s[2]);
 976                   if (CPP_OPTION (pfile, trigraphs))
 977                     {
 978                       /* We do, and that means we have to switch to the
 979                          slow path.  */
 980                       d = (uchar *) s;
 981                       *d = _cpp_trigraph_map[s[2]];
 982                       s += 2;
 983                       goto slow_path;
 984                     }
 985                 }
 986               /* Not a trigraph.  Continue on fast-path.  */
 987               s++;
 988             }
 989           else
 990             break;
 991         }
 992
 993       /* This must be \r or \n.  We're either done, or we'll be forced
 994          to write back to the buffer and continue on the slow path.  */
 995       d = (uchar *) s;
 996
 997       if (__builtin_expect (s == buffer->rlimit, false))
 998         goto done;
 999
1000       /* DOS line ending? */
1001       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1002         {
1003           s++;
1004           if (s == buffer->rlimit)
1005             goto done;
1006         }
1007
1008       if (__builtin_expect (pbackslash == NULL, true))
1009         goto done;
1010
1011       /* Check for escaped newline.  */
1012       p = d;
1013       while (is_nvspace (p[-1]))
1014         p--;
1015       if (p - 1 != pbackslash)
1016         goto done;
1017
1018       /* Have an escaped newline; process it and proceed to
1019          the slow path.  */
1020       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1021       d = p - 2;
1022       buffer->next_line = p - 1;
1023
1024     slow_path:
1025       while (1)
1026         {
1027           c = *++s;
1028           *++d = c;
1029
1030           if (c == '\n' || c == '\r')
1031             {
1032               /* Handle DOS line endings.  */
1033               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1034                 s++;
1035               if (s == buffer->rlimit)
1036                 break;
1037
1038               /* Escaped?  */
1039               p = d;
1040               while (p != buffer->next_line && is_nvspace (p[-1]))
1041                 p--;
1042               if (p == buffer->next_line || p[-1] != '\\')
1043                 break;
1044
1045               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1046               d = p - 2;
1047               buffer->next_line = p - 1;
1048             }
1049           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1050             {
1051               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1052               add_line_note (buffer, d, s[2]);
1053               if (CPP_OPTION (pfile, trigraphs))
1054                 {
1055                   *d = _cpp_trigraph_map[s[2]];
1056                   s += 2;
1057                 }
1058             }
1059         }
1060     }
1061   else
1062     {
1063       while (*s != '\n' && *s != '\r')
1064         s++;
1065       d = (uchar *) s;
1066
1067       /* Handle DOS line endings.  */
1068       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1069         s++;
1070     }
1071
1072  done:
1073   *d = '\n';
1074   /* A sentinel note that should never be processed.  */
1075   add_line_note (buffer, d + 1, '\n');
1076   buffer->next_line = s + 1;
1077 }
1078
1079 /* Return true if the trigraph indicated by NOTE should be warned
1080    about in a comment.  */
1081 static bool
1082 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1083 {
1084   const uchar *p;
1085
1086   /* Within comments we don't warn about trigraphs, unless the
1087      trigraph forms an escaped newline, as that may change
1088      behavior.  */
1089   if (note->type != '/')
1090     return false;
1091
1092   /* If -trigraphs, then this was an escaped newline iff the next note
1093      is coincident.  */
1094   if (CPP_OPTION (pfile, trigraphs))
1095     return note[1].pos == note->pos;
1096
1097   /* Otherwise, see if this forms an escaped newline.  */
1098   p = note->pos + 3;
1099   while (is_nvspace (*p))
1100     p++;
1101
1102   /* There might have been escaped newlines between the trigraph and the
1103      newline we found.  Hence the position test.  */
1104   return (*p == '\n' && p < note[1].pos);
1105 }
1106
1107 /* Process the notes created by add_line_note as far as the current
1108    location.  */
1109 void
1110 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1111 {
1112   cpp_buffer *buffer = pfile->buffer;
1113
1114   for (;;)
1115     {
1116       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1117       unsigned int col;
1118
1119       if (note->pos > buffer->cur)
1120         break;
1121
1122       buffer->cur_note++;
1123       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1124
1125       if (note->type == '\\' || note->type == ' ')
1126         {
1127           if (note->type == ' ' && !in_comment)
1128             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1129                                  "backslash and newline separated by space");
1130
1131           if (buffer->next_line > buffer->rlimit)
1132             {
1133               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1134                                    "backslash-newline at end of file");
1135               /* Prevent "no newline at end of file" warning.  */
1136               buffer->next_line = buffer->rlimit;
1137             }
1138
1139           buffer->line_base = note->pos;
1140           CPP_INCREMENT_LINE (pfile, 0);
1141         }
1142       else if (_cpp_trigraph_map[note->type])
1143         {
1144           if (CPP_OPTION (pfile, warn_trigraphs)
1145               && (!in_comment || warn_in_comment (pfile, note)))
1146             {
1147               if (CPP_OPTION (pfile, trigraphs))
1148                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1149                                        pfile->line_table->highest_line, col,
1150                                        "trigraph ??%c converted to %c",
1151                                        note->type,
1152                                        (int) _cpp_trigraph_map[note->type]);
1153               else
1154                 {
1155                   cpp_warning_with_line
1156                     (pfile, CPP_W_TRIGRAPHS,
1157                      pfile->line_table->highest_line, col,
1158                      "trigraph ??%c ignored, use -trigraphs to enable",
1159                      note->type);
1160                 }
1161             }
1162         }
1163       else if (note->type == 0)
1164         /* Already processed in lex_raw_string.  */;
1165       else
1166         abort ();
1167     }
1168 }
1169
1170 namespace bidi {
1171   enum class kind {
1172     NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1173   };
1174
1175   /* All the UTF-8 encodings of bidi characters start with E2.  */
1176   constexpr uchar utf8_start = 0xe2;
1177
1178   struct context
1179   {
1180     context () {}
1181     context (location_t loc, kind k, bool pdf, bool ucn)
1182     : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1183     {
1184     }
1185
1186     kind get_pop_kind () const
1187     {
1188       return m_pdf ? kind::PDF : kind::PDI;
1189     }
1190     bool ucn_p () const
1191     {
1192       return m_ucn;
1193     }
1194
1195     location_t m_loc;
1196     kind m_kind;
1197     unsigned m_pdf : 1;
1198     unsigned m_ucn : 1;
1199   };
1200
1201   /* A vector holding currently open bidi contexts.  We use a char for
1202      each context, its LSB is 1 if it represents a PDF context, 0 if it
1203      represents a PDI context.  The next bit is 1 if this context was open
1204      by a bidi character written as a UCN, and 0 when it was UTF-8.  */
1205   semi_embedded_vec <context, 16> vec;
1206
1207   /* Close the whole comment/identifier/string literal/character constant
1208      context.  */
1209   void on_close ()
1210   {
1211     vec.truncate (0);
1212   }
1213
1214   /* Pop the last element in the vector.  */
1215   void pop ()
1216   {
1217     unsigned int len = vec.count ();
1218     gcc_checking_assert (len > 0);
1219     vec.truncate (len - 1);
1220   }
1221
1222   /* Return the pop kind of the context of the Ith element.  */
1223   kind pop_kind_at (unsigned int i)
1224   {
1225     return vec[i].get_pop_kind ();
1226   }
1227
1228   /* Return the pop kind of the context that is currently opened.  */
1229   kind current_ctx ()
1230   {
1231     unsigned int len = vec.count ();
1232     if (len == 0)
1233       return kind::NONE;
1234     return vec[len - 1].get_pop_kind ();
1235   }
1236
1237   /* Return true if the current context comes from a UCN origin, that is,
1238      the bidi char which started this bidi context was written as a UCN.  */
1239   bool current_ctx_ucn_p ()
1240   {
1241     unsigned int len = vec.count ();
1242     gcc_checking_assert (len > 0);
1243     return vec[len - 1].m_ucn;
1244   }
1245
1246   location_t current_ctx_loc ()
1247   {
1248     unsigned int len = vec.count ();
1249     gcc_checking_assert (len > 0);
1250     return vec[len - 1].m_loc;
1251   }
1252
1253   /* We've read a bidi char, update the current vector as necessary.
1254      LOC is only valid when K is not kind::NONE.  */
1255   void on_char (kind k, bool ucn_p, location_t loc)
1256   {
1257     switch (k)
1258       {
1259       case kind::LRE:
1260       case kind::RLE:
1261       case kind::LRO:
1262       case kind::RLO:
1263         vec.push (context (loc, k, true, ucn_p));
1264         break;
1265       case kind::LRI:
1266       case kind::RLI:
1267       case kind::FSI:
1268         vec.push (context (loc, k, false, ucn_p));
1269         break;
1270       /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1271          whose scope has not yet been terminated.  */
1272       case kind::PDF:
1273         if (current_ctx () == kind::PDF)
1274           pop ();
1275         break;
1276       /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1277          scope has not yet been terminated, as well as the scopes of
1278          any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1279          yet been terminated.  */
1280       case kind::PDI:
1281         for (int i = vec.count () - 1; i >= 0; --i)
1282           if (pop_kind_at (i) == kind::PDI)
1283             {
1284               vec.truncate (i);
1285               break;
1286             }
1287         break;
1288       case kind::LTR:
1289       case kind::RTL:
1290         /* These aren't popped by a PDF/PDI.  */
1291         break;
1292       ATTR_LIKELY case kind::NONE:
1293         break;
1294       default:
1295         abort ();
1296       }
1297   }
1298
1299   /* Return a descriptive string for K.  */
1300   const char *to_str (kind k)
1301   {
1302     switch (k)
1303       {
1304       case kind::LRE:
1305         return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1306       case kind::RLE:
1307         return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1308       case kind::LRO:
1309         return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1310       case kind::RLO:
1311         return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1312       case kind::LRI:
1313         return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1314       case kind::RLI:
1315         return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1316       case kind::FSI:
1317         return "U+2068 (FIRST STRONG ISOLATE)";
1318       case kind::PDF:
1319         return "U+202C (POP DIRECTIONAL FORMATTING)";
1320       case kind::PDI:
1321         return "U+2069 (POP DIRECTIONAL ISOLATE)";
1322       case kind::LTR:
1323         return "U+200E (LEFT-TO-RIGHT MARK)";
1324       case kind::RTL:
1325         return "U+200F (RIGHT-TO-LEFT MARK)";
1326       default:
1327         abort ();
1328       }
1329   }
1330 }
1331
1332 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1333    within the current line in FILE, with the caret at START.  */
1334
1335 static location_t
1336 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1337                                          const unsigned char *const start,
1338                                          size_t num_bytes)
1339 {
1340   gcc_checking_assert (num_bytes > 0);
1341
1342   /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1343      to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1344      whereas linemap_position_for_column is 1-based.  */
1345
1346   /* Get 0-based offsets within the line.  */
1347   size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1348   size_t end_offset = start_offset + num_bytes - 1;
1349
1350   /* Now convert to location_t, where "columns" are 1-based byte offsets.  */
1351   location_t start_loc = linemap_position_for_column (pfile->line_table,
1352                                                       start_offset + 1);
1353   location_t end_loc = linemap_position_for_column (pfile->line_table,
1354                                                      end_offset + 1);
1355
1356   if (start_loc == end_loc)
1357     return start_loc;
1358
1359   source_range src_range;
1360   src_range.m_start = start_loc;
1361   src_range.m_finish = end_loc;
1362   location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
1363                                                    start_loc,
1364                                                    src_range,
1365                                                    NULL,
1366                                                    0);
1367   return combined_loc;
1368 }
1369
1370 /* Parse a sequence of 3 bytes starting with P and return its bidi code.  */
1371
1372 static bidi::kind
1373 get_bidi_utf8_1 (const unsigned char *const p)
1374 {
1375   gcc_checking_assert (p[0] == bidi::utf8_start);
1376
1377   if (p[1] == 0x80)
1378     switch (p[2])
1379       {
1380       case 0xaa:
1381         return bidi::kind::LRE;
1382       case 0xab:
1383         return bidi::kind::RLE;
1384       case 0xac:
1385         return bidi::kind::PDF;
1386       case 0xad:
1387         return bidi::kind::LRO;
1388       case 0xae:
1389         return bidi::kind::RLO;
1390       case 0x8e:
1391         return bidi::kind::LTR;
1392       case 0x8f:
1393         return bidi::kind::RTL;
1394       default:
1395         break;
1396       }
1397   else if (p[1] == 0x81)
1398     switch (p[2])
1399       {
1400       case 0xa6:
1401         return bidi::kind::LRI;
1402       case 0xa7:
1403         return bidi::kind::RLI;
1404       case 0xa8:
1405         return bidi::kind::FSI;
1406       case 0xa9:
1407         return bidi::kind::PDI;
1408       default:
1409         break;
1410       }
1411
1412   return bidi::kind::NONE;
1413 }
1414
1415 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1416    If the kind is not NONE, write the location to *OUT.*/
1417
1418 static bidi::kind
1419 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1420 {
1421   bidi::kind result = get_bidi_utf8_1 (p);
1422   if (result != bidi::kind::NONE)
1423     {
1424       /* We have a sequence of 3 bytes starting at P.  */
1425       *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1426     }
1427   return result;
1428 }
1429
1430 /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
1431
1432 static bidi::kind
1433 get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
1434 {
1435   /* 6.4.3 Universal Character Names
1436       \u hex-quad
1437       \U hex-quad hex-quad
1438       \u { simple-hexadecimal-digit-sequence }
1439      where \unnnn means \U0000nnnn.  */
1440
1441   *end = p + 4;
1442   if (is_U)
1443     {
1444       if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1445         return bidi::kind::NONE;
1446       /* Skip 4B so we can treat \u and \U the same below.  */
1447       p += 4;
1448       *end += 4;
1449     }
1450   else if (p[0] == '{')
1451     {
1452       p++;
1453       while (*p == '0')
1454         p++;
1455       if (p[0] != '2'
1456           || p[1] != '0'
1457           || !ISXDIGIT (p[2])
1458           || !ISXDIGIT (p[3])
1459           || p[4] != '}')
1460         return bidi::kind::NONE;
1461       *end = p + 5;
1462     }
1463
1464   /* All code points we are looking for start with 20xx.  */
1465   if (p[0] != '2' || p[1] != '0')
1466     return bidi::kind::NONE;
1467   else if (p[2] == '2')
1468     switch (p[3])
1469       {
1470       case 'a':
1471       case 'A':
1472         return bidi::kind::LRE;
1473       case 'b':
1474       case 'B':
1475         return bidi::kind::RLE;
1476       case 'c':
1477       case 'C':
1478         return bidi::kind::PDF;
1479       case 'd':
1480       case 'D':
1481         return bidi::kind::LRO;
1482       case 'e':
1483       case 'E':
1484         return bidi::kind::RLO;
1485       default:
1486         break;
1487       }
1488   else if (p[2] == '6')
1489     switch (p[3])
1490       {
1491       case '6':
1492         return bidi::kind::LRI;
1493       case '7':
1494         return bidi::kind::RLI;
1495       case '8':
1496         return bidi::kind::FSI;
1497       case '9':
1498         return bidi::kind::PDI;
1499       default:
1500         break;
1501       }
1502   else if (p[2] == '0')
1503     switch (p[3])
1504       {
1505       case 'e':
1506       case 'E':
1507         return bidi::kind::LTR;
1508       case 'f':
1509       case 'F':
1510         return bidi::kind::RTL;
1511       default:
1512         break;
1513       }
1514
1515   return bidi::kind::NONE;
1516 }
1517
1518 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1519    If the kind is not NONE, write the location to *OUT.  */
1520
1521 static bidi::kind
1522 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1523               location_t *out)
1524 {
1525   const unsigned char *end;
1526   bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
1527   if (result != bidi::kind::NONE)
1528     {
1529       const unsigned char *start = p - 2;
1530       size_t num_bytes = end - start;
1531       *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1532     }
1533   return result;
1534 }
1535
1536 /* Parse a named universal character escape where P points just past \N and
1537    return its bidi code.  If the kind is not NONE, write the location to
1538    *OUT.  */
1539
1540 static bidi::kind
1541 get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1542 {
1543   bidi::kind result = bidi::kind::NONE;
1544   if (*p != '{')
1545     return bidi::kind::NONE;
1546   if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
1547     {
1548       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1549         result = bidi::kind::LTR;
1550       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1551         result = bidi::kind::LRE;
1552       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1553         result = bidi::kind::LRO;
1554       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1555         result = bidi::kind::LRI;
1556     }
1557   else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
1558     {
1559       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1560         result = bidi::kind::RTL;
1561       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1562         result = bidi::kind::RLE;
1563       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1564         result = bidi::kind::RLO;
1565       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1566         result = bidi::kind::RLI;
1567     }
1568   else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
1569     {
1570       if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
1571         result = bidi::kind::PDF;
1572       else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
1573         result = bidi::kind::PDI;
1574     }
1575   else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
1576     result = bidi::kind::FSI;
1577   if (result != bidi::kind::NONE)
1578     *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
1579                                                     (strchr ((const char *)
1580                                                              (p + 1), '}')
1581                                                      - (const char *) p)
1582                                                     + 3);
1583   return result;
1584 }
1585
1586 /* Subclass of rich_location for reporting on unpaired UTF-8
1587    bidirectional control character(s).
1588    Escape the source lines on output, and show all unclosed
1589    bidi context, labelling everything.  */
1590
1591 class unpaired_bidi_rich_location : public rich_location
1592 {
1593  public:
1594   class custom_range_label : public range_label
1595   {
1596    public:
1597      label_text get_text (unsigned range_idx) const final override
1598      {
1599        /* range 0 is the primary location; each subsequent range i + 1
1600           is for bidi::vec[i].  */
1601        if (range_idx > 0)
1602          {
1603            const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1604            return label_text::borrow (bidi::to_str (ctxt.m_kind));
1605          }
1606        else
1607          return label_text::borrow (_("end of bidirectional context"));
1608      }
1609   };
1610
1611   unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1612   : rich_location (pfile->line_table, loc, &m_custom_label)
1613   {
1614     set_escape_on_output (true);
1615     for (unsigned i = 0; i < bidi::vec.count (); i++)
1616       add_range (bidi::vec[i].m_loc,
1617                  SHOW_RANGE_WITHOUT_CARET,
1618                  &m_custom_label);
1619   }
1620
1621  private:
1622    custom_range_label m_custom_label;
1623 };
1624
1625 /* We're closing a bidi context, that is, we've encountered a newline,
1626    are closing a C-style comment, or are at the end of a string literal,
1627    character constant, or identifier.  Warn if this context was not
1628    properly terminated by a PDI or PDF.  P points to the last character
1629    in this context.  */
1630
1631 static void
1632 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1633 {
1634   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1635   if (bidi::vec.count () > 0
1636       && (warn_bidi & bidirectional_unpaired
1637           && (!bidi::current_ctx_ucn_p ()
1638               || (warn_bidi & bidirectional_ucn))))
1639     {
1640       const location_t loc
1641         = linemap_position_for_column (pfile->line_table,
1642                                        CPP_BUF_COLUMN (pfile->buffer, p));
1643       unpaired_bidi_rich_location rich_loc (pfile, loc);
1644       /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1645          forms of a diagnostic, so fake it for now.  */
1646       if (bidi::vec.count () > 1)
1647         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1648                         "unpaired UTF-8 bidirectional control characters "
1649                         "detected");
1650       else
1651         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1652                         "unpaired UTF-8 bidirectional control character "
1653                         "detected");
1654     }
1655   /* We're done with this context.  */
1656   bidi::on_close ();
1657 }
1658
1659 /* We're at the beginning or in the middle of an identifier/comment/string
1660    literal/character constant.  Warn if we've encountered a bidi character.
1661    KIND says which bidi control character it was; UCN_P is true iff this bidi
1662    control character was written as a UCN.  LOC is the location of the
1663    character, but is only valid if KIND != bidi::kind::NONE.  */
1664
1665 static void
1666 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1667                          bool ucn_p, location_t loc)
1668 {
1669   if (__builtin_expect (kind == bidi::kind::NONE, 1))
1670     return;
1671
1672   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1673
1674   if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1675     {
1676       rich_location rich_loc (pfile->line_table, loc);
1677       rich_loc.set_escape_on_output (true);
1678
1679       /* It seems excessive to warn about a PDI/PDF that is closing
1680          an opened context because we've already warned about the
1681          opening character.  Except warn when we have a UCN x UTF-8
1682          mismatch, if UCN checking is enabled.  */
1683       if (kind == bidi::current_ctx ())
1684         {
1685           if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1686               && bidi::current_ctx_ucn_p () != ucn_p)
1687             {
1688               rich_loc.add_range (bidi::current_ctx_loc ());
1689               cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1690                               "UTF-8 vs UCN mismatch when closing "
1691                               "a context by \"%s\"", bidi::to_str (kind));
1692             }
1693         }
1694       else if (warn_bidi & bidirectional_any
1695                && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1696         {
1697           if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1698             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1699                             "\"%s\" is closing an unopened context",
1700                             bidi::to_str (kind));
1701           else
1702             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1703                             "found problematic Unicode character \"%s\"",
1704                             bidi::to_str (kind));
1705         }
1706     }
1707   /* We're done with this context.  */
1708   bidi::on_char (kind, ucn_p, loc);
1709 }
1710
1711 static const cppchar_t utf8_continuation = 0x80;
1712 static const cppchar_t utf8_signifier = 0xC0;
1713
1714 /* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1715    at PFILE->buffer->cur.  Return a pointer after the diagnosed
1716    invalid character.  */
1717
1718 static const uchar *
1719 _cpp_warn_invalid_utf8 (cpp_reader *pfile)
1720 {
1721   cpp_buffer *buffer = pfile->buffer;
1722   const uchar *cur = buffer->cur;
1723   bool pedantic = (CPP_PEDANTIC (pfile)
1724                    && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1725
1726   if (cur[0] < utf8_signifier
1727       || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1728     {
1729       if (pedantic)
1730         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1731                              pfile->line_table->highest_line,
1732                              CPP_BUF_COL (buffer),
1733                              "invalid UTF-8 character <%x>",
1734                              cur[0]);
1735       else
1736         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1737                                pfile->line_table->highest_line,
1738                                CPP_BUF_COL (buffer),
1739                                "invalid UTF-8 character <%x>",
1740                                cur[0]);
1741       return cur + 1;
1742     }
1743   else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1744     {
1745       if (pedantic)
1746         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1747                              pfile->line_table->highest_line,
1748                              CPP_BUF_COL (buffer),
1749                              "invalid UTF-8 character <%x><%x>",
1750                              cur[0], cur[1]);
1751       else
1752         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1753                                pfile->line_table->highest_line,
1754                                CPP_BUF_COL (buffer),
1755                                "invalid UTF-8 character <%x><%x>",
1756                                cur[0], cur[1]);
1757       return cur + 2;
1758     }
1759   else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1760     {
1761       if (pedantic)
1762         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1763                              pfile->line_table->highest_line,
1764                              CPP_BUF_COL (buffer),
1765                              "invalid UTF-8 character <%x><%x><%x>",
1766                              cur[0], cur[1], cur[2]);
1767       else
1768         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1769                                pfile->line_table->highest_line,
1770                                CPP_BUF_COL (buffer),
1771                                "invalid UTF-8 character <%x><%x><%x>",
1772                                cur[0], cur[1], cur[2]);
1773       return cur + 3;
1774     }
1775   else
1776     {
1777       if (pedantic)
1778         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1779                              pfile->line_table->highest_line,
1780                              CPP_BUF_COL (buffer),
1781                              "invalid UTF-8 character <%x><%x><%x><%x>",
1782                              cur[0], cur[1], cur[2], cur[3]);
1783       else
1784         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1785                                pfile->line_table->highest_line,
1786                                CPP_BUF_COL (buffer),
1787                                "invalid UTF-8 character <%x><%x><%x><%x>",
1788                                cur[0], cur[1], cur[2], cur[3]);
1789       return cur + 4;
1790     }
1791 }
1792
1793 /* Helper function of *skip_*_comment and lex*_string.  For C,
1794    character at CUR[-1] with MSB set handle -Wbidi-chars* and
1795    -Winvalid-utf8 diagnostics and return pointer to first character
1796    that should be processed next.  */
1797
1798 static inline const uchar *
1799 _cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1800                             const uchar *cur, bool warn_bidi_p,
1801                             bool warn_invalid_utf8_p)
1802 {
1803   /* If this is a beginning of a UTF-8 encoding, it might be
1804      a bidirectional control character.  */
1805   if (c == bidi::utf8_start && warn_bidi_p)
1806     {
1807       location_t loc;
1808       bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1809       maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1810     }
1811   if (!warn_invalid_utf8_p)
1812     return cur;
1813   if (c >= utf8_signifier)
1814     {
1815       cppchar_t s;
1816       const uchar *pstr = cur - 1;
1817       if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
1818           && s <= UCS_LIMIT)
1819         return pstr;
1820     }
1821   pfile->buffer->cur = cur - 1;
1822   return _cpp_warn_invalid_utf8 (pfile);
1823 }
1824
1825 /* Skip a C-style block comment.  We find the end of the comment by
1826    seeing if an asterisk is before every '/' we encounter.  Returns
1827    nonzero if comment terminated by EOF, zero otherwise.
1828
1829    Buffer->cur points to the initial asterisk of the comment.  */
1830 bool
1831 _cpp_skip_block_comment (cpp_reader *pfile)
1832 {
1833   cpp_buffer *buffer = pfile->buffer;
1834   const uchar *cur = buffer->cur;
1835   uchar c;
1836   const bool warn_bidi_p = pfile->warn_bidi_p ();
1837   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1838   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1839
1840   cur++;
1841   if (*cur == '/')
1842     cur++;
1843
1844   for (;;)
1845     {
1846       /* People like decorating comments with '*', so check for '/'
1847          instead for efficiency.  */
1848       c = *cur++;
1849
1850       if (c == '/')
1851         {
1852           if (cur[-2] == '*')
1853             {
1854               if (warn_bidi_p)
1855                 maybe_warn_bidi_on_close (pfile, cur);
1856               break;
1857             }
1858
1859           /* Warn about potential nested comments, but not if the '/'
1860              comes immediately before the true comment delimiter.
1861              Don't bother to get it right across escaped newlines.  */
1862           if (CPP_OPTION (pfile, warn_comments)
1863               && cur[0] == '*' && cur[1] != '/')
1864             {
1865               buffer->cur = cur;
1866               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1867                                      pfile->line_table->highest_line,
1868                                      CPP_BUF_COL (buffer),
1869                                      "\"/*\" within comment");
1870             }
1871         }
1872       else if (c == '\n')
1873         {
1874           unsigned int cols;
1875           buffer->cur = cur - 1;
1876           if (warn_bidi_p)
1877             maybe_warn_bidi_on_close (pfile, cur);
1878           _cpp_process_line_notes (pfile, true);
1879           if (buffer->next_line >= buffer->rlimit)
1880             return true;
1881           _cpp_clean_line (pfile);
1882
1883           cols = buffer->next_line - buffer->line_base;
1884           CPP_INCREMENT_LINE (pfile, cols);
1885
1886           cur = buffer->cur;
1887         }
1888       else if (__builtin_expect (c >= utf8_continuation, 0)
1889                && warn_bidi_or_invalid_utf8_p)
1890         cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1891                                           warn_invalid_utf8_p);
1892     }
1893
1894   buffer->cur = cur;
1895   _cpp_process_line_notes (pfile, true);
1896   return false;
1897 }
1898
1899 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1900    terminating newline.  Handles escaped newlines.  Returns nonzero
1901    if a multiline comment.  */
1902 static int
1903 skip_line_comment (cpp_reader *pfile)
1904 {
1905   cpp_buffer *buffer = pfile->buffer;
1906   location_t orig_line = pfile->line_table->highest_line;
1907   const bool warn_bidi_p = pfile->warn_bidi_p ();
1908   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1909   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1910
1911   if (!warn_bidi_or_invalid_utf8_p)
1912     while (*buffer->cur != '\n')
1913       buffer->cur++;
1914   else if (!warn_invalid_utf8_p)
1915     {
1916       while (*buffer->cur != '\n'
1917              && *buffer->cur != bidi::utf8_start)
1918         buffer->cur++;
1919       if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1920         {
1921           while (*buffer->cur != '\n')
1922             {
1923               if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1924                 {
1925                   location_t loc;
1926                   bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1927                   maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1928                 }
1929               buffer->cur++;
1930             }
1931           maybe_warn_bidi_on_close (pfile, buffer->cur);
1932         }
1933     }
1934   else
1935     {
1936       while (*buffer->cur != '\n')
1937         {
1938           if (*buffer->cur < utf8_continuation)
1939             {
1940               buffer->cur++;
1941               continue;
1942             }
1943           buffer->cur
1944             = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
1945                                           warn_bidi_p, warn_invalid_utf8_p);
1946         }
1947       if (warn_bidi_p)
1948         maybe_warn_bidi_on_close (pfile, buffer->cur);
1949     }
1950
1951   _cpp_process_line_notes (pfile, true);
1952   return orig_line != pfile->line_table->highest_line;
1953 }
1954
1955 /* Skips whitespace, saving the next non-whitespace character.  */
1956 static void
1957 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1958 {
1959   cpp_buffer *buffer = pfile->buffer;
1960   bool saw_NUL = false;
1961
1962   do
1963     {
1964       /* Horizontal space always OK.  */
1965       if (c == ' ' || c == '\t')
1966         ;
1967       /* Just \f \v or \0 left.  */
1968       else if (c == '\0')
1969         saw_NUL = true;
1970       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1971         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1972                              CPP_BUF_COL (buffer),
1973                              "%s in preprocessing directive",
1974                              c == '\f' ? "form feed" : "vertical tab");
1975
1976       c = *buffer->cur++;
1977     }
1978   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1979   while (is_nvspace (c));
1980
1981   if (saw_NUL)
1982     {
1983       encoding_rich_location rich_loc (pfile);
1984       cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1985                     "null character(s) ignored");
1986     }
1987
1988   buffer->cur--;
1989 }
1990
1991 /* See if the characters of a number token are valid in a name (no
1992    '.', '+' or '-').  */
1993 static int
1994 name_p (cpp_reader *pfile, const cpp_string *string)
1995 {
1996   unsigned int i;
1997
1998   for (i = 0; i < string->len; i++)
1999     if (!is_idchar (string->text[i]))
2000       return 0;
2001
2002   return 1;
2003 }
2004
2005 /* After parsing an identifier or other sequence, produce a warning about
2006    sequences not in NFC/NFKC.  */
2007 static void
2008 warn_about_normalization (cpp_reader *pfile,
2009                           const cpp_token *token,
2010                           const struct normalize_state *s)
2011 {
2012   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2013       && !pfile->state.skipping)
2014     {
2015       location_t loc = token->src_loc;
2016
2017       /* If possible, create a location range for the token.  */
2018       if (loc >= RESERVED_LOCATION_COUNT
2019           && token->type != CPP_EOF
2020           /* There must be no line notes to process.  */
2021           && (!(pfile->buffer->cur
2022                 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2023                 && !pfile->overlaid_buffer)))
2024         {
2025           source_range tok_range;
2026           tok_range.m_start = loc;
2027           tok_range.m_finish
2028             = linemap_position_for_column (pfile->line_table,
2029                                            CPP_BUF_COLUMN (pfile->buffer,
2030                                                            pfile->buffer->cur));
2031           loc = COMBINE_LOCATION_DATA (pfile->line_table,
2032                                        loc, tok_range, NULL, 0);
2033         }
2034
2035       encoding_rich_location rich_loc (pfile, loc);
2036
2037       /* Make sure that the token is printed using UCNs, even
2038          if we'd otherwise happily print UTF-8.  */
2039       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2040       size_t sz;
2041
2042       sz = cpp_spell_token (pfile, token, buf, false) - buf;
2043       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2044         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2045                         "`%.*s' is not in NFKC", (int) sz, buf);
2046       else if (CPP_OPTION (pfile, cplusplus))
2047         cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2048                                   "`%.*s' is not in NFC", (int) sz, buf);
2049       else
2050         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2051                         "`%.*s' is not in NFC", (int) sz, buf);
2052       free (buf);
2053     }
2054 }
2055
2056 /* Returns TRUE if the sequence starting at buffer->cur is valid in
2057    an identifier.  FIRST is TRUE if this starts an identifier.  */
2058
2059 static bool
2060 forms_identifier_p (cpp_reader *pfile, int first,
2061                     struct normalize_state *state)
2062 {
2063   cpp_buffer *buffer = pfile->buffer;
2064   const bool warn_bidi_p = pfile->warn_bidi_p ();
2065
2066   if (*buffer->cur == '$')
2067     {
2068       if (!CPP_OPTION (pfile, dollars_in_ident))
2069         return false;
2070
2071       buffer->cur++;
2072       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
2073         {
2074           CPP_OPTION (pfile, warn_dollars) = 0;
2075           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
2076         }
2077
2078       return true;
2079     }
2080
2081   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
2082   if (CPP_OPTION (pfile, extended_identifiers))
2083     {
2084       cppchar_t s;
2085       if (*buffer->cur >= utf8_signifier)
2086         {
2087           if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2088               && warn_bidi_p)
2089             {
2090               location_t loc;
2091               bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
2092               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2093             }
2094           if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2095                                state, &s))
2096             return true;
2097         }
2098       else if (*buffer->cur == '\\'
2099                && (buffer->cur[1] == 'u'
2100                    || buffer->cur[1] == 'U'
2101                    || buffer->cur[1] == 'N'))
2102         {
2103           buffer->cur += 2;
2104           if (warn_bidi_p)
2105             {
2106               location_t loc;
2107               bidi::kind kind;
2108               if (buffer->cur[-1] == 'N')
2109                 kind = get_bidi_named (pfile, buffer->cur, &loc);
2110               else
2111                 kind = get_bidi_ucn (pfile, buffer->cur,
2112                                      buffer->cur[-1] == 'U', &loc);
2113               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2114             }
2115           if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2116                               state, &s, NULL, NULL))
2117             return true;
2118           buffer->cur -= 2;
2119         }
2120     }
2121
2122   return false;
2123 }
2124
2125 /* Helper function to issue error about improper __VA_OPT__ use.  */
2126 static void
2127 maybe_va_opt_error (cpp_reader *pfile)
2128 {
2129   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2130     {
2131       /* __VA_OPT__ should not be accepted at all, but allow it in
2132          system headers.  */
2133       if (!_cpp_in_system_header (pfile))
2134         cpp_error (pfile, CPP_DL_PEDWARN,
2135                    "__VA_OPT__ is not available until C++20");
2136     }
2137   else if (!pfile->state.va_args_ok)
2138     {
2139       /* __VA_OPT__ should only appear in the replacement list of a
2140          variadic macro.  */
2141       cpp_error (pfile, CPP_DL_PEDWARN,
2142                  "__VA_OPT__ can only appear in the expansion"
2143                  " of a C++20 variadic macro");
2144     }
2145 }
2146
2147 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
2148 static cpp_hashnode *
2149 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
2150 {
2151   cpp_hashnode *result;
2152   const uchar *cur;
2153   unsigned int len;
2154   unsigned int hash = HT_HASHSTEP (0, *base);
2155
2156   cur = base + 1;
2157   while (ISIDNUM (*cur))
2158     {
2159       hash = HT_HASHSTEP (hash, *cur);
2160       cur++;
2161     }
2162   len = cur - base;
2163   hash = HT_HASHFINISH (hash, len);
2164   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2165                                               base, len, hash, HT_ALLOC));
2166
2167   /* Rarely, identifiers require diagnostics when lexed.  */
2168   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2169                         && !pfile->state.skipping, 0))
2170     {
2171       /* It is allowed to poison the same identifier twice.  */
2172       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2173         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2174                    NODE_NAME (result));
2175
2176       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2177          replacement list of a variadic macro.  */
2178       if (result == pfile->spec_nodes.n__VA_ARGS__
2179           && !pfile->state.va_args_ok)
2180         {
2181           if (CPP_OPTION (pfile, cplusplus))
2182             cpp_error (pfile, CPP_DL_PEDWARN,
2183                        "__VA_ARGS__ can only appear in the expansion"
2184                        " of a C++11 variadic macro");
2185           else
2186             cpp_error (pfile, CPP_DL_PEDWARN,
2187                        "__VA_ARGS__ can only appear in the expansion"
2188                        " of a C99 variadic macro");
2189         }
2190
2191       if (result == pfile->spec_nodes.n__VA_OPT__)
2192         maybe_va_opt_error (pfile);
2193
2194       /* For -Wc++-compat, warn about use of C++ named operators.  */
2195       if (result->flags & NODE_WARN_OPERATOR)
2196         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2197                      "identifier \"%s\" is a special operator name in C++",
2198                      NODE_NAME (result));
2199     }
2200
2201   return result;
2202 }
2203
2204 /* Get the cpp_hashnode of an identifier specified by NAME in
2205    the current cpp_reader object.  If none is found, NULL is returned.  */
2206 cpp_hashnode *
2207 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2208 {
2209   cpp_hashnode *result;
2210   result = lex_identifier_intern (pfile, (uchar *) name);
2211   return result;
2212 }
2213
2214 /* Lex an identifier starting at BUFFER->CUR - 1.  */
2215 static cpp_hashnode *
2216 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2217                 struct normalize_state *nst, cpp_hashnode **spelling)
2218 {
2219   cpp_hashnode *result;
2220   const uchar *cur;
2221   unsigned int len;
2222   unsigned int hash = HT_HASHSTEP (0, *base);
2223   const bool warn_bidi_p = pfile->warn_bidi_p ();
2224
2225   cur = pfile->buffer->cur;
2226   if (! starts_ucn)
2227     {
2228       while (ISIDNUM (*cur))
2229         {
2230           hash = HT_HASHSTEP (hash, *cur);
2231           cur++;
2232         }
2233       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2234     }
2235   pfile->buffer->cur = cur;
2236   if (starts_ucn || forms_identifier_p (pfile, false, nst))
2237     {
2238       /* Slower version for identifiers containing UCNs
2239          or extended chars (including $).  */
2240       do {
2241         while (ISIDNUM (*pfile->buffer->cur))
2242           {
2243             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2244             pfile->buffer->cur++;
2245           }
2246       } while (forms_identifier_p (pfile, false, nst));
2247       if (warn_bidi_p)
2248         maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2249       result = _cpp_interpret_identifier (pfile, base,
2250                                           pfile->buffer->cur - base);
2251       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2252     }
2253   else
2254     {
2255       len = cur - base;
2256       hash = HT_HASHFINISH (hash, len);
2257
2258       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2259                                                   base, len, hash, HT_ALLOC));
2260       *spelling = result;
2261     }
2262
2263   /* Rarely, identifiers require diagnostics when lexed.  */
2264   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2265                         && !pfile->state.skipping, 0))
2266     {
2267       /* It is allowed to poison the same identifier twice.  */
2268       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2269         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2270                    NODE_NAME (result));
2271
2272       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2273          replacement list of a variadic macro.  */
2274       if (result == pfile->spec_nodes.n__VA_ARGS__
2275           && !pfile->state.va_args_ok)
2276         {
2277           if (CPP_OPTION (pfile, cplusplus))
2278             cpp_error (pfile, CPP_DL_PEDWARN,
2279                        "__VA_ARGS__ can only appear in the expansion"
2280                        " of a C++11 variadic macro");
2281           else
2282             cpp_error (pfile, CPP_DL_PEDWARN,
2283                        "__VA_ARGS__ can only appear in the expansion"
2284                        " of a C99 variadic macro");
2285         }
2286
2287       /* __VA_OPT__ should only appear in the replacement list of a
2288          variadic macro.  */
2289       if (result == pfile->spec_nodes.n__VA_OPT__)
2290         maybe_va_opt_error (pfile);
2291
2292       /* For -Wc++-compat, warn about use of C++ named operators.  */
2293       if (result->flags & NODE_WARN_OPERATOR)
2294         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2295                      "identifier \"%s\" is a special operator name in C++",
2296                      NODE_NAME (result));
2297     }
2298
2299   return result;
2300 }
2301
2302 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
2303 static void
2304 lex_number (cpp_reader *pfile, cpp_string *number,
2305             struct normalize_state *nst)
2306 {
2307   const uchar *cur;
2308   const uchar *base;
2309   uchar *dest;
2310
2311   base = pfile->buffer->cur - 1;
2312   do
2313     {
2314       const uchar *adj_digit_sep = NULL;
2315       cur = pfile->buffer->cur;
2316
2317       /* N.B. ISIDNUM does not include $.  */
2318       while (ISIDNUM (*cur)
2319              || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2320              || DIGIT_SEP (*cur)
2321              || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2322         {
2323           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2324           /* Adjacent digit separators do not form part of the pp-number syntax.
2325              However, they can safely be diagnosed here as an error, since '' is
2326              not a valid preprocessing token.  */
2327           if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2328             adj_digit_sep = cur;
2329           cur++;
2330         }
2331       /* A number can't end with a digit separator.  */
2332       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2333         --cur;
2334       if (adj_digit_sep && adj_digit_sep < cur)
2335         cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2336
2337       pfile->buffer->cur = cur;
2338     }
2339   while (forms_identifier_p (pfile, false, nst));
2340
2341   number->len = cur - base;
2342   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2343   memcpy (dest, base, number->len);
2344   dest[number->len] = '\0';
2345   number->text = dest;
2346 }
2347
2348 /* Create a token of type TYPE with a literal spelling.  */
2349 static void
2350 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2351                 unsigned int len, enum cpp_ttype type)
2352 {
2353   token->type = type;
2354   token->val.str.len = len;
2355   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2356 }
2357
2358 const uchar *
2359 cpp_alloc_token_string (cpp_reader *pfile,
2360                         const unsigned char *ptr, unsigned len)
2361 {
2362   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2363
2364   dest[len] = 0;
2365   memcpy (dest, ptr, len);
2366   return dest;
2367 }
2368
2369 /* A pair of raw buffer pointers.  The currently open one is [1], the
2370    first one is [0].  Used for string literal lexing.  */
2371 struct lit_accum {
2372   _cpp_buff *first;
2373   _cpp_buff *last;
2374   const uchar *rpos;
2375   size_t accum;
2376
2377   lit_accum ()
2378     : first (NULL), last (NULL), rpos (0), accum (0)
2379   {
2380   }
2381
2382   void append (cpp_reader *, const uchar *, size_t);
2383
2384   void read_begin (cpp_reader *);
2385   bool reading_p () const
2386   {
2387     return rpos != NULL;
2388   }
2389   char read_char ()
2390   {
2391     char c = *rpos++;
2392     if (rpos == BUFF_FRONT (last))
2393       rpos = NULL;
2394     return c;
2395   }
2396 };
2397
2398 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2399    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
2400
2401 void
2402 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2403 {
2404   if (!last)
2405     /* Starting.  */
2406     first = last = _cpp_get_buff (pfile, len);
2407   else if (len > BUFF_ROOM (last))
2408     {
2409       /* There is insufficient room in the buffer.  Copy what we can,
2410          and then either extend or create a new one.  */
2411       size_t room = BUFF_ROOM (last);
2412       memcpy (BUFF_FRONT (last), base, room);
2413       BUFF_FRONT (last) += room;
2414       base += room;
2415       len -= room;
2416       accum += room;
2417
2418       gcc_checking_assert (!rpos);
2419
2420       last = _cpp_append_extend_buff (pfile, last, len);
2421     }
2422
2423   memcpy (BUFF_FRONT (last), base, len);
2424   BUFF_FRONT (last) += len;
2425   accum += len;
2426 }
2427
2428 void
2429 lit_accum::read_begin (cpp_reader *pfile)
2430 {
2431   /* We never accumulate more than 4 chars to read.  */
2432   if (BUFF_ROOM (last) < 4)
2433
2434     last = _cpp_append_extend_buff (pfile, last, 4);
2435   rpos = BUFF_FRONT (last);
2436 }
2437
2438 /* Returns true if a macro has been defined.
2439    This might not work if compile with -save-temps,
2440    or preprocess separately from compilation.  */
2441
2442 static bool
2443 is_macro(cpp_reader *pfile, const uchar *base)
2444 {
2445   const uchar *cur = base;
2446   if (! ISIDST (*cur))
2447     return false;
2448   unsigned int hash = HT_HASHSTEP (0, *cur);
2449   ++cur;
2450   while (ISIDNUM (*cur))
2451     {
2452       hash = HT_HASHSTEP (hash, *cur);
2453       ++cur;
2454     }
2455   hash = HT_HASHFINISH (hash, cur - base);
2456
2457   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2458                                         base, cur - base, hash, HT_NO_INSERT));
2459
2460   return result && cpp_macro_p (result);
2461 }
2462
2463 /* Returns true if a literal suffix does not have the expected form
2464    and is defined as a macro.  */
2465
2466 static bool
2467 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
2468 {
2469   /* User-defined literals outside of namespace std must start with a single
2470      underscore, so assume anything of that form really is a UDL suffix.
2471      We don't need to worry about UDLs defined inside namespace std because
2472      their names are reserved, so cannot be used as macro names in valid
2473      programs.  */
2474   if (base[0] == '_' && base[1] != '_')
2475     return false;
2476   return is_macro (pfile, base);
2477 }
2478
2479 /* Lexes a raw string.  The stored string contains the spelling,
2480    including double quotes, delimiter string, '(' and ')', any leading
2481    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
2482    the type of the literal, or CPP_OTHER if it was not properly
2483    terminated.
2484
2485    BASE is the start of the token.  Updates pfile->buffer->cur to just
2486    after the lexed string.
2487
2488    The spelling is NUL-terminated, but it is not guaranteed that this
2489    is the first NUL since embedded NULs are preserved.  */
2490
2491 static void
2492 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2493 {
2494   const uchar *pos = base;
2495   const bool warn_bidi_p = pfile->warn_bidi_p ();
2496   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2497   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2498
2499   /* 'tis a pity this information isn't passed down from the lexer's
2500      initial categorization of the token.  */
2501   enum cpp_ttype type = CPP_STRING;
2502
2503   if (*pos == 'L')
2504     {
2505       type = CPP_WSTRING;
2506       pos++;
2507     }
2508   else if (*pos == 'U')
2509     {
2510       type = CPP_STRING32;
2511       pos++;
2512     }
2513   else if (*pos == 'u')
2514     {
2515       if (pos[1] == '8')
2516         {
2517           type = CPP_UTF8STRING;
2518           pos++;
2519         }
2520       else
2521         type = CPP_STRING16;
2522       pos++;
2523     }
2524
2525   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2526   pos += 2;
2527
2528   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2529
2530   /* Skip notes before the ".  */
2531   while (note->pos < pos)
2532     ++note;
2533
2534   lit_accum accum;
2535
2536   uchar prefix[17];
2537   unsigned prefix_len = 0;
2538   enum Phase
2539   {
2540    PHASE_PREFIX = -2,
2541    PHASE_NONE = -1,
2542    PHASE_SUFFIX = 0
2543   } phase = PHASE_PREFIX;
2544
2545   for (;;)
2546     {
2547       gcc_checking_assert (note->pos >= pos);
2548
2549       /* Undo any escaped newlines and trigraphs.  */
2550       if (!accum.reading_p () && note->pos == pos)
2551         switch (note->type)
2552           {
2553           case '\\':
2554           case ' ':
2555             /* Restore backslash followed by newline.  */
2556             accum.append (pfile, base, pos - base);
2557             base = pos;
2558             accum.read_begin (pfile);
2559             accum.append (pfile, UC"\\", 1);
2560
2561           after_backslash:
2562             if (note->type == ' ')
2563               /* GNU backslash whitespace newline extension.  FIXME
2564                  could be any sequence of non-vertical space.  When we
2565                  can properly restore any such sequence, we should
2566                  mark this note as handled so _cpp_process_line_notes
2567                  doesn't warn.  */
2568               accum.append (pfile, UC" ", 1);
2569
2570             accum.append (pfile, UC"\n", 1);
2571             note++;
2572             break;
2573
2574           case '\n':
2575             /* This can happen for ??/<NEWLINE> when trigraphs are not
2576                being interpretted.  */
2577             gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2578             note->type = 0;
2579             note++;
2580             break;
2581
2582           default:
2583             gcc_checking_assert (_cpp_trigraph_map[note->type]);
2584
2585             /* Don't warn about this trigraph in
2586                _cpp_process_line_notes, since trigraphs show up as
2587                trigraphs in raw strings.  */
2588             uchar type = note->type;
2589             note->type = 0;
2590
2591             if (CPP_OPTION (pfile, trigraphs))
2592               {
2593                 accum.append (pfile, base, pos - base);
2594                 base = pos;
2595                 accum.read_begin (pfile);
2596                 accum.append (pfile, UC"??", 2);
2597                 accum.append (pfile, &type, 1);
2598
2599                 /* ??/ followed by newline gets two line notes, one for
2600                    the trigraph and one for the backslash/newline.  */
2601                 if (type == '/' && note[1].pos == pos)
2602                   {
2603                     note++;
2604                     gcc_assert (note->type == '\\' || note->type == ' ');
2605                     goto after_backslash;
2606                   }
2607                 /* Skip the replacement character.  */
2608                 base = ++pos;
2609               }
2610
2611             note++;
2612             break;
2613           }
2614
2615       /* Now get a char to process.  Either from an expanded note, or
2616          from the line buffer.  */
2617       bool read_note = accum.reading_p ();
2618       char c = read_note ? accum.read_char () : *pos++;
2619
2620       if (phase == PHASE_PREFIX)
2621         {
2622           if (c == '(')
2623             {
2624               /* Done.  */
2625               phase = PHASE_NONE;
2626               prefix[prefix_len++] = '"';
2627             }
2628           else if (prefix_len < 16
2629                    /* Prefix chars are any of the basic character set,
2630                       [lex.charset] except for '
2631                       ()\\\t\v\f\n'. Optimized for a contiguous
2632                       alphabet.  */
2633                    /* Unlike a switch, this collapses down to one or
2634                       two shift and bitmask operations on an ASCII
2635                       system, with an outlier or two.   */
2636                    && (('Z' - 'A' == 25
2637                         ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2638                         : ISIDST (c))
2639                        || (c >= '0' && c <= '9')
2640                        || c == '_' || c == '{' || c == '}'
2641                        || c == '[' || c == ']' || c == '#'
2642                        || c == '<' || c == '>' || c == '%'
2643                        || c == ':' || c == ';' || c == '.' || c == '?'
2644                        || c == '*' || c == '+' || c == '-' || c == '/'
2645                        || c == '^' || c == '&' || c == '|' || c == '~'
2646                        || c == '!' || c == '=' || c == ','
2647                        || c == '"' || c == '\''))
2648             prefix[prefix_len++] = c;
2649           else
2650             {
2651               /* Something is wrong.  */
2652               int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2653               if (prefix_len == 16)
2654                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2655                                      col, "raw string delimiter longer "
2656                                      "than 16 characters");
2657               else if (c == '\n')
2658                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2659                                      col, "invalid new-line in raw "
2660                                      "string delimiter");
2661               else
2662                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2663                                      col, "invalid character '%c' in "
2664                                      "raw string delimiter", c);
2665               type = CPP_OTHER;
2666               phase = PHASE_NONE;
2667               /* Continue until we get a close quote, that's probably
2668                  the best failure mode.  */
2669               prefix_len = 0;
2670             }
2671           if (c != '\n')
2672             continue;
2673         }
2674
2675       if (phase != PHASE_NONE)
2676         {
2677           if (prefix[phase] != c)
2678             phase = PHASE_NONE;
2679           else if (unsigned (phase + 1) == prefix_len)
2680             break;
2681           else
2682             {
2683               phase = Phase (phase + 1);
2684               continue;
2685             }
2686         }
2687
2688       if (!prefix_len && c == '"')
2689         /* Failure mode lexing.  */
2690         goto out;
2691       else if (prefix_len && c == ')')
2692         phase = PHASE_SUFFIX;
2693       else if (!read_note && c == '\n')
2694         {
2695           pos--;
2696           pfile->buffer->cur = pos;
2697           if (pfile->state.in_directive
2698               || (pfile->state.parsing_args
2699                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
2700             {
2701               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2702                                    "unterminated raw string");
2703               type = CPP_OTHER;
2704               goto out;
2705             }
2706
2707           accum.append (pfile, base, pos - base + 1);
2708           _cpp_process_line_notes (pfile, false);
2709
2710           if (pfile->buffer->next_line < pfile->buffer->rlimit)
2711             CPP_INCREMENT_LINE (pfile, 0);
2712           pfile->buffer->need_line = true;
2713
2714           if (!_cpp_get_fresh_line (pfile))
2715             {
2716               /* We ran out of file and failed to get a line.  */
2717               location_t src_loc = token->src_loc;
2718               token->type = CPP_EOF;
2719               /* Tell the compiler the line number of the EOF token.  */
2720               token->src_loc = pfile->line_table->highest_line;
2721               token->flags = BOL;
2722               if (accum.first)
2723                 _cpp_release_buff (pfile, accum.first);
2724               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2725                                    "unterminated raw string");
2726               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
2727               _cpp_pop_buffer (pfile);
2728               return;
2729             }
2730
2731           pos = base = pfile->buffer->cur;
2732           note = &pfile->buffer->notes[pfile->buffer->cur_note];
2733         }
2734       else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2735                && warn_bidi_or_invalid_utf8_p)
2736         pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
2737                                           warn_invalid_utf8_p);
2738     }
2739
2740   if (warn_bidi_p)
2741     maybe_warn_bidi_on_close (pfile, pos);
2742
2743   if (CPP_OPTION (pfile, user_literals))
2744     {
2745       /* If a string format macro, say from inttypes.h, is placed touching
2746          a string literal it could be parsed as a C++11 user-defined string
2747          literal thus breaking the program.  */
2748       if (is_macro_not_literal_suffix (pfile, pos))
2749         {
2750           /* Raise a warning, but do not consume subsequent tokens.  */
2751           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2752             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2753                                    token->src_loc, 0,
2754                                    "invalid suffix on literal; C++11 requires "
2755                                    "a space between literal and string macro");
2756         }
2757       /* Grab user defined literal suffix.  */
2758       else if (ISIDST (*pos))
2759         {
2760           type = cpp_userdef_string_add_type (type);
2761           ++pos;
2762
2763           while (ISIDNUM (*pos))
2764             ++pos;
2765         }
2766     }
2767
2768  out:
2769   pfile->buffer->cur = pos;
2770   if (!accum.accum)
2771     create_literal (pfile, token, base, pos - base, type);
2772   else
2773     {
2774       size_t extra_len = pos - base;
2775       uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2776
2777       token->type = type;
2778       token->val.str.len = accum.accum + extra_len;
2779       token->val.str.text = dest;
2780       for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2781         {
2782           size_t len = BUFF_FRONT (buf) - buf->base;
2783           memcpy (dest, buf->base, len);
2784           dest += len;
2785         }
2786       _cpp_release_buff (pfile, accum.first);
2787       memcpy (dest, base, extra_len);
2788       dest[extra_len] = '\0';
2789     }
2790 }
2791
2792 /* Lexes a string, character constant, or angle-bracketed header file
2793    name.  The stored string contains the spelling, including opening
2794    quote and any leading 'L', 'u', 'U' or 'u8' and optional
2795    'R' modifier.  It returns the type of the literal, or CPP_OTHER
2796    if it was not properly terminated, or CPP_LESS for an unterminated
2797    header name which must be relexed as normal tokens.
2798
2799    The spelling is NUL-terminated, but it is not guaranteed that this
2800    is the first NUL since embedded NULs are preserved.  */
2801 static void
2802 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2803 {
2804   bool saw_NUL = false;
2805   const uchar *cur;
2806   cppchar_t terminator;
2807   enum cpp_ttype type;
2808
2809   cur = base;
2810   terminator = *cur++;
2811   if (terminator == 'L' || terminator == 'U')
2812     terminator = *cur++;
2813   else if (terminator == 'u')
2814     {
2815       terminator = *cur++;
2816       if (terminator == '8')
2817         terminator = *cur++;
2818     }
2819   if (terminator == 'R')
2820     {
2821       lex_raw_string (pfile, token, base);
2822       return;
2823     }
2824   if (terminator == '"')
2825     type = (*base == 'L' ? CPP_WSTRING :
2826             *base == 'U' ? CPP_STRING32 :
2827             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2828                          : CPP_STRING);
2829   else if (terminator == '\'')
2830     type = (*base == 'L' ? CPP_WCHAR :
2831             *base == 'U' ? CPP_CHAR32 :
2832             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2833                          : CPP_CHAR);
2834   else
2835     terminator = '>', type = CPP_HEADER_NAME;
2836
2837   const bool warn_bidi_p = pfile->warn_bidi_p ();
2838   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2839   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2840   for (;;)
2841     {
2842       cppchar_t c = *cur++;
2843
2844       /* In #include-style directives, terminators are not escapable.  */
2845       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2846         {
2847           if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
2848             {
2849               location_t loc;
2850               bidi::kind kind;
2851               if (cur[0] == 'N')
2852                 kind = get_bidi_named (pfile, cur + 1, &loc);
2853               else
2854                 kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
2855               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2856             }
2857           cur++;
2858         }
2859       else if (c == terminator)
2860         {
2861           if (warn_bidi_p)
2862             maybe_warn_bidi_on_close (pfile, cur - 1);
2863           break;
2864         }
2865       else if (c == '\n')
2866         {
2867           cur--;
2868           /* Unmatched quotes always yield undefined behavior, but
2869              greedy lexing means that what appears to be an unterminated
2870              header name may actually be a legitimate sequence of tokens.  */
2871           if (terminator == '>')
2872             {
2873               token->type = CPP_LESS;
2874               return;
2875             }
2876           type = CPP_OTHER;
2877           break;
2878         }
2879       else if (c == '\0')
2880         saw_NUL = true;
2881       else if (__builtin_expect (c >= utf8_continuation, 0)
2882                && warn_bidi_or_invalid_utf8_p)
2883         cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2884                                           warn_invalid_utf8_p);
2885     }
2886
2887   if (saw_NUL && !pfile->state.skipping)
2888     cpp_error (pfile, CPP_DL_WARNING,
2889                "null character(s) preserved in literal");
2890
2891   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2892     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2893                (int) terminator);
2894
2895   if (CPP_OPTION (pfile, user_literals))
2896     {
2897       /* If a string format macro, say from inttypes.h, is placed touching
2898          a string literal it could be parsed as a C++11 user-defined string
2899          literal thus breaking the program.  */
2900       if (is_macro_not_literal_suffix (pfile, cur))
2901         {
2902           /* Raise a warning, but do not consume subsequent tokens.  */
2903           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2904             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2905                                    token->src_loc, 0,
2906                                    "invalid suffix on literal; C++11 requires "
2907                                    "a space between literal and string macro");
2908         }
2909       /* Grab user defined literal suffix.  */
2910       else if (ISIDST (*cur))
2911         {
2912           type = cpp_userdef_char_add_type (type);
2913           type = cpp_userdef_string_add_type (type);
2914           ++cur;
2915
2916           while (ISIDNUM (*cur))
2917             ++cur;
2918         }
2919     }
2920   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2921            && is_macro (pfile, cur)
2922            && !pfile->state.skipping)
2923     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2924                            token->src_loc, 0, "C++11 requires a space "
2925                            "between string literal and macro");
2926
2927   pfile->buffer->cur = cur;
2928   create_literal (pfile, token, base, cur - base, type);
2929 }
2930
2931 /* Return the comment table. The client may not make any assumption
2932    about the ordering of the table.  */
2933 cpp_comment_table *
2934 cpp_get_comments (cpp_reader *pfile)
2935 {
2936   return &pfile->comments;
2937 }
2938
2939 /* Append a comment to the end of the comment table. */
2940 static void
2941 store_comment (cpp_reader *pfile, cpp_token *token)
2942 {
2943   int len;
2944
2945   if (pfile->comments.allocated == 0)
2946     {
2947       pfile->comments.allocated = 256;
2948       pfile->comments.entries = (cpp_comment *) xmalloc
2949         (pfile->comments.allocated * sizeof (cpp_comment));
2950     }
2951
2952   if (pfile->comments.count == pfile->comments.allocated)
2953     {
2954       pfile->comments.allocated *= 2;
2955       pfile->comments.entries = (cpp_comment *) xrealloc
2956         (pfile->comments.entries,
2957          pfile->comments.allocated * sizeof (cpp_comment));
2958     }
2959
2960   len = token->val.str.len;
2961
2962   /* Copy comment. Note, token may not be NULL terminated. */
2963   pfile->comments.entries[pfile->comments.count].comment =
2964     (char *) xmalloc (sizeof (char) * (len + 1));
2965   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2966           token->val.str.text, len);
2967   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2968
2969   /* Set source location. */
2970   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2971
2972   /* Increment the count of entries in the comment table. */
2973   pfile->comments.count++;
2974 }
2975
2976 /* The stored comment includes the comment start and any terminator.  */
2977 static void
2978 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2979               cppchar_t type)
2980 {
2981   unsigned char *buffer;
2982   unsigned int len, clen, i;
2983
2984   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2985
2986   /* C++ comments probably (not definitely) have moved past a new
2987      line, which we don't want to save in the comment.  */
2988   if (is_vspace (pfile->buffer->cur[-1]))
2989     len--;
2990
2991   /* If we are currently in a directive or in argument parsing, then
2992      we need to store all C++ comments as C comments internally, and
2993      so we need to allocate a little extra space in that case.
2994
2995      Note that the only time we encounter a directive here is
2996      when we are saving comments in a "#define".  */
2997   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2998           && type == '/') ? len + 2 : len;
2999
3000   buffer = _cpp_unaligned_alloc (pfile, clen);
3001
3002   token->type = CPP_COMMENT;
3003   token->val.str.len = clen;
3004   token->val.str.text = buffer;
3005
3006   buffer[0] = '/';
3007   memcpy (buffer + 1, from, len - 1);
3008
3009   /* Finish conversion to a C comment, if necessary.  */
3010   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
3011     {
3012       buffer[1] = '*';
3013       buffer[clen - 2] = '*';
3014       buffer[clen - 1] = '/';
3015       /* As there can be in a C++ comments illegal sequences for C comments
3016          we need to filter them out.  */
3017       for (i = 2; i < (clen - 2); i++)
3018         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3019           buffer[i] = '|';
3020     }
3021
3022   /* Finally store this comment for use by clients of libcpp. */
3023   store_comment (pfile, token);
3024 }
3025
3026 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3027    comment.  */
3028
3029 static bool
3030 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3031 {
3032   const unsigned char *from = comment_start + 1;
3033
3034   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3035     {
3036       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3037          don't recognize any comments.  The latter only checks attributes,
3038          the former doesn't warn.  */
3039     case 0:
3040     default:
3041       return false;
3042       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3043          content it has.  */
3044     case 1:
3045       return true;
3046     case 2:
3047       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3048          .*falls?[ \t-]*thr(u|ough).* regex.  */
3049       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3050            from++)
3051         {
3052           /* Is there anything like strpbrk with upper boundary, or
3053              memchr looking for 2 characters rather than just one?  */
3054           if (from[0] != 'f' && from[0] != 'F')
3055             continue;
3056           if (from[1] != 'a' && from[1] != 'A')
3057             continue;
3058           if (from[2] != 'l' && from[2] != 'L')
3059             continue;
3060           if (from[3] != 'l' && from[3] != 'L')
3061             continue;
3062           from += sizeof "fall" - 1;
3063           if (from[0] == 's' || from[0] == 'S')
3064             from++;
3065           while (*from == ' ' || *from == '\t' || *from == '-')
3066             from++;
3067           if (from[0] != 't' && from[0] != 'T')
3068             continue;
3069           if (from[1] != 'h' && from[1] != 'H')
3070             continue;
3071           if (from[2] != 'r' && from[2] != 'R')
3072             continue;
3073           if (from[3] == 'u' || from[3] == 'U')
3074             return true;
3075           if (from[3] != 'o' && from[3] != 'O')
3076             continue;
3077           if (from[4] != 'u' && from[4] != 'U')
3078             continue;
3079           if (from[5] != 'g' && from[5] != 'G')
3080             continue;
3081           if (from[6] != 'h' && from[6] != 'H')
3082             continue;
3083           return true;
3084         }
3085       return false;
3086     case 3:
3087     case 4:
3088       break;
3089     }
3090
3091   /* Whole comment contents:
3092      -fallthrough
3093      @fallthrough@
3094    */
3095   if (*from == '-' || *from == '@')
3096     {
3097       size_t len = sizeof "fallthrough" - 1;
3098       if ((size_t) (pfile->buffer->cur - from - 1) < len)
3099         return false;
3100       if (memcmp (from + 1, "fallthrough", len))
3101         return false;
3102       if (*from == '@')
3103         {
3104           if (from[len + 1] != '@')
3105             return false;
3106           len++;
3107         }
3108       from += 1 + len;
3109     }
3110   /* Whole comment contents (regex):
3111      lint -fallthrough[ \t]*
3112    */
3113   else if (*from == 'l')
3114     {
3115       size_t len = sizeof "int -fallthrough" - 1;
3116       if ((size_t) (pfile->buffer->cur - from - 1) < len)
3117         return false;
3118       if (memcmp (from + 1, "int -fallthrough", len))
3119         return false;
3120       from += 1 + len;
3121       while (*from == ' ' || *from == '\t')
3122         from++;
3123     }
3124   /* Whole comment contents (regex):
3125      [ \t]*FALLTHR(U|OUGH)[ \t]*
3126    */
3127   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3128     {
3129       while (*from == ' ' || *from == '\t')
3130         from++;
3131       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
3132         return false;
3133       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
3134         return false;
3135       from += sizeof "FALLTHR" - 1;
3136       if (*from == 'U')
3137         from++;
3138       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
3139         return false;
3140       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
3141         return false;
3142       else
3143         from += sizeof "OUGH" - 1;
3144       while (*from == ' ' || *from == '\t')
3145         from++;
3146     }
3147   /* Whole comment contents (regex):
3148      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3149      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3150      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
3151    */
3152   else
3153     {
3154       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3155         from++;
3156       unsigned char f = *from;
3157       bool all_upper = false;
3158       if (f == 'E' || f == 'e')
3159         {
3160           if ((size_t) (pfile->buffer->cur - from)
3161               < sizeof "else fallthru" - 1)
3162             return false;
3163           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
3164             all_upper = true;
3165           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
3166             return false;
3167           from += sizeof "else" - 1;
3168           if (*from == ',')
3169             from++;
3170           if (*from != ' ')
3171             return false;
3172           from++;
3173           if (all_upper && *from == 'f')
3174             return false;
3175           if (f == 'e' && *from == 'F')
3176             return false;
3177           f = *from;
3178         }
3179       else if (f == 'I' || f == 'i')
3180         {
3181           if ((size_t) (pfile->buffer->cur - from)
3182               < sizeof "intentional fallthru" - 1)
3183             return false;
3184           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
3185                                   sizeof "NTENTIONAL" - 1) == 0)
3186             all_upper = true;
3187           else if (memcmp (from + 1, "ntentional",
3188                            sizeof "ntentional" - 1))
3189             return false;
3190           from += sizeof "intentional" - 1;
3191           if (*from == ' ')
3192             {
3193               from++;
3194               if (all_upper && *from == 'f')
3195                 return false;
3196             }
3197           else if (all_upper)
3198             {
3199               if (memcmp (from, "LY F", sizeof "LY F" - 1))
3200                 return false;
3201               from += sizeof "LY " - 1;
3202             }
3203           else
3204             {
3205               if (memcmp (from, "ly ", sizeof "ly " - 1))
3206                 return false;
3207               from += sizeof "ly " - 1;
3208             }
3209           if (f == 'i' && *from == 'F')
3210             return false;
3211           f = *from;
3212         }
3213       if (f != 'F' && f != 'f')
3214         return false;
3215       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3216         return false;
3217       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3218         all_upper = true;
3219       else if (all_upper)
3220         return false;
3221       else if (memcmp (from + 1, "all", sizeof "all" - 1))
3222         return false;
3223       from += sizeof "fall" - 1;
3224       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3225         from += 2;
3226       else if (*from == ' ' || *from == '-')
3227         from++;
3228       else if (*from != (all_upper ? 'T' : 't'))
3229         return false;
3230       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3231         return false;
3232       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3233         return false;
3234       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3235         {
3236           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3237             return false;
3238           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3239                       sizeof "hrough" - 1))
3240             return false;
3241           from += sizeof "through" - 1;
3242         }
3243       else
3244         from += sizeof "thru" - 1;
3245       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3246         from++;
3247       if (*from == '-')
3248         {
3249           from++;
3250           if (*comment_start == '*')
3251             {
3252               do
3253                 {
3254                   while (*from && *from != '*'
3255                          && *from != '\n' && *from != '\r')
3256                     from++;
3257                   if (*from != '*' || from[1] == '/')
3258                     break;
3259                   from++;
3260                 }
3261               while (1);
3262             }
3263           else
3264             while (*from && *from != '\n' && *from != '\r')
3265               from++;
3266         }
3267     }
3268   /* C block comment.  */
3269   if (*comment_start == '*')
3270     {
3271       if (*from != '*' || from[1] != '/')
3272         return false;
3273     }
3274   /* C++ line comment.  */
3275   else if (*from != '\n')
3276     return false;
3277
3278   return true;
3279 }
3280
3281 /* Allocate COUNT tokens for RUN.  */
3282 void
3283 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3284 {
3285   run->base = XNEWVEC (cpp_token, count);
3286   run->limit = run->base + count;
3287   run->next = NULL;
3288 }
3289
3290 /* Returns the next tokenrun, or creates one if there is none.  */
3291 static tokenrun *
3292 next_tokenrun (tokenrun *run)
3293 {
3294   if (run->next == NULL)
3295     {
3296       run->next = XNEW (tokenrun);
3297       run->next->prev = run;
3298       _cpp_init_tokenrun (run->next, 250);
3299     }
3300
3301   return run->next;
3302 }
3303
3304 /* Return the number of not yet processed token in a given
3305    context.  */
3306 int
3307 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3308 {
3309   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3310     return (LAST (context).token - FIRST (context).token);
3311   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3312            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3313     return (LAST (context).ptoken - FIRST (context).ptoken);
3314   else
3315       abort ();
3316 }
3317
3318 /* Returns the token present at index INDEX in a given context.  If
3319    INDEX is zero, the next token to be processed is returned.  */
3320 static const cpp_token*
3321 _cpp_token_from_context_at (cpp_context *context, int index)
3322 {
3323   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3324     return &(FIRST (context).token[index]);
3325   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3326            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3327     return FIRST (context).ptoken[index];
3328  else
3329    abort ();
3330 }
3331
3332 /* Look ahead in the input stream.  */
3333 const cpp_token *
3334 cpp_peek_token (cpp_reader *pfile, int index)
3335 {
3336   cpp_context *context = pfile->context;
3337   const cpp_token *peektok;
3338   int count;
3339
3340   /* First, scan through any pending cpp_context objects.  */
3341   while (context->prev)
3342     {
3343       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3344
3345       if (index < (int) sz)
3346         return _cpp_token_from_context_at (context, index);
3347       index -= (int) sz;
3348       context = context->prev;
3349     }
3350
3351   /* We will have to read some new tokens after all (and do so
3352      without invalidating preceding tokens).  */
3353   count = index;
3354   pfile->keep_tokens++;
3355
3356   /* For peeked tokens temporarily disable line_change reporting,
3357      until the tokens are parsed for real.  */
3358   void (*line_change) (cpp_reader *, const cpp_token *, int)
3359     = pfile->cb.line_change;
3360   pfile->cb.line_change = NULL;
3361
3362   do
3363     {
3364       peektok = _cpp_lex_token (pfile);
3365       if (peektok->type == CPP_EOF)
3366         {
3367           index--;
3368           break;
3369         }
3370       else if (peektok->type == CPP_PRAGMA)
3371         {
3372           /* Don't peek past a pragma.  */
3373           if (peektok == &pfile->directive_result)
3374             /* Save the pragma in the buffer.  */
3375             *pfile->cur_token++ = *peektok;
3376           index--;
3377           break;
3378         }
3379     }
3380   while (index--);
3381
3382   _cpp_backup_tokens_direct (pfile, count - index);
3383   pfile->keep_tokens--;
3384   pfile->cb.line_change = line_change;
3385
3386   return peektok;
3387 }
3388
3389 /* Allocate a single token that is invalidated at the same time as the
3390    rest of the tokens on the line.  Has its line and col set to the
3391    same as the last lexed token, so that diagnostics appear in the
3392    right place.  */
3393 cpp_token *
3394 _cpp_temp_token (cpp_reader *pfile)
3395 {
3396   cpp_token *old, *result;
3397   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3398   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3399
3400   old = pfile->cur_token - 1;
3401   /* Any pre-existing lookaheads must not be clobbered.  */
3402   if (la)
3403     {
3404       if (sz <= la)
3405         {
3406           tokenrun *next = next_tokenrun (pfile->cur_run);
3407
3408           if (sz < la)
3409             memmove (next->base + 1, next->base,
3410                      (la - sz) * sizeof (cpp_token));
3411
3412           next->base[0] = pfile->cur_run->limit[-1];
3413         }
3414
3415       if (sz > 1)
3416         memmove (pfile->cur_token + 1, pfile->cur_token,
3417                  MIN (la, sz - 1) * sizeof (cpp_token));
3418     }
3419
3420   if (!sz && pfile->cur_token == pfile->cur_run->limit)
3421     {
3422       pfile->cur_run = next_tokenrun (pfile->cur_run);
3423       pfile->cur_token = pfile->cur_run->base;
3424     }
3425
3426   result = pfile->cur_token++;
3427   result->src_loc = old->src_loc;
3428   return result;
3429 }
3430
3431 /* We're at the beginning of a logical line (so not in
3432   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
3433   if we should enter deferred_pragma mode to tokenize the rest of the
3434   line as a module control-line.  */
3435
3436 static void
3437 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3438 {
3439   unsigned backup = 0; /* Tokens we peeked.  */
3440   cpp_hashnode *node = result->val.node.node;
3441   cpp_token *peek = result;
3442   cpp_token *keyword = peek;
3443   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3444   int header_count = 0;
3445
3446   /* Make sure the incoming state is as we expect it.  This way we
3447      can restore it using constants.  */
3448   gcc_checking_assert (!pfile->state.in_deferred_pragma
3449                        && !pfile->state.skipping
3450                        && !pfile->state.parsing_args
3451                        && !pfile->state.angled_headers
3452                        && (pfile->state.save_comments
3453                            == !CPP_OPTION (pfile, discard_comments)));
3454
3455   /* Enter directives mode sufficiently for peeking.  We don't have
3456      to actually set in_directive.  */
3457   pfile->state.in_deferred_pragma = true;
3458
3459   /* These two fields are needed to process tokenization in deferred
3460      pragma mode.  They are not used outside deferred pragma mode or
3461      directives mode.  */
3462   pfile->state.pragma_allow_expansion = true;
3463   pfile->directive_line = result->src_loc;
3464
3465   /* Saving comments is incompatible with directives mode.   */
3466   pfile->state.save_comments = 0;
3467
3468   if (node == n_modules[spec_nodes::M_EXPORT][0])
3469     {
3470       peek = _cpp_lex_direct (pfile);
3471       keyword = peek;
3472       backup++;
3473       if (keyword->type != CPP_NAME)
3474         goto not_module;
3475       node = keyword->val.node.node;
3476       if (!(node->flags & NODE_MODULE))
3477         goto not_module;
3478     }
3479
3480   if (node == n_modules[spec_nodes::M__IMPORT][0])
3481     /* __import  */
3482     header_count = backup + 2 + 16;
3483   else if (node == n_modules[spec_nodes::M_IMPORT][0])
3484     /* import  */
3485     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3486   else if (node == n_modules[spec_nodes::M_MODULE][0])
3487     ; /* module  */
3488   else
3489     goto not_module;
3490
3491   /* We've seen [export] {module|import|__import}.  Check the next token.  */
3492   if (header_count)
3493     /* After '{,__}import' a header name may appear.  */
3494     pfile->state.angled_headers = true;
3495   peek = _cpp_lex_direct (pfile);
3496   backup++;
3497
3498   /* ... import followed by identifier, ':', '<' or
3499      header-name preprocessing tokens, or module
3500      followed by cpp-identifier, ':' or ';' preprocessing
3501      tokens.  C++ keywords are not yet relevant.  */
3502   if (peek->type == CPP_NAME
3503       || peek->type == CPP_COLON
3504       ||  (header_count
3505            ? (peek->type == CPP_LESS
3506               || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3507               || peek->type == CPP_HEADER_NAME)
3508            : peek->type == CPP_SEMICOLON))
3509     {
3510       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3511       if (!pfile->state.pragma_allow_expansion)
3512         pfile->state.prevent_expansion++;
3513
3514       if (!header_count && linemap_included_from
3515           (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3516         cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3517                              "module control-line cannot be in included file");
3518
3519       /* The first one or two tokens cannot be macro names.  */
3520       for (int ix = backup; ix--;)
3521         {
3522           cpp_token *tok = ix ? keyword : result;
3523           cpp_hashnode *node = tok->val.node.node;
3524
3525           /* Don't attempt to expand the token.  */
3526           tok->flags |= NO_EXPAND;
3527           if (_cpp_defined_macro_p (node)
3528               && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3529               && !cpp_fun_like_macro_p (node))
3530             cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3531                                  "module control-line \"%s\" cannot be"
3532                                  " an object-like macro",
3533                                  NODE_NAME (node));
3534         }
3535
3536       /* Map to underbar variants.  */
3537       keyword->val.node.node = n_modules[header_count
3538                                          ? spec_nodes::M_IMPORT
3539                                          : spec_nodes::M_MODULE][1];
3540       if (backup != 1)
3541         result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3542
3543       /* Maybe tell the tokenizer we expect a header-name down the
3544          road.  */
3545       pfile->state.directive_file_token = header_count;
3546     }
3547   else
3548     {
3549     not_module:
3550       /* Drop out of directive mode.  */
3551       /* We aaserted save_comments had this value upon entry.  */
3552       pfile->state.save_comments
3553         = !CPP_OPTION (pfile, discard_comments);
3554       pfile->state.in_deferred_pragma = false;
3555       /* Do not let this remain on.  */
3556       pfile->state.angled_headers = false;
3557     }
3558
3559   /* In either case we want to backup the peeked tokens.  */
3560   if (backup)
3561     {
3562       /* If we saw EOL, we should drop it, because this isn't a module
3563          control-line after all.  */
3564       bool eol = peek->type == CPP_PRAGMA_EOL;
3565       if (!eol || backup > 1)
3566         {
3567           /* Put put the peeked tokens back  */
3568           _cpp_backup_tokens_direct (pfile, backup);
3569           /* But if the last one was an EOL, forget it.  */
3570           if (eol)
3571             pfile->lookaheads--;
3572         }
3573     }
3574 }
3575
3576 /* Lex a token into RESULT (external interface).  Takes care of issues
3577    like directive handling, token lookahead, multiple include
3578    optimization and skipping.  */
3579 const cpp_token *
3580 _cpp_lex_token (cpp_reader *pfile)
3581 {
3582   cpp_token *result;
3583
3584   for (;;)
3585     {
3586       if (pfile->cur_token == pfile->cur_run->limit)
3587         {
3588           pfile->cur_run = next_tokenrun (pfile->cur_run);
3589           pfile->cur_token = pfile->cur_run->base;
3590         }
3591       /* We assume that the current token is somewhere in the current
3592          run.  */
3593       if (pfile->cur_token < pfile->cur_run->base
3594           || pfile->cur_token >= pfile->cur_run->limit)
3595         abort ();
3596
3597       if (pfile->lookaheads)
3598         {
3599           pfile->lookaheads--;
3600           result = pfile->cur_token++;
3601         }
3602       else
3603         result = _cpp_lex_direct (pfile);
3604
3605       if (result->flags & BOL)
3606         {
3607           /* Is this a directive.  If _cpp_handle_directive returns
3608              false, it is an assembler #.  */
3609           if (result->type == CPP_HASH
3610               /* 6.10.3 p 11: Directives in a list of macro arguments
3611                  gives undefined behavior.  This implementation
3612                  handles the directive as normal.  */
3613               && pfile->state.parsing_args != 1)
3614             {
3615               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3616                 {
3617                   if (pfile->directive_result.type == CPP_PADDING)
3618                     continue;
3619                   result = &pfile->directive_result;
3620                 }
3621             }
3622           else if (pfile->state.in_deferred_pragma)
3623             result = &pfile->directive_result;
3624           else if (result->type == CPP_NAME
3625                    && (result->val.node.node->flags & NODE_MODULE)
3626                    && !pfile->state.skipping
3627                    /* Unlike regular directives, we do not deal with
3628                       tokenizing module directives as macro arguments.
3629                       That's not permitted.  */
3630                    && !pfile->state.parsing_args)
3631             {
3632               /* P1857.  Before macro expansion, At start of logical
3633                  line ... */
3634               /* We don't have to consider lookaheads at this point.  */
3635               gcc_checking_assert (!pfile->lookaheads);
3636
3637               cpp_maybe_module_directive (pfile, result);
3638             }
3639
3640           if (pfile->cb.line_change && !pfile->state.skipping)
3641             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3642         }
3643
3644       /* We don't skip tokens in directives.  */
3645       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3646         break;
3647
3648       /* Outside a directive, invalidate controlling macros.  At file
3649          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3650          get here and MI optimization works.  */
3651       pfile->mi_valid = false;
3652
3653       if (!pfile->state.skipping || result->type == CPP_EOF)
3654         break;
3655     }
3656
3657   return result;
3658 }
3659
3660 /* Returns true if a fresh line has been loaded.  */
3661 bool
3662 _cpp_get_fresh_line (cpp_reader *pfile)
3663 {
3664   /* We can't get a new line until we leave the current directive.  */
3665   if (pfile->state.in_directive)
3666     return false;
3667
3668   for (;;)
3669     {
3670       cpp_buffer *buffer = pfile->buffer;
3671
3672       if (!buffer->need_line)
3673         return true;
3674
3675       if (buffer->next_line < buffer->rlimit)
3676         {
3677           _cpp_clean_line (pfile);
3678           return true;
3679         }
3680
3681       /* First, get out of parsing arguments state.  */
3682       if (pfile->state.parsing_args)
3683         return false;
3684
3685       /* End of buffer.  Non-empty files should end in a newline.  */
3686       if (buffer->buf != buffer->rlimit
3687           && buffer->next_line > buffer->rlimit
3688           && !buffer->from_stage3)
3689         {
3690           /* Clip to buffer size.  */
3691           buffer->next_line = buffer->rlimit;
3692         }
3693
3694       if (buffer->prev && !buffer->return_at_eof)
3695         _cpp_pop_buffer (pfile);
3696       else
3697         {
3698           /* End of translation.  Do not pop the buffer yet. Increment
3699              line number so that the EOF token is on a line of its own
3700              (_cpp_lex_direct doesn't increment in that case, because
3701              it's hard for it to distinguish this special case). */
3702           CPP_INCREMENT_LINE (pfile, 0);
3703           return false;
3704         }
3705     }
3706 }
3707
3708 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
3709   do                                                    \
3710     {                                                   \
3711       result->type = ELSE_TYPE;                         \
3712       if (*buffer->cur == CHAR)                         \
3713         buffer->cur++, result->type = THEN_TYPE;        \
3714     }                                                   \
3715   while (0)
3716
3717 /* Lex a token into pfile->cur_token, which is also incremented, to
3718    get diagnostics pointing to the correct location.
3719
3720    Does not handle issues such as token lookahead, multiple-include
3721    optimization, directives, skipping etc.  This function is only
3722    suitable for use by _cpp_lex_token, and in special cases like
3723    lex_expansion_token which doesn't care for any of these issues.
3724
3725    When meeting a newline, returns CPP_EOF if parsing a directive,
3726    otherwise returns to the start of the token buffer if permissible.
3727    Returns the location of the lexed token.  */
3728 cpp_token *
3729 _cpp_lex_direct (cpp_reader *pfile)
3730 {
3731   cppchar_t c;
3732   cpp_buffer *buffer;
3733   const unsigned char *comment_start;
3734   bool fallthrough_comment = false;
3735   cpp_token *result = pfile->cur_token++;
3736
3737  fresh_line:
3738   result->flags = 0;
3739   buffer = pfile->buffer;
3740   if (buffer->need_line)
3741     {
3742       if (pfile->state.in_deferred_pragma)
3743         {
3744           /* This can happen in cases like:
3745              #define loop(x) whatever
3746              #pragma omp loop
3747              where when trying to expand loop we need to peek
3748              next token after loop, but aren't still in_deferred_pragma
3749              mode but are in in_directive mode, so buffer->need_line
3750              is set, a CPP_EOF is peeked.  */
3751           result->type = CPP_PRAGMA_EOL;
3752           pfile->state.in_deferred_pragma = false;
3753           if (!pfile->state.pragma_allow_expansion)
3754             pfile->state.prevent_expansion--;
3755           return result;
3756         }
3757       if (!_cpp_get_fresh_line (pfile))
3758         {
3759           result->type = CPP_EOF;
3760           /* Not a real EOF in a directive or arg parsing -- we refuse
3761              to advance to the next file now, and will once we're out
3762              of those modes.  */
3763           if (!pfile->state.in_directive && !pfile->state.parsing_args)
3764             {
3765               /* Tell the compiler the line number of the EOF token.  */
3766               result->src_loc = pfile->line_table->highest_line;
3767               result->flags = BOL;
3768               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
3769               _cpp_pop_buffer (pfile);
3770             }
3771           return result;
3772         }
3773       if (buffer != pfile->buffer)
3774         fallthrough_comment = false;
3775       if (!pfile->keep_tokens)
3776         {
3777           pfile->cur_run = &pfile->base_run;
3778           result = pfile->base_run.base;
3779           pfile->cur_token = result + 1;
3780         }
3781       result->flags = BOL;
3782       if (pfile->state.parsing_args == 2)
3783         result->flags |= PREV_WHITE;
3784     }
3785   buffer = pfile->buffer;
3786  update_tokens_line:
3787   result->src_loc = pfile->line_table->highest_line;
3788
3789  skipped_white:
3790   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3791       && !pfile->overlaid_buffer)
3792     {
3793       _cpp_process_line_notes (pfile, false);
3794       result->src_loc = pfile->line_table->highest_line;
3795     }
3796   c = *buffer->cur++;
3797
3798   if (pfile->forced_token_location)
3799     result->src_loc = pfile->forced_token_location;
3800   else
3801     result->src_loc = linemap_position_for_column (pfile->line_table,
3802                                           CPP_BUF_COLUMN (buffer, buffer->cur));
3803
3804   switch (c)
3805     {
3806     case ' ': case '\t': case '\f': case '\v': case '\0':
3807       result->flags |= PREV_WHITE;
3808       skip_whitespace (pfile, c);
3809       goto skipped_white;
3810
3811     case '\n':
3812       /* Increment the line, unless this is the last line ...  */
3813       if (buffer->cur < buffer->rlimit
3814           /* ... or this is a #include, (where _cpp_stack_file needs to
3815              unwind by one line) ...  */
3816           || (pfile->state.in_directive > 1
3817               /* ... except traditional-cpp increments this elsewhere.  */
3818               && !CPP_OPTION (pfile, traditional)))
3819         CPP_INCREMENT_LINE (pfile, 0);
3820       buffer->need_line = true;
3821       if (pfile->state.in_deferred_pragma)
3822         {
3823           /* Produce the PRAGMA_EOL on this line.  File reading
3824              ensures there is always a \n at end of the buffer, thus
3825              in a deferred pragma we always see CPP_PRAGMA_EOL before
3826              any CPP_EOF.  */
3827           result->type = CPP_PRAGMA_EOL;
3828           result->flags &= ~PREV_WHITE;
3829           pfile->state.in_deferred_pragma = false;
3830           if (!pfile->state.pragma_allow_expansion)
3831             pfile->state.prevent_expansion--;
3832           return result;
3833         }
3834       goto fresh_line;
3835
3836     case '0': case '1': case '2': case '3': case '4':
3837     case '5': case '6': case '7': case '8': case '9':
3838       {
3839         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3840         result->type = CPP_NUMBER;
3841         lex_number (pfile, &result->val.str, &nst);
3842         warn_about_normalization (pfile, result, &nst);
3843         break;
3844       }
3845
3846     case 'L':
3847     case 'u':
3848     case 'U':
3849     case 'R':
3850       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3851          wide strings or raw strings.  */
3852       if (c == 'L' || CPP_OPTION (pfile, rliterals)
3853           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3854         {
3855           if ((*buffer->cur == '\'' && c != 'R')
3856               || *buffer->cur == '"'
3857               || (*buffer->cur == 'R'
3858                   && c != 'R'
3859                   && buffer->cur[1] == '"'
3860                   && CPP_OPTION (pfile, rliterals))
3861               || (*buffer->cur == '8'
3862                   && c == 'u'
3863                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3864                                 && CPP_OPTION (pfile, utf8_char_literals)))
3865                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3866                           && CPP_OPTION (pfile, rliterals)))))
3867             {
3868               lex_string (pfile, result, buffer->cur - 1);
3869               break;
3870             }
3871         }
3872       /* Fall through.  */
3873
3874     case '_':
3875     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3876     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3877     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3878     case 's': case 't':           case 'v': case 'w': case 'x':
3879     case 'y': case 'z':
3880     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3881     case 'G': case 'H': case 'I': case 'J': case 'K':
3882     case 'M': case 'N': case 'O': case 'P': case 'Q':
3883     case 'S': case 'T':           case 'V': case 'W': case 'X':
3884     case 'Y': case 'Z':
3885       result->type = CPP_NAME;
3886       {
3887         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3888         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3889                                                 &nst,
3890                                                 &result->val.node.spelling);
3891         warn_about_normalization (pfile, result, &nst);
3892       }
3893
3894       /* Convert named operators to their proper types.  */
3895       if (result->val.node.node->flags & NODE_OPERATOR)
3896         {
3897           result->flags |= NAMED_OP;
3898           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3899         }
3900
3901       /* Signal FALLTHROUGH comment followed by another token.  */
3902       if (fallthrough_comment)
3903         result->flags |= PREV_FALLTHROUGH;
3904       break;
3905
3906     case '\'':
3907     case '"':
3908       lex_string (pfile, result, buffer->cur - 1);
3909       break;
3910
3911     case '/':
3912       /* A potential block or line comment.  */
3913       comment_start = buffer->cur;
3914       c = *buffer->cur;
3915
3916       if (c == '*')
3917         {
3918           if (_cpp_skip_block_comment (pfile))
3919             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3920         }
3921       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3922         {
3923           /* Don't warn for system headers.  */
3924           if (_cpp_in_system_header (pfile))
3925             ;
3926           /* Warn about comments if pedantically GNUC89, and not
3927              in system headers.  */
3928           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3929                    && CPP_PEDANTIC (pfile)
3930                    && ! buffer->warned_cplusplus_comments)
3931             {
3932               if (cpp_error (pfile, CPP_DL_PEDWARN,
3933                              "C++ style comments are not allowed in ISO C90"))
3934                 cpp_error (pfile, CPP_DL_NOTE,
3935                            "(this will be reported only once per input file)");
3936               buffer->warned_cplusplus_comments = 1;
3937             }
3938           /* Or if specifically desired via -Wc90-c99-compat.  */
3939           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3940                    && ! CPP_OPTION (pfile, cplusplus)
3941                    && ! buffer->warned_cplusplus_comments)
3942             {
3943               if (cpp_error (pfile, CPP_DL_WARNING,
3944                              "C++ style comments are incompatible with C90"))
3945                 cpp_error (pfile, CPP_DL_NOTE,
3946                            "(this will be reported only once per input file)");
3947               buffer->warned_cplusplus_comments = 1;
3948             }
3949           /* In C89/C94, C++ style comments are forbidden.  */
3950           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3951                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
3952             {
3953               /* But don't be confused about valid code such as
3954                  - // immediately followed by *,
3955                  - // in a preprocessing directive,
3956                  - // in an #if 0 block.  */
3957               if (buffer->cur[1] == '*'
3958                   || pfile->state.in_directive
3959                   || pfile->state.skipping)
3960                 {
3961                   result->type = CPP_DIV;
3962                   break;
3963                 }
3964               else if (! buffer->warned_cplusplus_comments)
3965                 {
3966                   if (cpp_error (pfile, CPP_DL_ERROR,
3967                                  "C++ style comments are not allowed in "
3968                                  "ISO C90"))
3969                     cpp_error (pfile, CPP_DL_NOTE,
3970                                "(this will be reported only once per input "
3971                                "file)");
3972                   buffer->warned_cplusplus_comments = 1;
3973                 }
3974             }
3975           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3976             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3977         }
3978       else if (c == '=')
3979         {
3980           buffer->cur++;
3981           result->type = CPP_DIV_EQ;
3982           break;
3983         }
3984       else
3985         {
3986           result->type = CPP_DIV;
3987           break;
3988         }
3989
3990       if (fallthrough_comment_p (pfile, comment_start))
3991         fallthrough_comment = true;
3992
3993       if (pfile->cb.comment)
3994         {
3995           size_t len = pfile->buffer->cur - comment_start;
3996           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3997                              len + 1);
3998         }
3999
4000       if (!pfile->state.save_comments)
4001         {
4002           result->flags |= PREV_WHITE;
4003           goto update_tokens_line;
4004         }
4005
4006       if (fallthrough_comment)
4007         result->flags |= PREV_FALLTHROUGH;
4008
4009       /* Save the comment as a token in its own right.  */
4010       save_comment (pfile, result, comment_start, c);
4011       break;
4012
4013     case '<':
4014       if (pfile->state.angled_headers)
4015         {
4016           lex_string (pfile, result, buffer->cur - 1);
4017           if (result->type != CPP_LESS)
4018             break;
4019         }
4020
4021       result->type = CPP_LESS;
4022       if (*buffer->cur == '=')
4023         {
4024           buffer->cur++, result->type = CPP_LESS_EQ;
4025           if (*buffer->cur == '>'
4026               && CPP_OPTION (pfile, cplusplus)
4027               && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
4028             buffer->cur++, result->type = CPP_SPACESHIP;
4029         }
4030       else if (*buffer->cur == '<')
4031         {
4032           buffer->cur++;
4033           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
4034         }
4035       else if (CPP_OPTION (pfile, digraphs))
4036         {
4037           if (*buffer->cur == ':')
4038             {
4039               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4040                  three characters are <:: and the subsequent character
4041                  is neither : nor >, the < is treated as a preprocessor
4042                  token by itself".  */
4043               if (CPP_OPTION (pfile, cplusplus)
4044                   && CPP_OPTION (pfile, lang) != CLK_CXX98
4045                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
4046                   && buffer->cur[1] == ':'
4047                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4048                 break;
4049
4050               buffer->cur++;
4051               result->flags |= DIGRAPH;
4052               result->type = CPP_OPEN_SQUARE;
4053             }
4054           else if (*buffer->cur == '%')
4055             {
4056               buffer->cur++;
4057               result->flags |= DIGRAPH;
4058               result->type = CPP_OPEN_BRACE;
4059             }
4060         }
4061       break;
4062
4063     case '>':
4064       result->type = CPP_GREATER;
4065       if (*buffer->cur == '=')
4066         buffer->cur++, result->type = CPP_GREATER_EQ;
4067       else if (*buffer->cur == '>')
4068         {
4069           buffer->cur++;
4070           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4071         }
4072       break;
4073
4074     case '%':
4075       result->type = CPP_MOD;
4076       if (*buffer->cur == '=')
4077         buffer->cur++, result->type = CPP_MOD_EQ;
4078       else if (CPP_OPTION (pfile, digraphs))
4079         {
4080           if (*buffer->cur == ':')
4081             {
4082               buffer->cur++;
4083               result->flags |= DIGRAPH;
4084               result->type = CPP_HASH;
4085               if (*buffer->cur == '%' && buffer->cur[1] == ':')
4086                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
4087             }
4088           else if (*buffer->cur == '>')
4089             {
4090               buffer->cur++;
4091               result->flags |= DIGRAPH;
4092               result->type = CPP_CLOSE_BRACE;
4093             }
4094         }
4095       break;
4096
4097     case '.':
4098       result->type = CPP_DOT;
4099       if (ISDIGIT (*buffer->cur))
4100         {
4101           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4102           result->type = CPP_NUMBER;
4103           lex_number (pfile, &result->val.str, &nst);
4104           warn_about_normalization (pfile, result, &nst);
4105         }
4106       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4107         buffer->cur += 2, result->type = CPP_ELLIPSIS;
4108       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4109         buffer->cur++, result->type = CPP_DOT_STAR;
4110       break;
4111
4112     case '+':
4113       result->type = CPP_PLUS;
4114       if (*buffer->cur == '+')
4115         buffer->cur++, result->type = CPP_PLUS_PLUS;
4116       else if (*buffer->cur == '=')
4117         buffer->cur++, result->type = CPP_PLUS_EQ;
4118       break;
4119
4120     case '-':
4121       result->type = CPP_MINUS;
4122       if (*buffer->cur == '>')
4123         {
4124           buffer->cur++;
4125           result->type = CPP_DEREF;
4126           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4127             buffer->cur++, result->type = CPP_DEREF_STAR;
4128         }
4129       else if (*buffer->cur == '-')
4130         buffer->cur++, result->type = CPP_MINUS_MINUS;
4131       else if (*buffer->cur == '=')
4132         buffer->cur++, result->type = CPP_MINUS_EQ;
4133       break;
4134
4135     case '&':
4136       result->type = CPP_AND;
4137       if (*buffer->cur == '&')
4138         buffer->cur++, result->type = CPP_AND_AND;
4139       else if (*buffer->cur == '=')
4140         buffer->cur++, result->type = CPP_AND_EQ;
4141       break;
4142
4143     case '|':
4144       result->type = CPP_OR;
4145       if (*buffer->cur == '|')
4146         buffer->cur++, result->type = CPP_OR_OR;
4147       else if (*buffer->cur == '=')
4148         buffer->cur++, result->type = CPP_OR_EQ;
4149       break;
4150
4151     case ':':
4152       result->type = CPP_COLON;
4153       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
4154         buffer->cur++, result->type = CPP_SCOPE;
4155       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
4156         {
4157           buffer->cur++;
4158           result->flags |= DIGRAPH;
4159           result->type = CPP_CLOSE_SQUARE;
4160         }
4161       break;
4162
4163     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4164     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4165     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4166     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
4167     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
4168
4169     case '?': result->type = CPP_QUERY; break;
4170     case '~': result->type = CPP_COMPL; break;
4171     case ',': result->type = CPP_COMMA; break;
4172     case '(': result->type = CPP_OPEN_PAREN; break;
4173     case ')': result->type = CPP_CLOSE_PAREN; break;
4174     case '[': result->type = CPP_OPEN_SQUARE; break;
4175     case ']': result->type = CPP_CLOSE_SQUARE; break;
4176     case '{': result->type = CPP_OPEN_BRACE; break;
4177     case '}': result->type = CPP_CLOSE_BRACE; break;
4178     case ';': result->type = CPP_SEMICOLON; break;
4179
4180       /* @ is a punctuator in Objective-C.  */
4181     case '@': result->type = CPP_ATSIGN; break;
4182
4183     default:
4184       {
4185         const uchar *base = --buffer->cur;
4186         static int no_warn_cnt;
4187
4188         /* Check for an extended identifier ($ or UCN or UTF-8).  */
4189         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4190         if (forms_identifier_p (pfile, true, &nst))
4191           {
4192             result->type = CPP_NAME;
4193             result->val.node.node = lex_identifier (pfile, base, true, &nst,
4194                                                     &result->val.node.spelling);
4195             warn_about_normalization (pfile, result, &nst);
4196             break;
4197           }
4198
4199         /* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
4200            single token.  */
4201         buffer->cur++;
4202         if (c >= utf8_signifier)
4203           {
4204             const uchar *pstr = base;
4205             cppchar_t s;
4206             if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4207               {
4208                 if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4209                   {
4210                     buffer->cur = base;
4211                     _cpp_warn_invalid_utf8 (pfile);
4212                   }
4213                 buffer->cur = pstr;
4214               }
4215             else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4216               {
4217                 buffer->cur = base;
4218                 const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4219                 buffer->cur = base + 1;
4220                 no_warn_cnt = end - buffer->cur;
4221               }
4222           }
4223         else if (c >= utf8_continuation
4224                  && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4225           {
4226             if (no_warn_cnt)
4227               --no_warn_cnt;
4228             else
4229               {
4230                 buffer->cur = base;
4231                 _cpp_warn_invalid_utf8 (pfile);
4232                 buffer->cur = base + 1;
4233               }
4234           }
4235         create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4236         break;
4237       }
4238
4239     }
4240
4241   /* Potentially convert the location of the token to a range.  */
4242   if (result->src_loc >= RESERVED_LOCATION_COUNT
4243       && result->type != CPP_EOF)
4244     {
4245       /* Ensure that any line notes are processed, so that we have the
4246          correct physical line/column for the end-point of the token even
4247          when a logical line is split via one or more backslashes.  */
4248       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4249           && !pfile->overlaid_buffer)
4250         _cpp_process_line_notes (pfile, false);
4251
4252       source_range tok_range;
4253       tok_range.m_start = result->src_loc;
4254       tok_range.m_finish
4255         = linemap_position_for_column (pfile->line_table,
4256                                        CPP_BUF_COLUMN (buffer, buffer->cur));
4257
4258       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
4259                                                result->src_loc,
4260                                                tok_range, NULL, 0);
4261     }
4262
4263   return result;
4264 }
4265
4266 /* An upper bound on the number of bytes needed to spell TOKEN.
4267    Does not include preceding whitespace.  */
4268 unsigned int
4269 cpp_token_len (const cpp_token *token)
4270 {
4271   unsigned int len;
4272
4273   switch (TOKEN_SPELL (token))
4274     {
4275     default:            len = 6;                                break;
4276     case SPELL_LITERAL: len = token->val.str.len;               break;
4277     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
4278     }
4279
4280   return len;
4281 }
4282
4283 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4284    Return the number of bytes read out of NAME.  (There are always
4285    10 bytes written to BUFFER.)  */
4286
4287 static size_t
4288 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4289 {
4290   int j;
4291   int ucn_len = 0;
4292   int ucn_len_c;
4293   unsigned t;
4294   unsigned long utf32;
4295
4296   /* Compute the length of the UTF-8 sequence.  */
4297   for (t = *name; t & 0x80; t <<= 1)
4298     ucn_len++;
4299
4300   utf32 = *name & (0x7F >> ucn_len);
4301   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4302     {
4303       utf32 = (utf32 << 6) | (*++name & 0x3F);
4304
4305       /* Ill-formed UTF-8.  */
4306       if ((*name & ~0x3F) != 0x80)
4307         abort ();
4308     }
4309
4310   *buffer++ = '\\';
4311   *buffer++ = 'U';
4312   for (j = 7; j >= 0; j--)
4313     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4314   return ucn_len;
4315 }
4316
4317 /* Given a token TYPE corresponding to a digraph, return a pointer to
4318    the spelling of the digraph.  */
4319 static const unsigned char *
4320 cpp_digraph2name (enum cpp_ttype type)
4321 {
4322   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4323 }
4324
4325 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4326    The buffer must already contain the enough space to hold the
4327    token's spelling.  Returns a pointer to the character after the
4328    last character written.  */
4329 unsigned char *
4330 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4331 {
4332   size_t i;
4333   const unsigned char *name = NODE_NAME (ident);
4334
4335   for (i = 0; i < NODE_LEN (ident); i++)
4336     if (name[i] & ~0x7F)
4337       {
4338         i += utf8_to_ucn (buffer, name + i) - 1;
4339         buffer += 10;
4340       }
4341     else
4342       *buffer++ = name[i];
4343
4344   return buffer;
4345 }
4346
4347 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
4348    already contain the enough space to hold the token's spelling.
4349    Returns a pointer to the character after the last character written.
4350    FORSTRING is true if this is to be the spelling after translation
4351    phase 1 (with the original spelling of extended identifiers), false
4352    if extended identifiers should always be written using UCNs (there is
4353    no option for always writing them in the internal UTF-8 form).
4354    FIXME: Would be nice if we didn't need the PFILE argument.  */
4355 unsigned char *
4356 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4357                  unsigned char *buffer, bool forstring)
4358 {
4359   switch (TOKEN_SPELL (token))
4360     {
4361     case SPELL_OPERATOR:
4362       {
4363         const unsigned char *spelling;
4364         unsigned char c;
4365
4366         if (token->flags & DIGRAPH)
4367           spelling = cpp_digraph2name (token->type);
4368         else if (token->flags & NAMED_OP)
4369           goto spell_ident;
4370         else
4371           spelling = TOKEN_NAME (token);
4372
4373         while ((c = *spelling++) != '\0')
4374           *buffer++ = c;
4375       }
4376       break;
4377
4378     spell_ident:
4379     case SPELL_IDENT:
4380       if (forstring)
4381         {
4382           memcpy (buffer, NODE_NAME (token->val.node.spelling),
4383                   NODE_LEN (token->val.node.spelling));
4384           buffer += NODE_LEN (token->val.node.spelling);
4385         }
4386       else
4387         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4388       break;
4389
4390     case SPELL_LITERAL:
4391       memcpy (buffer, token->val.str.text, token->val.str.len);
4392       buffer += token->val.str.len;
4393       break;
4394
4395     case SPELL_NONE:
4396       cpp_error (pfile, CPP_DL_ICE,
4397                  "unspellable token %s", TOKEN_NAME (token));
4398       break;
4399     }
4400
4401   return buffer;
4402 }
4403
4404 /* Returns TOKEN spelt as a null-terminated string.  The string is
4405    freed when the reader is destroyed.  Useful for diagnostics.  */
4406 unsigned char *
4407 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4408 {
4409   unsigned int len = cpp_token_len (token) + 1;
4410   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4411
4412   end = cpp_spell_token (pfile, token, start, false);
4413   end[0] = '\0';
4414
4415   return start;
4416 }
4417
4418 /* Returns a pointer to a string which spells the token defined by
4419    TYPE and FLAGS.  Used by C front ends, which really should move to
4420    using cpp_token_as_text.  */
4421 const char *
4422 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4423 {
4424   if (flags & DIGRAPH)
4425     return (const char *) cpp_digraph2name (type);
4426   else if (flags & NAMED_OP)
4427     return cpp_named_operator2name (type);
4428
4429   return (const char *) token_spellings[type].name;
4430 }
4431
4432 /* Writes the spelling of token to FP, without any preceding space.
4433    Separated from cpp_spell_token for efficiency - to avoid stdio
4434    double-buffering.  */
4435 void
4436 cpp_output_token (const cpp_token *token, FILE *fp)
4437 {
4438   switch (TOKEN_SPELL (token))
4439     {
4440     case SPELL_OPERATOR:
4441       {
4442         const unsigned char *spelling;
4443         int c;
4444
4445         if (token->flags & DIGRAPH)
4446           spelling = cpp_digraph2name (token->type);
4447         else if (token->flags & NAMED_OP)
4448           goto spell_ident;
4449         else
4450           spelling = TOKEN_NAME (token);
4451
4452         c = *spelling;
4453         do
4454           putc (c, fp);
4455         while ((c = *++spelling) != '\0');
4456       }
4457       break;
4458
4459     spell_ident:
4460     case SPELL_IDENT:
4461       {
4462         size_t i;
4463         const unsigned char * name = NODE_NAME (token->val.node.node);
4464
4465         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4466           if (name[i] & ~0x7F)
4467             {
4468               unsigned char buffer[10];
4469               i += utf8_to_ucn (buffer, name + i) - 1;
4470               fwrite (buffer, 1, 10, fp);
4471             }
4472           else
4473             fputc (NODE_NAME (token->val.node.node)[i], fp);
4474       }
4475       break;
4476
4477     case SPELL_LITERAL:
4478       if (token->type == CPP_HEADER_NAME)
4479         fputc ('"', fp);
4480       fwrite (token->val.str.text, 1, token->val.str.len, fp);
4481       if (token->type == CPP_HEADER_NAME)
4482         fputc ('"', fp);
4483       break;
4484
4485     case SPELL_NONE:
4486       /* An error, most probably.  */
4487       break;
4488     }
4489 }
4490
4491 /* Compare two tokens.  */
4492 int
4493 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4494 {
4495   if (a->type == b->type && a->flags == b->flags)
4496     switch (TOKEN_SPELL (a))
4497       {
4498       default:                  /* Keep compiler happy.  */
4499       case SPELL_OPERATOR:
4500         /* token_no is used to track where multiple consecutive ##
4501            tokens were originally located.  */
4502         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4503       case SPELL_NONE:
4504         return (a->type != CPP_MACRO_ARG
4505                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4506                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4507       case SPELL_IDENT:
4508         return (a->val.node.node == b->val.node.node
4509                 && a->val.node.spelling == b->val.node.spelling);
4510       case SPELL_LITERAL:
4511         return (a->val.str.len == b->val.str.len
4512                 && !memcmp (a->val.str.text, b->val.str.text,
4513                             a->val.str.len));
4514       }
4515
4516   return 0;
4517 }
4518
4519 /* Returns nonzero if a space should be inserted to avoid an
4520    accidental token paste for output.  For simplicity, it is
4521    conservative, and occasionally advises a space where one is not
4522    needed, e.g. "." and ".2".  */
4523 int
4524 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4525                  const cpp_token *token2)
4526 {
4527   enum cpp_ttype a = token1->type, b = token2->type;
4528   cppchar_t c;
4529
4530   if (token1->flags & NAMED_OP)
4531     a = CPP_NAME;
4532   if (token2->flags & NAMED_OP)
4533     b = CPP_NAME;
4534
4535   c = EOF;
4536   if (token2->flags & DIGRAPH)
4537     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4538   else if (token_spellings[b].category == SPELL_OPERATOR)
4539     c = token_spellings[b].name[0];
4540
4541   /* Quickly get everything that can paste with an '='.  */
4542   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4543     return 1;
4544
4545   switch (a)
4546     {
4547     case CPP_GREATER:   return c == '>';
4548     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
4549     case CPP_PLUS:      return c == '+';
4550     case CPP_MINUS:     return c == '-' || c == '>';
4551     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
4552     case CPP_MOD:       return c == ':' || c == '>';
4553     case CPP_AND:       return c == '&';
4554     case CPP_OR:        return c == '|';
4555     case CPP_COLON:     return c == ':' || c == '>';
4556     case CPP_DEREF:     return c == '*';
4557     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
4558     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
4559     case CPP_PRAGMA:
4560     case CPP_NAME:      return ((b == CPP_NUMBER
4561                                  && name_p (pfile, &token2->val.str))
4562                                 || b == CPP_NAME
4563                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
4564     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
4565                                 || b == CPP_CHAR
4566                                 || c == '.' || c == '+' || c == '-');
4567                                       /* UCNs */
4568     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
4569                                  && b == CPP_NAME)
4570                                 || (CPP_OPTION (pfile, objc)
4571                                     && token1->val.str.text[0] == '@'
4572                                     && (b == CPP_NAME || b == CPP_STRING)));
4573     case CPP_LESS_EQ:   return c == '>';
4574     case CPP_STRING:
4575     case CPP_WSTRING:
4576     case CPP_UTF8STRING:
4577     case CPP_STRING16:
4578     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
4579                                 && (b == CPP_NAME
4580                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
4581                                         && ISIDST (token2->val.str.text[0]))));
4582
4583     default:            break;
4584     }
4585
4586   return 0;
4587 }
4588
4589 /* Output all the remaining tokens on the current line, and a newline
4590    character, to FP.  Leading whitespace is removed.  If there are
4591    macros, special token padding is not performed.  */
4592 void
4593 cpp_output_line (cpp_reader *pfile, FILE *fp)
4594 {
4595   const cpp_token *token;
4596
4597   token = cpp_get_token (pfile);
4598   while (token->type != CPP_EOF)
4599     {
4600       cpp_output_token (token, fp);
4601       token = cpp_get_token (pfile);
4602       if (token->flags & PREV_WHITE)
4603         putc (' ', fp);
4604     }
4605
4606   putc ('\n', fp);
4607 }
4608
4609 /* Return a string representation of all the remaining tokens on the
4610    current line.  The result is allocated using xmalloc and must be
4611    freed by the caller.  */
4612 unsigned char *
4613 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4614 {
4615   const cpp_token *token;
4616   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4617   unsigned int alloced = 120 + out;
4618   unsigned char *result = (unsigned char *) xmalloc (alloced);
4619
4620   /* If DIR_NAME is empty, there are no initial contents.  */
4621   if (dir_name)
4622     {
4623       sprintf ((char *) result, "#%s ", dir_name);
4624       out += 2;
4625     }
4626
4627   token = cpp_get_token (pfile);
4628   while (token->type != CPP_EOF)
4629     {
4630       unsigned char *last;
4631       /* Include room for a possible space and the terminating nul.  */
4632       unsigned int len = cpp_token_len (token) + 2;
4633
4634       if (out + len > alloced)
4635         {
4636           alloced *= 2;
4637           if (out + len > alloced)
4638             alloced = out + len;
4639           result = (unsigned char *) xrealloc (result, alloced);
4640         }
4641
4642       last = cpp_spell_token (pfile, token, &result[out], 0);
4643       out = last - result;
4644
4645       token = cpp_get_token (pfile);
4646       if (token->flags & PREV_WHITE)
4647         result[out++] = ' ';
4648     }
4649
4650   result[out] = '\0';
4651   return result;
4652 }
4653
4654 /* Memory buffers.  Changing these three constants can have a dramatic
4655    effect on performance.  The values here are reasonable defaults,
4656    but might be tuned.  If you adjust them, be sure to test across a
4657    range of uses of cpplib, including heavy nested function-like macro
4658    expansion.  Also check the change in peak memory usage (NJAMD is a
4659    good tool for this).  */
4660 #define MIN_BUFF_SIZE 8000
4661 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4662 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4663         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4664
4665 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4666   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4667 #endif
4668
4669 /* Create a new allocation buffer.  Place the control block at the end
4670    of the buffer, so that buffer overflows will cause immediate chaos.  */
4671 static _cpp_buff *
4672 new_buff (size_t len)
4673 {
4674   _cpp_buff *result;
4675   unsigned char *base;
4676
4677   if (len < MIN_BUFF_SIZE)
4678     len = MIN_BUFF_SIZE;
4679   len = CPP_ALIGN (len);
4680
4681 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4682   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4683      struct first.  */
4684   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4685   base = XNEWVEC (unsigned char, len + slen);
4686   result = (_cpp_buff *) base;
4687   base += slen;
4688 #else
4689   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4690   result = (_cpp_buff *) (base + len);
4691 #endif
4692   result->base = base;
4693   result->cur = base;
4694   result->limit = base + len;
4695   result->next = NULL;
4696   return result;
4697 }
4698
4699 /* Place a chain of unwanted allocation buffers on the free list.  */
4700 void
4701 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4702 {
4703   _cpp_buff *end = buff;
4704
4705   while (end->next)
4706     end = end->next;
4707   end->next = pfile->free_buffs;
4708   pfile->free_buffs = buff;
4709 }
4710
4711 /* Return a free buffer of size at least MIN_SIZE.  */
4712 _cpp_buff *
4713 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4714 {
4715   _cpp_buff *result, **p;
4716
4717   for (p = &pfile->free_buffs;; p = &(*p)->next)
4718     {
4719       size_t size;
4720
4721       if (*p == NULL)
4722         return new_buff (min_size);
4723       result = *p;
4724       size = result->limit - result->base;
4725       /* Return a buffer that's big enough, but don't waste one that's
4726          way too big.  */
4727       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4728         break;
4729     }
4730
4731   *p = result->next;
4732   result->next = NULL;
4733   result->cur = result->base;
4734   return result;
4735 }
4736
4737 /* Creates a new buffer with enough space to hold the uncommitted
4738    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
4739    the excess bytes to the new buffer.  Chains the new buffer after
4740    BUFF, and returns the new buffer.  */
4741 _cpp_buff *
4742 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4743 {
4744   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4745   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4746
4747   buff->next = new_buff;
4748   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4749   return new_buff;
4750 }
4751
4752 /* Creates a new buffer with enough space to hold the uncommitted
4753    remaining bytes of the buffer pointed to by BUFF, and at least
4754    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
4755    Chains the new buffer before the buffer pointed to by BUFF, and
4756    updates the pointer to point to the new buffer.  */
4757 void
4758 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4759 {
4760   _cpp_buff *new_buff, *old_buff = *pbuff;
4761   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4762
4763   new_buff = _cpp_get_buff (pfile, size);
4764   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4765   new_buff->next = old_buff;
4766   *pbuff = new_buff;
4767 }
4768
4769 /* Free a chain of buffers starting at BUFF.  */
4770 void
4771 _cpp_free_buff (_cpp_buff *buff)
4772 {
4773   _cpp_buff *next;
4774
4775   for (; buff; buff = next)
4776     {
4777       next = buff->next;
4778 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4779       free (buff);
4780 #else
4781       free (buff->base);
4782 #endif
4783     }
4784 }
4785
4786 /* Allocate permanent, unaligned storage of length LEN.  */
4787 unsigned char *
4788 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4789 {
4790   _cpp_buff *buff = pfile->u_buff;
4791   unsigned char *result = buff->cur;
4792
4793   if (len > (size_t) (buff->limit - result))
4794     {
4795       buff = _cpp_get_buff (pfile, len);
4796       buff->next = pfile->u_buff;
4797       pfile->u_buff = buff;
4798       result = buff->cur;
4799     }
4800
4801   buff->cur = result + len;
4802   return result;
4803 }
4804
4805 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4806    That buffer is used for growing allocations when saving macro
4807    replacement lists in a #define, and when parsing an answer to an
4808    assertion in #assert, #unassert or #if (and therefore possibly
4809    whilst expanding macros).  It therefore must not be used by any
4810    code that they might call: specifically the lexer and the guts of
4811    the macro expander.
4812
4813    All existing other uses clearly fit this restriction: storing
4814    registered pragmas during initialization.  */
4815 unsigned char *
4816 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4817 {
4818   _cpp_buff *buff = pfile->a_buff;
4819   unsigned char *result = buff->cur;
4820
4821   if (len > (size_t) (buff->limit - result))
4822     {
4823       buff = _cpp_get_buff (pfile, len);
4824       buff->next = pfile->a_buff;
4825       pfile->a_buff = buff;
4826       result = buff->cur;
4827     }
4828
4829   buff->cur = result + len;
4830   return result;
4831 }
4832
4833 /* Commit or allocate storage from a buffer.  */
4834
4835 void *
4836 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4837 {
4838   void *ptr = BUFF_FRONT (pfile->a_buff);
4839
4840   if (pfile->hash_table->alloc_subobject)
4841     {
4842       void *copy = pfile->hash_table->alloc_subobject (size);
4843       memcpy (copy, ptr, size);
4844       ptr = copy;
4845     }
4846   else
4847     BUFF_FRONT (pfile->a_buff) += size;
4848
4849   return ptr;
4850 }
4851
4852 /* Say which field of TOK is in use.  */
4853
4854 enum cpp_token_fld_kind
4855 cpp_token_val_index (const cpp_token *tok)
4856 {
4857   switch (TOKEN_SPELL (tok))
4858     {
4859     case SPELL_IDENT:
4860       return CPP_TOKEN_FLD_NODE;
4861     case SPELL_LITERAL:
4862       return CPP_TOKEN_FLD_STR;
4863     case SPELL_OPERATOR:
4864       /* Operands which were originally spelled as ident keep around
4865          the node for the exact spelling.  */
4866       if (tok->flags & NAMED_OP)
4867         return CPP_TOKEN_FLD_NODE;
4868       else if (tok->type == CPP_PASTE)
4869         return CPP_TOKEN_FLD_TOKEN_NO;
4870       else
4871         return CPP_TOKEN_FLD_NONE;
4872     case SPELL_NONE:
4873       if (tok->type == CPP_MACRO_ARG)
4874         return CPP_TOKEN_FLD_ARG_NO;
4875       else if (tok->type == CPP_PADDING)
4876         return CPP_TOKEN_FLD_SOURCE;
4877       else if (tok->type == CPP_PRAGMA)
4878         return CPP_TOKEN_FLD_PRAGMA;
4879       /* fall through */
4880     default:
4881       return CPP_TOKEN_FLD_NONE;
4882     }
4883 }
4884
4885 /* All tokens lexed in R after calling this function will be forced to
4886    have their location_t to be P, until
4887    cpp_stop_forcing_token_locations is called for R.  */
4888
4889 void
4890 cpp_force_token_locations (cpp_reader *r, location_t loc)
4891 {
4892   r->forced_token_location = loc;
4893 }
4894
4895 /* Go back to assigning locations naturally for lexed tokens.  */
4896
4897 void
4898 cpp_stop_forcing_token_locations (cpp_reader *r)
4899 {
4900   r->forced_token_location = 0;
4901 }
4902
4903 /* We're looking at \, if it's escaping EOL, look past it.  If at
4904    LIMIT, don't advance.  */
4905
4906 static const unsigned char *
4907 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4908 {
4909   const unsigned char *probe = peek;
4910
4911   if (__builtin_expect (peek[1] == '\n', true))
4912     {
4913     eol:
4914       probe += 2;
4915       if (__builtin_expect (probe < limit, true))
4916         {
4917           peek = probe;
4918           if (*peek == '\\')
4919             /* The user might be perverse.  */
4920             return do_peek_backslash (peek, limit);
4921         }
4922     }
4923   else if (__builtin_expect (peek[1] == '\r', false))
4924     {
4925       if (probe[2] == '\n')
4926         probe++;
4927       goto eol;
4928     }
4929
4930   return peek;
4931 }
4932
4933 static const unsigned char *
4934 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4935 {
4936   if (__builtin_expect (*peek == '\\', false))
4937     peek = do_peek_backslash (peek, limit);
4938   return peek;
4939 }
4940
4941 static const unsigned char *
4942 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4943 {
4944   if (peek == bound)
4945     return NULL;
4946
4947   unsigned char c = *--peek;
4948   if (__builtin_expect (c == '\n', false)
4949       || __builtin_expect (c == 'r', false))
4950     {
4951       if (peek == bound)
4952         return peek;
4953       int ix = -1;
4954       if (c == '\n' && peek[ix] == '\r')
4955         {
4956           if (peek + ix == bound)
4957             return peek;
4958           ix--;
4959         }
4960
4961       if (peek[ix] == '\\')
4962         return do_peek_prev (peek + ix, bound);
4963
4964       return peek;
4965     }
4966   else
4967     return peek;
4968 }
4969
4970 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4971    space.  Otherwise return NULL.  */
4972
4973 static const unsigned char *
4974 do_peek_ident (const char *match, const unsigned char *peek,
4975                const unsigned char *limit)
4976 {
4977   for (; *++match; peek++)
4978     if (*peek != *match)
4979       {
4980         peek = do_peek_next (peek, limit);
4981         if (*peek != *match)
4982           return NULL;
4983       }
4984
4985   /* Must now not be looking at an identifier char.  */
4986   peek = do_peek_next (peek, limit);
4987   if (ISIDNUM (*peek))
4988     return NULL;
4989
4990   /* Skip control-line whitespace.  */
4991  ws:
4992   while (*peek == ' ' || *peek == '\t')
4993     peek++;
4994   if (__builtin_expect (*peek == '\\', false))
4995     {
4996       peek = do_peek_backslash (peek, limit);
4997       if (*peek != '\\')
4998         goto ws;
4999     }
5000
5001   return peek;
5002 }
5003
5004 /* Are we looking at a module control line starting as PEEK - 1?  */
5005
5006 static bool
5007 do_peek_module (cpp_reader *pfile, unsigned char c,
5008                 const unsigned char *peek, const unsigned char *limit)
5009 {
5010   bool import = false;
5011
5012   if (__builtin_expect (c == 'e', false))
5013     {
5014       if (!((peek[0] == 'x' || peek[0] == '\\')
5015             && (peek = do_peek_ident ("export", peek, limit))))
5016         return false;
5017
5018       /* export, peek for import or module.  No need to peek __import
5019          here.  */
5020       if (peek[0] == 'i')
5021         {
5022           if (!((peek[1] == 'm' || peek[1] == '\\')
5023                 && (peek = do_peek_ident ("import", peek + 1, limit))))
5024             return false;
5025           import = true;
5026         }
5027       else if (peek[0] == 'm')
5028         {
5029           if (!((peek[1] == 'o' || peek[1] == '\\')
5030                 && (peek = do_peek_ident ("module", peek + 1, limit))))
5031             return false;
5032         }
5033       else
5034         return false;
5035     }
5036   else if (__builtin_expect (c == 'i', false))
5037     {
5038       if (!((peek[0] == 'm' || peek[0] == '\\')
5039             && (peek = do_peek_ident ("import", peek, limit))))
5040         return false;
5041       import = true;
5042     }
5043   else if (__builtin_expect (c == '_', false))
5044     {
5045       /* Needed for translated includes.   */
5046       if (!((peek[0] == '_' || peek[0] == '\\')
5047             && (peek = do_peek_ident ("__import", peek, limit))))
5048         return false;
5049       import = true;
5050     }
5051   else if (__builtin_expect (c == 'm', false))
5052     {
5053       if (!((peek[0] == 'o' || peek[0] == '\\')
5054             && (peek = do_peek_ident ("module", peek, limit))))
5055         return false;
5056     }
5057   else
5058     return false;
5059
5060   /* Peek the next character to see if it's good enough.  We'll be at
5061      the first non-whitespace char, including skipping an escaped
5062      newline.  */
5063   /* ... import followed by identifier, ':', '<' or header-name
5064      preprocessing tokens, or module followed by identifier, ':' or
5065      ';' preprocessing tokens.  */
5066   unsigned char p = *peek++;
5067
5068   /* A character literal is ... single quotes, ... optionally preceded
5069      by u8, u, U, or L */
5070   /* A string-literal is a ... double quotes, optionally prefixed by
5071      R, u8, u8R, u, uR, U, UR, L, or LR */
5072   if (p == 'u')
5073     {
5074       peek = do_peek_next (peek, limit);
5075       if (*peek == '8')
5076         {
5077           peek++;
5078           goto peek_u8;
5079         }
5080       goto peek_u;
5081     }
5082   else if (p == 'U' || p == 'L')
5083     {
5084     peek_u8:
5085       peek = do_peek_next (peek, limit);
5086     peek_u:
5087       if (*peek == '\"' || *peek == '\'')
5088         return false;
5089
5090       if (*peek == 'R')
5091         goto peek_R;
5092       /* Identifier. Ok.  */
5093     }
5094   else if (p == 'R')
5095     {
5096     peek_R:
5097       if (CPP_OPTION (pfile, rliterals))
5098         {
5099           peek = do_peek_next (peek, limit);
5100           if (*peek == '\"')
5101             return false;
5102         }
5103       /* Identifier. Ok.  */
5104     }
5105   else if ('Z' - 'A' == 25
5106            ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5107            : ISIDST (p))
5108     {
5109       /* Identifier.  Ok. */
5110     }
5111   else if (p == '<')
5112     {
5113       /* Maybe angle header, ok for import.  Reject
5114          '<=', '<<' digraph:'<:'.  */
5115       if (!import)
5116         return false;
5117       peek = do_peek_next (peek, limit);
5118       if (*peek == '=' || *peek == '<'
5119           || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5120         return false;
5121     }
5122   else if (p == ';')
5123     {
5124       /* SEMICOLON, ok for module.  */
5125       if (import)
5126         return false;
5127     }
5128   else if (p == '"')
5129     {
5130       /* STRING, ok for import.  */
5131       if (!import)
5132         return false;
5133     }
5134   else if (p == ':')
5135     {
5136       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
5137       peek = do_peek_next (peek, limit);
5138       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5139         return false;
5140     }
5141   else
5142     /* FIXME: Detect a unicode character, excluding those not
5143        permitted as the initial character. [lex.name]/1.  I presume
5144        we need to check the \[uU] spellings, and directly using
5145        Unicode in say UTF8 form?  Or perhaps we do the phase-1
5146        conversion of UTF8 to universal-character-names?  */
5147     return false;
5148
5149   return true;
5150 }
5151
5152 /* Directives-only scanning.  Somewhat more relaxed than correct
5153    parsing -- some ill-formed programs will not be rejected.  */
5154
5155 void
5156 cpp_directive_only_process (cpp_reader *pfile,
5157                             void *data,
5158                             void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5159 {
5160   bool module_p = CPP_OPTION (pfile, module_directives);
5161
5162   do
5163     {
5164     restart:
5165       /* Buffer initialization, but no line cleaning. */
5166       cpp_buffer *buffer = pfile->buffer;
5167       buffer->cur_note = buffer->notes_used = 0;
5168       buffer->cur = buffer->line_base = buffer->next_line;
5169       buffer->need_line = false;
5170       /* Files always end in a newline or carriage return.  We rely on this for
5171          character peeking safety.  */
5172       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
5173
5174       const unsigned char *base = buffer->cur;
5175       unsigned line_count = 0;
5176       const unsigned char *line_start = base;
5177
5178       bool bol = true;
5179       bool raw = false;
5180
5181       const unsigned char *lwm = base;
5182       for (const unsigned char *pos = base, *limit = buffer->rlimit;
5183            pos < limit;)
5184         {
5185           unsigned char c = *pos++;
5186           /* This matches the switch in _cpp_lex_direct.  */
5187           switch (c)
5188             {
5189             case ' ': case '\t': case '\f': case '\v':
5190               /* Whitespace, do nothing.  */
5191               break;
5192
5193             case '\r': /* MAC line ending, or Windows \r\n  */
5194               if (*pos == '\n')
5195                 pos++;
5196               /* FALLTHROUGH */
5197
5198             case '\n':
5199               bol = true;
5200
5201             next_line:
5202               CPP_INCREMENT_LINE (pfile, 0);
5203               line_count++;
5204               line_start = pos;
5205               break;
5206
5207             case '\\':
5208               /* <backslash><newline> is removed, and doesn't undo any
5209                  preceeding escape or whatnot.  */
5210               if (*pos == '\n')
5211                 {
5212                   pos++;
5213                   goto next_line;
5214                 }
5215               else if (*pos == '\r')
5216                 {
5217                   if (pos[1] == '\n')
5218                     pos++;
5219                   pos++;
5220                   goto next_line;
5221                 }
5222               goto dflt;
5223
5224             case '#':
5225               if (bol)
5226                 {
5227                   /* Line directive.  */
5228                   if (pos - 1 > base && !pfile->state.skipping)
5229                     cb (pfile, CPP_DO_print, data,
5230                         line_count, base, pos - 1 - base);
5231
5232                   /* Prep things for directive handling. */
5233                   buffer->next_line = pos;
5234                   buffer->need_line = true;
5235                   bool ok = _cpp_get_fresh_line (pfile);
5236                   gcc_checking_assert (ok);
5237
5238                   /* Ensure proper column numbering for generated
5239                      error messages. */
5240                   buffer->line_base -= pos - line_start;
5241
5242                   _cpp_handle_directive (pfile, line_start + 1 != pos);
5243
5244                   /* Sanitize the line settings.  Duplicate #include's can
5245                      mess things up. */
5246                   // FIXME: Necessary?
5247                   pfile->line_table->highest_location
5248                     = pfile->line_table->highest_line;
5249
5250                   if (!pfile->state.skipping
5251                       && pfile->buffer->next_line < pfile->buffer->rlimit)
5252                     cb (pfile, CPP_DO_location, data,
5253                         pfile->line_table->highest_line);
5254
5255                   goto restart;
5256                 }
5257               goto dflt;
5258
5259             case '/':
5260               {
5261                 const unsigned char *peek = do_peek_next (pos, limit);
5262                 if (!(*peek == '/' || *peek == '*'))
5263                   goto dflt;
5264
5265                 /* Line or block comment  */
5266                 bool is_block = *peek == '*';
5267                 bool star = false;
5268                 bool esc = false;
5269                 location_t sloc
5270                   = linemap_position_for_column (pfile->line_table,
5271                                                  pos - line_start);
5272
5273                 while (pos < limit)
5274                   {
5275                     char c = *pos++;
5276                     switch (c)
5277                       {
5278                       case '\\':
5279                         esc = true;
5280                         break;
5281
5282                       case '\r':
5283                         if (*pos == '\n')
5284                           pos++;
5285                         /* FALLTHROUGH  */
5286
5287                       case '\n':
5288                         {
5289                           CPP_INCREMENT_LINE (pfile, 0);
5290                           line_count++;
5291                           line_start = pos;
5292                           if (!esc && !is_block)
5293                             {
5294                               bol = true;
5295                               goto done_comment;
5296                             }
5297                         }
5298                         if (!esc)
5299                           star = false;
5300                         esc = false;
5301                         break;
5302
5303                       case '*':
5304                         if (pos > peek)
5305                           star = is_block;
5306                         esc = false;
5307                         break;
5308
5309                       case '/':
5310                         if (star)
5311                           goto done_comment;
5312                         /* FALLTHROUGH  */
5313
5314                       default:
5315                         star = false;
5316                         esc = false;
5317                         break;
5318                       }
5319                   }
5320                 if (pos < limit || is_block)
5321                   cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5322                                        "unterminated comment");
5323               done_comment:
5324                 lwm = pos;
5325                 break;
5326               }
5327
5328             case '\'':
5329               if (!CPP_OPTION (pfile, digit_separators))
5330                 goto delimited_string;
5331
5332               /* Possibly a number punctuator.  */
5333               if (!ISIDNUM (*do_peek_next (pos, limit)))
5334                 goto delimited_string;
5335
5336               goto quote_peek;
5337
5338             case '\"':
5339               if (!CPP_OPTION (pfile, rliterals))
5340                 goto delimited_string;
5341
5342             quote_peek:
5343               {
5344                 /* For ' see if it's a number punctuator
5345                    \.?<digit>(<digit>|<identifier-nondigit>
5346                    |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5347                 /* For " see if it's a raw string
5348                    {U,L,u,u8}R.  This includes CPP_NUMBER detection,
5349                    because that could be 0e+R.  */
5350                 const unsigned char *peek = pos - 1;
5351                 bool quote_first = c == '"';
5352                 bool quote_eight = false;
5353                 bool maybe_number_start = false;
5354                 bool want_number = false;
5355
5356                 while ((peek = do_peek_prev (peek, lwm)))
5357                   {
5358                     unsigned char p = *peek;
5359                     if (quote_first)
5360                       {
5361                         if (!raw)
5362                           {
5363                             if (p != 'R')
5364                               break;
5365                             raw = true;
5366                             continue;
5367                           }
5368
5369                         quote_first = false;
5370                         if (p == 'L' || p == 'U' || p == 'u')
5371                           ;
5372                         else if (p == '8')
5373                           quote_eight = true;
5374                         else
5375                           goto second_raw;
5376                       }
5377                     else if (quote_eight)
5378                       {
5379                         if (p != 'u')
5380                           {
5381                             raw = false;
5382                             break;
5383                           }
5384                         quote_eight = false;
5385                       }
5386                     else if (c == '"')
5387                       {
5388                       second_raw:;
5389                         if (!want_number && ISIDNUM (p))
5390                           {
5391                             raw = false;
5392                             break;
5393                           }
5394                       }
5395
5396                     if (ISDIGIT (p))
5397                       maybe_number_start = true;
5398                     else if (p == '.')
5399                       want_number = true;
5400                     else if (ISIDNUM (p))
5401                       maybe_number_start = false;
5402                     else if (p == '+' || p == '-')
5403                       {
5404                         if (const unsigned char *peek_prev
5405                             = do_peek_prev (peek, lwm))
5406                           {
5407                             p = *peek_prev;
5408                             if (p == 'e' || p == 'E'
5409                                 || p == 'p' || p == 'P')
5410                               {
5411                                 want_number = true;
5412                                 maybe_number_start = false;
5413                               }
5414                             else
5415                               break;
5416                           }
5417                         else
5418                           break;
5419                       }
5420                     else if (p == '\'' || p == '\"')
5421                       {
5422                         /* If this is lwm, this must be the end of a
5423                            previous string.  So this is a trailing
5424                            literal type, (a) if those are allowed,
5425                              and (b) maybe_start is false.  Otherwise
5426                              this must be a CPP_NUMBER because we've
5427                              met another ', and we'd have checked that
5428                              in its own right.  */
5429                         if (peek == lwm && CPP_OPTION (pfile, uliterals))
5430                           {
5431                             if  (!maybe_number_start && !want_number)
5432                               /* Must be a literal type.  */
5433                               raw = false;
5434                           }
5435                         else if (p == '\''
5436                                  && CPP_OPTION (pfile, digit_separators))
5437                           maybe_number_start = true;
5438                         break;
5439                       }
5440                     else if (c == '\'')
5441                       break;
5442                     else if (!quote_first && !quote_eight)
5443                       break;
5444                   }
5445
5446                 if (maybe_number_start)
5447                   {
5448                     if (c == '\'')
5449                       /* A CPP NUMBER.  */
5450                       goto dflt;
5451                     raw = false;
5452                   }
5453
5454                 goto delimited_string;
5455               }
5456
5457             delimited_string:
5458               {
5459                 /* (Possibly raw) string or char literal.  */
5460                 unsigned char end = c;
5461                 int delim_len = -1;
5462                 const unsigned char *delim = NULL;
5463                 location_t sloc = linemap_position_for_column (pfile->line_table,
5464                                                                pos - line_start);
5465                 int esc = 0;
5466
5467                 if (raw)
5468                   {
5469                     /* There can be no line breaks in the delimiter.  */
5470                     delim = pos;
5471                     for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5472                       {
5473                         if (delim_len == 16)
5474                           {
5475                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5476                                                  sloc, 0,
5477                                                  "raw string delimiter"
5478                                                  " longer than %d"
5479                                                  " characters",
5480                                                  delim_len);
5481                             raw = false;
5482                             pos = delim;
5483                             break;
5484                           }
5485                         if (strchr (") \\\t\v\f\n", c))
5486                           {
5487                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5488                                                  sloc, 0,
5489                                                  "invalid character '%c'"
5490                                                  " in raw string"
5491                                                  " delimiter", c);
5492                             raw = false;
5493                             pos = delim;
5494                             break;
5495                           }
5496                         if (pos >= limit)
5497                           goto bad_string;
5498                       }
5499                   }
5500
5501                 while (pos < limit)
5502                   {
5503                     char c = *pos++;
5504                     switch (c)
5505                       {
5506                       case '\\':
5507                         if (!raw)
5508                           esc++;
5509                         break;
5510
5511                       case '\r':
5512                         if (*pos == '\n')
5513                           pos++;
5514                         /* FALLTHROUGH  */
5515
5516                       case '\n':
5517                         {
5518                           CPP_INCREMENT_LINE (pfile, 0);
5519                           line_count++;
5520                           line_start = pos;
5521                         }
5522                         if (esc)
5523                           esc--;
5524                         break;
5525
5526                       case ')':
5527                         if (raw
5528                             && pos + delim_len + 1 < limit
5529                             && pos[delim_len] == end
5530                             && !memcmp (delim, pos, delim_len))
5531                           {
5532                             pos += delim_len + 1;
5533                             raw = false;
5534                             goto done_string;
5535                           }
5536                         break;
5537
5538                       default:
5539                         if (!raw && !(esc & 1) && c == end)
5540                           goto done_string;
5541                         esc = 0;
5542                         break;
5543                       }
5544                   }
5545               bad_string:
5546                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5547                                      "unterminated literal");
5548
5549               done_string:
5550                 raw = false;
5551                 lwm = pos - 1;
5552               }
5553               goto dflt;
5554
5555             case '_':
5556             case 'e':
5557             case 'i':
5558             case 'm':
5559               if (bol && module_p && !pfile->state.skipping
5560                   && do_peek_module (pfile, c, pos, limit))
5561                 {
5562                   /* We've seen the start of a module control line.
5563                      Start up the tokenizer.  */
5564                   pos--; /* Backup over the first character.  */
5565
5566                   /* Backup over whitespace to start of line.  */
5567                   while (pos > line_start
5568                          && (pos[-1] == ' ' || pos[-1] == '\t'))
5569                     pos--;
5570
5571                   if (pos > base)
5572                     cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5573
5574                   /* Prep things for directive handling. */
5575                   buffer->next_line = pos;
5576                   buffer->need_line = true;
5577
5578                   /* Now get tokens until the PRAGMA_EOL.  */
5579                   do
5580                     {
5581                       location_t spelling;
5582                       const cpp_token *tok
5583                         = cpp_get_token_with_location (pfile, &spelling);
5584
5585                       gcc_assert (pfile->state.in_deferred_pragma
5586                                   || tok->type == CPP_PRAGMA_EOL);
5587                       cb (pfile, CPP_DO_token, data, tok, spelling);
5588                     }
5589                   while (pfile->state.in_deferred_pragma);
5590
5591                   if (pfile->buffer->next_line < pfile->buffer->rlimit)
5592                     cb (pfile, CPP_DO_location, data,
5593                         pfile->line_table->highest_line);
5594
5595                   pfile->mi_valid = false;
5596                   goto restart;
5597                 }
5598               goto dflt;
5599
5600             default:
5601             dflt:
5602               bol = false;
5603               pfile->mi_valid = false;
5604               break;
5605             }
5606         }
5607
5608       if (buffer->rlimit > base && !pfile->state.skipping)
5609         {
5610           const unsigned char *limit = buffer->rlimit;
5611           /* If the file was not newline terminated, add rlimit, which is
5612              guaranteed to point to a newline, to the end of our range.  */
5613           if (limit[-1] != '\n')
5614             {
5615               limit++;
5616               CPP_INCREMENT_LINE (pfile, 0);
5617               line_count++;
5618             }
5619           cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5620         }
5621
5622       _cpp_pop_buffer (pfile);
5623     }
5624   while (pfile->buffer);
5625 }