libcpp/lex.cc

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2023 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive.  */
  54 #define UCS_LIMIT 0x10FFFF
  55
  56 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  57 static int skip_line_comment (cpp_reader *);
  58 static void skip_whitespace (cpp_reader *, cppchar_t);
  59 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  60 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  61 static void store_comment (cpp_reader *, cpp_token *);
  62 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  63                             unsigned int, enum cpp_ttype);
  64 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  65 static int name_p (cpp_reader *, const cpp_string *);
  66 static tokenrun *next_tokenrun (tokenrun *);
  67
  68 static _cpp_buff *new_buff (size_t);
  69
  70
  71 /* Utility routine:
  72
  73    Compares, the token TOKEN to the NUL-terminated string STRING.
  74    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  75 int
  76 cpp_ideq (const cpp_token *token, const char *string)
  77 {
  78   if (token->type != CPP_NAME)
  79     return 0;
  80
  81   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  82 }
  83
  84 /* Record a note TYPE at byte POS into the current cleaned logical
  85    line.  */
  86 static void
  87 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  88 {
  89   if (buffer->notes_used == buffer->notes_cap)
  90     {
  91       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  92       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  93                                   buffer->notes_cap);
  94     }
  95
  96   buffer->notes[buffer->notes_used].pos = pos;
  97   buffer->notes[buffer->notes_used].type = type;
  98   buffer->notes_used++;
  99 }
 100
 101 \f
 102 /* Fast path to find line special characters using optimized character
 103    scanning algorithms.  Anything complicated falls back to the slow
 104    path below.  Since this loop is very hot it's worth doing these kinds
 105    of optimizations.
 106
 107    One of the paths through the ifdefs should provide
 108
 109      const uchar *search_line_fast (const uchar *s, const uchar *end);
 110
 111    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 112    the found character.
 113
 114    Note that the last character of the buffer is *always* a newline,
 115    as forced by _cpp_convert_input.  This fact can be used to avoid
 116    explicitly looking for the end of the buffer.  */
 117
 118 /* Configure gives us an ifdef test.  */
 119 #ifndef WORDS_BIGENDIAN
 120 #define WORDS_BIGENDIAN 0
 121 #endif
 122
 123 /* We'd like the largest integer that fits into a register.  There's nothing
 124    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 125    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 126    can get the "real" word size.  */
 127 #ifdef __GNUC__
 128 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 129 #else
 130 typedef unsigned long word_type;
 131 #endif
 132
 133 /* The code below is only expecting sizes 4 or 8.
 134    Die at compile-time if this expectation is violated.  */
 135 typedef char check_word_type_size
 136   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 137
 138 /* Return X with the first N bytes forced to values that won't match one
 139    of the interesting characters.  Note that NUL is not interesting.  */
 140
 141 static inline word_type
 142 acc_char_mask_misalign (word_type val, unsigned int n)
 143 {
 144   word_type mask = -1;
 145   if (WORDS_BIGENDIAN)
 146     mask >>= n * 8;
 147   else
 148     mask <<= n * 8;
 149   return val & mask;
 150 }
 151
 152 /* Return X replicated to all byte positions within WORD_TYPE.  */
 153
 154 static inline word_type
 155 acc_char_replicate (uchar x)
 156 {
 157   word_type ret;
 158
 159   ret = (x << 24) | (x << 16) | (x << 8) | x;
 160   if (sizeof(word_type) == 8)
 161     ret = (ret << 16 << 16) | ret;
 162   return ret;
 163 }
 164
 165 /* Return non-zero if some byte of VAL is (probably) C.  */
 166
 167 static inline word_type
 168 acc_char_cmp (word_type val, word_type c)
 169 {
 170 #if defined(__GNUC__) && defined(__alpha__)
 171   /* We can get exact results using a compare-bytes instruction.
 172      Get (val == c) via (0 >= (val ^ c)).  */
 173   return __builtin_alpha_cmpbge (0, val ^ c);
 174 #else
 175   word_type magic = 0x7efefefeU;
 176   if (sizeof(word_type) == 8)
 177     magic = (magic << 16 << 16) | 0xfefefefeU;
 178   magic |= 1;
 179
 180   val ^= c;
 181   return ((val + magic) ^ ~val) & ~magic;
 182 #endif
 183 }
 184
 185 /* Given the result of acc_char_cmp is non-zero, return the index of
 186    the found character.  If this was a false positive, return -1.  */
 187
 188 static inline int
 189 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 190                 word_type val ATTRIBUTE_UNUSED)
 191 {
 192 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 193   /* The cmpbge instruction sets *bits* of the result corresponding to
 194      matches in the bytes with no false positives.  */
 195   return __builtin_ctzl (cmp);
 196 #else
 197   unsigned int i;
 198
 199   /* ??? It would be nice to force unrolling here,
 200      and have all of these constants folded.  */
 201   for (i = 0; i < sizeof(word_type); ++i)
 202     {
 203       uchar c;
 204       if (WORDS_BIGENDIAN)
 205         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 206       else
 207         c = (val >> i * 8) & 0xff;
 208
 209       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 210         return i;
 211     }
 212
 213   return -1;
 214 #endif
 215 }
 216
 217 /* A version of the fast scanner using bit fiddling techniques.
 218
 219    For 32-bit words, one would normally perform 16 comparisons and
 220    16 branches.  With this algorithm one performs 24 arithmetic
 221    operations and one branch.  Whether this is faster with a 32-bit
 222    word size is going to be somewhat system dependent.
 223
 224    For 64-bit words, we eliminate twice the number of comparisons
 225    and branches without increasing the number of arithmetic operations.
 226    It's almost certainly going to be a win with 64-bit word size.  */
 227
 228 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 229   ATTRIBUTE_UNUSED;
 230
 231 static const uchar *
 232 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 233 {
 234   const word_type repl_nl = acc_char_replicate ('\n');
 235   const word_type repl_cr = acc_char_replicate ('\r');
 236   const word_type repl_bs = acc_char_replicate ('\\');
 237   const word_type repl_qm = acc_char_replicate ('?');
 238
 239   unsigned int misalign;
 240   const word_type *p;
 241   word_type val, t;
 242
 243   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 244   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 245   val = *p;
 246   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 247   if (misalign)
 248     val = acc_char_mask_misalign (val, misalign);
 249
 250   /* Main loop.  */
 251   while (1)
 252     {
 253       t  = acc_char_cmp (val, repl_nl);
 254       t |= acc_char_cmp (val, repl_cr);
 255       t |= acc_char_cmp (val, repl_bs);
 256       t |= acc_char_cmp (val, repl_qm);
 257
 258       if (__builtin_expect (t != 0, 0))
 259         {
 260           int i = acc_char_index (t, val);
 261           if (i >= 0)
 262             return (const uchar *)p + i;
 263         }
 264
 265       val = *++p;
 266     }
 267 }
 268
 269 /* Disable on Solaris 2/x86 until the following problem can be properly
 270    autoconfed:
 271
 272    The Solaris 10+ assembler tags objects with the instruction set
 273    extensions used, so SSE4.2 executables cannot run on machines that
 274    don't support that extension.  */
 275
 276 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 277
 278 /* Replicated character data to be shared between implementations.
 279    Recall that outside of a context with vector support we can't
 280    define compatible vector types, therefore these are all defined
 281    in terms of raw characters.  */
 282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 283   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 284     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 285   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 286     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 287   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 288     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 289   { '?', '?', '?', '?', '?', '?', '?', '?',
 290     '?', '?', '?', '?', '?', '?', '?', '?' },
 291 };
 292
 293 /* A version of the fast scanner using MMX vectorized byte compare insns.
 294
 295    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 296    which was packaged into SSE1; it is also present in the AMD MMX
 297    extension.  Mark the function as using "sse" so that we emit a real
 298    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 299
 300 static const uchar *
 301 #ifndef __SSE__
 302 __attribute__((__target__("sse")))
 303 #endif
 304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 305 {
 306   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 307   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 308
 309   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 310   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 311   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 312   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 313
 314   unsigned int misalign, found, mask;
 315   const v8qi *p;
 316   v8qi data, t, c;
 317
 318   /* Align the source pointer.  While MMX doesn't generate unaligned data
 319      faults, this allows us to safely scan to the end of the buffer without
 320      reading beyond the end of the last page.  */
 321   misalign = (uintptr_t)s & 7;
 322   p = (const v8qi *)((uintptr_t)s & -8);
 323   data = *p;
 324
 325   /* Create a mask for the bytes that are valid within the first
 326      16-byte block.  The Idea here is that the AND with the mask
 327      within the loop is "free", since we need some AND or TEST
 328      insn in order to set the flags for the branch anyway.  */
 329   mask = -1u << misalign;
 330
 331   /* Main loop processing 8 bytes at a time.  */
 332   goto start;
 333   do
 334     {
 335       data = *++p;
 336       mask = -1;
 337
 338     start:
 339       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 340       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 341       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 342       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 343       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 344       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 345       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 346       found = __builtin_ia32_pmovmskb (t);
 347       found &= mask;
 348     }
 349   while (!found);
 350
 351   __builtin_ia32_emms ();
 352
 353   /* FOUND contains 1 in bits for which we matched a relevant
 354      character.  Conversion to the byte index is trivial.  */
 355   found = __builtin_ctz(found);
 356   return (const uchar *)p + found;
 357 }
 358
 359 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 360
 361 static const uchar *
 362 #ifndef __SSE2__
 363 __attribute__((__target__("sse2")))
 364 #endif
 365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 366 {
 367   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 368
 369   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 370   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 371   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 372   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 373
 374   unsigned int misalign, found, mask;
 375   const v16qi *p;
 376   v16qi data, t;
 377
 378   /* Align the source pointer.  */
 379   misalign = (uintptr_t)s & 15;
 380   p = (const v16qi *)((uintptr_t)s & -16);
 381   data = *p;
 382
 383   /* Create a mask for the bytes that are valid within the first
 384      16-byte block.  The Idea here is that the AND with the mask
 385      within the loop is "free", since we need some AND or TEST
 386      insn in order to set the flags for the branch anyway.  */
 387   mask = -1u << misalign;
 388
 389   /* Main loop processing 16 bytes at a time.  */
 390   goto start;
 391   do
 392     {
 393       data = *++p;
 394       mask = -1;
 395
 396     start:
 397       t  = data == repl_nl;
 398       t |= data == repl_cr;
 399       t |= data == repl_bs;
 400       t |= data == repl_qm;
 401       found = __builtin_ia32_pmovmskb128 (t);
 402       found &= mask;
 403     }
 404   while (!found);
 405
 406   /* FOUND contains 1 in bits for which we matched a relevant
 407      character.  Conversion to the byte index is trivial.  */
 408   found = __builtin_ctz(found);
 409   return (const uchar *)p + found;
 410 }
 411
 412 #ifdef HAVE_SSE4
 413 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 414
 415 static const uchar *
 416 #ifndef __SSE4_2__
 417 __attribute__((__target__("sse4.2")))
 418 #endif
 419 search_line_sse42 (const uchar *s, const uchar *end)
 420 {
 421   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 422   static const v16qi search = { '\n', '\r', '?', '\\' };
 423
 424   uintptr_t si = (uintptr_t)s;
 425   uintptr_t index;
 426
 427   /* Check for unaligned input.  */
 428   if (si & 15)
 429     {
 430       v16qi sv;
 431
 432       if (__builtin_expect (end - s < 16, 0)
 433           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 434         {
 435           /* There are less than 16 bytes left in the buffer, and less
 436              than 16 bytes left on the page.  Reading 16 bytes at this
 437              point might generate a spurious page fault.  Defer to the
 438              SSE2 implementation, which already handles alignment.  */
 439           return search_line_sse2 (s, end);
 440         }
 441
 442       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 443          memory need not be aligned.  */
 444       sv = __builtin_ia32_loaddqu ((const char *) s);
 445       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 446
 447       if (__builtin_expect (index < 16, 0))
 448         goto found;
 449
 450       /* Advance the pointer to an aligned address.  We will re-scan a
 451          few bytes, but we no longer need care for reading past the
 452          end of a page, since we're guaranteed a match.  */
 453       s = (const uchar *)((si + 15) & -16);
 454     }
 455
 456   /* Main loop, processing 16 bytes at a time.  */
 457 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 458   while (1)
 459     {
 460       char f;
 461
 462       /* By using inline assembly instead of the builtin,
 463          we can use the result, as well as the flags set.  */
 464       __asm ("%vpcmpestri\t$0, %2, %3"
 465              : "=c"(index), "=@ccc"(f)
 466              : "m"(*s), "x"(search), "a"(4), "d"(16));
 467       if (f)
 468         break;
 469
 470       s += 16;
 471     }
 472 #else
 473   s -= 16;
 474   /* By doing the whole loop in inline assembly,
 475      we can make proper use of the flags set.  */
 476   __asm (      ".balign 16\n"
 477         "0:     add $16, %1\n"
 478         "       %vpcmpestri\t$0, (%1), %2\n"
 479         "       jnc 0b"
 480         : "=&c"(index), "+r"(s)
 481         : "x"(search), "a"(4), "d"(16));
 482 #endif
 483
 484  found:
 485   return s + index;
 486 }
 487
 488 #else
 489 /* Work around out-dated assemblers without sse4 support.  */
 490 #define search_line_sse42 search_line_sse2
 491 #endif
 492
 493 /* Check the CPU capabilities.  */
 494
 495 #include "../gcc/config/i386/cpuid.h"
 496
 497 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 498 static search_line_fast_type search_line_fast;
 499
 500 #define HAVE_init_vectorized_lexer 1
 501 static inline void
 502 init_vectorized_lexer (void)
 503 {
 504   unsigned dummy, ecx = 0, edx = 0;
 505   search_line_fast_type impl = search_line_acc_char;
 506   int minimum = 0;
 507
 508 #if defined(__SSE4_2__)
 509   minimum = 3;
 510 #elif defined(__SSE2__)
 511   minimum = 2;
 512 #elif defined(__SSE__)
 513   minimum = 1;
 514 #endif
 515
 516   if (minimum == 3)
 517     impl = search_line_sse42;
 518   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 519     {
 520       if (minimum == 3 || (ecx & bit_SSE4_2))
 521         impl = search_line_sse42;
 522       else if (minimum == 2 || (edx & bit_SSE2))
 523         impl = search_line_sse2;
 524       else if (minimum == 1 || (edx & bit_SSE))
 525         impl = search_line_mmx;
 526     }
 527   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 528     {
 529       if (minimum == 1
 530           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 531         impl = search_line_mmx;
 532     }
 533
 534   search_line_fast = impl;
 535 }
 536
 537 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 538
 539 /* A vection of the fast scanner using AltiVec vectorized byte compares
 540    and VSX unaligned loads (when VSX is available).  This is otherwise
 541    the same as the AltiVec version.  */
 542
 543 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 544 static const uchar *
 545 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 546 {
 547   typedef __attribute__((altivec(vector))) unsigned char vc;
 548
 549   const vc repl_nl = {
 550     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 551     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 552   };
 553   const vc repl_cr = {
 554     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 555     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 556   };
 557   const vc repl_bs = {
 558     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 559     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 560   };
 561   const vc repl_qm = {
 562     '?', '?', '?', '?', '?', '?', '?', '?',
 563     '?', '?', '?', '?', '?', '?', '?', '?',
 564   };
 565   const vc zero = { 0 };
 566
 567   vc data, t;
 568
 569   /* Main loop processing 16 bytes at a time.  */
 570   do
 571     {
 572       vc m_nl, m_cr, m_bs, m_qm;
 573
 574       data = __builtin_vec_vsx_ld (0, s);
 575       s += 16;
 576
 577       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 578       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 579       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 580       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 581       t = (m_nl | m_cr) | (m_bs | m_qm);
 582
 583       /* T now contains 0xff in bytes for which we matched one of the relevant
 584          characters.  We want to exit the loop if any byte in T is non-zero.
 585          Below is the expansion of vec_any_ne(t, zero).  */
 586     }
 587   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 588
 589   /* Restore s to to point to the 16 bytes we just processed.  */
 590   s -= 16;
 591
 592   {
 593 #define N  (sizeof(vc) / sizeof(long))
 594
 595     union {
 596       vc v;
 597       /* Statically assert that N is 2 or 4.  */
 598       unsigned long l[(N == 2 || N == 4) ? N : -1];
 599     } u;
 600     unsigned long l, i = 0;
 601
 602     u.v = t;
 603
 604     /* Find the first word of T that is non-zero.  */
 605     switch (N)
 606       {
 607       case 4:
 608         l = u.l[i++];
 609         if (l != 0)
 610           break;
 611         s += sizeof(unsigned long);
 612         l = u.l[i++];
 613         if (l != 0)
 614           break;
 615         s += sizeof(unsigned long);
 616         /* FALLTHRU */
 617       case 2:
 618         l = u.l[i++];
 619         if (l != 0)
 620           break;
 621         s += sizeof(unsigned long);
 622         l = u.l[i];
 623       }
 624
 625     /* L now contains 0xff in bytes for which we matched one of the
 626        relevant characters.  We can find the byte index by finding
 627        its bit index and dividing by 8.  */
 628 #ifdef __BIG_ENDIAN__
 629     l = __builtin_clzl(l) >> 3;
 630 #else
 631     l = __builtin_ctzl(l) >> 3;
 632 #endif
 633     return s + l;
 634
 635 #undef N
 636   }
 637 }
 638
 639 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 640
 641 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 642    This cannot be used for little endian because vec_lvsl/lvsr are
 643    deprecated for little endian and the code won't work properly.  */
 644 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 645    so we can't compile this function without -maltivec on the command line
 646    (or implied by some other switch).  */
 647
 648 static const uchar *
 649 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 650 {
 651   typedef __attribute__((altivec(vector))) unsigned char vc;
 652
 653   const vc repl_nl = {
 654     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 655     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 656   };
 657   const vc repl_cr = {
 658     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 659     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 660   };
 661   const vc repl_bs = {
 662     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 663     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 664   };
 665   const vc repl_qm = {
 666     '?', '?', '?', '?', '?', '?', '?', '?',
 667     '?', '?', '?', '?', '?', '?', '?', '?',
 668   };
 669   const vc ones = {
 670     -1, -1, -1, -1, -1, -1, -1, -1,
 671     -1, -1, -1, -1, -1, -1, -1, -1,
 672   };
 673   const vc zero = { 0 };
 674
 675   vc data, mask, t;
 676
 677   /* Altivec loads automatically mask addresses with -16.  This lets us
 678      issue the first load as early as possible.  */
 679   data = __builtin_vec_ld(0, (const vc *)s);
 680
 681   /* Discard bytes before the beginning of the buffer.  Do this by
 682      beginning with all ones and shifting in zeros according to the
 683      mis-alignment.  The LVSR instruction pulls the exact shift we
 684      want from the address.  */
 685   mask = __builtin_vec_lvsr(0, s);
 686   mask = __builtin_vec_perm(zero, ones, mask);
 687   data &= mask;
 688
 689   /* While altivec loads mask addresses, we still need to align S so
 690      that the offset we compute at the end is correct.  */
 691   s = (const uchar *)((uintptr_t)s & -16);
 692
 693   /* Main loop processing 16 bytes at a time.  */
 694   goto start;
 695   do
 696     {
 697       vc m_nl, m_cr, m_bs, m_qm;
 698
 699       s += 16;
 700       data = __builtin_vec_ld(0, (const vc *)s);
 701
 702     start:
 703       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 704       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 705       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 706       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 707       t = (m_nl | m_cr) | (m_bs | m_qm);
 708
 709       /* T now contains 0xff in bytes for which we matched one of the relevant
 710          characters.  We want to exit the loop if any byte in T is non-zero.
 711          Below is the expansion of vec_any_ne(t, zero).  */
 712     }
 713   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 714
 715   {
 716 #define N  (sizeof(vc) / sizeof(long))
 717
 718     union {
 719       vc v;
 720       /* Statically assert that N is 2 or 4.  */
 721       unsigned long l[(N == 2 || N == 4) ? N : -1];
 722     } u;
 723     unsigned long l, i = 0;
 724
 725     u.v = t;
 726
 727     /* Find the first word of T that is non-zero.  */
 728     switch (N)
 729       {
 730       case 4:
 731         l = u.l[i++];
 732         if (l != 0)
 733           break;
 734         s += sizeof(unsigned long);
 735         l = u.l[i++];
 736         if (l != 0)
 737           break;
 738         s += sizeof(unsigned long);
 739         /* FALLTHROUGH */
 740       case 2:
 741         l = u.l[i++];
 742         if (l != 0)
 743           break;
 744         s += sizeof(unsigned long);
 745         l = u.l[i];
 746       }
 747
 748     /* L now contains 0xff in bytes for which we matched one of the
 749        relevant characters.  We can find the byte index by finding
 750        its bit index and dividing by 8.  */
 751     l = __builtin_clzl(l) >> 3;
 752     return s + l;
 753
 754 #undef N
 755   }
 756 }
 757
 758 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 759 #include "arm_neon.h"
 760
 761 /* This doesn't have to be the exact page size, but no system may use
 762    a size smaller than this.  ARMv8 requires a minimum page size of
 763    4k.  The impact of being conservative here is a small number of
 764    cases will take the slightly slower entry path into the main
 765    loop.  */
 766
 767 #define AARCH64_MIN_PAGE_SIZE 4096
 768
 769 static const uchar *
 770 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 771 {
 772   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 773   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 774   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 775   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 776   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 777
 778 #ifdef __ARM_BIG_ENDIAN
 779   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 780 #else
 781   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 782 #endif
 783
 784   unsigned int found;
 785   const uint8_t *p;
 786   uint8x16_t data;
 787   uint8x16_t t;
 788   uint16x8_t m;
 789   uint8x16_t u, v, w;
 790
 791   /* Align the source pointer.  */
 792   p = (const uint8_t *)((uintptr_t)s & -16);
 793
 794   /* Assuming random string start positions, with a 4k page size we'll take
 795      the slow path about 0.37% of the time.  */
 796   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 797                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 798                         < 16, 0))
 799     {
 800       /* Slow path: the string starts near a possible page boundary.  */
 801       uint32_t misalign, mask;
 802
 803       misalign = (uintptr_t)s & 15;
 804       mask = (-1u << misalign) & 0xffff;
 805       data = vld1q_u8 (p);
 806       t = vceqq_u8 (data, repl_nl);
 807       u = vceqq_u8 (data, repl_cr);
 808       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 809       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 810       t = vorrq_u8 (v, w);
 811       t = vandq_u8 (t, xmask);
 812       m = vpaddlq_u8 (t);
 813       m = vshlq_u16 (m, shift);
 814       found = vaddvq_u16 (m);
 815       found &= mask;
 816       if (found)
 817         return (const uchar*)p + __builtin_ctz (found);
 818     }
 819   else
 820     {
 821       data = vld1q_u8 ((const uint8_t *) s);
 822       t = vceqq_u8 (data, repl_nl);
 823       u = vceqq_u8 (data, repl_cr);
 824       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 825       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 826       t = vorrq_u8 (v, w);
 827       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 828         goto done;
 829     }
 830
 831   do
 832     {
 833       p += 16;
 834       data = vld1q_u8 (p);
 835       t = vceqq_u8 (data, repl_nl);
 836       u = vceqq_u8 (data, repl_cr);
 837       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 838       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 839       t = vorrq_u8 (v, w);
 840     } while (!vpaddd_u64 ((uint64x2_t)t));
 841
 842 done:
 843   /* Now that we've found the terminating substring, work out precisely where
 844      we need to stop.  */
 845   t = vandq_u8 (t, xmask);
 846   m = vpaddlq_u8 (t);
 847   m = vshlq_u16 (m, shift);
 848   found = vaddvq_u16 (m);
 849   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 850           + __builtin_ctz (found));
 851 }
 852
 853 #elif defined (__ARM_NEON)
 854 #include "arm_neon.h"
 855
 856 static const uchar *
 857 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 858 {
 859   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 860   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 861   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 862   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 863   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 864
 865   unsigned int misalign, found, mask;
 866   const uint8_t *p;
 867   uint8x16_t data;
 868
 869   /* Align the source pointer.  */
 870   misalign = (uintptr_t)s & 15;
 871   p = (const uint8_t *)((uintptr_t)s & -16);
 872   data = vld1q_u8 (p);
 873
 874   /* Create a mask for the bytes that are valid within the first
 875      16-byte block.  The Idea here is that the AND with the mask
 876      within the loop is "free", since we need some AND or TEST
 877      insn in order to set the flags for the branch anyway.  */
 878   mask = (-1u << misalign) & 0xffff;
 879
 880   /* Main loop, processing 16 bytes at a time.  */
 881   goto start;
 882
 883   do
 884     {
 885       uint8x8_t l;
 886       uint16x4_t m;
 887       uint32x2_t n;
 888       uint8x16_t t, u, v, w;
 889
 890       p += 16;
 891       data = vld1q_u8 (p);
 892       mask = 0xffff;
 893
 894     start:
 895       t = vceqq_u8 (data, repl_nl);
 896       u = vceqq_u8 (data, repl_cr);
 897       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 898       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 899       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 900       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 901       m = vpaddl_u8 (l);
 902       n = vpaddl_u16 (m);
 903
 904       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 905               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 906       found &= mask;
 907     }
 908   while (!found);
 909
 910   /* FOUND contains 1 in bits for which we matched a relevant
 911      character.  Conversion to the byte index is trivial.  */
 912   found = __builtin_ctz (found);
 913   return (const uchar *)p + found;
 914 }
 915
 916 #else
 917
 918 /* We only have one accelerated alternative.  Use a direct call so that
 919    we encourage inlining.  */
 920
 921 #define search_line_fast  search_line_acc_char
 922
 923 #endif
 924
 925 /* Initialize the lexer if needed.  */
 926
 927 void
 928 _cpp_init_lexer (void)
 929 {
 930 #ifdef HAVE_init_vectorized_lexer
 931   init_vectorized_lexer ();
 932 #endif
 933 }
 934
 935 /* Returns with a logical line that contains no escaped newlines or
 936    trigraphs.  This is a time-critical inner loop.  */
 937 void
 938 _cpp_clean_line (cpp_reader *pfile)
 939 {
 940   cpp_buffer *buffer;
 941   const uchar *s;
 942   uchar c, *d, *p;
 943
 944   buffer = pfile->buffer;
 945   buffer->cur_note = buffer->notes_used = 0;
 946   buffer->cur = buffer->line_base = buffer->next_line;
 947   buffer->need_line = false;
 948   s = buffer->next_line;
 949
 950   if (!buffer->from_stage3)
 951     {
 952       const uchar *pbackslash = NULL;
 953
 954       /* Fast path.  This is the common case of an un-escaped line with
 955          no trigraphs.  The primary win here is by not writing any
 956          data back to memory until we have to.  */
 957       while (1)
 958         {
 959           /* Perform an optimized search for \n, \r, \\, ?.  */
 960           s = search_line_fast (s, buffer->rlimit);
 961
 962           c = *s;
 963           if (c == '\\')
 964             {
 965               /* Record the location of the backslash and continue.  */
 966               pbackslash = s++;
 967             }
 968           else if (__builtin_expect (c == '?', 0))
 969             {
 970               if (__builtin_expect (s[1] == '?', false)
 971                    && _cpp_trigraph_map[s[2]])
 972                 {
 973                   /* Have a trigraph.  We may or may not have to convert
 974                      it.  Add a line note regardless, for -Wtrigraphs.  */
 975                   add_line_note (buffer, s, s[2]);
 976                   if (CPP_OPTION (pfile, trigraphs))
 977                     {
 978                       /* We do, and that means we have to switch to the
 979                          slow path.  */
 980                       d = (uchar *) s;
 981                       *d = _cpp_trigraph_map[s[2]];
 982                       s += 2;
 983                       goto slow_path;
 984                     }
 985                 }
 986               /* Not a trigraph.  Continue on fast-path.  */
 987               s++;
 988             }
 989           else
 990             break;
 991         }
 992
 993       /* This must be \r or \n.  We're either done, or we'll be forced
 994          to write back to the buffer and continue on the slow path.  */
 995       d = (uchar *) s;
 996
 997       if (__builtin_expect (s == buffer->rlimit, false))
 998         goto done;
 999
1000       /* DOS line ending? */
1001       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1002         {
1003           s++;
1004           if (s == buffer->rlimit)
1005             goto done;
1006         }
1007
1008       if (__builtin_expect (pbackslash == NULL, true))
1009         goto done;
1010
1011       /* Check for escaped newline.  */
1012       p = d;
1013       while (is_nvspace (p[-1]))
1014         p--;
1015       if (p - 1 != pbackslash)
1016         goto done;
1017
1018       /* Have an escaped newline; process it and proceed to
1019          the slow path.  */
1020       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1021       d = p - 2;
1022       buffer->next_line = p - 1;
1023
1024     slow_path:
1025       while (1)
1026         {
1027           c = *++s;
1028           *++d = c;
1029
1030           if (c == '\n' || c == '\r')
1031             {
1032               /* Handle DOS line endings.  */
1033               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1034                 s++;
1035               if (s == buffer->rlimit)
1036                 break;
1037
1038               /* Escaped?  */
1039               p = d;
1040               while (p != buffer->next_line && is_nvspace (p[-1]))
1041                 p--;
1042               if (p == buffer->next_line || p[-1] != '\\')
1043                 break;
1044
1045               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1046               d = p - 2;
1047               buffer->next_line = p - 1;
1048             }
1049           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1050             {
1051               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1052               add_line_note (buffer, d, s[2]);
1053               if (CPP_OPTION (pfile, trigraphs))
1054                 {
1055                   *d = _cpp_trigraph_map[s[2]];
1056                   s += 2;
1057                 }
1058             }
1059         }
1060     }
1061   else
1062     {
1063       while (*s != '\n' && *s != '\r')
1064         s++;
1065       d = (uchar *) s;
1066
1067       /* Handle DOS line endings.  */
1068       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1069         s++;
1070     }
1071
1072  done:
1073   *d = '\n';
1074   /* A sentinel note that should never be processed.  */
1075   add_line_note (buffer, d + 1, '\n');
1076   buffer->next_line = s + 1;
1077 }
1078
1079 template <bool lexing_raw_string>
1080 static bool get_fresh_line_impl (cpp_reader *pfile);
1081
1082 /* Return true if the trigraph indicated by NOTE should be warned
1083    about in a comment.  */
1084 static bool
1085 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1086 {
1087   const uchar *p;
1088
1089   /* Within comments we don't warn about trigraphs, unless the
1090      trigraph forms an escaped newline, as that may change
1091      behavior.  */
1092   if (note->type != '/')
1093     return false;
1094
1095   /* If -trigraphs, then this was an escaped newline iff the next note
1096      is coincident.  */
1097   if (CPP_OPTION (pfile, trigraphs))
1098     return note[1].pos == note->pos;
1099
1100   /* Otherwise, see if this forms an escaped newline.  */
1101   p = note->pos + 3;
1102   while (is_nvspace (*p))
1103     p++;
1104
1105   /* There might have been escaped newlines between the trigraph and the
1106      newline we found.  Hence the position test.  */
1107   return (*p == '\n' && p < note[1].pos);
1108 }
1109
1110 /* Process the notes created by add_line_note as far as the current
1111    location.  */
1112 void
1113 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1114 {
1115   cpp_buffer *buffer = pfile->buffer;
1116
1117   for (;;)
1118     {
1119       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1120       unsigned int col;
1121
1122       if (note->pos > buffer->cur)
1123         break;
1124
1125       buffer->cur_note++;
1126       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1127
1128       if (note->type == '\\' || note->type == ' ')
1129         {
1130           if (note->type == ' ' && !in_comment)
1131             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1132                                  "backslash and newline separated by space");
1133
1134           if (buffer->next_line > buffer->rlimit)
1135             {
1136               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1137                                    "backslash-newline at end of file");
1138               /* Prevent "no newline at end of file" warning.  */
1139               buffer->next_line = buffer->rlimit;
1140             }
1141
1142           buffer->line_base = note->pos;
1143           CPP_INCREMENT_LINE (pfile, 0);
1144         }
1145       else if (_cpp_trigraph_map[note->type])
1146         {
1147           if (CPP_OPTION (pfile, warn_trigraphs)
1148               && (!in_comment || warn_in_comment (pfile, note)))
1149             {
1150               if (CPP_OPTION (pfile, trigraphs))
1151                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1152                                        pfile->line_table->highest_line, col,
1153                                        "trigraph ??%c converted to %c",
1154                                        note->type,
1155                                        (int) _cpp_trigraph_map[note->type]);
1156               else
1157                 {
1158                   cpp_warning_with_line
1159                     (pfile, CPP_W_TRIGRAPHS,
1160                      pfile->line_table->highest_line, col,
1161                      "trigraph ??%c ignored, use -trigraphs to enable",
1162                      note->type);
1163                 }
1164             }
1165         }
1166       else if (note->type == 0)
1167         /* Already processed in lex_raw_string.  */;
1168       else
1169         abort ();
1170     }
1171 }
1172
1173 namespace bidi {
1174   enum class kind {
1175     NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1176   };
1177
1178   /* All the UTF-8 encodings of bidi characters start with E2.  */
1179   constexpr uchar utf8_start = 0xe2;
1180
1181   struct context
1182   {
1183     context () {}
1184     context (location_t loc, kind k, bool pdf, bool ucn)
1185     : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1186     {
1187     }
1188
1189     kind get_pop_kind () const
1190     {
1191       return m_pdf ? kind::PDF : kind::PDI;
1192     }
1193     bool ucn_p () const
1194     {
1195       return m_ucn;
1196     }
1197
1198     location_t m_loc;
1199     kind m_kind;
1200     unsigned m_pdf : 1;
1201     unsigned m_ucn : 1;
1202   };
1203
1204   /* A vector holding currently open bidi contexts.  We use a char for
1205      each context, its LSB is 1 if it represents a PDF context, 0 if it
1206      represents a PDI context.  The next bit is 1 if this context was open
1207      by a bidi character written as a UCN, and 0 when it was UTF-8.  */
1208   semi_embedded_vec <context, 16> vec;
1209
1210   /* Close the whole comment/identifier/string literal/character constant
1211      context.  */
1212   void on_close ()
1213   {
1214     vec.truncate (0);
1215   }
1216
1217   /* Pop the last element in the vector.  */
1218   void pop ()
1219   {
1220     unsigned int len = vec.count ();
1221     gcc_checking_assert (len > 0);
1222     vec.truncate (len - 1);
1223   }
1224
1225   /* Return the pop kind of the context of the Ith element.  */
1226   kind pop_kind_at (unsigned int i)
1227   {
1228     return vec[i].get_pop_kind ();
1229   }
1230
1231   /* Return the pop kind of the context that is currently opened.  */
1232   kind current_ctx ()
1233   {
1234     unsigned int len = vec.count ();
1235     if (len == 0)
1236       return kind::NONE;
1237     return vec[len - 1].get_pop_kind ();
1238   }
1239
1240   /* Return true if the current context comes from a UCN origin, that is,
1241      the bidi char which started this bidi context was written as a UCN.  */
1242   bool current_ctx_ucn_p ()
1243   {
1244     unsigned int len = vec.count ();
1245     gcc_checking_assert (len > 0);
1246     return vec[len - 1].m_ucn;
1247   }
1248
1249   location_t current_ctx_loc ()
1250   {
1251     unsigned int len = vec.count ();
1252     gcc_checking_assert (len > 0);
1253     return vec[len - 1].m_loc;
1254   }
1255
1256   /* We've read a bidi char, update the current vector as necessary.
1257      LOC is only valid when K is not kind::NONE.  */
1258   void on_char (kind k, bool ucn_p, location_t loc)
1259   {
1260     switch (k)
1261       {
1262       case kind::LRE:
1263       case kind::RLE:
1264       case kind::LRO:
1265       case kind::RLO:
1266         vec.push (context (loc, k, true, ucn_p));
1267         break;
1268       case kind::LRI:
1269       case kind::RLI:
1270       case kind::FSI:
1271         vec.push (context (loc, k, false, ucn_p));
1272         break;
1273       /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1274          whose scope has not yet been terminated.  */
1275       case kind::PDF:
1276         if (current_ctx () == kind::PDF)
1277           pop ();
1278         break;
1279       /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1280          scope has not yet been terminated, as well as the scopes of
1281          any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1282          yet been terminated.  */
1283       case kind::PDI:
1284         for (int i = vec.count () - 1; i >= 0; --i)
1285           if (pop_kind_at (i) == kind::PDI)
1286             {
1287               vec.truncate (i);
1288               break;
1289             }
1290         break;
1291       case kind::LTR:
1292       case kind::RTL:
1293         /* These aren't popped by a PDF/PDI.  */
1294         break;
1295       ATTR_LIKELY case kind::NONE:
1296         break;
1297       default:
1298         abort ();
1299       }
1300   }
1301
1302   /* Return a descriptive string for K.  */
1303   const char *to_str (kind k)
1304   {
1305     switch (k)
1306       {
1307       case kind::LRE:
1308         return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1309       case kind::RLE:
1310         return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1311       case kind::LRO:
1312         return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1313       case kind::RLO:
1314         return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1315       case kind::LRI:
1316         return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1317       case kind::RLI:
1318         return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1319       case kind::FSI:
1320         return "U+2068 (FIRST STRONG ISOLATE)";
1321       case kind::PDF:
1322         return "U+202C (POP DIRECTIONAL FORMATTING)";
1323       case kind::PDI:
1324         return "U+2069 (POP DIRECTIONAL ISOLATE)";
1325       case kind::LTR:
1326         return "U+200E (LEFT-TO-RIGHT MARK)";
1327       case kind::RTL:
1328         return "U+200F (RIGHT-TO-LEFT MARK)";
1329       default:
1330         abort ();
1331       }
1332   }
1333 }
1334
1335 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1336    within the current line in FILE, with the caret at START.  */
1337
1338 static location_t
1339 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1340                                          const unsigned char *const start,
1341                                          size_t num_bytes)
1342 {
1343   gcc_checking_assert (num_bytes > 0);
1344
1345   /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1346      to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1347      whereas linemap_position_for_column is 1-based.  */
1348
1349   /* Get 0-based offsets within the line.  */
1350   size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1351   size_t end_offset = start_offset + num_bytes - 1;
1352
1353   /* Now convert to location_t, where "columns" are 1-based byte offsets.  */
1354   location_t start_loc = linemap_position_for_column (pfile->line_table,
1355                                                       start_offset + 1);
1356   location_t end_loc = linemap_position_for_column (pfile->line_table,
1357                                                      end_offset + 1);
1358
1359   if (start_loc == end_loc)
1360     return start_loc;
1361
1362   source_range src_range;
1363   src_range.m_start = start_loc;
1364   src_range.m_finish = end_loc;
1365   location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
1366                                                    start_loc,
1367                                                    src_range,
1368                                                    NULL,
1369                                                    0);
1370   return combined_loc;
1371 }
1372
1373 /* Parse a sequence of 3 bytes starting with P and return its bidi code.  */
1374
1375 static bidi::kind
1376 get_bidi_utf8_1 (const unsigned char *const p)
1377 {
1378   gcc_checking_assert (p[0] == bidi::utf8_start);
1379
1380   if (p[1] == 0x80)
1381     switch (p[2])
1382       {
1383       case 0xaa:
1384         return bidi::kind::LRE;
1385       case 0xab:
1386         return bidi::kind::RLE;
1387       case 0xac:
1388         return bidi::kind::PDF;
1389       case 0xad:
1390         return bidi::kind::LRO;
1391       case 0xae:
1392         return bidi::kind::RLO;
1393       case 0x8e:
1394         return bidi::kind::LTR;
1395       case 0x8f:
1396         return bidi::kind::RTL;
1397       default:
1398         break;
1399       }
1400   else if (p[1] == 0x81)
1401     switch (p[2])
1402       {
1403       case 0xa6:
1404         return bidi::kind::LRI;
1405       case 0xa7:
1406         return bidi::kind::RLI;
1407       case 0xa8:
1408         return bidi::kind::FSI;
1409       case 0xa9:
1410         return bidi::kind::PDI;
1411       default:
1412         break;
1413       }
1414
1415   return bidi::kind::NONE;
1416 }
1417
1418 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1419    If the kind is not NONE, write the location to *OUT.*/
1420
1421 static bidi::kind
1422 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1423 {
1424   bidi::kind result = get_bidi_utf8_1 (p);
1425   if (result != bidi::kind::NONE)
1426     {
1427       /* We have a sequence of 3 bytes starting at P.  */
1428       *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1429     }
1430   return result;
1431 }
1432
1433 /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
1434
1435 static bidi::kind
1436 get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
1437 {
1438   /* 6.4.3 Universal Character Names
1439       \u hex-quad
1440       \U hex-quad hex-quad
1441       \u { simple-hexadecimal-digit-sequence }
1442      where \unnnn means \U0000nnnn.  */
1443
1444   *end = p + 4;
1445   if (is_U)
1446     {
1447       if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1448         return bidi::kind::NONE;
1449       /* Skip 4B so we can treat \u and \U the same below.  */
1450       p += 4;
1451       *end += 4;
1452     }
1453   else if (p[0] == '{')
1454     {
1455       p++;
1456       while (*p == '0')
1457         p++;
1458       if (p[0] != '2'
1459           || p[1] != '0'
1460           || !ISXDIGIT (p[2])
1461           || !ISXDIGIT (p[3])
1462           || p[4] != '}')
1463         return bidi::kind::NONE;
1464       *end = p + 5;
1465     }
1466
1467   /* All code points we are looking for start with 20xx.  */
1468   if (p[0] != '2' || p[1] != '0')
1469     return bidi::kind::NONE;
1470   else if (p[2] == '2')
1471     switch (p[3])
1472       {
1473       case 'a':
1474       case 'A':
1475         return bidi::kind::LRE;
1476       case 'b':
1477       case 'B':
1478         return bidi::kind::RLE;
1479       case 'c':
1480       case 'C':
1481         return bidi::kind::PDF;
1482       case 'd':
1483       case 'D':
1484         return bidi::kind::LRO;
1485       case 'e':
1486       case 'E':
1487         return bidi::kind::RLO;
1488       default:
1489         break;
1490       }
1491   else if (p[2] == '6')
1492     switch (p[3])
1493       {
1494       case '6':
1495         return bidi::kind::LRI;
1496       case '7':
1497         return bidi::kind::RLI;
1498       case '8':
1499         return bidi::kind::FSI;
1500       case '9':
1501         return bidi::kind::PDI;
1502       default:
1503         break;
1504       }
1505   else if (p[2] == '0')
1506     switch (p[3])
1507       {
1508       case 'e':
1509       case 'E':
1510         return bidi::kind::LTR;
1511       case 'f':
1512       case 'F':
1513         return bidi::kind::RTL;
1514       default:
1515         break;
1516       }
1517
1518   return bidi::kind::NONE;
1519 }
1520
1521 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1522    If the kind is not NONE, write the location to *OUT.  */
1523
1524 static bidi::kind
1525 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1526               location_t *out)
1527 {
1528   const unsigned char *end;
1529   bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
1530   if (result != bidi::kind::NONE)
1531     {
1532       const unsigned char *start = p - 2;
1533       size_t num_bytes = end - start;
1534       *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1535     }
1536   return result;
1537 }
1538
1539 /* Parse a named universal character escape where P points just past \N and
1540    return its bidi code.  If the kind is not NONE, write the location to
1541    *OUT.  */
1542
1543 static bidi::kind
1544 get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1545 {
1546   bidi::kind result = bidi::kind::NONE;
1547   if (*p != '{')
1548     return bidi::kind::NONE;
1549   if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
1550     {
1551       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1552         result = bidi::kind::LTR;
1553       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1554         result = bidi::kind::LRE;
1555       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1556         result = bidi::kind::LRO;
1557       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1558         result = bidi::kind::LRI;
1559     }
1560   else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
1561     {
1562       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1563         result = bidi::kind::RTL;
1564       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1565         result = bidi::kind::RLE;
1566       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1567         result = bidi::kind::RLO;
1568       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1569         result = bidi::kind::RLI;
1570     }
1571   else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
1572     {
1573       if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
1574         result = bidi::kind::PDF;
1575       else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
1576         result = bidi::kind::PDI;
1577     }
1578   else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
1579     result = bidi::kind::FSI;
1580   if (result != bidi::kind::NONE)
1581     *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
1582                                                     (strchr ((const char *)
1583                                                              (p + 1), '}')
1584                                                      - (const char *) p)
1585                                                     + 3);
1586   return result;
1587 }
1588
1589 /* Subclass of rich_location for reporting on unpaired UTF-8
1590    bidirectional control character(s).
1591    Escape the source lines on output, and show all unclosed
1592    bidi context, labelling everything.  */
1593
1594 class unpaired_bidi_rich_location : public rich_location
1595 {
1596  public:
1597   class custom_range_label : public range_label
1598   {
1599    public:
1600      label_text get_text (unsigned range_idx) const final override
1601      {
1602        /* range 0 is the primary location; each subsequent range i + 1
1603           is for bidi::vec[i].  */
1604        if (range_idx > 0)
1605          {
1606            const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1607            return label_text::borrow (bidi::to_str (ctxt.m_kind));
1608          }
1609        else
1610          return label_text::borrow (_("end of bidirectional context"));
1611      }
1612   };
1613
1614   unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1615   : rich_location (pfile->line_table, loc, &m_custom_label)
1616   {
1617     set_escape_on_output (true);
1618     for (unsigned i = 0; i < bidi::vec.count (); i++)
1619       add_range (bidi::vec[i].m_loc,
1620                  SHOW_RANGE_WITHOUT_CARET,
1621                  &m_custom_label);
1622   }
1623
1624  private:
1625    custom_range_label m_custom_label;
1626 };
1627
1628 /* We're closing a bidi context, that is, we've encountered a newline,
1629    are closing a C-style comment, or are at the end of a string literal,
1630    character constant, or identifier.  Warn if this context was not
1631    properly terminated by a PDI or PDF.  P points to the last character
1632    in this context.  */
1633
1634 static void
1635 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1636 {
1637   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1638   if (bidi::vec.count () > 0
1639       && (warn_bidi & bidirectional_unpaired
1640           && (!bidi::current_ctx_ucn_p ()
1641               || (warn_bidi & bidirectional_ucn))))
1642     {
1643       const location_t loc
1644         = linemap_position_for_column (pfile->line_table,
1645                                        CPP_BUF_COLUMN (pfile->buffer, p));
1646       unpaired_bidi_rich_location rich_loc (pfile, loc);
1647       /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1648          forms of a diagnostic, so fake it for now.  */
1649       if (bidi::vec.count () > 1)
1650         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1651                         "unpaired UTF-8 bidirectional control characters "
1652                         "detected");
1653       else
1654         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1655                         "unpaired UTF-8 bidirectional control character "
1656                         "detected");
1657     }
1658   /* We're done with this context.  */
1659   bidi::on_close ();
1660 }
1661
1662 /* We're at the beginning or in the middle of an identifier/comment/string
1663    literal/character constant.  Warn if we've encountered a bidi character.
1664    KIND says which bidi control character it was; UCN_P is true iff this bidi
1665    control character was written as a UCN.  LOC is the location of the
1666    character, but is only valid if KIND != bidi::kind::NONE.  */
1667
1668 static void
1669 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1670                          bool ucn_p, location_t loc)
1671 {
1672   if (__builtin_expect (kind == bidi::kind::NONE, 1))
1673     return;
1674
1675   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1676
1677   if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1678     {
1679       rich_location rich_loc (pfile->line_table, loc);
1680       rich_loc.set_escape_on_output (true);
1681
1682       /* It seems excessive to warn about a PDI/PDF that is closing
1683          an opened context because we've already warned about the
1684          opening character.  Except warn when we have a UCN x UTF-8
1685          mismatch, if UCN checking is enabled.  */
1686       if (kind == bidi::current_ctx ())
1687         {
1688           if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1689               && bidi::current_ctx_ucn_p () != ucn_p)
1690             {
1691               rich_loc.add_range (bidi::current_ctx_loc ());
1692               cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1693                               "UTF-8 vs UCN mismatch when closing "
1694                               "a context by \"%s\"", bidi::to_str (kind));
1695             }
1696         }
1697       else if (warn_bidi & bidirectional_any
1698                && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1699         {
1700           if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1701             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1702                             "\"%s\" is closing an unopened context",
1703                             bidi::to_str (kind));
1704           else
1705             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1706                             "found problematic Unicode character \"%s\"",
1707                             bidi::to_str (kind));
1708         }
1709     }
1710   /* We're done with this context.  */
1711   bidi::on_char (kind, ucn_p, loc);
1712 }
1713
1714 static const cppchar_t utf8_continuation = 0x80;
1715 static const cppchar_t utf8_signifier = 0xC0;
1716
1717 /* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1718    at PFILE->buffer->cur.  Return a pointer after the diagnosed
1719    invalid character.  */
1720
1721 static const uchar *
1722 _cpp_warn_invalid_utf8 (cpp_reader *pfile)
1723 {
1724   cpp_buffer *buffer = pfile->buffer;
1725   const uchar *cur = buffer->cur;
1726   bool pedantic = (CPP_PEDANTIC (pfile)
1727                    && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1728
1729   if (cur[0] < utf8_signifier
1730       || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1731     {
1732       if (pedantic)
1733         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1734                              pfile->line_table->highest_line,
1735                              CPP_BUF_COL (buffer),
1736                              "invalid UTF-8 character <%x>",
1737                              cur[0]);
1738       else
1739         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1740                                pfile->line_table->highest_line,
1741                                CPP_BUF_COL (buffer),
1742                                "invalid UTF-8 character <%x>",
1743                                cur[0]);
1744       return cur + 1;
1745     }
1746   else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1747     {
1748       if (pedantic)
1749         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1750                              pfile->line_table->highest_line,
1751                              CPP_BUF_COL (buffer),
1752                              "invalid UTF-8 character <%x><%x>",
1753                              cur[0], cur[1]);
1754       else
1755         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1756                                pfile->line_table->highest_line,
1757                                CPP_BUF_COL (buffer),
1758                                "invalid UTF-8 character <%x><%x>",
1759                                cur[0], cur[1]);
1760       return cur + 2;
1761     }
1762   else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1763     {
1764       if (pedantic)
1765         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1766                              pfile->line_table->highest_line,
1767                              CPP_BUF_COL (buffer),
1768                              "invalid UTF-8 character <%x><%x><%x>",
1769                              cur[0], cur[1], cur[2]);
1770       else
1771         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1772                                pfile->line_table->highest_line,
1773                                CPP_BUF_COL (buffer),
1774                                "invalid UTF-8 character <%x><%x><%x>",
1775                                cur[0], cur[1], cur[2]);
1776       return cur + 3;
1777     }
1778   else
1779     {
1780       if (pedantic)
1781         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1782                              pfile->line_table->highest_line,
1783                              CPP_BUF_COL (buffer),
1784                              "invalid UTF-8 character <%x><%x><%x><%x>",
1785                              cur[0], cur[1], cur[2], cur[3]);
1786       else
1787         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1788                                pfile->line_table->highest_line,
1789                                CPP_BUF_COL (buffer),
1790                                "invalid UTF-8 character <%x><%x><%x><%x>",
1791                                cur[0], cur[1], cur[2], cur[3]);
1792       return cur + 4;
1793     }
1794 }
1795
1796 /* Helper function of *skip_*_comment and lex*_string.  For C,
1797    character at CUR[-1] with MSB set handle -Wbidi-chars* and
1798    -Winvalid-utf8 diagnostics and return pointer to first character
1799    that should be processed next.  */
1800
1801 static inline const uchar *
1802 _cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1803                             const uchar *cur, bool warn_bidi_p,
1804                             bool warn_invalid_utf8_p)
1805 {
1806   /* If this is a beginning of a UTF-8 encoding, it might be
1807      a bidirectional control character.  */
1808   if (c == bidi::utf8_start && warn_bidi_p)
1809     {
1810       location_t loc;
1811       bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1812       maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1813     }
1814   if (!warn_invalid_utf8_p)
1815     return cur;
1816   if (c >= utf8_signifier)
1817     {
1818       cppchar_t s;
1819       const uchar *pstr = cur - 1;
1820       if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
1821           && s <= UCS_LIMIT)
1822         return pstr;
1823     }
1824   pfile->buffer->cur = cur - 1;
1825   return _cpp_warn_invalid_utf8 (pfile);
1826 }
1827
1828 /* Skip a C-style block comment.  We find the end of the comment by
1829    seeing if an asterisk is before every '/' we encounter.  Returns
1830    nonzero if comment terminated by EOF, zero otherwise.
1831
1832    Buffer->cur points to the initial asterisk of the comment.  */
1833 bool
1834 _cpp_skip_block_comment (cpp_reader *pfile)
1835 {
1836   cpp_buffer *buffer = pfile->buffer;
1837   const uchar *cur = buffer->cur;
1838   uchar c;
1839   const bool warn_bidi_p = pfile->warn_bidi_p ();
1840   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1841   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1842
1843   cur++;
1844   if (*cur == '/')
1845     cur++;
1846
1847   for (;;)
1848     {
1849       /* People like decorating comments with '*', so check for '/'
1850          instead for efficiency.  */
1851       c = *cur++;
1852
1853       if (c == '/')
1854         {
1855           if (cur[-2] == '*')
1856             {
1857               if (warn_bidi_p)
1858                 maybe_warn_bidi_on_close (pfile, cur);
1859               break;
1860             }
1861
1862           /* Warn about potential nested comments, but not if the '/'
1863              comes immediately before the true comment delimiter.
1864              Don't bother to get it right across escaped newlines.  */
1865           if (CPP_OPTION (pfile, warn_comments)
1866               && cur[0] == '*' && cur[1] != '/')
1867             {
1868               buffer->cur = cur;
1869               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1870                                      pfile->line_table->highest_line,
1871                                      CPP_BUF_COL (buffer),
1872                                      "\"/*\" within comment");
1873             }
1874         }
1875       else if (c == '\n')
1876         {
1877           unsigned int cols;
1878           buffer->cur = cur - 1;
1879           if (warn_bidi_p)
1880             maybe_warn_bidi_on_close (pfile, cur);
1881           _cpp_process_line_notes (pfile, true);
1882           if (buffer->next_line >= buffer->rlimit)
1883             return true;
1884           _cpp_clean_line (pfile);
1885
1886           cols = buffer->next_line - buffer->line_base;
1887           CPP_INCREMENT_LINE (pfile, cols);
1888
1889           cur = buffer->cur;
1890         }
1891       else if (__builtin_expect (c >= utf8_continuation, 0)
1892                && warn_bidi_or_invalid_utf8_p)
1893         cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1894                                           warn_invalid_utf8_p);
1895     }
1896
1897   buffer->cur = cur;
1898   _cpp_process_line_notes (pfile, true);
1899   return false;
1900 }
1901
1902 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1903    terminating newline.  Handles escaped newlines.  Returns nonzero
1904    if a multiline comment.  */
1905 static int
1906 skip_line_comment (cpp_reader *pfile)
1907 {
1908   cpp_buffer *buffer = pfile->buffer;
1909   location_t orig_line = pfile->line_table->highest_line;
1910   const bool warn_bidi_p = pfile->warn_bidi_p ();
1911   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1912   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1913
1914   if (!warn_bidi_or_invalid_utf8_p)
1915     while (*buffer->cur != '\n')
1916       buffer->cur++;
1917   else if (!warn_invalid_utf8_p)
1918     {
1919       while (*buffer->cur != '\n'
1920              && *buffer->cur != bidi::utf8_start)
1921         buffer->cur++;
1922       if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1923         {
1924           while (*buffer->cur != '\n')
1925             {
1926               if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1927                 {
1928                   location_t loc;
1929                   bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1930                   maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1931                 }
1932               buffer->cur++;
1933             }
1934           maybe_warn_bidi_on_close (pfile, buffer->cur);
1935         }
1936     }
1937   else
1938     {
1939       while (*buffer->cur != '\n')
1940         {
1941           if (*buffer->cur < utf8_continuation)
1942             {
1943               buffer->cur++;
1944               continue;
1945             }
1946           buffer->cur
1947             = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
1948                                           warn_bidi_p, warn_invalid_utf8_p);
1949         }
1950       if (warn_bidi_p)
1951         maybe_warn_bidi_on_close (pfile, buffer->cur);
1952     }
1953
1954   _cpp_process_line_notes (pfile, true);
1955   return orig_line != pfile->line_table->highest_line;
1956 }
1957
1958 /* Skips whitespace, saving the next non-whitespace character.  */
1959 static void
1960 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1961 {
1962   cpp_buffer *buffer = pfile->buffer;
1963   bool saw_NUL = false;
1964
1965   do
1966     {
1967       /* Horizontal space always OK.  */
1968       if (c == ' ' || c == '\t')
1969         ;
1970       /* Just \f \v or \0 left.  */
1971       else if (c == '\0')
1972         saw_NUL = true;
1973       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1974         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1975                              CPP_BUF_COL (buffer),
1976                              "%s in preprocessing directive",
1977                              c == '\f' ? "form feed" : "vertical tab");
1978
1979       c = *buffer->cur++;
1980     }
1981   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1982   while (is_nvspace (c));
1983
1984   if (saw_NUL)
1985     {
1986       encoding_rich_location rich_loc (pfile);
1987       cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1988                     "null character(s) ignored");
1989     }
1990
1991   buffer->cur--;
1992 }
1993
1994 /* See if the characters of a number token are valid in a name (no
1995    '.', '+' or '-').  */
1996 static int
1997 name_p (cpp_reader *pfile, const cpp_string *string)
1998 {
1999   unsigned int i;
2000
2001   for (i = 0; i < string->len; i++)
2002     if (!is_idchar (string->text[i]))
2003       return 0;
2004
2005   return 1;
2006 }
2007
2008 /* After parsing an identifier or other sequence, produce a warning about
2009    sequences not in NFC/NFKC.  */
2010 static void
2011 warn_about_normalization (cpp_reader *pfile,
2012                           const cpp_token *token,
2013                           const struct normalize_state *s,
2014                           bool identifier)
2015 {
2016   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2017       && !pfile->state.skipping)
2018     {
2019       location_t loc = token->src_loc;
2020
2021       /* If possible, create a location range for the token.  */
2022       if (loc >= RESERVED_LOCATION_COUNT
2023           && token->type != CPP_EOF
2024           /* There must be no line notes to process.  */
2025           && (!(pfile->buffer->cur
2026                 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2027                 && !pfile->overlaid_buffer)))
2028         {
2029           source_range tok_range;
2030           tok_range.m_start = loc;
2031           tok_range.m_finish
2032             = linemap_position_for_column (pfile->line_table,
2033                                            CPP_BUF_COLUMN (pfile->buffer,
2034                                                            pfile->buffer->cur));
2035           loc = COMBINE_LOCATION_DATA (pfile->line_table,
2036                                        loc, tok_range, NULL, 0);
2037         }
2038
2039       encoding_rich_location rich_loc (pfile, loc);
2040
2041       /* Make sure that the token is printed using UCNs, even
2042          if we'd otherwise happily print UTF-8.  */
2043       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2044       size_t sz;
2045
2046       sz = cpp_spell_token (pfile, token, buf, false) - buf;
2047       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2048         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2049                         "`%.*s' is not in NFKC", (int) sz, buf);
2050       else if (identifier && CPP_OPTION (pfile, xid_identifiers))
2051         cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2052                                   "`%.*s' is not in NFC", (int) sz, buf);
2053       else
2054         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2055                         "`%.*s' is not in NFC", (int) sz, buf);
2056       free (buf);
2057     }
2058 }
2059
2060 /* Returns TRUE if the sequence starting at buffer->cur is valid in
2061    an identifier.  FIRST is TRUE if this starts an identifier.  */
2062
2063 static bool
2064 forms_identifier_p (cpp_reader *pfile, int first,
2065                     struct normalize_state *state)
2066 {
2067   cpp_buffer *buffer = pfile->buffer;
2068   const bool warn_bidi_p = pfile->warn_bidi_p ();
2069
2070   if (*buffer->cur == '$')
2071     {
2072       if (!CPP_OPTION (pfile, dollars_in_ident))
2073         return false;
2074
2075       buffer->cur++;
2076       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
2077         {
2078           CPP_OPTION (pfile, warn_dollars) = 0;
2079           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
2080         }
2081
2082       return true;
2083     }
2084
2085   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
2086   if (CPP_OPTION (pfile, extended_identifiers))
2087     {
2088       cppchar_t s;
2089       if (*buffer->cur >= utf8_signifier)
2090         {
2091           if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2092               && warn_bidi_p)
2093             {
2094               location_t loc;
2095               bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
2096               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2097             }
2098           if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2099                                state, &s))
2100             return true;
2101         }
2102       else if (*buffer->cur == '\\'
2103                && (buffer->cur[1] == 'u'
2104                    || buffer->cur[1] == 'U'
2105                    || buffer->cur[1] == 'N'))
2106         {
2107           buffer->cur += 2;
2108           if (warn_bidi_p)
2109             {
2110               location_t loc;
2111               bidi::kind kind;
2112               if (buffer->cur[-1] == 'N')
2113                 kind = get_bidi_named (pfile, buffer->cur, &loc);
2114               else
2115                 kind = get_bidi_ucn (pfile, buffer->cur,
2116                                      buffer->cur[-1] == 'U', &loc);
2117               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2118             }
2119           if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2120                               state, &s, NULL, NULL))
2121             return true;
2122           buffer->cur -= 2;
2123         }
2124     }
2125
2126   return false;
2127 }
2128
2129 /* Helper function to issue error about improper __VA_OPT__ use.  */
2130 static void
2131 maybe_va_opt_error (cpp_reader *pfile)
2132 {
2133   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2134     {
2135       /* __VA_OPT__ should not be accepted at all, but allow it in
2136          system headers.  */
2137       if (!_cpp_in_system_header (pfile))
2138         {
2139           if (CPP_OPTION (pfile, cplusplus))
2140             cpp_error (pfile, CPP_DL_PEDWARN,
2141                        "__VA_OPT__ is not available until C++20");
2142           else
2143             cpp_error (pfile, CPP_DL_PEDWARN,
2144                        "__VA_OPT__ is not available until C2X");
2145         }
2146     }
2147   else if (!pfile->state.va_args_ok)
2148     {
2149       /* __VA_OPT__ should only appear in the replacement list of a
2150          variadic macro.  */
2151       cpp_error (pfile, CPP_DL_PEDWARN,
2152                  "__VA_OPT__ can only appear in the expansion"
2153                  " of a C++20 variadic macro");
2154     }
2155 }
2156
2157 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
2158 static cpp_hashnode *
2159 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
2160 {
2161   cpp_hashnode *result;
2162   const uchar *cur;
2163   unsigned int len;
2164   unsigned int hash = HT_HASHSTEP (0, *base);
2165
2166   cur = base + 1;
2167   while (ISIDNUM (*cur))
2168     {
2169       hash = HT_HASHSTEP (hash, *cur);
2170       cur++;
2171     }
2172   len = cur - base;
2173   hash = HT_HASHFINISH (hash, len);
2174   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2175                                               base, len, hash, HT_ALLOC));
2176
2177   /* Rarely, identifiers require diagnostics when lexed.  */
2178   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2179                         && !pfile->state.skipping, 0))
2180     {
2181       /* It is allowed to poison the same identifier twice.  */
2182       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2183         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2184                    NODE_NAME (result));
2185
2186       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2187          replacement list of a variadic macro.  */
2188       if (result == pfile->spec_nodes.n__VA_ARGS__
2189           && !pfile->state.va_args_ok)
2190         {
2191           if (CPP_OPTION (pfile, cplusplus))
2192             cpp_error (pfile, CPP_DL_PEDWARN,
2193                        "__VA_ARGS__ can only appear in the expansion"
2194                        " of a C++11 variadic macro");
2195           else
2196             cpp_error (pfile, CPP_DL_PEDWARN,
2197                        "__VA_ARGS__ can only appear in the expansion"
2198                        " of a C99 variadic macro");
2199         }
2200
2201       if (result == pfile->spec_nodes.n__VA_OPT__)
2202         maybe_va_opt_error (pfile);
2203
2204       /* For -Wc++-compat, warn about use of C++ named operators.  */
2205       if (result->flags & NODE_WARN_OPERATOR)
2206         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2207                      "identifier \"%s\" is a special operator name in C++",
2208                      NODE_NAME (result));
2209     }
2210
2211   return result;
2212 }
2213
2214 /* Get the cpp_hashnode of an identifier specified by NAME in
2215    the current cpp_reader object.  If none is found, NULL is returned.  */
2216 cpp_hashnode *
2217 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2218 {
2219   cpp_hashnode *result;
2220   result = lex_identifier_intern (pfile, (uchar *) name);
2221   return result;
2222 }
2223
2224 /* Lex an identifier starting at BUFFER->CUR - 1.  */
2225 static cpp_hashnode *
2226 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2227                 struct normalize_state *nst, cpp_hashnode **spelling)
2228 {
2229   cpp_hashnode *result;
2230   const uchar *cur;
2231   unsigned int len;
2232   unsigned int hash = HT_HASHSTEP (0, *base);
2233   const bool warn_bidi_p = pfile->warn_bidi_p ();
2234
2235   cur = pfile->buffer->cur;
2236   if (! starts_ucn)
2237     {
2238       while (ISIDNUM (*cur))
2239         {
2240           hash = HT_HASHSTEP (hash, *cur);
2241           cur++;
2242         }
2243       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2244     }
2245   pfile->buffer->cur = cur;
2246   if (starts_ucn || forms_identifier_p (pfile, false, nst))
2247     {
2248       /* Slower version for identifiers containing UCNs
2249          or extended chars (including $).  */
2250       do {
2251         while (ISIDNUM (*pfile->buffer->cur))
2252           {
2253             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2254             pfile->buffer->cur++;
2255           }
2256       } while (forms_identifier_p (pfile, false, nst));
2257       if (warn_bidi_p)
2258         maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2259       result = _cpp_interpret_identifier (pfile, base,
2260                                           pfile->buffer->cur - base);
2261       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2262     }
2263   else
2264     {
2265       len = cur - base;
2266       hash = HT_HASHFINISH (hash, len);
2267
2268       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2269                                                   base, len, hash, HT_ALLOC));
2270       *spelling = result;
2271     }
2272
2273   /* Rarely, identifiers require diagnostics when lexed.  */
2274   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2275                         && !pfile->state.skipping, 0))
2276     {
2277       /* It is allowed to poison the same identifier twice.  */
2278       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2279         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2280                    NODE_NAME (result));
2281
2282       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2283          replacement list of a variadic macro.  */
2284       if (result == pfile->spec_nodes.n__VA_ARGS__
2285           && !pfile->state.va_args_ok)
2286         {
2287           if (CPP_OPTION (pfile, cplusplus))
2288             cpp_error (pfile, CPP_DL_PEDWARN,
2289                        "__VA_ARGS__ can only appear in the expansion"
2290                        " of a C++11 variadic macro");
2291           else
2292             cpp_error (pfile, CPP_DL_PEDWARN,
2293                        "__VA_ARGS__ can only appear in the expansion"
2294                        " of a C99 variadic macro");
2295         }
2296
2297       /* __VA_OPT__ should only appear in the replacement list of a
2298          variadic macro.  */
2299       if (result == pfile->spec_nodes.n__VA_OPT__)
2300         maybe_va_opt_error (pfile);
2301
2302       /* For -Wc++-compat, warn about use of C++ named operators.  */
2303       if (result->flags & NODE_WARN_OPERATOR)
2304         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2305                      "identifier \"%s\" is a special operator name in C++",
2306                      NODE_NAME (result));
2307     }
2308
2309   return result;
2310 }
2311
2312 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
2313 static void
2314 lex_number (cpp_reader *pfile, cpp_string *number,
2315             struct normalize_state *nst)
2316 {
2317   const uchar *cur;
2318   const uchar *base;
2319   uchar *dest;
2320
2321   base = pfile->buffer->cur - 1;
2322   do
2323     {
2324       const uchar *adj_digit_sep = NULL;
2325       cur = pfile->buffer->cur;
2326
2327       /* N.B. ISIDNUM does not include $.  */
2328       while (ISIDNUM (*cur)
2329              || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2330              || DIGIT_SEP (*cur)
2331              || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2332         {
2333           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2334           /* Adjacent digit separators do not form part of the pp-number syntax.
2335              However, they can safely be diagnosed here as an error, since '' is
2336              not a valid preprocessing token.  */
2337           if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2338             adj_digit_sep = cur;
2339           cur++;
2340         }
2341       /* A number can't end with a digit separator.  */
2342       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2343         --cur;
2344       if (adj_digit_sep && adj_digit_sep < cur)
2345         cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2346
2347       pfile->buffer->cur = cur;
2348     }
2349   while (forms_identifier_p (pfile, false, nst));
2350
2351   number->len = cur - base;
2352   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2353   memcpy (dest, base, number->len);
2354   dest[number->len] = '\0';
2355   number->text = dest;
2356 }
2357
2358 /* Create a token of type TYPE with a literal spelling.  */
2359 static void
2360 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2361                 unsigned int len, enum cpp_ttype type)
2362 {
2363   token->type = type;
2364   token->val.str.len = len;
2365   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2366 }
2367
2368 const uchar *
2369 cpp_alloc_token_string (cpp_reader *pfile,
2370                         const unsigned char *ptr, unsigned len)
2371 {
2372   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2373
2374   dest[len] = 0;
2375   memcpy (dest, ptr, len);
2376   return dest;
2377 }
2378
2379 /* A pair of raw buffer pointers.  The currently open one is [1], the
2380    first one is [0].  Used for string literal lexing.  */
2381 struct lit_accum {
2382   _cpp_buff *first;
2383   _cpp_buff *last;
2384   const uchar *rpos;
2385   size_t accum;
2386
2387   lit_accum ()
2388     : first (NULL), last (NULL), rpos (0), accum (0)
2389   {
2390   }
2391
2392   void append (cpp_reader *, const uchar *, size_t);
2393
2394   void read_begin (cpp_reader *);
2395   bool reading_p () const
2396   {
2397     return rpos != NULL;
2398   }
2399   char read_char ()
2400   {
2401     char c = *rpos++;
2402     if (rpos == BUFF_FRONT (last))
2403       rpos = NULL;
2404     return c;
2405   }
2406 };
2407
2408 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2409    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
2410
2411 void
2412 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2413 {
2414   if (!last)
2415     /* Starting.  */
2416     first = last = _cpp_get_buff (pfile, len);
2417   else if (len > BUFF_ROOM (last))
2418     {
2419       /* There is insufficient room in the buffer.  Copy what we can,
2420          and then either extend or create a new one.  */
2421       size_t room = BUFF_ROOM (last);
2422       memcpy (BUFF_FRONT (last), base, room);
2423       BUFF_FRONT (last) += room;
2424       base += room;
2425       len -= room;
2426       accum += room;
2427
2428       gcc_checking_assert (!rpos);
2429
2430       last = _cpp_append_extend_buff (pfile, last, len);
2431     }
2432
2433   memcpy (BUFF_FRONT (last), base, len);
2434   BUFF_FRONT (last) += len;
2435   accum += len;
2436 }
2437
2438 void
2439 lit_accum::read_begin (cpp_reader *pfile)
2440 {
2441   /* We never accumulate more than 4 chars to read.  */
2442   if (BUFF_ROOM (last) < 4)
2443
2444     last = _cpp_append_extend_buff (pfile, last, 4);
2445   rpos = BUFF_FRONT (last);
2446 }
2447
2448 /* Returns true if a macro has been defined.
2449    This might not work if compile with -save-temps,
2450    or preprocess separately from compilation.  */
2451
2452 static bool
2453 is_macro(cpp_reader *pfile, const uchar *base)
2454 {
2455   const uchar *cur = base;
2456   if (! ISIDST (*cur))
2457     return false;
2458   unsigned int hash = HT_HASHSTEP (0, *cur);
2459   ++cur;
2460   while (ISIDNUM (*cur))
2461     {
2462       hash = HT_HASHSTEP (hash, *cur);
2463       ++cur;
2464     }
2465   hash = HT_HASHFINISH (hash, cur - base);
2466
2467   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2468                                         base, cur - base, hash, HT_NO_INSERT));
2469
2470   return result && cpp_macro_p (result);
2471 }
2472
2473 /* Returns true if a literal suffix does not have the expected form
2474    and is defined as a macro.  */
2475
2476 static bool
2477 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
2478 {
2479   /* User-defined literals outside of namespace std must start with a single
2480      underscore, so assume anything of that form really is a UDL suffix.
2481      We don't need to worry about UDLs defined inside namespace std because
2482      their names are reserved, so cannot be used as macro names in valid
2483      programs.  */
2484   if (base[0] == '_' && base[1] != '_')
2485     return false;
2486   return is_macro (pfile, base);
2487 }
2488
2489 /* Lexes a raw string.  The stored string contains the spelling,
2490    including double quotes, delimiter string, '(' and ')', any leading
2491    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
2492    the type of the literal, or CPP_OTHER if it was not properly
2493    terminated.
2494
2495    BASE is the start of the token.  Updates pfile->buffer->cur to just
2496    after the lexed string.
2497
2498    The spelling is NUL-terminated, but it is not guaranteed that this
2499    is the first NUL since embedded NULs are preserved.  */
2500
2501 static void
2502 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2503 {
2504   const uchar *pos = base;
2505   const bool warn_bidi_p = pfile->warn_bidi_p ();
2506   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2507   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2508
2509   /* 'tis a pity this information isn't passed down from the lexer's
2510      initial categorization of the token.  */
2511   enum cpp_ttype type = CPP_STRING;
2512
2513   if (*pos == 'L')
2514     {
2515       type = CPP_WSTRING;
2516       pos++;
2517     }
2518   else if (*pos == 'U')
2519     {
2520       type = CPP_STRING32;
2521       pos++;
2522     }
2523   else if (*pos == 'u')
2524     {
2525       if (pos[1] == '8')
2526         {
2527           type = CPP_UTF8STRING;
2528           pos++;
2529         }
2530       else
2531         type = CPP_STRING16;
2532       pos++;
2533     }
2534
2535   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2536   pos += 2;
2537
2538   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2539
2540   /* Skip notes before the ".  */
2541   while (note->pos < pos)
2542     ++note;
2543
2544   lit_accum accum;
2545
2546   uchar prefix[17];
2547   unsigned prefix_len = 0;
2548   enum Phase
2549   {
2550    PHASE_PREFIX = -2,
2551    PHASE_NONE = -1,
2552    PHASE_SUFFIX = 0
2553   } phase = PHASE_PREFIX;
2554
2555   for (;;)
2556     {
2557       gcc_checking_assert (note->pos >= pos);
2558
2559       /* Undo any escaped newlines and trigraphs.  */
2560       if (!accum.reading_p () && note->pos == pos)
2561         switch (note->type)
2562           {
2563           case '\\':
2564           case ' ':
2565             /* Restore backslash followed by newline.  */
2566             accum.append (pfile, base, pos - base);
2567             base = pos;
2568             accum.read_begin (pfile);
2569             accum.append (pfile, UC"\\", 1);
2570
2571           after_backslash:
2572             if (note->type == ' ')
2573               /* GNU backslash whitespace newline extension.  FIXME
2574                  could be any sequence of non-vertical space.  When we
2575                  can properly restore any such sequence, we should
2576                  mark this note as handled so _cpp_process_line_notes
2577                  doesn't warn.  */
2578               accum.append (pfile, UC" ", 1);
2579
2580             accum.append (pfile, UC"\n", 1);
2581             note++;
2582             break;
2583
2584           case '\n':
2585             /* This can happen for ??/<NEWLINE> when trigraphs are not
2586                being interpretted.  */
2587             gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2588             note->type = 0;
2589             note++;
2590             break;
2591
2592           default:
2593             gcc_checking_assert (_cpp_trigraph_map[note->type]);
2594
2595             /* Don't warn about this trigraph in
2596                _cpp_process_line_notes, since trigraphs show up as
2597                trigraphs in raw strings.  */
2598             uchar type = note->type;
2599             note->type = 0;
2600
2601             if (CPP_OPTION (pfile, trigraphs))
2602               {
2603                 accum.append (pfile, base, pos - base);
2604                 base = pos;
2605                 accum.read_begin (pfile);
2606                 accum.append (pfile, UC"??", 2);
2607                 accum.append (pfile, &type, 1);
2608
2609                 /* ??/ followed by newline gets two line notes, one for
2610                    the trigraph and one for the backslash/newline.  */
2611                 if (type == '/' && note[1].pos == pos)
2612                   {
2613                     note++;
2614                     gcc_assert (note->type == '\\' || note->type == ' ');
2615                     goto after_backslash;
2616                   }
2617                 /* Skip the replacement character.  */
2618                 base = ++pos;
2619               }
2620
2621             note++;
2622             break;
2623           }
2624
2625       /* Now get a char to process.  Either from an expanded note, or
2626          from the line buffer.  */
2627       bool read_note = accum.reading_p ();
2628       char c = read_note ? accum.read_char () : *pos++;
2629
2630       if (phase == PHASE_PREFIX)
2631         {
2632           if (c == '(')
2633             {
2634               /* Done.  */
2635               phase = PHASE_NONE;
2636               prefix[prefix_len++] = '"';
2637             }
2638           else if (prefix_len < 16
2639                    /* Prefix chars are any of the basic character set,
2640                       [lex.charset] except for '
2641                       ()\\\t\v\f\n'. Optimized for a contiguous
2642                       alphabet.  */
2643                    /* Unlike a switch, this collapses down to one or
2644                       two shift and bitmask operations on an ASCII
2645                       system, with an outlier or two.   */
2646                    && (('Z' - 'A' == 25
2647                         ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2648                         : ISIDST (c))
2649                        || (c >= '0' && c <= '9')
2650                        || c == '_' || c == '{' || c == '}'
2651                        || c == '[' || c == ']' || c == '#'
2652                        || c == '<' || c == '>' || c == '%'
2653                        || c == ':' || c == ';' || c == '.' || c == '?'
2654                        || c == '*' || c == '+' || c == '-' || c == '/'
2655                        || c == '^' || c == '&' || c == '|' || c == '~'
2656                        || c == '!' || c == '=' || c == ','
2657                        || c == '"' || c == '\''))
2658             prefix[prefix_len++] = c;
2659           else
2660             {
2661               /* Something is wrong.  */
2662               int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2663               if (prefix_len == 16)
2664                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2665                                      col, "raw string delimiter longer "
2666                                      "than 16 characters");
2667               else if (c == '\n')
2668                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2669                                      col, "invalid new-line in raw "
2670                                      "string delimiter");
2671               else
2672                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2673                                      col, "invalid character '%c' in "
2674                                      "raw string delimiter", c);
2675               type = CPP_OTHER;
2676               phase = PHASE_NONE;
2677               /* Continue until we get a close quote, that's probably
2678                  the best failure mode.  */
2679               prefix_len = 0;
2680             }
2681           if (c != '\n')
2682             continue;
2683         }
2684
2685       if (phase != PHASE_NONE)
2686         {
2687           if (prefix[phase] != c)
2688             phase = PHASE_NONE;
2689           else if (unsigned (phase + 1) == prefix_len)
2690             break;
2691           else
2692             {
2693               phase = Phase (phase + 1);
2694               continue;
2695             }
2696         }
2697
2698       if (!prefix_len && c == '"')
2699         /* Failure mode lexing.  */
2700         goto out;
2701       else if (prefix_len && c == ')')
2702         phase = PHASE_SUFFIX;
2703       else if (!read_note && c == '\n')
2704         {
2705           pos--;
2706           pfile->buffer->cur = pos;
2707           if ((pfile->state.in_directive || pfile->state.parsing_args)
2708               && pfile->buffer->next_line >= pfile->buffer->rlimit)
2709             {
2710               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2711                                    "unterminated raw string");
2712               type = CPP_OTHER;
2713               goto out;
2714             }
2715
2716           accum.append (pfile, base, pos - base + 1);
2717           _cpp_process_line_notes (pfile, false);
2718
2719           if (pfile->buffer->next_line < pfile->buffer->rlimit)
2720             CPP_INCREMENT_LINE (pfile, 0);
2721           pfile->buffer->need_line = true;
2722
2723           if (!get_fresh_line_impl<true> (pfile))
2724             {
2725               /* We ran out of file and failed to get a line.  */
2726               location_t src_loc = token->src_loc;
2727               token->type = CPP_EOF;
2728               /* Tell the compiler the line number of the EOF token.  */
2729               token->src_loc = pfile->line_table->highest_line;
2730               token->flags = BOL;
2731               if (accum.first)
2732                 _cpp_release_buff (pfile, accum.first);
2733               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2734                                    "unterminated raw string");
2735
2736               /* Now pop the buffer that get_fresh_line_impl() did not.  Popping
2737                  is not safe if processing a directive, however this cannot
2738                  happen as we already checked above that a line would be
2739                  available, and get_fresh_line_impl() can't fail in this
2740                  case.  */
2741               gcc_assert (!pfile->state.in_directive);
2742               _cpp_pop_buffer (pfile);
2743
2744               return;
2745             }
2746
2747           pos = base = pfile->buffer->cur;
2748           note = &pfile->buffer->notes[pfile->buffer->cur_note];
2749         }
2750       else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2751                && warn_bidi_or_invalid_utf8_p)
2752         pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
2753                                           warn_invalid_utf8_p);
2754     }
2755
2756   if (warn_bidi_p)
2757     maybe_warn_bidi_on_close (pfile, pos);
2758
2759   if (CPP_OPTION (pfile, user_literals))
2760     {
2761       /* If a string format macro, say from inttypes.h, is placed touching
2762          a string literal it could be parsed as a C++11 user-defined string
2763          literal thus breaking the program.  */
2764       if (is_macro_not_literal_suffix (pfile, pos))
2765         {
2766           /* Raise a warning, but do not consume subsequent tokens.  */
2767           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2768             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2769                                    token->src_loc, 0,
2770                                    "invalid suffix on literal; C++11 requires "
2771                                    "a space between literal and string macro");
2772         }
2773       /* Grab user defined literal suffix.  */
2774       else if (ISIDST (*pos))
2775         {
2776           type = cpp_userdef_string_add_type (type);
2777           ++pos;
2778
2779           while (ISIDNUM (*pos))
2780             ++pos;
2781         }
2782     }
2783
2784  out:
2785   pfile->buffer->cur = pos;
2786   if (!accum.accum)
2787     create_literal (pfile, token, base, pos - base, type);
2788   else
2789     {
2790       size_t extra_len = pos - base;
2791       uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2792
2793       token->type = type;
2794       token->val.str.len = accum.accum + extra_len;
2795       token->val.str.text = dest;
2796       for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2797         {
2798           size_t len = BUFF_FRONT (buf) - buf->base;
2799           memcpy (dest, buf->base, len);
2800           dest += len;
2801         }
2802       _cpp_release_buff (pfile, accum.first);
2803       memcpy (dest, base, extra_len);
2804       dest[extra_len] = '\0';
2805     }
2806 }
2807
2808 /* Lexes a string, character constant, or angle-bracketed header file
2809    name.  The stored string contains the spelling, including opening
2810    quote and any leading 'L', 'u', 'U' or 'u8' and optional
2811    'R' modifier.  It returns the type of the literal, or CPP_OTHER
2812    if it was not properly terminated, or CPP_LESS for an unterminated
2813    header name which must be relexed as normal tokens.
2814
2815    The spelling is NUL-terminated, but it is not guaranteed that this
2816    is the first NUL since embedded NULs are preserved.  */
2817 static void
2818 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2819 {
2820   bool saw_NUL = false;
2821   const uchar *cur;
2822   cppchar_t terminator;
2823   enum cpp_ttype type;
2824
2825   cur = base;
2826   terminator = *cur++;
2827   if (terminator == 'L' || terminator == 'U')
2828     terminator = *cur++;
2829   else if (terminator == 'u')
2830     {
2831       terminator = *cur++;
2832       if (terminator == '8')
2833         terminator = *cur++;
2834     }
2835   if (terminator == 'R')
2836     {
2837       lex_raw_string (pfile, token, base);
2838       return;
2839     }
2840   if (terminator == '"')
2841     type = (*base == 'L' ? CPP_WSTRING :
2842             *base == 'U' ? CPP_STRING32 :
2843             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2844                          : CPP_STRING);
2845   else if (terminator == '\'')
2846     type = (*base == 'L' ? CPP_WCHAR :
2847             *base == 'U' ? CPP_CHAR32 :
2848             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2849                          : CPP_CHAR);
2850   else
2851     terminator = '>', type = CPP_HEADER_NAME;
2852
2853   const bool warn_bidi_p = pfile->warn_bidi_p ();
2854   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2855   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2856   for (;;)
2857     {
2858       cppchar_t c = *cur++;
2859
2860       /* In #include-style directives, terminators are not escapable.  */
2861       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2862         {
2863           if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
2864             {
2865               location_t loc;
2866               bidi::kind kind;
2867               if (cur[0] == 'N')
2868                 kind = get_bidi_named (pfile, cur + 1, &loc);
2869               else
2870                 kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
2871               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2872             }
2873           cur++;
2874         }
2875       else if (c == terminator)
2876         {
2877           if (warn_bidi_p)
2878             maybe_warn_bidi_on_close (pfile, cur - 1);
2879           break;
2880         }
2881       else if (c == '\n')
2882         {
2883           cur--;
2884           /* Unmatched quotes always yield undefined behavior, but
2885              greedy lexing means that what appears to be an unterminated
2886              header name may actually be a legitimate sequence of tokens.  */
2887           if (terminator == '>')
2888             {
2889               token->type = CPP_LESS;
2890               return;
2891             }
2892           type = CPP_OTHER;
2893           break;
2894         }
2895       else if (c == '\0')
2896         saw_NUL = true;
2897       else if (__builtin_expect (c >= utf8_continuation, 0)
2898                && warn_bidi_or_invalid_utf8_p)
2899         cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2900                                           warn_invalid_utf8_p);
2901     }
2902
2903   if (saw_NUL && !pfile->state.skipping)
2904     cpp_error (pfile, CPP_DL_WARNING,
2905                "null character(s) preserved in literal");
2906
2907   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2908     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2909                (int) terminator);
2910
2911   if (CPP_OPTION (pfile, user_literals))
2912     {
2913       /* If a string format macro, say from inttypes.h, is placed touching
2914          a string literal it could be parsed as a C++11 user-defined string
2915          literal thus breaking the program.  */
2916       if (is_macro_not_literal_suffix (pfile, cur))
2917         {
2918           /* Raise a warning, but do not consume subsequent tokens.  */
2919           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2920             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2921                                    token->src_loc, 0,
2922                                    "invalid suffix on literal; C++11 requires "
2923                                    "a space between literal and string macro");
2924         }
2925       /* Grab user defined literal suffix.  */
2926       else if (ISIDST (*cur))
2927         {
2928           type = cpp_userdef_char_add_type (type);
2929           type = cpp_userdef_string_add_type (type);
2930           ++cur;
2931
2932           while (ISIDNUM (*cur))
2933             ++cur;
2934         }
2935     }
2936   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2937            && is_macro (pfile, cur)
2938            && !pfile->state.skipping)
2939     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2940                            token->src_loc, 0, "C++11 requires a space "
2941                            "between string literal and macro");
2942
2943   pfile->buffer->cur = cur;
2944   create_literal (pfile, token, base, cur - base, type);
2945 }
2946
2947 /* Return the comment table. The client may not make any assumption
2948    about the ordering of the table.  */
2949 cpp_comment_table *
2950 cpp_get_comments (cpp_reader *pfile)
2951 {
2952   return &pfile->comments;
2953 }
2954
2955 /* Append a comment to the end of the comment table. */
2956 static void
2957 store_comment (cpp_reader *pfile, cpp_token *token)
2958 {
2959   int len;
2960
2961   if (pfile->comments.allocated == 0)
2962     {
2963       pfile->comments.allocated = 256;
2964       pfile->comments.entries = (cpp_comment *) xmalloc
2965         (pfile->comments.allocated * sizeof (cpp_comment));
2966     }
2967
2968   if (pfile->comments.count == pfile->comments.allocated)
2969     {
2970       pfile->comments.allocated *= 2;
2971       pfile->comments.entries = (cpp_comment *) xrealloc
2972         (pfile->comments.entries,
2973          pfile->comments.allocated * sizeof (cpp_comment));
2974     }
2975
2976   len = token->val.str.len;
2977
2978   /* Copy comment. Note, token may not be NULL terminated. */
2979   pfile->comments.entries[pfile->comments.count].comment =
2980     (char *) xmalloc (sizeof (char) * (len + 1));
2981   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2982           token->val.str.text, len);
2983   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2984
2985   /* Set source location. */
2986   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2987
2988   /* Increment the count of entries in the comment table. */
2989   pfile->comments.count++;
2990 }
2991
2992 /* The stored comment includes the comment start and any terminator.  */
2993 static void
2994 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2995               cppchar_t type)
2996 {
2997   unsigned char *buffer;
2998   unsigned int len, clen, i;
2999
3000   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
3001
3002   /* C++ comments probably (not definitely) have moved past a new
3003      line, which we don't want to save in the comment.  */
3004   if (is_vspace (pfile->buffer->cur[-1]))
3005     len--;
3006
3007   /* If we are currently in a directive or in argument parsing, then
3008      we need to store all C++ comments as C comments internally, and
3009      so we need to allocate a little extra space in that case.
3010
3011      Note that the only time we encounter a directive here is
3012      when we are saving comments in a "#define".  */
3013   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
3014           && type == '/') ? len + 2 : len;
3015
3016   buffer = _cpp_unaligned_alloc (pfile, clen);
3017
3018   token->type = CPP_COMMENT;
3019   token->val.str.len = clen;
3020   token->val.str.text = buffer;
3021
3022   buffer[0] = '/';
3023   memcpy (buffer + 1, from, len - 1);
3024
3025   /* Finish conversion to a C comment, if necessary.  */
3026   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
3027     {
3028       buffer[1] = '*';
3029       buffer[clen - 2] = '*';
3030       buffer[clen - 1] = '/';
3031       /* As there can be in a C++ comments illegal sequences for C comments
3032          we need to filter them out.  */
3033       for (i = 2; i < (clen - 2); i++)
3034         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3035           buffer[i] = '|';
3036     }
3037
3038   /* Finally store this comment for use by clients of libcpp. */
3039   store_comment (pfile, token);
3040 }
3041
3042 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3043    comment.  */
3044
3045 static bool
3046 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3047 {
3048   const unsigned char *from = comment_start + 1;
3049
3050   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3051     {
3052       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3053          don't recognize any comments.  The latter only checks attributes,
3054          the former doesn't warn.  */
3055     case 0:
3056     default:
3057       return false;
3058       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3059          content it has.  */
3060     case 1:
3061       return true;
3062     case 2:
3063       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3064          .*falls?[ \t-]*thr(u|ough).* regex.  */
3065       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3066            from++)
3067         {
3068           /* Is there anything like strpbrk with upper boundary, or
3069              memchr looking for 2 characters rather than just one?  */
3070           if (from[0] != 'f' && from[0] != 'F')
3071             continue;
3072           if (from[1] != 'a' && from[1] != 'A')
3073             continue;
3074           if (from[2] != 'l' && from[2] != 'L')
3075             continue;
3076           if (from[3] != 'l' && from[3] != 'L')
3077             continue;
3078           from += sizeof "fall" - 1;
3079           if (from[0] == 's' || from[0] == 'S')
3080             from++;
3081           while (*from == ' ' || *from == '\t' || *from == '-')
3082             from++;
3083           if (from[0] != 't' && from[0] != 'T')
3084             continue;
3085           if (from[1] != 'h' && from[1] != 'H')
3086             continue;
3087           if (from[2] != 'r' && from[2] != 'R')
3088             continue;
3089           if (from[3] == 'u' || from[3] == 'U')
3090             return true;
3091           if (from[3] != 'o' && from[3] != 'O')
3092             continue;
3093           if (from[4] != 'u' && from[4] != 'U')
3094             continue;
3095           if (from[5] != 'g' && from[5] != 'G')
3096             continue;
3097           if (from[6] != 'h' && from[6] != 'H')
3098             continue;
3099           return true;
3100         }
3101       return false;
3102     case 3:
3103     case 4:
3104       break;
3105     }
3106
3107   /* Whole comment contents:
3108      -fallthrough
3109      @fallthrough@
3110    */
3111   if (*from == '-' || *from == '@')
3112     {
3113       size_t len = sizeof "fallthrough" - 1;
3114       if ((size_t) (pfile->buffer->cur - from - 1) < len)
3115         return false;
3116       if (memcmp (from + 1, "fallthrough", len))
3117         return false;
3118       if (*from == '@')
3119         {
3120           if (from[len + 1] != '@')
3121             return false;
3122           len++;
3123         }
3124       from += 1 + len;
3125     }
3126   /* Whole comment contents (regex):
3127      lint -fallthrough[ \t]*
3128    */
3129   else if (*from == 'l')
3130     {
3131       size_t len = sizeof "int -fallthrough" - 1;
3132       if ((size_t) (pfile->buffer->cur - from - 1) < len)
3133         return false;
3134       if (memcmp (from + 1, "int -fallthrough", len))
3135         return false;
3136       from += 1 + len;
3137       while (*from == ' ' || *from == '\t')
3138         from++;
3139     }
3140   /* Whole comment contents (regex):
3141      [ \t]*FALLTHR(U|OUGH)[ \t]*
3142    */
3143   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3144     {
3145       while (*from == ' ' || *from == '\t')
3146         from++;
3147       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
3148         return false;
3149       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
3150         return false;
3151       from += sizeof "FALLTHR" - 1;
3152       if (*from == 'U')
3153         from++;
3154       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
3155         return false;
3156       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
3157         return false;
3158       else
3159         from += sizeof "OUGH" - 1;
3160       while (*from == ' ' || *from == '\t')
3161         from++;
3162     }
3163   /* Whole comment contents (regex):
3164      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3165      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3166      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
3167    */
3168   else
3169     {
3170       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3171         from++;
3172       unsigned char f = *from;
3173       bool all_upper = false;
3174       if (f == 'E' || f == 'e')
3175         {
3176           if ((size_t) (pfile->buffer->cur - from)
3177               < sizeof "else fallthru" - 1)
3178             return false;
3179           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
3180             all_upper = true;
3181           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
3182             return false;
3183           from += sizeof "else" - 1;
3184           if (*from == ',')
3185             from++;
3186           if (*from != ' ')
3187             return false;
3188           from++;
3189           if (all_upper && *from == 'f')
3190             return false;
3191           if (f == 'e' && *from == 'F')
3192             return false;
3193           f = *from;
3194         }
3195       else if (f == 'I' || f == 'i')
3196         {
3197           if ((size_t) (pfile->buffer->cur - from)
3198               < sizeof "intentional fallthru" - 1)
3199             return false;
3200           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
3201                                   sizeof "NTENTIONAL" - 1) == 0)
3202             all_upper = true;
3203           else if (memcmp (from + 1, "ntentional",
3204                            sizeof "ntentional" - 1))
3205             return false;
3206           from += sizeof "intentional" - 1;
3207           if (*from == ' ')
3208             {
3209               from++;
3210               if (all_upper && *from == 'f')
3211                 return false;
3212             }
3213           else if (all_upper)
3214             {
3215               if (memcmp (from, "LY F", sizeof "LY F" - 1))
3216                 return false;
3217               from += sizeof "LY " - 1;
3218             }
3219           else
3220             {
3221               if (memcmp (from, "ly ", sizeof "ly " - 1))
3222                 return false;
3223               from += sizeof "ly " - 1;
3224             }
3225           if (f == 'i' && *from == 'F')
3226             return false;
3227           f = *from;
3228         }
3229       if (f != 'F' && f != 'f')
3230         return false;
3231       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3232         return false;
3233       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3234         all_upper = true;
3235       else if (all_upper)
3236         return false;
3237       else if (memcmp (from + 1, "all", sizeof "all" - 1))
3238         return false;
3239       from += sizeof "fall" - 1;
3240       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3241         from += 2;
3242       else if (*from == ' ' || *from == '-')
3243         from++;
3244       else if (*from != (all_upper ? 'T' : 't'))
3245         return false;
3246       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3247         return false;
3248       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3249         return false;
3250       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3251         {
3252           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3253             return false;
3254           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3255                       sizeof "hrough" - 1))
3256             return false;
3257           from += sizeof "through" - 1;
3258         }
3259       else
3260         from += sizeof "thru" - 1;
3261       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3262         from++;
3263       if (*from == '-')
3264         {
3265           from++;
3266           if (*comment_start == '*')
3267             {
3268               do
3269                 {
3270                   while (*from && *from != '*'
3271                          && *from != '\n' && *from != '\r')
3272                     from++;
3273                   if (*from != '*' || from[1] == '/')
3274                     break;
3275                   from++;
3276                 }
3277               while (1);
3278             }
3279           else
3280             while (*from && *from != '\n' && *from != '\r')
3281               from++;
3282         }
3283     }
3284   /* C block comment.  */
3285   if (*comment_start == '*')
3286     {
3287       if (*from != '*' || from[1] != '/')
3288         return false;
3289     }
3290   /* C++ line comment.  */
3291   else if (*from != '\n')
3292     return false;
3293
3294   return true;
3295 }
3296
3297 /* Allocate COUNT tokens for RUN.  */
3298 void
3299 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3300 {
3301   run->base = XNEWVEC (cpp_token, count);
3302   run->limit = run->base + count;
3303   run->next = NULL;
3304 }
3305
3306 /* Returns the next tokenrun, or creates one if there is none.  */
3307 static tokenrun *
3308 next_tokenrun (tokenrun *run)
3309 {
3310   if (run->next == NULL)
3311     {
3312       run->next = XNEW (tokenrun);
3313       run->next->prev = run;
3314       _cpp_init_tokenrun (run->next, 250);
3315     }
3316
3317   return run->next;
3318 }
3319
3320 /* Return the number of not yet processed token in a given
3321    context.  */
3322 int
3323 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3324 {
3325   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3326     return (LAST (context).token - FIRST (context).token);
3327   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3328            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3329     return (LAST (context).ptoken - FIRST (context).ptoken);
3330   else
3331       abort ();
3332 }
3333
3334 /* Returns the token present at index INDEX in a given context.  If
3335    INDEX is zero, the next token to be processed is returned.  */
3336 static const cpp_token*
3337 _cpp_token_from_context_at (cpp_context *context, int index)
3338 {
3339   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3340     return &(FIRST (context).token[index]);
3341   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3342            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3343     return FIRST (context).ptoken[index];
3344  else
3345    abort ();
3346 }
3347
3348 /* Look ahead in the input stream.  */
3349 const cpp_token *
3350 cpp_peek_token (cpp_reader *pfile, int index)
3351 {
3352   cpp_context *context = pfile->context;
3353   const cpp_token *peektok;
3354   int count;
3355
3356   /* First, scan through any pending cpp_context objects.  */
3357   while (context->prev)
3358     {
3359       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3360
3361       if (index < (int) sz)
3362         return _cpp_token_from_context_at (context, index);
3363       index -= (int) sz;
3364       context = context->prev;
3365     }
3366
3367   /* We will have to read some new tokens after all (and do so
3368      without invalidating preceding tokens).  */
3369   count = index;
3370   pfile->keep_tokens++;
3371
3372   /* For peeked tokens temporarily disable line_change reporting,
3373      until the tokens are parsed for real.  */
3374   void (*line_change) (cpp_reader *, const cpp_token *, int)
3375     = pfile->cb.line_change;
3376   pfile->cb.line_change = NULL;
3377
3378   do
3379     {
3380       peektok = _cpp_lex_token (pfile);
3381       if (peektok->type == CPP_EOF)
3382         {
3383           index--;
3384           break;
3385         }
3386       else if (peektok->type == CPP_PRAGMA)
3387         {
3388           /* Don't peek past a pragma.  */
3389           if (peektok == &pfile->directive_result)
3390             /* Save the pragma in the buffer.  */
3391             *pfile->cur_token++ = *peektok;
3392           index--;
3393           break;
3394         }
3395     }
3396   while (index--);
3397
3398   _cpp_backup_tokens_direct (pfile, count - index);
3399   pfile->keep_tokens--;
3400   pfile->cb.line_change = line_change;
3401
3402   return peektok;
3403 }
3404
3405 /* Allocate a single token that is invalidated at the same time as the
3406    rest of the tokens on the line.  Has its line and col set to the
3407    same as the last lexed token, so that diagnostics appear in the
3408    right place.  */
3409 cpp_token *
3410 _cpp_temp_token (cpp_reader *pfile)
3411 {
3412   cpp_token *old, *result;
3413   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3414   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3415
3416   old = pfile->cur_token - 1;
3417   /* Any pre-existing lookaheads must not be clobbered.  */
3418   if (la)
3419     {
3420       if (sz <= la)
3421         {
3422           tokenrun *next = next_tokenrun (pfile->cur_run);
3423
3424           if (sz < la)
3425             memmove (next->base + 1, next->base,
3426                      (la - sz) * sizeof (cpp_token));
3427
3428           next->base[0] = pfile->cur_run->limit[-1];
3429         }
3430
3431       if (sz > 1)
3432         memmove (pfile->cur_token + 1, pfile->cur_token,
3433                  MIN (la, sz - 1) * sizeof (cpp_token));
3434     }
3435
3436   if (!sz && pfile->cur_token == pfile->cur_run->limit)
3437     {
3438       pfile->cur_run = next_tokenrun (pfile->cur_run);
3439       pfile->cur_token = pfile->cur_run->base;
3440     }
3441
3442   result = pfile->cur_token++;
3443   result->src_loc = old->src_loc;
3444   return result;
3445 }
3446
3447 /* We're at the beginning of a logical line (so not in
3448   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
3449   if we should enter deferred_pragma mode to tokenize the rest of the
3450   line as a module control-line.  */
3451
3452 static void
3453 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3454 {
3455   unsigned backup = 0; /* Tokens we peeked.  */
3456   cpp_hashnode *node = result->val.node.node;
3457   cpp_token *peek = result;
3458   cpp_token *keyword = peek;
3459   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3460   int header_count = 0;
3461
3462   /* Make sure the incoming state is as we expect it.  This way we
3463      can restore it using constants.  */
3464   gcc_checking_assert (!pfile->state.in_deferred_pragma
3465                        && !pfile->state.skipping
3466                        && !pfile->state.parsing_args
3467                        && !pfile->state.angled_headers
3468                        && (pfile->state.save_comments
3469                            == !CPP_OPTION (pfile, discard_comments)));
3470
3471   /* Enter directives mode sufficiently for peeking.  We don't have
3472      to actually set in_directive.  */
3473   pfile->state.in_deferred_pragma = true;
3474
3475   /* These two fields are needed to process tokenization in deferred
3476      pragma mode.  They are not used outside deferred pragma mode or
3477      directives mode.  */
3478   pfile->state.pragma_allow_expansion = true;
3479   pfile->directive_line = result->src_loc;
3480
3481   /* Saving comments is incompatible with directives mode.   */
3482   pfile->state.save_comments = 0;
3483
3484   if (node == n_modules[spec_nodes::M_EXPORT][0])
3485     {
3486       peek = _cpp_lex_direct (pfile);
3487       keyword = peek;
3488       backup++;
3489       if (keyword->type != CPP_NAME)
3490         goto not_module;
3491       node = keyword->val.node.node;
3492       if (!(node->flags & NODE_MODULE))
3493         goto not_module;
3494     }
3495
3496   if (node == n_modules[spec_nodes::M__IMPORT][0])
3497     /* __import  */
3498     header_count = backup + 2 + 16;
3499   else if (node == n_modules[spec_nodes::M_IMPORT][0])
3500     /* import  */
3501     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3502   else if (node == n_modules[spec_nodes::M_MODULE][0])
3503     ; /* module  */
3504   else
3505     goto not_module;
3506
3507   /* We've seen [export] {module|import|__import}.  Check the next token.  */
3508   if (header_count)
3509     /* After '{,__}import' a header name may appear.  */
3510     pfile->state.angled_headers = true;
3511   peek = _cpp_lex_direct (pfile);
3512   backup++;
3513
3514   /* ... import followed by identifier, ':', '<' or
3515      header-name preprocessing tokens, or module
3516      followed by cpp-identifier, ':' or ';' preprocessing
3517      tokens.  C++ keywords are not yet relevant.  */
3518   if (peek->type == CPP_NAME
3519       || peek->type == CPP_COLON
3520       ||  (header_count
3521            ? (peek->type == CPP_LESS
3522               || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3523               || peek->type == CPP_HEADER_NAME)
3524            : peek->type == CPP_SEMICOLON))
3525     {
3526       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3527       if (!pfile->state.pragma_allow_expansion)
3528         pfile->state.prevent_expansion++;
3529
3530       if (!header_count && linemap_included_from
3531           (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3532         cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3533                              "module control-line cannot be in included file");
3534
3535       /* The first one or two tokens cannot be macro names.  */
3536       for (int ix = backup; ix--;)
3537         {
3538           cpp_token *tok = ix ? keyword : result;
3539           cpp_hashnode *node = tok->val.node.node;
3540
3541           /* Don't attempt to expand the token.  */
3542           tok->flags |= NO_EXPAND;
3543           if (_cpp_defined_macro_p (node)
3544               && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3545               && !cpp_fun_like_macro_p (node))
3546             cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3547                                  "module control-line \"%s\" cannot be"
3548                                  " an object-like macro",
3549                                  NODE_NAME (node));
3550         }
3551
3552       /* Map to underbar variants.  */
3553       keyword->val.node.node = n_modules[header_count
3554                                          ? spec_nodes::M_IMPORT
3555                                          : spec_nodes::M_MODULE][1];
3556       if (backup != 1)
3557         result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3558
3559       /* Maybe tell the tokenizer we expect a header-name down the
3560          road.  */
3561       pfile->state.directive_file_token = header_count;
3562     }
3563   else
3564     {
3565     not_module:
3566       /* Drop out of directive mode.  */
3567       /* We aaserted save_comments had this value upon entry.  */
3568       pfile->state.save_comments
3569         = !CPP_OPTION (pfile, discard_comments);
3570       pfile->state.in_deferred_pragma = false;
3571       /* Do not let this remain on.  */
3572       pfile->state.angled_headers = false;
3573     }
3574
3575   /* In either case we want to backup the peeked tokens.  */
3576   if (backup)
3577     {
3578       /* If we saw EOL, we should drop it, because this isn't a module
3579          control-line after all.  */
3580       bool eol = peek->type == CPP_PRAGMA_EOL;
3581       if (!eol || backup > 1)
3582         {
3583           /* Put put the peeked tokens back  */
3584           _cpp_backup_tokens_direct (pfile, backup);
3585           /* But if the last one was an EOL, forget it.  */
3586           if (eol)
3587             pfile->lookaheads--;
3588         }
3589     }
3590 }
3591
3592 /* Lex a token into RESULT (external interface).  Takes care of issues
3593    like directive handling, token lookahead, multiple include
3594    optimization and skipping.  */
3595 const cpp_token *
3596 _cpp_lex_token (cpp_reader *pfile)
3597 {
3598   cpp_token *result;
3599
3600   for (;;)
3601     {
3602       if (pfile->cur_token == pfile->cur_run->limit)
3603         {
3604           pfile->cur_run = next_tokenrun (pfile->cur_run);
3605           pfile->cur_token = pfile->cur_run->base;
3606         }
3607       /* We assume that the current token is somewhere in the current
3608          run.  */
3609       if (pfile->cur_token < pfile->cur_run->base
3610           || pfile->cur_token >= pfile->cur_run->limit)
3611         abort ();
3612
3613       if (pfile->lookaheads)
3614         {
3615           pfile->lookaheads--;
3616           result = pfile->cur_token++;
3617         }
3618       else
3619         result = _cpp_lex_direct (pfile);
3620
3621       if (result->flags & BOL)
3622         {
3623           /* Is this a directive.  If _cpp_handle_directive returns
3624              false, it is an assembler #.  */
3625           if (result->type == CPP_HASH
3626               /* 6.10.3 p 11: Directives in a list of macro arguments
3627                  gives undefined behavior.  This implementation
3628                  handles the directive as normal.  */
3629               && pfile->state.parsing_args != 1)
3630             {
3631               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3632                 {
3633                   if (pfile->directive_result.type == CPP_PADDING)
3634                     continue;
3635                   result = &pfile->directive_result;
3636                 }
3637             }
3638           else if (pfile->state.in_deferred_pragma)
3639             result = &pfile->directive_result;
3640           else if (result->type == CPP_NAME
3641                    && (result->val.node.node->flags & NODE_MODULE)
3642                    && !pfile->state.skipping
3643                    /* Unlike regular directives, we do not deal with
3644                       tokenizing module directives as macro arguments.
3645                       That's not permitted.  */
3646                    && !pfile->state.parsing_args)
3647             {
3648               /* P1857.  Before macro expansion, At start of logical
3649                  line ... */
3650               /* We don't have to consider lookaheads at this point.  */
3651               gcc_checking_assert (!pfile->lookaheads);
3652
3653               cpp_maybe_module_directive (pfile, result);
3654             }
3655
3656           if (pfile->cb.line_change && !pfile->state.skipping)
3657             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3658         }
3659
3660       /* We don't skip tokens in directives.  */
3661       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3662         break;
3663
3664       /* Outside a directive, invalidate controlling macros.  At file
3665          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3666          get here and MI optimization works.  */
3667       pfile->mi_valid = false;
3668
3669       if (!pfile->state.skipping || result->type == CPP_EOF)
3670         break;
3671     }
3672
3673   return result;
3674 }
3675
3676 /* Returns true if a fresh line has been loaded.  */
3677 template <bool lexing_raw_string>
3678 static bool
3679 get_fresh_line_impl (cpp_reader *pfile)
3680 {
3681   /* We can't get a new line until we leave the current directive, unless we
3682      are lexing a raw string, in which case it will be OK as long as we don't
3683      pop the current buffer.  */
3684   if (!lexing_raw_string && pfile->state.in_directive)
3685     return false;
3686
3687   for (;;)
3688     {
3689       cpp_buffer *buffer = pfile->buffer;
3690
3691       if (!buffer->need_line)
3692         return true;
3693
3694       if (buffer->next_line < buffer->rlimit)
3695         {
3696           _cpp_clean_line (pfile);
3697           return true;
3698         }
3699
3700       /* We can't change buffers until we leave the current directive.  */
3701       if (lexing_raw_string && pfile->state.in_directive)
3702         return false;
3703
3704       /* First, get out of parsing arguments state.  */
3705       if (pfile->state.parsing_args)
3706         return false;
3707
3708       /* End of buffer.  Non-empty files should end in a newline.  */
3709       if (buffer->buf != buffer->rlimit
3710           && buffer->next_line > buffer->rlimit
3711           && !buffer->from_stage3)
3712         {
3713           /* Clip to buffer size.  */
3714           buffer->next_line = buffer->rlimit;
3715         }
3716
3717       if (buffer->prev && !buffer->return_at_eof)
3718         _cpp_pop_buffer (pfile);
3719       else
3720         {
3721           /* End of translation.  Do not pop the buffer yet. Increment
3722              line number so that the EOF token is on a line of its own
3723              (_cpp_lex_direct doesn't increment in that case, because
3724              it's hard for it to distinguish this special case). */
3725           CPP_INCREMENT_LINE (pfile, 0);
3726           return false;
3727         }
3728     }
3729 }
3730
3731 bool
3732 _cpp_get_fresh_line (cpp_reader *pfile)
3733 {
3734   return get_fresh_line_impl<false> (pfile);
3735 }
3736
3737
3738 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
3739   do                                                    \
3740     {                                                   \
3741       result->type = ELSE_TYPE;                         \
3742       if (*buffer->cur == CHAR)                         \
3743         buffer->cur++, result->type = THEN_TYPE;        \
3744     }                                                   \
3745   while (0)
3746
3747 /* Lex a token into pfile->cur_token, which is also incremented, to
3748    get diagnostics pointing to the correct location.
3749
3750    Does not handle issues such as token lookahead, multiple-include
3751    optimization, directives, skipping etc.  This function is only
3752    suitable for use by _cpp_lex_token, and in special cases like
3753    lex_expansion_token which doesn't care for any of these issues.
3754
3755    When meeting a newline, returns CPP_EOF if parsing a directive,
3756    otherwise returns to the start of the token buffer if permissible.
3757    Returns the location of the lexed token.  */
3758 cpp_token *
3759 _cpp_lex_direct (cpp_reader *pfile)
3760 {
3761   cppchar_t c;
3762   cpp_buffer *buffer;
3763   const unsigned char *comment_start;
3764   bool fallthrough_comment = false;
3765   cpp_token *result = pfile->cur_token++;
3766
3767  fresh_line:
3768   result->flags = 0;
3769   buffer = pfile->buffer;
3770   if (buffer->need_line)
3771     {
3772       if (pfile->state.in_deferred_pragma)
3773         {
3774           /* This can happen in cases like:
3775              #define loop(x) whatever
3776              #pragma omp loop
3777              where when trying to expand loop we need to peek
3778              next token after loop, but aren't still in_deferred_pragma
3779              mode but are in in_directive mode, so buffer->need_line
3780              is set, a CPP_EOF is peeked.  */
3781           result->type = CPP_PRAGMA_EOL;
3782           pfile->state.in_deferred_pragma = false;
3783           if (!pfile->state.pragma_allow_expansion)
3784             pfile->state.prevent_expansion--;
3785           return result;
3786         }
3787       if (!_cpp_get_fresh_line (pfile))
3788         {
3789           result->type = CPP_EOF;
3790           /* Not a real EOF in a directive or arg parsing -- we refuse
3791              to advance to the next file now, and will once we're out
3792              of those modes.  */
3793           if (!pfile->state.in_directive && !pfile->state.parsing_args)
3794             {
3795               /* Tell the compiler the line number of the EOF token.  */
3796               result->src_loc = pfile->line_table->highest_line;
3797               result->flags = BOL;
3798               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
3799               _cpp_pop_buffer (pfile);
3800             }
3801           return result;
3802         }
3803       if (buffer != pfile->buffer)
3804         fallthrough_comment = false;
3805       if (!pfile->keep_tokens)
3806         {
3807           pfile->cur_run = &pfile->base_run;
3808           result = pfile->base_run.base;
3809           pfile->cur_token = result + 1;
3810         }
3811       result->flags = BOL;
3812       if (pfile->state.parsing_args == 2)
3813         result->flags |= PREV_WHITE;
3814     }
3815   buffer = pfile->buffer;
3816  update_tokens_line:
3817   result->src_loc = pfile->line_table->highest_line;
3818
3819  skipped_white:
3820   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3821       && !pfile->overlaid_buffer)
3822     {
3823       _cpp_process_line_notes (pfile, false);
3824       result->src_loc = pfile->line_table->highest_line;
3825     }
3826   c = *buffer->cur++;
3827
3828   if (pfile->forced_token_location)
3829     result->src_loc = pfile->forced_token_location;
3830   else
3831     result->src_loc = linemap_position_for_column (pfile->line_table,
3832                                           CPP_BUF_COLUMN (buffer, buffer->cur));
3833
3834   switch (c)
3835     {
3836     case ' ': case '\t': case '\f': case '\v': case '\0':
3837       result->flags |= PREV_WHITE;
3838       skip_whitespace (pfile, c);
3839       goto skipped_white;
3840
3841     case '\n':
3842       /* Increment the line, unless this is the last line ...  */
3843       if (buffer->cur < buffer->rlimit
3844           /* ... or this is a #include, (where _cpp_stack_file needs to
3845              unwind by one line) ...  */
3846           || (pfile->state.in_directive > 1
3847               /* ... except traditional-cpp increments this elsewhere.  */
3848               && !CPP_OPTION (pfile, traditional)))
3849         CPP_INCREMENT_LINE (pfile, 0);
3850       buffer->need_line = true;
3851       if (pfile->state.in_deferred_pragma)
3852         {
3853           /* Produce the PRAGMA_EOL on this line.  File reading
3854              ensures there is always a \n at end of the buffer, thus
3855              in a deferred pragma we always see CPP_PRAGMA_EOL before
3856              any CPP_EOF.  */
3857           result->type = CPP_PRAGMA_EOL;
3858           result->flags &= ~PREV_WHITE;
3859           pfile->state.in_deferred_pragma = false;
3860           if (!pfile->state.pragma_allow_expansion)
3861             pfile->state.prevent_expansion--;
3862           return result;
3863         }
3864       goto fresh_line;
3865
3866     case '0': case '1': case '2': case '3': case '4':
3867     case '5': case '6': case '7': case '8': case '9':
3868       {
3869         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3870         result->type = CPP_NUMBER;
3871         lex_number (pfile, &result->val.str, &nst);
3872         warn_about_normalization (pfile, result, &nst, false);
3873         break;
3874       }
3875
3876     case 'L':
3877     case 'u':
3878     case 'U':
3879     case 'R':
3880       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3881          wide strings or raw strings.  */
3882       if (c == 'L' || CPP_OPTION (pfile, rliterals)
3883           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3884         {
3885           if ((*buffer->cur == '\'' && c != 'R')
3886               || *buffer->cur == '"'
3887               || (*buffer->cur == 'R'
3888                   && c != 'R'
3889                   && buffer->cur[1] == '"'
3890                   && CPP_OPTION (pfile, rliterals))
3891               || (*buffer->cur == '8'
3892                   && c == 'u'
3893                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3894                                 && CPP_OPTION (pfile, utf8_char_literals)))
3895                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3896                           && CPP_OPTION (pfile, rliterals)))))
3897             {
3898               lex_string (pfile, result, buffer->cur - 1);
3899               break;
3900             }
3901         }
3902       /* Fall through.  */
3903
3904     case '_':
3905     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3906     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3907     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3908     case 's': case 't':           case 'v': case 'w': case 'x':
3909     case 'y': case 'z':
3910     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3911     case 'G': case 'H': case 'I': case 'J': case 'K':
3912     case 'M': case 'N': case 'O': case 'P': case 'Q':
3913     case 'S': case 'T':           case 'V': case 'W': case 'X':
3914     case 'Y': case 'Z':
3915       result->type = CPP_NAME;
3916       {
3917         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3918         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3919                                                 &nst,
3920                                                 &result->val.node.spelling);
3921         warn_about_normalization (pfile, result, &nst, true);
3922       }
3923
3924       /* Convert named operators to their proper types.  */
3925       if (result->val.node.node->flags & NODE_OPERATOR)
3926         {
3927           result->flags |= NAMED_OP;
3928           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3929         }
3930
3931       /* Signal FALLTHROUGH comment followed by another token.  */
3932       if (fallthrough_comment)
3933         result->flags |= PREV_FALLTHROUGH;
3934       break;
3935
3936     case '\'':
3937     case '"':
3938       lex_string (pfile, result, buffer->cur - 1);
3939       break;
3940
3941     case '/':
3942       /* A potential block or line comment.  */
3943       comment_start = buffer->cur;
3944       c = *buffer->cur;
3945
3946       if (c == '*')
3947         {
3948           if (_cpp_skip_block_comment (pfile))
3949             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3950         }
3951       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3952         {
3953           /* Don't warn for system headers.  */
3954           if (_cpp_in_system_header (pfile))
3955             ;
3956           /* Warn about comments if pedantically GNUC89, and not
3957              in system headers.  */
3958           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3959                    && CPP_PEDANTIC (pfile)
3960                    && ! buffer->warned_cplusplus_comments)
3961             {
3962               if (cpp_error (pfile, CPP_DL_PEDWARN,
3963                              "C++ style comments are not allowed in ISO C90"))
3964                 cpp_error (pfile, CPP_DL_NOTE,
3965                            "(this will be reported only once per input file)");
3966               buffer->warned_cplusplus_comments = 1;
3967             }
3968           /* Or if specifically desired via -Wc90-c99-compat.  */
3969           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3970                    && ! CPP_OPTION (pfile, cplusplus)
3971                    && ! buffer->warned_cplusplus_comments)
3972             {
3973               if (cpp_error (pfile, CPP_DL_WARNING,
3974                              "C++ style comments are incompatible with C90"))
3975                 cpp_error (pfile, CPP_DL_NOTE,
3976                            "(this will be reported only once per input file)");
3977               buffer->warned_cplusplus_comments = 1;
3978             }
3979           /* In C89/C94, C++ style comments are forbidden.  */
3980           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3981                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
3982             {
3983               /* But don't be confused about valid code such as
3984                  - // immediately followed by *,
3985                  - // in a preprocessing directive,
3986                  - // in an #if 0 block.  */
3987               if (buffer->cur[1] == '*'
3988                   || pfile->state.in_directive
3989                   || pfile->state.skipping)
3990                 {
3991                   result->type = CPP_DIV;
3992                   break;
3993                 }
3994               else if (! buffer->warned_cplusplus_comments)
3995                 {
3996                   if (cpp_error (pfile, CPP_DL_ERROR,
3997                                  "C++ style comments are not allowed in "
3998                                  "ISO C90"))
3999                     cpp_error (pfile, CPP_DL_NOTE,
4000                                "(this will be reported only once per input "
4001                                "file)");
4002                   buffer->warned_cplusplus_comments = 1;
4003                 }
4004             }
4005           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
4006             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
4007         }
4008       else if (c == '=')
4009         {
4010           buffer->cur++;
4011           result->type = CPP_DIV_EQ;
4012           break;
4013         }
4014       else
4015         {
4016           result->type = CPP_DIV;
4017           break;
4018         }
4019
4020       if (fallthrough_comment_p (pfile, comment_start))
4021         fallthrough_comment = true;
4022
4023       if (pfile->cb.comment)
4024         {
4025           size_t len = pfile->buffer->cur - comment_start;
4026           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
4027                              len + 1);
4028         }
4029
4030       if (!pfile->state.save_comments)
4031         {
4032           result->flags |= PREV_WHITE;
4033           goto update_tokens_line;
4034         }
4035
4036       if (fallthrough_comment)
4037         result->flags |= PREV_FALLTHROUGH;
4038
4039       /* Save the comment as a token in its own right.  */
4040       save_comment (pfile, result, comment_start, c);
4041       break;
4042
4043     case '<':
4044       if (pfile->state.angled_headers)
4045         {
4046           lex_string (pfile, result, buffer->cur - 1);
4047           if (result->type != CPP_LESS)
4048             break;
4049         }
4050
4051       result->type = CPP_LESS;
4052       if (*buffer->cur == '=')
4053         {
4054           buffer->cur++, result->type = CPP_LESS_EQ;
4055           if (*buffer->cur == '>'
4056               && CPP_OPTION (pfile, cplusplus)
4057               && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
4058             buffer->cur++, result->type = CPP_SPACESHIP;
4059         }
4060       else if (*buffer->cur == '<')
4061         {
4062           buffer->cur++;
4063           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
4064         }
4065       else if (CPP_OPTION (pfile, digraphs))
4066         {
4067           if (*buffer->cur == ':')
4068             {
4069               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4070                  three characters are <:: and the subsequent character
4071                  is neither : nor >, the < is treated as a preprocessor
4072                  token by itself".  */
4073               if (CPP_OPTION (pfile, cplusplus)
4074                   && CPP_OPTION (pfile, lang) != CLK_CXX98
4075                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
4076                   && buffer->cur[1] == ':'
4077                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4078                 break;
4079
4080               buffer->cur++;
4081               result->flags |= DIGRAPH;
4082               result->type = CPP_OPEN_SQUARE;
4083             }
4084           else if (*buffer->cur == '%')
4085             {
4086               buffer->cur++;
4087               result->flags |= DIGRAPH;
4088               result->type = CPP_OPEN_BRACE;
4089             }
4090         }
4091       break;
4092
4093     case '>':
4094       result->type = CPP_GREATER;
4095       if (*buffer->cur == '=')
4096         buffer->cur++, result->type = CPP_GREATER_EQ;
4097       else if (*buffer->cur == '>')
4098         {
4099           buffer->cur++;
4100           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4101         }
4102       break;
4103
4104     case '%':
4105       result->type = CPP_MOD;
4106       if (*buffer->cur == '=')
4107         buffer->cur++, result->type = CPP_MOD_EQ;
4108       else if (CPP_OPTION (pfile, digraphs))
4109         {
4110           if (*buffer->cur == ':')
4111             {
4112               buffer->cur++;
4113               result->flags |= DIGRAPH;
4114               result->type = CPP_HASH;
4115               if (*buffer->cur == '%' && buffer->cur[1] == ':')
4116                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
4117             }
4118           else if (*buffer->cur == '>')
4119             {
4120               buffer->cur++;
4121               result->flags |= DIGRAPH;
4122               result->type = CPP_CLOSE_BRACE;
4123             }
4124         }
4125       break;
4126
4127     case '.':
4128       result->type = CPP_DOT;
4129       if (ISDIGIT (*buffer->cur))
4130         {
4131           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4132           result->type = CPP_NUMBER;
4133           lex_number (pfile, &result->val.str, &nst);
4134           warn_about_normalization (pfile, result, &nst, false);
4135         }
4136       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4137         buffer->cur += 2, result->type = CPP_ELLIPSIS;
4138       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4139         buffer->cur++, result->type = CPP_DOT_STAR;
4140       break;
4141
4142     case '+':
4143       result->type = CPP_PLUS;
4144       if (*buffer->cur == '+')
4145         buffer->cur++, result->type = CPP_PLUS_PLUS;
4146       else if (*buffer->cur == '=')
4147         buffer->cur++, result->type = CPP_PLUS_EQ;
4148       break;
4149
4150     case '-':
4151       result->type = CPP_MINUS;
4152       if (*buffer->cur == '>')
4153         {
4154           buffer->cur++;
4155           result->type = CPP_DEREF;
4156           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4157             buffer->cur++, result->type = CPP_DEREF_STAR;
4158         }
4159       else if (*buffer->cur == '-')
4160         buffer->cur++, result->type = CPP_MINUS_MINUS;
4161       else if (*buffer->cur == '=')
4162         buffer->cur++, result->type = CPP_MINUS_EQ;
4163       break;
4164
4165     case '&':
4166       result->type = CPP_AND;
4167       if (*buffer->cur == '&')
4168         buffer->cur++, result->type = CPP_AND_AND;
4169       else if (*buffer->cur == '=')
4170         buffer->cur++, result->type = CPP_AND_EQ;
4171       break;
4172
4173     case '|':
4174       result->type = CPP_OR;
4175       if (*buffer->cur == '|')
4176         buffer->cur++, result->type = CPP_OR_OR;
4177       else if (*buffer->cur == '=')
4178         buffer->cur++, result->type = CPP_OR_EQ;
4179       break;
4180
4181     case ':':
4182       result->type = CPP_COLON;
4183       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
4184         buffer->cur++, result->type = CPP_SCOPE;
4185       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
4186         {
4187           buffer->cur++;
4188           result->flags |= DIGRAPH;
4189           result->type = CPP_CLOSE_SQUARE;
4190         }
4191       break;
4192
4193     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4194     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4195     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4196     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
4197     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
4198
4199     case '?': result->type = CPP_QUERY; break;
4200     case '~': result->type = CPP_COMPL; break;
4201     case ',': result->type = CPP_COMMA; break;
4202     case '(': result->type = CPP_OPEN_PAREN; break;
4203     case ')': result->type = CPP_CLOSE_PAREN; break;
4204     case '[': result->type = CPP_OPEN_SQUARE; break;
4205     case ']': result->type = CPP_CLOSE_SQUARE; break;
4206     case '{': result->type = CPP_OPEN_BRACE; break;
4207     case '}': result->type = CPP_CLOSE_BRACE; break;
4208     case ';': result->type = CPP_SEMICOLON; break;
4209
4210       /* @ is a punctuator in Objective-C.  */
4211     case '@': result->type = CPP_ATSIGN; break;
4212
4213     default:
4214       {
4215         const uchar *base = --buffer->cur;
4216         static int no_warn_cnt;
4217
4218         /* Check for an extended identifier ($ or UCN or UTF-8).  */
4219         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4220         if (forms_identifier_p (pfile, true, &nst))
4221           {
4222             result->type = CPP_NAME;
4223             result->val.node.node = lex_identifier (pfile, base, true, &nst,
4224                                                     &result->val.node.spelling);
4225             warn_about_normalization (pfile, result, &nst, true);
4226             break;
4227           }
4228
4229         /* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
4230            single token.  */
4231         buffer->cur++;
4232         if (c >= utf8_signifier)
4233           {
4234             const uchar *pstr = base;
4235             cppchar_t s;
4236             if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4237               {
4238                 if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4239                   {
4240                     buffer->cur = base;
4241                     _cpp_warn_invalid_utf8 (pfile);
4242                   }
4243                 buffer->cur = pstr;
4244               }
4245             else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4246               {
4247                 buffer->cur = base;
4248                 const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4249                 buffer->cur = base + 1;
4250                 no_warn_cnt = end - buffer->cur;
4251               }
4252           }
4253         else if (c >= utf8_continuation
4254                  && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4255           {
4256             if (no_warn_cnt)
4257               --no_warn_cnt;
4258             else
4259               {
4260                 buffer->cur = base;
4261                 _cpp_warn_invalid_utf8 (pfile);
4262                 buffer->cur = base + 1;
4263               }
4264           }
4265         create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4266         break;
4267       }
4268
4269     }
4270
4271   /* Potentially convert the location of the token to a range.  */
4272   if (result->src_loc >= RESERVED_LOCATION_COUNT
4273       && result->type != CPP_EOF)
4274     {
4275       /* Ensure that any line notes are processed, so that we have the
4276          correct physical line/column for the end-point of the token even
4277          when a logical line is split via one or more backslashes.  */
4278       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4279           && !pfile->overlaid_buffer)
4280         _cpp_process_line_notes (pfile, false);
4281
4282       source_range tok_range;
4283       tok_range.m_start = result->src_loc;
4284       tok_range.m_finish
4285         = linemap_position_for_column (pfile->line_table,
4286                                        CPP_BUF_COLUMN (buffer, buffer->cur));
4287
4288       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
4289                                                result->src_loc,
4290                                                tok_range, NULL, 0);
4291     }
4292
4293   return result;
4294 }
4295
4296 /* An upper bound on the number of bytes needed to spell TOKEN.
4297    Does not include preceding whitespace.  */
4298 unsigned int
4299 cpp_token_len (const cpp_token *token)
4300 {
4301   unsigned int len;
4302
4303   switch (TOKEN_SPELL (token))
4304     {
4305     default:            len = 6;                                break;
4306     case SPELL_LITERAL: len = token->val.str.len;               break;
4307     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
4308     }
4309
4310   return len;
4311 }
4312
4313 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4314    Return the number of bytes read out of NAME.  (There are always
4315    10 bytes written to BUFFER.)  */
4316
4317 static size_t
4318 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4319 {
4320   int j;
4321   int ucn_len = 0;
4322   int ucn_len_c;
4323   unsigned t;
4324   unsigned long utf32;
4325
4326   /* Compute the length of the UTF-8 sequence.  */
4327   for (t = *name; t & 0x80; t <<= 1)
4328     ucn_len++;
4329
4330   utf32 = *name & (0x7F >> ucn_len);
4331   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4332     {
4333       utf32 = (utf32 << 6) | (*++name & 0x3F);
4334
4335       /* Ill-formed UTF-8.  */
4336       if ((*name & ~0x3F) != 0x80)
4337         abort ();
4338     }
4339
4340   *buffer++ = '\\';
4341   *buffer++ = 'U';
4342   for (j = 7; j >= 0; j--)
4343     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4344   return ucn_len;
4345 }
4346
4347 /* Given a token TYPE corresponding to a digraph, return a pointer to
4348    the spelling of the digraph.  */
4349 static const unsigned char *
4350 cpp_digraph2name (enum cpp_ttype type)
4351 {
4352   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4353 }
4354
4355 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4356    The buffer must already contain the enough space to hold the
4357    token's spelling.  Returns a pointer to the character after the
4358    last character written.  */
4359 unsigned char *
4360 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4361 {
4362   size_t i;
4363   const unsigned char *name = NODE_NAME (ident);
4364
4365   for (i = 0; i < NODE_LEN (ident); i++)
4366     if (name[i] & ~0x7F)
4367       {
4368         i += utf8_to_ucn (buffer, name + i) - 1;
4369         buffer += 10;
4370       }
4371     else
4372       *buffer++ = name[i];
4373
4374   return buffer;
4375 }
4376
4377 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
4378    already contain the enough space to hold the token's spelling.
4379    Returns a pointer to the character after the last character written.
4380    FORSTRING is true if this is to be the spelling after translation
4381    phase 1 (with the original spelling of extended identifiers), false
4382    if extended identifiers should always be written using UCNs (there is
4383    no option for always writing them in the internal UTF-8 form).
4384    FIXME: Would be nice if we didn't need the PFILE argument.  */
4385 unsigned char *
4386 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4387                  unsigned char *buffer, bool forstring)
4388 {
4389   switch (TOKEN_SPELL (token))
4390     {
4391     case SPELL_OPERATOR:
4392       {
4393         const unsigned char *spelling;
4394         unsigned char c;
4395
4396         if (token->flags & DIGRAPH)
4397           spelling = cpp_digraph2name (token->type);
4398         else if (token->flags & NAMED_OP)
4399           goto spell_ident;
4400         else
4401           spelling = TOKEN_NAME (token);
4402
4403         while ((c = *spelling++) != '\0')
4404           *buffer++ = c;
4405       }
4406       break;
4407
4408     spell_ident:
4409     case SPELL_IDENT:
4410       if (forstring)
4411         {
4412           memcpy (buffer, NODE_NAME (token->val.node.spelling),
4413                   NODE_LEN (token->val.node.spelling));
4414           buffer += NODE_LEN (token->val.node.spelling);
4415         }
4416       else
4417         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4418       break;
4419
4420     case SPELL_LITERAL:
4421       memcpy (buffer, token->val.str.text, token->val.str.len);
4422       buffer += token->val.str.len;
4423       break;
4424
4425     case SPELL_NONE:
4426       cpp_error (pfile, CPP_DL_ICE,
4427                  "unspellable token %s", TOKEN_NAME (token));
4428       break;
4429     }
4430
4431   return buffer;
4432 }
4433
4434 /* Returns TOKEN spelt as a null-terminated string.  The string is
4435    freed when the reader is destroyed.  Useful for diagnostics.  */
4436 unsigned char *
4437 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4438 {
4439   unsigned int len = cpp_token_len (token) + 1;
4440   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4441
4442   end = cpp_spell_token (pfile, token, start, false);
4443   end[0] = '\0';
4444
4445   return start;
4446 }
4447
4448 /* Returns a pointer to a string which spells the token defined by
4449    TYPE and FLAGS.  Used by C front ends, which really should move to
4450    using cpp_token_as_text.  */
4451 const char *
4452 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4453 {
4454   if (flags & DIGRAPH)
4455     return (const char *) cpp_digraph2name (type);
4456   else if (flags & NAMED_OP)
4457     return cpp_named_operator2name (type);
4458
4459   return (const char *) token_spellings[type].name;
4460 }
4461
4462 /* Writes the spelling of token to FP, without any preceding space.
4463    Separated from cpp_spell_token for efficiency - to avoid stdio
4464    double-buffering.  */
4465 void
4466 cpp_output_token (const cpp_token *token, FILE *fp)
4467 {
4468   switch (TOKEN_SPELL (token))
4469     {
4470     case SPELL_OPERATOR:
4471       {
4472         const unsigned char *spelling;
4473         int c;
4474
4475         if (token->flags & DIGRAPH)
4476           spelling = cpp_digraph2name (token->type);
4477         else if (token->flags & NAMED_OP)
4478           goto spell_ident;
4479         else
4480           spelling = TOKEN_NAME (token);
4481
4482         c = *spelling;
4483         do
4484           putc (c, fp);
4485         while ((c = *++spelling) != '\0');
4486       }
4487       break;
4488
4489     spell_ident:
4490     case SPELL_IDENT:
4491       {
4492         size_t i;
4493         const unsigned char * name = NODE_NAME (token->val.node.node);
4494
4495         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4496           if (name[i] & ~0x7F)
4497             {
4498               unsigned char buffer[10];
4499               i += utf8_to_ucn (buffer, name + i) - 1;
4500               fwrite (buffer, 1, 10, fp);
4501             }
4502           else
4503             fputc (NODE_NAME (token->val.node.node)[i], fp);
4504       }
4505       break;
4506
4507     case SPELL_LITERAL:
4508       if (token->type == CPP_HEADER_NAME)
4509         fputc ('"', fp);
4510       fwrite (token->val.str.text, 1, token->val.str.len, fp);
4511       if (token->type == CPP_HEADER_NAME)
4512         fputc ('"', fp);
4513       break;
4514
4515     case SPELL_NONE:
4516       /* An error, most probably.  */
4517       break;
4518     }
4519 }
4520
4521 /* Compare two tokens.  */
4522 int
4523 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4524 {
4525   if (a->type == b->type && a->flags == b->flags)
4526     switch (TOKEN_SPELL (a))
4527       {
4528       default:                  /* Keep compiler happy.  */
4529       case SPELL_OPERATOR:
4530         /* token_no is used to track where multiple consecutive ##
4531            tokens were originally located.  */
4532         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4533       case SPELL_NONE:
4534         return (a->type != CPP_MACRO_ARG
4535                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4536                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4537       case SPELL_IDENT:
4538         return (a->val.node.node == b->val.node.node
4539                 && a->val.node.spelling == b->val.node.spelling);
4540       case SPELL_LITERAL:
4541         return (a->val.str.len == b->val.str.len
4542                 && !memcmp (a->val.str.text, b->val.str.text,
4543                             a->val.str.len));
4544       }
4545
4546   return 0;
4547 }
4548
4549 /* Returns nonzero if a space should be inserted to avoid an
4550    accidental token paste for output.  For simplicity, it is
4551    conservative, and occasionally advises a space where one is not
4552    needed, e.g. "." and ".2".  */
4553 int
4554 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4555                  const cpp_token *token2)
4556 {
4557   enum cpp_ttype a = token1->type, b = token2->type;
4558   cppchar_t c;
4559
4560   if (token1->flags & NAMED_OP)
4561     a = CPP_NAME;
4562   if (token2->flags & NAMED_OP)
4563     b = CPP_NAME;
4564
4565   c = EOF;
4566   if (token2->flags & DIGRAPH)
4567     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4568   else if (token_spellings[b].category == SPELL_OPERATOR)
4569     c = token_spellings[b].name[0];
4570
4571   /* Quickly get everything that can paste with an '='.  */
4572   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4573     return 1;
4574
4575   switch (a)
4576     {
4577     case CPP_GREATER:   return c == '>';
4578     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
4579     case CPP_PLUS:      return c == '+';
4580     case CPP_MINUS:     return c == '-' || c == '>';
4581     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
4582     case CPP_MOD:       return c == ':' || c == '>';
4583     case CPP_AND:       return c == '&';
4584     case CPP_OR:        return c == '|';
4585     case CPP_COLON:     return c == ':' || c == '>';
4586     case CPP_DEREF:     return c == '*';
4587     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
4588     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
4589     case CPP_PRAGMA:
4590     case CPP_NAME:      return ((b == CPP_NUMBER
4591                                  && name_p (pfile, &token2->val.str))
4592                                 || b == CPP_NAME
4593                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
4594     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
4595                                 || b == CPP_CHAR
4596                                 || c == '.' || c == '+' || c == '-');
4597                                       /* UCNs */
4598     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
4599                                  && b == CPP_NAME)
4600                                 || (CPP_OPTION (pfile, objc)
4601                                     && token1->val.str.text[0] == '@'
4602                                     && (b == CPP_NAME || b == CPP_STRING)));
4603     case CPP_LESS_EQ:   return c == '>';
4604     case CPP_STRING:
4605     case CPP_WSTRING:
4606     case CPP_UTF8STRING:
4607     case CPP_STRING16:
4608     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
4609                                 && (b == CPP_NAME
4610                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
4611                                         && ISIDST (token2->val.str.text[0]))));
4612
4613     default:            break;
4614     }
4615
4616   return 0;
4617 }
4618
4619 /* Output all the remaining tokens on the current line, and a newline
4620    character, to FP.  Leading whitespace is removed.  If there are
4621    macros, special token padding is not performed.  */
4622 void
4623 cpp_output_line (cpp_reader *pfile, FILE *fp)
4624 {
4625   const cpp_token *token;
4626
4627   token = cpp_get_token (pfile);
4628   while (token->type != CPP_EOF)
4629     {
4630       cpp_output_token (token, fp);
4631       token = cpp_get_token (pfile);
4632       if (token->flags & PREV_WHITE)
4633         putc (' ', fp);
4634     }
4635
4636   putc ('\n', fp);
4637 }
4638
4639 /* Return a string representation of all the remaining tokens on the
4640    current line.  The result is allocated using xmalloc and must be
4641    freed by the caller.  */
4642 unsigned char *
4643 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4644 {
4645   const cpp_token *token;
4646   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4647   unsigned int alloced = 120 + out;
4648   unsigned char *result = (unsigned char *) xmalloc (alloced);
4649
4650   /* If DIR_NAME is empty, there are no initial contents.  */
4651   if (dir_name)
4652     {
4653       sprintf ((char *) result, "#%s ", dir_name);
4654       out += 2;
4655     }
4656
4657   token = cpp_get_token (pfile);
4658   while (token->type != CPP_EOF)
4659     {
4660       unsigned char *last;
4661       /* Include room for a possible space and the terminating nul.  */
4662       unsigned int len = cpp_token_len (token) + 2;
4663
4664       if (out + len > alloced)
4665         {
4666           alloced *= 2;
4667           if (out + len > alloced)
4668             alloced = out + len;
4669           result = (unsigned char *) xrealloc (result, alloced);
4670         }
4671
4672       last = cpp_spell_token (pfile, token, &result[out], 0);
4673       out = last - result;
4674
4675       token = cpp_get_token (pfile);
4676       if (token->flags & PREV_WHITE)
4677         result[out++] = ' ';
4678     }
4679
4680   result[out] = '\0';
4681   return result;
4682 }
4683
4684 /* Memory buffers.  Changing these three constants can have a dramatic
4685    effect on performance.  The values here are reasonable defaults,
4686    but might be tuned.  If you adjust them, be sure to test across a
4687    range of uses of cpplib, including heavy nested function-like macro
4688    expansion.  Also check the change in peak memory usage (NJAMD is a
4689    good tool for this).  */
4690 #define MIN_BUFF_SIZE 8000
4691 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4692 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4693         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4694
4695 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4696   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4697 #endif
4698
4699 /* Create a new allocation buffer.  Place the control block at the end
4700    of the buffer, so that buffer overflows will cause immediate chaos.  */
4701 static _cpp_buff *
4702 new_buff (size_t len)
4703 {
4704   _cpp_buff *result;
4705   unsigned char *base;
4706
4707   if (len < MIN_BUFF_SIZE)
4708     len = MIN_BUFF_SIZE;
4709   len = CPP_ALIGN (len);
4710
4711 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4712   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4713      struct first.  */
4714   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4715   base = XNEWVEC (unsigned char, len + slen);
4716   result = (_cpp_buff *) base;
4717   base += slen;
4718 #else
4719   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4720   result = (_cpp_buff *) (base + len);
4721 #endif
4722   result->base = base;
4723   result->cur = base;
4724   result->limit = base + len;
4725   result->next = NULL;
4726   return result;
4727 }
4728
4729 /* Place a chain of unwanted allocation buffers on the free list.  */
4730 void
4731 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4732 {
4733   _cpp_buff *end = buff;
4734
4735   while (end->next)
4736     end = end->next;
4737   end->next = pfile->free_buffs;
4738   pfile->free_buffs = buff;
4739 }
4740
4741 /* Return a free buffer of size at least MIN_SIZE.  */
4742 _cpp_buff *
4743 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4744 {
4745   _cpp_buff *result, **p;
4746
4747   for (p = &pfile->free_buffs;; p = &(*p)->next)
4748     {
4749       size_t size;
4750
4751       if (*p == NULL)
4752         return new_buff (min_size);
4753       result = *p;
4754       size = result->limit - result->base;
4755       /* Return a buffer that's big enough, but don't waste one that's
4756          way too big.  */
4757       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4758         break;
4759     }
4760
4761   *p = result->next;
4762   result->next = NULL;
4763   result->cur = result->base;
4764   return result;
4765 }
4766
4767 /* Creates a new buffer with enough space to hold the uncommitted
4768    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
4769    the excess bytes to the new buffer.  Chains the new buffer after
4770    BUFF, and returns the new buffer.  */
4771 _cpp_buff *
4772 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4773 {
4774   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4775   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4776
4777   buff->next = new_buff;
4778   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4779   return new_buff;
4780 }
4781
4782 /* Creates a new buffer with enough space to hold the uncommitted
4783    remaining bytes of the buffer pointed to by BUFF, and at least
4784    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
4785    Chains the new buffer before the buffer pointed to by BUFF, and
4786    updates the pointer to point to the new buffer.  */
4787 void
4788 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4789 {
4790   _cpp_buff *new_buff, *old_buff = *pbuff;
4791   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4792
4793   new_buff = _cpp_get_buff (pfile, size);
4794   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4795   new_buff->next = old_buff;
4796   *pbuff = new_buff;
4797 }
4798
4799 /* Free a chain of buffers starting at BUFF.  */
4800 void
4801 _cpp_free_buff (_cpp_buff *buff)
4802 {
4803   _cpp_buff *next;
4804
4805   for (; buff; buff = next)
4806     {
4807       next = buff->next;
4808 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4809       free (buff);
4810 #else
4811       free (buff->base);
4812 #endif
4813     }
4814 }
4815
4816 /* Allocate permanent, unaligned storage of length LEN.  */
4817 unsigned char *
4818 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4819 {
4820   _cpp_buff *buff = pfile->u_buff;
4821   unsigned char *result = buff->cur;
4822
4823   if (len > (size_t) (buff->limit - result))
4824     {
4825       buff = _cpp_get_buff (pfile, len);
4826       buff->next = pfile->u_buff;
4827       pfile->u_buff = buff;
4828       result = buff->cur;
4829     }
4830
4831   buff->cur = result + len;
4832   return result;
4833 }
4834
4835 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4836    That buffer is used for growing allocations when saving macro
4837    replacement lists in a #define, and when parsing an answer to an
4838    assertion in #assert, #unassert or #if (and therefore possibly
4839    whilst expanding macros).  It therefore must not be used by any
4840    code that they might call: specifically the lexer and the guts of
4841    the macro expander.
4842
4843    All existing other uses clearly fit this restriction: storing
4844    registered pragmas during initialization.  */
4845 unsigned char *
4846 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4847 {
4848   _cpp_buff *buff = pfile->a_buff;
4849   unsigned char *result = buff->cur;
4850
4851   if (len > (size_t) (buff->limit - result))
4852     {
4853       buff = _cpp_get_buff (pfile, len);
4854       buff->next = pfile->a_buff;
4855       pfile->a_buff = buff;
4856       result = buff->cur;
4857     }
4858
4859   buff->cur = result + len;
4860   return result;
4861 }
4862
4863 /* Commit or allocate storage from a buffer.  */
4864
4865 void *
4866 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4867 {
4868   void *ptr = BUFF_FRONT (pfile->a_buff);
4869
4870   if (pfile->hash_table->alloc_subobject)
4871     {
4872       void *copy = pfile->hash_table->alloc_subobject (size);
4873       memcpy (copy, ptr, size);
4874       ptr = copy;
4875     }
4876   else
4877     BUFF_FRONT (pfile->a_buff) += size;
4878
4879   return ptr;
4880 }
4881
4882 /* Say which field of TOK is in use.  */
4883
4884 enum cpp_token_fld_kind
4885 cpp_token_val_index (const cpp_token *tok)
4886 {
4887   switch (TOKEN_SPELL (tok))
4888     {
4889     case SPELL_IDENT:
4890       return CPP_TOKEN_FLD_NODE;
4891     case SPELL_LITERAL:
4892       return CPP_TOKEN_FLD_STR;
4893     case SPELL_OPERATOR:
4894       /* Operands which were originally spelled as ident keep around
4895          the node for the exact spelling.  */
4896       if (tok->flags & NAMED_OP)
4897         return CPP_TOKEN_FLD_NODE;
4898       else if (tok->type == CPP_PASTE)
4899         return CPP_TOKEN_FLD_TOKEN_NO;
4900       else
4901         return CPP_TOKEN_FLD_NONE;
4902     case SPELL_NONE:
4903       if (tok->type == CPP_MACRO_ARG)
4904         return CPP_TOKEN_FLD_ARG_NO;
4905       else if (tok->type == CPP_PADDING)
4906         return CPP_TOKEN_FLD_SOURCE;
4907       else if (tok->type == CPP_PRAGMA)
4908         return CPP_TOKEN_FLD_PRAGMA;
4909       /* fall through */
4910     default:
4911       return CPP_TOKEN_FLD_NONE;
4912     }
4913 }
4914
4915 /* All tokens lexed in R after calling this function will be forced to
4916    have their location_t to be P, until
4917    cpp_stop_forcing_token_locations is called for R.  */
4918
4919 void
4920 cpp_force_token_locations (cpp_reader *r, location_t loc)
4921 {
4922   r->forced_token_location = loc;
4923 }
4924
4925 /* Go back to assigning locations naturally for lexed tokens.  */
4926
4927 void
4928 cpp_stop_forcing_token_locations (cpp_reader *r)
4929 {
4930   r->forced_token_location = 0;
4931 }
4932
4933 /* We're looking at \, if it's escaping EOL, look past it.  If at
4934    LIMIT, don't advance.  */
4935
4936 static const unsigned char *
4937 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4938 {
4939   const unsigned char *probe = peek;
4940
4941   if (__builtin_expect (peek[1] == '\n', true))
4942     {
4943     eol:
4944       probe += 2;
4945       if (__builtin_expect (probe < limit, true))
4946         {
4947           peek = probe;
4948           if (*peek == '\\')
4949             /* The user might be perverse.  */
4950             return do_peek_backslash (peek, limit);
4951         }
4952     }
4953   else if (__builtin_expect (peek[1] == '\r', false))
4954     {
4955       if (probe[2] == '\n')
4956         probe++;
4957       goto eol;
4958     }
4959
4960   return peek;
4961 }
4962
4963 static const unsigned char *
4964 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4965 {
4966   if (__builtin_expect (*peek == '\\', false))
4967     peek = do_peek_backslash (peek, limit);
4968   return peek;
4969 }
4970
4971 static const unsigned char *
4972 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4973 {
4974   if (peek == bound)
4975     return NULL;
4976
4977   unsigned char c = *--peek;
4978   if (__builtin_expect (c == '\n', false)
4979       || __builtin_expect (c == 'r', false))
4980     {
4981       if (peek == bound)
4982         return peek;
4983       int ix = -1;
4984       if (c == '\n' && peek[ix] == '\r')
4985         {
4986           if (peek + ix == bound)
4987             return peek;
4988           ix--;
4989         }
4990
4991       if (peek[ix] == '\\')
4992         return do_peek_prev (peek + ix, bound);
4993
4994       return peek;
4995     }
4996   else
4997     return peek;
4998 }
4999
5000 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
5001    space.  Otherwise return NULL.  */
5002
5003 static const unsigned char *
5004 do_peek_ident (const char *match, const unsigned char *peek,
5005                const unsigned char *limit)
5006 {
5007   for (; *++match; peek++)
5008     if (*peek != *match)
5009       {
5010         peek = do_peek_next (peek, limit);
5011         if (*peek != *match)
5012           return NULL;
5013       }
5014
5015   /* Must now not be looking at an identifier char.  */
5016   peek = do_peek_next (peek, limit);
5017   if (ISIDNUM (*peek))
5018     return NULL;
5019
5020   /* Skip control-line whitespace.  */
5021  ws:
5022   while (*peek == ' ' || *peek == '\t')
5023     peek++;
5024   if (__builtin_expect (*peek == '\\', false))
5025     {
5026       peek = do_peek_backslash (peek, limit);
5027       if (*peek != '\\')
5028         goto ws;
5029     }
5030
5031   return peek;
5032 }
5033
5034 /* Are we looking at a module control line starting as PEEK - 1?  */
5035
5036 static bool
5037 do_peek_module (cpp_reader *pfile, unsigned char c,
5038                 const unsigned char *peek, const unsigned char *limit)
5039 {
5040   bool import = false;
5041
5042   if (__builtin_expect (c == 'e', false))
5043     {
5044       if (!((peek[0] == 'x' || peek[0] == '\\')
5045             && (peek = do_peek_ident ("export", peek, limit))))
5046         return false;
5047
5048       /* export, peek for import or module.  No need to peek __import
5049          here.  */
5050       if (peek[0] == 'i')
5051         {
5052           if (!((peek[1] == 'm' || peek[1] == '\\')
5053                 && (peek = do_peek_ident ("import", peek + 1, limit))))
5054             return false;
5055           import = true;
5056         }
5057       else if (peek[0] == 'm')
5058         {
5059           if (!((peek[1] == 'o' || peek[1] == '\\')
5060                 && (peek = do_peek_ident ("module", peek + 1, limit))))
5061             return false;
5062         }
5063       else
5064         return false;
5065     }
5066   else if (__builtin_expect (c == 'i', false))
5067     {
5068       if (!((peek[0] == 'm' || peek[0] == '\\')
5069             && (peek = do_peek_ident ("import", peek, limit))))
5070         return false;
5071       import = true;
5072     }
5073   else if (__builtin_expect (c == '_', false))
5074     {
5075       /* Needed for translated includes.   */
5076       if (!((peek[0] == '_' || peek[0] == '\\')
5077             && (peek = do_peek_ident ("__import", peek, limit))))
5078         return false;
5079       import = true;
5080     }
5081   else if (__builtin_expect (c == 'm', false))
5082     {
5083       if (!((peek[0] == 'o' || peek[0] == '\\')
5084             && (peek = do_peek_ident ("module", peek, limit))))
5085         return false;
5086     }
5087   else
5088     return false;
5089
5090   /* Peek the next character to see if it's good enough.  We'll be at
5091      the first non-whitespace char, including skipping an escaped
5092      newline.  */
5093   /* ... import followed by identifier, ':', '<' or header-name
5094      preprocessing tokens, or module followed by identifier, ':' or
5095      ';' preprocessing tokens.  */
5096   unsigned char p = *peek++;
5097
5098   /* A character literal is ... single quotes, ... optionally preceded
5099      by u8, u, U, or L */
5100   /* A string-literal is a ... double quotes, optionally prefixed by
5101      R, u8, u8R, u, uR, U, UR, L, or LR */
5102   if (p == 'u')
5103     {
5104       peek = do_peek_next (peek, limit);
5105       if (*peek == '8')
5106         {
5107           peek++;
5108           goto peek_u8;
5109         }
5110       goto peek_u;
5111     }
5112   else if (p == 'U' || p == 'L')
5113     {
5114     peek_u8:
5115       peek = do_peek_next (peek, limit);
5116     peek_u:
5117       if (*peek == '\"' || *peek == '\'')
5118         return false;
5119
5120       if (*peek == 'R')
5121         goto peek_R;
5122       /* Identifier. Ok.  */
5123     }
5124   else if (p == 'R')
5125     {
5126     peek_R:
5127       if (CPP_OPTION (pfile, rliterals))
5128         {
5129           peek = do_peek_next (peek, limit);
5130           if (*peek == '\"')
5131             return false;
5132         }
5133       /* Identifier. Ok.  */
5134     }
5135   else if ('Z' - 'A' == 25
5136            ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5137            : ISIDST (p))
5138     {
5139       /* Identifier.  Ok. */
5140     }
5141   else if (p == '<')
5142     {
5143       /* Maybe angle header, ok for import.  Reject
5144          '<=', '<<' digraph:'<:'.  */
5145       if (!import)
5146         return false;
5147       peek = do_peek_next (peek, limit);
5148       if (*peek == '=' || *peek == '<'
5149           || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5150         return false;
5151     }
5152   else if (p == ';')
5153     {
5154       /* SEMICOLON, ok for module.  */
5155       if (import)
5156         return false;
5157     }
5158   else if (p == '"')
5159     {
5160       /* STRING, ok for import.  */
5161       if (!import)
5162         return false;
5163     }
5164   else if (p == ':')
5165     {
5166       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
5167       peek = do_peek_next (peek, limit);
5168       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5169         return false;
5170     }
5171   else
5172     /* FIXME: Detect a unicode character, excluding those not
5173        permitted as the initial character. [lex.name]/1.  I presume
5174        we need to check the \[uU] spellings, and directly using
5175        Unicode in say UTF8 form?  Or perhaps we do the phase-1
5176        conversion of UTF8 to universal-character-names?  */
5177     return false;
5178
5179   return true;
5180 }
5181
5182 /* Directives-only scanning.  Somewhat more relaxed than correct
5183    parsing -- some ill-formed programs will not be rejected.  */
5184
5185 void
5186 cpp_directive_only_process (cpp_reader *pfile,
5187                             void *data,
5188                             void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5189 {
5190   bool module_p = CPP_OPTION (pfile, module_directives);
5191
5192   do
5193     {
5194     restart:
5195       /* Buffer initialization, but no line cleaning. */
5196       cpp_buffer *buffer = pfile->buffer;
5197       buffer->cur_note = buffer->notes_used = 0;
5198       buffer->cur = buffer->line_base = buffer->next_line;
5199       buffer->need_line = false;
5200       /* Files always end in a newline or carriage return.  We rely on this for
5201          character peeking safety.  */
5202       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
5203
5204       const unsigned char *base = buffer->cur;
5205       unsigned line_count = 0;
5206       const unsigned char *line_start = base;
5207
5208       bool bol = true;
5209       bool raw = false;
5210
5211       const unsigned char *lwm = base;
5212       for (const unsigned char *pos = base, *limit = buffer->rlimit;
5213            pos < limit;)
5214         {
5215           unsigned char c = *pos++;
5216           /* This matches the switch in _cpp_lex_direct.  */
5217           switch (c)
5218             {
5219             case ' ': case '\t': case '\f': case '\v':
5220               /* Whitespace, do nothing.  */
5221               break;
5222
5223             case '\r': /* MAC line ending, or Windows \r\n  */
5224               if (*pos == '\n')
5225                 pos++;
5226               /* FALLTHROUGH */
5227
5228             case '\n':
5229               bol = true;
5230
5231             next_line:
5232               CPP_INCREMENT_LINE (pfile, 0);
5233               line_count++;
5234               line_start = pos;
5235               break;
5236
5237             case '\\':
5238               /* <backslash><newline> is removed, and doesn't undo any
5239                  preceeding escape or whatnot.  */
5240               if (*pos == '\n')
5241                 {
5242                   pos++;
5243                   goto next_line;
5244                 }
5245               else if (*pos == '\r')
5246                 {
5247                   if (pos[1] == '\n')
5248                     pos++;
5249                   pos++;
5250                   goto next_line;
5251                 }
5252               goto dflt;
5253
5254             case '#':
5255               if (bol)
5256                 {
5257                   /* Line directive.  */
5258                   if (pos - 1 > base && !pfile->state.skipping)
5259                     cb (pfile, CPP_DO_print, data,
5260                         line_count, base, pos - 1 - base);
5261
5262                   /* Prep things for directive handling. */
5263                   buffer->next_line = pos;
5264                   buffer->need_line = true;
5265                   bool ok = _cpp_get_fresh_line (pfile);
5266                   gcc_checking_assert (ok);
5267
5268                   /* Ensure proper column numbering for generated
5269                      error messages. */
5270                   buffer->line_base -= pos - line_start;
5271
5272                   _cpp_handle_directive (pfile, line_start + 1 != pos);
5273
5274                   /* Sanitize the line settings.  Duplicate #include's can
5275                      mess things up. */
5276                   // FIXME: Necessary?
5277                   pfile->line_table->highest_location
5278                     = pfile->line_table->highest_line;
5279
5280                   if (!pfile->state.skipping
5281                       && pfile->buffer->next_line < pfile->buffer->rlimit)
5282                     cb (pfile, CPP_DO_location, data,
5283                         pfile->line_table->highest_line);
5284
5285                   goto restart;
5286                 }
5287               goto dflt;
5288
5289             case '/':
5290               {
5291                 const unsigned char *peek = do_peek_next (pos, limit);
5292                 if (!(*peek == '/' || *peek == '*'))
5293                   goto dflt;
5294
5295                 /* Line or block comment  */
5296                 bool is_block = *peek == '*';
5297                 bool star = false;
5298                 bool esc = false;
5299                 location_t sloc
5300                   = linemap_position_for_column (pfile->line_table,
5301                                                  pos - line_start);
5302
5303                 while (pos < limit)
5304                   {
5305                     char c = *pos++;
5306                     switch (c)
5307                       {
5308                       case '\\':
5309                         esc = true;
5310                         break;
5311
5312                       case '\r':
5313                         if (*pos == '\n')
5314                           pos++;
5315                         /* FALLTHROUGH  */
5316
5317                       case '\n':
5318                         {
5319                           CPP_INCREMENT_LINE (pfile, 0);
5320                           line_count++;
5321                           line_start = pos;
5322                           if (!esc && !is_block)
5323                             {
5324                               bol = true;
5325                               goto done_comment;
5326                             }
5327                         }
5328                         if (!esc)
5329                           star = false;
5330                         esc = false;
5331                         break;
5332
5333                       case '*':
5334                         if (pos > peek)
5335                           star = is_block;
5336                         esc = false;
5337                         break;
5338
5339                       case '/':
5340                         if (star)
5341                           goto done_comment;
5342                         /* FALLTHROUGH  */
5343
5344                       default:
5345                         star = false;
5346                         esc = false;
5347                         break;
5348                       }
5349                   }
5350                 if (pos < limit || is_block)
5351                   cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5352                                        "unterminated comment");
5353               done_comment:
5354                 lwm = pos;
5355                 break;
5356               }
5357
5358             case '\'':
5359               if (!CPP_OPTION (pfile, digit_separators))
5360                 goto delimited_string;
5361
5362               /* Possibly a number punctuator.  */
5363               if (!ISIDNUM (*do_peek_next (pos, limit)))
5364                 goto delimited_string;
5365
5366               goto quote_peek;
5367
5368             case '\"':
5369               if (!CPP_OPTION (pfile, rliterals))
5370                 goto delimited_string;
5371
5372             quote_peek:
5373               {
5374                 /* For ' see if it's a number punctuator
5375                    \.?<digit>(<digit>|<identifier-nondigit>
5376                    |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5377                 /* For " see if it's a raw string
5378                    {U,L,u,u8}R.  This includes CPP_NUMBER detection,
5379                    because that could be 0e+R.  */
5380                 const unsigned char *peek = pos - 1;
5381                 bool quote_first = c == '"';
5382                 bool quote_eight = false;
5383                 bool maybe_number_start = false;
5384                 bool want_number = false;
5385
5386                 while ((peek = do_peek_prev (peek, lwm)))
5387                   {
5388                     unsigned char p = *peek;
5389                     if (quote_first)
5390                       {
5391                         if (!raw)
5392                           {
5393                             if (p != 'R')
5394                               break;
5395                             raw = true;
5396                             continue;
5397                           }
5398
5399                         quote_first = false;
5400                         if (p == 'L' || p == 'U' || p == 'u')
5401                           ;
5402                         else if (p == '8')
5403                           quote_eight = true;
5404                         else
5405                           goto second_raw;
5406                       }
5407                     else if (quote_eight)
5408                       {
5409                         if (p != 'u')
5410                           {
5411                             raw = false;
5412                             break;
5413                           }
5414                         quote_eight = false;
5415                       }
5416                     else if (c == '"')
5417                       {
5418                       second_raw:;
5419                         if (!want_number && ISIDNUM (p))
5420                           {
5421                             raw = false;
5422                             break;
5423                           }
5424                       }
5425
5426                     if (ISDIGIT (p))
5427                       maybe_number_start = true;
5428                     else if (p == '.')
5429                       want_number = true;
5430                     else if (ISIDNUM (p))
5431                       maybe_number_start = false;
5432                     else if (p == '+' || p == '-')
5433                       {
5434                         if (const unsigned char *peek_prev
5435                             = do_peek_prev (peek, lwm))
5436                           {
5437                             p = *peek_prev;
5438                             if (p == 'e' || p == 'E'
5439                                 || p == 'p' || p == 'P')
5440                               {
5441                                 want_number = true;
5442                                 maybe_number_start = false;
5443                               }
5444                             else
5445                               break;
5446                           }
5447                         else
5448                           break;
5449                       }
5450                     else if (p == '\'' || p == '\"')
5451                       {
5452                         /* If this is lwm, this must be the end of a
5453                            previous string.  So this is a trailing
5454                            literal type, (a) if those are allowed,
5455                              and (b) maybe_start is false.  Otherwise
5456                              this must be a CPP_NUMBER because we've
5457                              met another ', and we'd have checked that
5458                              in its own right.  */
5459                         if (peek == lwm && CPP_OPTION (pfile, uliterals))
5460                           {
5461                             if  (!maybe_number_start && !want_number)
5462                               /* Must be a literal type.  */
5463                               raw = false;
5464                           }
5465                         else if (p == '\''
5466                                  && CPP_OPTION (pfile, digit_separators))
5467                           maybe_number_start = true;
5468                         break;
5469                       }
5470                     else if (c == '\'')
5471                       break;
5472                     else if (!quote_first && !quote_eight)
5473                       break;
5474                   }
5475
5476                 if (maybe_number_start)
5477                   {
5478                     if (c == '\'')
5479                       /* A CPP NUMBER.  */
5480                       goto dflt;
5481                     raw = false;
5482                   }
5483
5484                 goto delimited_string;
5485               }
5486
5487             delimited_string:
5488               {
5489                 /* (Possibly raw) string or char literal.  */
5490                 unsigned char end = c;
5491                 int delim_len = -1;
5492                 const unsigned char *delim = NULL;
5493                 location_t sloc = linemap_position_for_column (pfile->line_table,
5494                                                                pos - line_start);
5495                 int esc = 0;
5496
5497                 if (raw)
5498                   {
5499                     /* There can be no line breaks in the delimiter.  */
5500                     delim = pos;
5501                     for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5502                       {
5503                         if (delim_len == 16)
5504                           {
5505                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5506                                                  sloc, 0,
5507                                                  "raw string delimiter"
5508                                                  " longer than %d"
5509                                                  " characters",
5510                                                  delim_len);
5511                             raw = false;
5512                             pos = delim;
5513                             break;
5514                           }
5515                         if (strchr (") \\\t\v\f\n", c))
5516                           {
5517                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5518                                                  sloc, 0,
5519                                                  "invalid character '%c'"
5520                                                  " in raw string"
5521                                                  " delimiter", c);
5522                             raw = false;
5523                             pos = delim;
5524                             break;
5525                           }
5526                         if (pos >= limit)
5527                           goto bad_string;
5528                       }
5529                   }
5530
5531                 while (pos < limit)
5532                   {
5533                     char c = *pos++;
5534                     switch (c)
5535                       {
5536                       case '\\':
5537                         if (!raw)
5538                           esc++;
5539                         break;
5540
5541                       case '\r':
5542                         if (*pos == '\n')
5543                           pos++;
5544                         /* FALLTHROUGH  */
5545
5546                       case '\n':
5547                         {
5548                           CPP_INCREMENT_LINE (pfile, 0);
5549                           line_count++;
5550                           line_start = pos;
5551                         }
5552                         if (esc)
5553                           esc--;
5554                         break;
5555
5556                       case ')':
5557                         if (raw
5558                             && pos + delim_len + 1 < limit
5559                             && pos[delim_len] == end
5560                             && !memcmp (delim, pos, delim_len))
5561                           {
5562                             pos += delim_len + 1;
5563                             raw = false;
5564                             goto done_string;
5565                           }
5566                         break;
5567
5568                       default:
5569                         if (!raw && !(esc & 1) && c == end)
5570                           goto done_string;
5571                         esc = 0;
5572                         break;
5573                       }
5574                   }
5575               bad_string:
5576                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5577                                      "unterminated literal");
5578
5579               done_string:
5580                 raw = false;
5581                 lwm = pos - 1;
5582               }
5583               goto dflt;
5584
5585             case '_':
5586             case 'e':
5587             case 'i':
5588             case 'm':
5589               if (bol && module_p && !pfile->state.skipping
5590                   && do_peek_module (pfile, c, pos, limit))
5591                 {
5592                   /* We've seen the start of a module control line.
5593                      Start up the tokenizer.  */
5594                   pos--; /* Backup over the first character.  */
5595
5596                   /* Backup over whitespace to start of line.  */
5597                   while (pos > line_start
5598                          && (pos[-1] == ' ' || pos[-1] == '\t'))
5599                     pos--;
5600
5601                   if (pos > base)
5602                     cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5603
5604                   /* Prep things for directive handling. */
5605                   buffer->next_line = pos;
5606                   buffer->need_line = true;
5607
5608                   /* Now get tokens until the PRAGMA_EOL.  */
5609                   do
5610                     {
5611                       location_t spelling;
5612                       const cpp_token *tok
5613                         = cpp_get_token_with_location (pfile, &spelling);
5614
5615                       gcc_assert (pfile->state.in_deferred_pragma
5616                                   || tok->type == CPP_PRAGMA_EOL);
5617                       cb (pfile, CPP_DO_token, data, tok, spelling);
5618                     }
5619                   while (pfile->state.in_deferred_pragma);
5620
5621                   if (pfile->buffer->next_line < pfile->buffer->rlimit)
5622                     cb (pfile, CPP_DO_location, data,
5623                         pfile->line_table->highest_line);
5624
5625                   pfile->mi_valid = false;
5626                   goto restart;
5627                 }
5628               goto dflt;
5629
5630             default:
5631             dflt:
5632               bol = false;
5633               pfile->mi_valid = false;
5634               break;
5635             }
5636         }
5637
5638       if (buffer->rlimit > base && !pfile->state.skipping)
5639         {
5640           const unsigned char *limit = buffer->rlimit;
5641           /* If the file was not newline terminated, add rlimit, which is
5642              guaranteed to point to a newline, to the end of our range.  */
5643           if (limit[-1] != '\n')
5644             {
5645               limit++;
5646               CPP_INCREMENT_LINE (pfile, 0);
5647               line_count++;
5648             }
5649           cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5650         }
5651
5652       _cpp_pop_buffer (pfile);
5653     }
5654   while (pfile->buffer);
5655 }