libcpp/lex.cc

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2022 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive.  */
  54 #define UCS_LIMIT 0x10FFFF
  55
  56 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  57 static int skip_line_comment (cpp_reader *);
  58 static void skip_whitespace (cpp_reader *, cppchar_t);
  59 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  60 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  61 static void store_comment (cpp_reader *, cpp_token *);
  62 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  63                             unsigned int, enum cpp_ttype);
  64 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  65 static int name_p (cpp_reader *, const cpp_string *);
  66 static tokenrun *next_tokenrun (tokenrun *);
  67
  68 static _cpp_buff *new_buff (size_t);
  69
  70
  71 /* Utility routine:
  72
  73    Compares, the token TOKEN to the NUL-terminated string STRING.
  74    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  75 int
  76 cpp_ideq (const cpp_token *token, const char *string)
  77 {
  78   if (token->type != CPP_NAME)
  79     return 0;
  80
  81   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  82 }
  83
  84 /* Record a note TYPE at byte POS into the current cleaned logical
  85    line.  */
  86 static void
  87 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  88 {
  89   if (buffer->notes_used == buffer->notes_cap)
  90     {
  91       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  92       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  93                                   buffer->notes_cap);
  94     }
  95
  96   buffer->notes[buffer->notes_used].pos = pos;
  97   buffer->notes[buffer->notes_used].type = type;
  98   buffer->notes_used++;
  99 }
 100
 101 \f
 102 /* Fast path to find line special characters using optimized character
 103    scanning algorithms.  Anything complicated falls back to the slow
 104    path below.  Since this loop is very hot it's worth doing these kinds
 105    of optimizations.
 106
 107    One of the paths through the ifdefs should provide
 108
 109      const uchar *search_line_fast (const uchar *s, const uchar *end);
 110
 111    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 112    the found character.
 113
 114    Note that the last character of the buffer is *always* a newline,
 115    as forced by _cpp_convert_input.  This fact can be used to avoid
 116    explicitly looking for the end of the buffer.  */
 117
 118 /* Configure gives us an ifdef test.  */
 119 #ifndef WORDS_BIGENDIAN
 120 #define WORDS_BIGENDIAN 0
 121 #endif
 122
 123 /* We'd like the largest integer that fits into a register.  There's nothing
 124    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 125    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 126    can get the "real" word size.  */
 127 #ifdef __GNUC__
 128 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 129 #else
 130 typedef unsigned long word_type;
 131 #endif
 132
 133 /* The code below is only expecting sizes 4 or 8.
 134    Die at compile-time if this expectation is violated.  */
 135 typedef char check_word_type_size
 136   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 137
 138 /* Return X with the first N bytes forced to values that won't match one
 139    of the interesting characters.  Note that NUL is not interesting.  */
 140
 141 static inline word_type
 142 acc_char_mask_misalign (word_type val, unsigned int n)
 143 {
 144   word_type mask = -1;
 145   if (WORDS_BIGENDIAN)
 146     mask >>= n * 8;
 147   else
 148     mask <<= n * 8;
 149   return val & mask;
 150 }
 151
 152 /* Return X replicated to all byte positions within WORD_TYPE.  */
 153
 154 static inline word_type
 155 acc_char_replicate (uchar x)
 156 {
 157   word_type ret;
 158
 159   ret = (x << 24) | (x << 16) | (x << 8) | x;
 160   if (sizeof(word_type) == 8)
 161     ret = (ret << 16 << 16) | ret;
 162   return ret;
 163 }
 164
 165 /* Return non-zero if some byte of VAL is (probably) C.  */
 166
 167 static inline word_type
 168 acc_char_cmp (word_type val, word_type c)
 169 {
 170 #if defined(__GNUC__) && defined(__alpha__)
 171   /* We can get exact results using a compare-bytes instruction.
 172      Get (val == c) via (0 >= (val ^ c)).  */
 173   return __builtin_alpha_cmpbge (0, val ^ c);
 174 #else
 175   word_type magic = 0x7efefefeU;
 176   if (sizeof(word_type) == 8)
 177     magic = (magic << 16 << 16) | 0xfefefefeU;
 178   magic |= 1;
 179
 180   val ^= c;
 181   return ((val + magic) ^ ~val) & ~magic;
 182 #endif
 183 }
 184
 185 /* Given the result of acc_char_cmp is non-zero, return the index of
 186    the found character.  If this was a false positive, return -1.  */
 187
 188 static inline int
 189 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 190                 word_type val ATTRIBUTE_UNUSED)
 191 {
 192 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 193   /* The cmpbge instruction sets *bits* of the result corresponding to
 194      matches in the bytes with no false positives.  */
 195   return __builtin_ctzl (cmp);
 196 #else
 197   unsigned int i;
 198
 199   /* ??? It would be nice to force unrolling here,
 200      and have all of these constants folded.  */
 201   for (i = 0; i < sizeof(word_type); ++i)
 202     {
 203       uchar c;
 204       if (WORDS_BIGENDIAN)
 205         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 206       else
 207         c = (val >> i * 8) & 0xff;
 208
 209       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 210         return i;
 211     }
 212
 213   return -1;
 214 #endif
 215 }
 216
 217 /* A version of the fast scanner using bit fiddling techniques.
 218
 219    For 32-bit words, one would normally perform 16 comparisons and
 220    16 branches.  With this algorithm one performs 24 arithmetic
 221    operations and one branch.  Whether this is faster with a 32-bit
 222    word size is going to be somewhat system dependent.
 223
 224    For 64-bit words, we eliminate twice the number of comparisons
 225    and branches without increasing the number of arithmetic operations.
 226    It's almost certainly going to be a win with 64-bit word size.  */
 227
 228 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 229   ATTRIBUTE_UNUSED;
 230
 231 static const uchar *
 232 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 233 {
 234   const word_type repl_nl = acc_char_replicate ('\n');
 235   const word_type repl_cr = acc_char_replicate ('\r');
 236   const word_type repl_bs = acc_char_replicate ('\\');
 237   const word_type repl_qm = acc_char_replicate ('?');
 238
 239   unsigned int misalign;
 240   const word_type *p;
 241   word_type val, t;
 242
 243   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 244   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 245   val = *p;
 246   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 247   if (misalign)
 248     val = acc_char_mask_misalign (val, misalign);
 249
 250   /* Main loop.  */
 251   while (1)
 252     {
 253       t  = acc_char_cmp (val, repl_nl);
 254       t |= acc_char_cmp (val, repl_cr);
 255       t |= acc_char_cmp (val, repl_bs);
 256       t |= acc_char_cmp (val, repl_qm);
 257
 258       if (__builtin_expect (t != 0, 0))
 259         {
 260           int i = acc_char_index (t, val);
 261           if (i >= 0)
 262             return (const uchar *)p + i;
 263         }
 264
 265       val = *++p;
 266     }
 267 }
 268
 269 /* Disable on Solaris 2/x86 until the following problem can be properly
 270    autoconfed:
 271
 272    The Solaris 10+ assembler tags objects with the instruction set
 273    extensions used, so SSE4.2 executables cannot run on machines that
 274    don't support that extension.  */
 275
 276 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 277
 278 /* Replicated character data to be shared between implementations.
 279    Recall that outside of a context with vector support we can't
 280    define compatible vector types, therefore these are all defined
 281    in terms of raw characters.  */
 282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 283   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 284     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 285   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 286     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 287   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 288     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 289   { '?', '?', '?', '?', '?', '?', '?', '?',
 290     '?', '?', '?', '?', '?', '?', '?', '?' },
 291 };
 292
 293 /* A version of the fast scanner using MMX vectorized byte compare insns.
 294
 295    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 296    which was packaged into SSE1; it is also present in the AMD MMX
 297    extension.  Mark the function as using "sse" so that we emit a real
 298    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 299
 300 static const uchar *
 301 #ifndef __SSE__
 302 __attribute__((__target__("sse")))
 303 #endif
 304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 305 {
 306   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 307   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 308
 309   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 310   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 311   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 312   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 313
 314   unsigned int misalign, found, mask;
 315   const v8qi *p;
 316   v8qi data, t, c;
 317
 318   /* Align the source pointer.  While MMX doesn't generate unaligned data
 319      faults, this allows us to safely scan to the end of the buffer without
 320      reading beyond the end of the last page.  */
 321   misalign = (uintptr_t)s & 7;
 322   p = (const v8qi *)((uintptr_t)s & -8);
 323   data = *p;
 324
 325   /* Create a mask for the bytes that are valid within the first
 326      16-byte block.  The Idea here is that the AND with the mask
 327      within the loop is "free", since we need some AND or TEST
 328      insn in order to set the flags for the branch anyway.  */
 329   mask = -1u << misalign;
 330
 331   /* Main loop processing 8 bytes at a time.  */
 332   goto start;
 333   do
 334     {
 335       data = *++p;
 336       mask = -1;
 337
 338     start:
 339       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 340       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 341       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 342       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 343       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 344       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 345       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 346       found = __builtin_ia32_pmovmskb (t);
 347       found &= mask;
 348     }
 349   while (!found);
 350
 351   __builtin_ia32_emms ();
 352
 353   /* FOUND contains 1 in bits for which we matched a relevant
 354      character.  Conversion to the byte index is trivial.  */
 355   found = __builtin_ctz(found);
 356   return (const uchar *)p + found;
 357 }
 358
 359 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 360
 361 static const uchar *
 362 #ifndef __SSE2__
 363 __attribute__((__target__("sse2")))
 364 #endif
 365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 366 {
 367   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 368
 369   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 370   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 371   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 372   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 373
 374   unsigned int misalign, found, mask;
 375   const v16qi *p;
 376   v16qi data, t;
 377
 378   /* Align the source pointer.  */
 379   misalign = (uintptr_t)s & 15;
 380   p = (const v16qi *)((uintptr_t)s & -16);
 381   data = *p;
 382
 383   /* Create a mask for the bytes that are valid within the first
 384      16-byte block.  The Idea here is that the AND with the mask
 385      within the loop is "free", since we need some AND or TEST
 386      insn in order to set the flags for the branch anyway.  */
 387   mask = -1u << misalign;
 388
 389   /* Main loop processing 16 bytes at a time.  */
 390   goto start;
 391   do
 392     {
 393       data = *++p;
 394       mask = -1;
 395
 396     start:
 397       t  = data == repl_nl;
 398       t |= data == repl_cr;
 399       t |= data == repl_bs;
 400       t |= data == repl_qm;
 401       found = __builtin_ia32_pmovmskb128 (t);
 402       found &= mask;
 403     }
 404   while (!found);
 405
 406   /* FOUND contains 1 in bits for which we matched a relevant
 407      character.  Conversion to the byte index is trivial.  */
 408   found = __builtin_ctz(found);
 409   return (const uchar *)p + found;
 410 }
 411
 412 #ifdef HAVE_SSE4
 413 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 414
 415 static const uchar *
 416 #ifndef __SSE4_2__
 417 __attribute__((__target__("sse4.2")))
 418 #endif
 419 search_line_sse42 (const uchar *s, const uchar *end)
 420 {
 421   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 422   static const v16qi search = { '\n', '\r', '?', '\\' };
 423
 424   uintptr_t si = (uintptr_t)s;
 425   uintptr_t index;
 426
 427   /* Check for unaligned input.  */
 428   if (si & 15)
 429     {
 430       v16qi sv;
 431
 432       if (__builtin_expect (end - s < 16, 0)
 433           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 434         {
 435           /* There are less than 16 bytes left in the buffer, and less
 436              than 16 bytes left on the page.  Reading 16 bytes at this
 437              point might generate a spurious page fault.  Defer to the
 438              SSE2 implementation, which already handles alignment.  */
 439           return search_line_sse2 (s, end);
 440         }
 441
 442       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 443          memory need not be aligned.  */
 444       sv = __builtin_ia32_loaddqu ((const char *) s);
 445       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 446
 447       if (__builtin_expect (index < 16, 0))
 448         goto found;
 449
 450       /* Advance the pointer to an aligned address.  We will re-scan a
 451          few bytes, but we no longer need care for reading past the
 452          end of a page, since we're guaranteed a match.  */
 453       s = (const uchar *)((si + 15) & -16);
 454     }
 455
 456   /* Main loop, processing 16 bytes at a time.  */
 457 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 458   while (1)
 459     {
 460       char f;
 461
 462       /* By using inline assembly instead of the builtin,
 463          we can use the result, as well as the flags set.  */
 464       __asm ("%vpcmpestri\t$0, %2, %3"
 465              : "=c"(index), "=@ccc"(f)
 466              : "m"(*s), "x"(search), "a"(4), "d"(16));
 467       if (f)
 468         break;
 469
 470       s += 16;
 471     }
 472 #else
 473   s -= 16;
 474   /* By doing the whole loop in inline assembly,
 475      we can make proper use of the flags set.  */
 476   __asm (      ".balign 16\n"
 477         "0:     add $16, %1\n"
 478         "       %vpcmpestri\t$0, (%1), %2\n"
 479         "       jnc 0b"
 480         : "=&c"(index), "+r"(s)
 481         : "x"(search), "a"(4), "d"(16));
 482 #endif
 483
 484  found:
 485   return s + index;
 486 }
 487
 488 #else
 489 /* Work around out-dated assemblers without sse4 support.  */
 490 #define search_line_sse42 search_line_sse2
 491 #endif
 492
 493 /* Check the CPU capabilities.  */
 494
 495 #include "../gcc/config/i386/cpuid.h"
 496
 497 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 498 static search_line_fast_type search_line_fast;
 499
 500 #define HAVE_init_vectorized_lexer 1
 501 static inline void
 502 init_vectorized_lexer (void)
 503 {
 504   unsigned dummy, ecx = 0, edx = 0;
 505   search_line_fast_type impl = search_line_acc_char;
 506   int minimum = 0;
 507
 508 #if defined(__SSE4_2__)
 509   minimum = 3;
 510 #elif defined(__SSE2__)
 511   minimum = 2;
 512 #elif defined(__SSE__)
 513   minimum = 1;
 514 #endif
 515
 516   if (minimum == 3)
 517     impl = search_line_sse42;
 518   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 519     {
 520       if (minimum == 3 || (ecx & bit_SSE4_2))
 521         impl = search_line_sse42;
 522       else if (minimum == 2 || (edx & bit_SSE2))
 523         impl = search_line_sse2;
 524       else if (minimum == 1 || (edx & bit_SSE))
 525         impl = search_line_mmx;
 526     }
 527   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 528     {
 529       if (minimum == 1
 530           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 531         impl = search_line_mmx;
 532     }
 533
 534   search_line_fast = impl;
 535 }
 536
 537 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 538
 539 /* A vection of the fast scanner using AltiVec vectorized byte compares
 540    and VSX unaligned loads (when VSX is available).  This is otherwise
 541    the same as the AltiVec version.  */
 542
 543 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 544 static const uchar *
 545 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 546 {
 547   typedef __attribute__((altivec(vector))) unsigned char vc;
 548
 549   const vc repl_nl = {
 550     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 551     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 552   };
 553   const vc repl_cr = {
 554     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 555     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 556   };
 557   const vc repl_bs = {
 558     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 559     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 560   };
 561   const vc repl_qm = {
 562     '?', '?', '?', '?', '?', '?', '?', '?',
 563     '?', '?', '?', '?', '?', '?', '?', '?',
 564   };
 565   const vc zero = { 0 };
 566
 567   vc data, t;
 568
 569   /* Main loop processing 16 bytes at a time.  */
 570   do
 571     {
 572       vc m_nl, m_cr, m_bs, m_qm;
 573
 574       data = __builtin_vec_vsx_ld (0, s);
 575       s += 16;
 576
 577       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 578       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 579       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 580       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 581       t = (m_nl | m_cr) | (m_bs | m_qm);
 582
 583       /* T now contains 0xff in bytes for which we matched one of the relevant
 584          characters.  We want to exit the loop if any byte in T is non-zero.
 585          Below is the expansion of vec_any_ne(t, zero).  */
 586     }
 587   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 588
 589   /* Restore s to to point to the 16 bytes we just processed.  */
 590   s -= 16;
 591
 592   {
 593 #define N  (sizeof(vc) / sizeof(long))
 594
 595     union {
 596       vc v;
 597       /* Statically assert that N is 2 or 4.  */
 598       unsigned long l[(N == 2 || N == 4) ? N : -1];
 599     } u;
 600     unsigned long l, i = 0;
 601
 602     u.v = t;
 603
 604     /* Find the first word of T that is non-zero.  */
 605     switch (N)
 606       {
 607       case 4:
 608         l = u.l[i++];
 609         if (l != 0)
 610           break;
 611         s += sizeof(unsigned long);
 612         l = u.l[i++];
 613         if (l != 0)
 614           break;
 615         s += sizeof(unsigned long);
 616         /* FALLTHRU */
 617       case 2:
 618         l = u.l[i++];
 619         if (l != 0)
 620           break;
 621         s += sizeof(unsigned long);
 622         l = u.l[i];
 623       }
 624
 625     /* L now contains 0xff in bytes for which we matched one of the
 626        relevant characters.  We can find the byte index by finding
 627        its bit index and dividing by 8.  */
 628 #ifdef __BIG_ENDIAN__
 629     l = __builtin_clzl(l) >> 3;
 630 #else
 631     l = __builtin_ctzl(l) >> 3;
 632 #endif
 633     return s + l;
 634
 635 #undef N
 636   }
 637 }
 638
 639 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 640
 641 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 642    This cannot be used for little endian because vec_lvsl/lvsr are
 643    deprecated for little endian and the code won't work properly.  */
 644 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 645    so we can't compile this function without -maltivec on the command line
 646    (or implied by some other switch).  */
 647
 648 static const uchar *
 649 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 650 {
 651   typedef __attribute__((altivec(vector))) unsigned char vc;
 652
 653   const vc repl_nl = {
 654     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 655     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 656   };
 657   const vc repl_cr = {
 658     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 659     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 660   };
 661   const vc repl_bs = {
 662     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 663     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 664   };
 665   const vc repl_qm = {
 666     '?', '?', '?', '?', '?', '?', '?', '?',
 667     '?', '?', '?', '?', '?', '?', '?', '?',
 668   };
 669   const vc ones = {
 670     -1, -1, -1, -1, -1, -1, -1, -1,
 671     -1, -1, -1, -1, -1, -1, -1, -1,
 672   };
 673   const vc zero = { 0 };
 674
 675   vc data, mask, t;
 676
 677   /* Altivec loads automatically mask addresses with -16.  This lets us
 678      issue the first load as early as possible.  */
 679   data = __builtin_vec_ld(0, (const vc *)s);
 680
 681   /* Discard bytes before the beginning of the buffer.  Do this by
 682      beginning with all ones and shifting in zeros according to the
 683      mis-alignment.  The LVSR instruction pulls the exact shift we
 684      want from the address.  */
 685   mask = __builtin_vec_lvsr(0, s);
 686   mask = __builtin_vec_perm(zero, ones, mask);
 687   data &= mask;
 688
 689   /* While altivec loads mask addresses, we still need to align S so
 690      that the offset we compute at the end is correct.  */
 691   s = (const uchar *)((uintptr_t)s & -16);
 692
 693   /* Main loop processing 16 bytes at a time.  */
 694   goto start;
 695   do
 696     {
 697       vc m_nl, m_cr, m_bs, m_qm;
 698
 699       s += 16;
 700       data = __builtin_vec_ld(0, (const vc *)s);
 701
 702     start:
 703       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 704       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 705       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 706       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 707       t = (m_nl | m_cr) | (m_bs | m_qm);
 708
 709       /* T now contains 0xff in bytes for which we matched one of the relevant
 710          characters.  We want to exit the loop if any byte in T is non-zero.
 711          Below is the expansion of vec_any_ne(t, zero).  */
 712     }
 713   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 714
 715   {
 716 #define N  (sizeof(vc) / sizeof(long))
 717
 718     union {
 719       vc v;
 720       /* Statically assert that N is 2 or 4.  */
 721       unsigned long l[(N == 2 || N == 4) ? N : -1];
 722     } u;
 723     unsigned long l, i = 0;
 724
 725     u.v = t;
 726
 727     /* Find the first word of T that is non-zero.  */
 728     switch (N)
 729       {
 730       case 4:
 731         l = u.l[i++];
 732         if (l != 0)
 733           break;
 734         s += sizeof(unsigned long);
 735         l = u.l[i++];
 736         if (l != 0)
 737           break;
 738         s += sizeof(unsigned long);
 739         /* FALLTHROUGH */
 740       case 2:
 741         l = u.l[i++];
 742         if (l != 0)
 743           break;
 744         s += sizeof(unsigned long);
 745         l = u.l[i];
 746       }
 747
 748     /* L now contains 0xff in bytes for which we matched one of the
 749        relevant characters.  We can find the byte index by finding
 750        its bit index and dividing by 8.  */
 751     l = __builtin_clzl(l) >> 3;
 752     return s + l;
 753
 754 #undef N
 755   }
 756 }
 757
 758 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 759 #include "arm_neon.h"
 760
 761 /* This doesn't have to be the exact page size, but no system may use
 762    a size smaller than this.  ARMv8 requires a minimum page size of
 763    4k.  The impact of being conservative here is a small number of
 764    cases will take the slightly slower entry path into the main
 765    loop.  */
 766
 767 #define AARCH64_MIN_PAGE_SIZE 4096
 768
 769 static const uchar *
 770 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 771 {
 772   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 773   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 774   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 775   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 776   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 777
 778 #ifdef __ARM_BIG_ENDIAN
 779   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 780 #else
 781   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 782 #endif
 783
 784   unsigned int found;
 785   const uint8_t *p;
 786   uint8x16_t data;
 787   uint8x16_t t;
 788   uint16x8_t m;
 789   uint8x16_t u, v, w;
 790
 791   /* Align the source pointer.  */
 792   p = (const uint8_t *)((uintptr_t)s & -16);
 793
 794   /* Assuming random string start positions, with a 4k page size we'll take
 795      the slow path about 0.37% of the time.  */
 796   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 797                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 798                         < 16, 0))
 799     {
 800       /* Slow path: the string starts near a possible page boundary.  */
 801       uint32_t misalign, mask;
 802
 803       misalign = (uintptr_t)s & 15;
 804       mask = (-1u << misalign) & 0xffff;
 805       data = vld1q_u8 (p);
 806       t = vceqq_u8 (data, repl_nl);
 807       u = vceqq_u8 (data, repl_cr);
 808       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 809       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 810       t = vorrq_u8 (v, w);
 811       t = vandq_u8 (t, xmask);
 812       m = vpaddlq_u8 (t);
 813       m = vshlq_u16 (m, shift);
 814       found = vaddvq_u16 (m);
 815       found &= mask;
 816       if (found)
 817         return (const uchar*)p + __builtin_ctz (found);
 818     }
 819   else
 820     {
 821       data = vld1q_u8 ((const uint8_t *) s);
 822       t = vceqq_u8 (data, repl_nl);
 823       u = vceqq_u8 (data, repl_cr);
 824       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 825       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 826       t = vorrq_u8 (v, w);
 827       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 828         goto done;
 829     }
 830
 831   do
 832     {
 833       p += 16;
 834       data = vld1q_u8 (p);
 835       t = vceqq_u8 (data, repl_nl);
 836       u = vceqq_u8 (data, repl_cr);
 837       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 838       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 839       t = vorrq_u8 (v, w);
 840     } while (!vpaddd_u64 ((uint64x2_t)t));
 841
 842 done:
 843   /* Now that we've found the terminating substring, work out precisely where
 844      we need to stop.  */
 845   t = vandq_u8 (t, xmask);
 846   m = vpaddlq_u8 (t);
 847   m = vshlq_u16 (m, shift);
 848   found = vaddvq_u16 (m);
 849   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 850           + __builtin_ctz (found));
 851 }
 852
 853 #elif defined (__ARM_NEON)
 854 #include "arm_neon.h"
 855
 856 static const uchar *
 857 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 858 {
 859   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 860   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 861   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 862   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 863   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 864
 865   unsigned int misalign, found, mask;
 866   const uint8_t *p;
 867   uint8x16_t data;
 868
 869   /* Align the source pointer.  */
 870   misalign = (uintptr_t)s & 15;
 871   p = (const uint8_t *)((uintptr_t)s & -16);
 872   data = vld1q_u8 (p);
 873
 874   /* Create a mask for the bytes that are valid within the first
 875      16-byte block.  The Idea here is that the AND with the mask
 876      within the loop is "free", since we need some AND or TEST
 877      insn in order to set the flags for the branch anyway.  */
 878   mask = (-1u << misalign) & 0xffff;
 879
 880   /* Main loop, processing 16 bytes at a time.  */
 881   goto start;
 882
 883   do
 884     {
 885       uint8x8_t l;
 886       uint16x4_t m;
 887       uint32x2_t n;
 888       uint8x16_t t, u, v, w;
 889
 890       p += 16;
 891       data = vld1q_u8 (p);
 892       mask = 0xffff;
 893
 894     start:
 895       t = vceqq_u8 (data, repl_nl);
 896       u = vceqq_u8 (data, repl_cr);
 897       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 898       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 899       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 900       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 901       m = vpaddl_u8 (l);
 902       n = vpaddl_u16 (m);
 903
 904       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 905               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 906       found &= mask;
 907     }
 908   while (!found);
 909
 910   /* FOUND contains 1 in bits for which we matched a relevant
 911      character.  Conversion to the byte index is trivial.  */
 912   found = __builtin_ctz (found);
 913   return (const uchar *)p + found;
 914 }
 915
 916 #else
 917
 918 /* We only have one accelerated alternative.  Use a direct call so that
 919    we encourage inlining.  */
 920
 921 #define search_line_fast  search_line_acc_char
 922
 923 #endif
 924
 925 /* Initialize the lexer if needed.  */
 926
 927 void
 928 _cpp_init_lexer (void)
 929 {
 930 #ifdef HAVE_init_vectorized_lexer
 931   init_vectorized_lexer ();
 932 #endif
 933 }
 934
 935 /* Returns with a logical line that contains no escaped newlines or
 936    trigraphs.  This is a time-critical inner loop.  */
 937 void
 938 _cpp_clean_line (cpp_reader *pfile)
 939 {
 940   cpp_buffer *buffer;
 941   const uchar *s;
 942   uchar c, *d, *p;
 943
 944   buffer = pfile->buffer;
 945   buffer->cur_note = buffer->notes_used = 0;
 946   buffer->cur = buffer->line_base = buffer->next_line;
 947   buffer->need_line = false;
 948   s = buffer->next_line;
 949
 950   if (!buffer->from_stage3)
 951     {
 952       const uchar *pbackslash = NULL;
 953
 954       /* Fast path.  This is the common case of an un-escaped line with
 955          no trigraphs.  The primary win here is by not writing any
 956          data back to memory until we have to.  */
 957       while (1)
 958         {
 959           /* Perform an optimized search for \n, \r, \\, ?.  */
 960           s = search_line_fast (s, buffer->rlimit);
 961
 962           c = *s;
 963           if (c == '\\')
 964             {
 965               /* Record the location of the backslash and continue.  */
 966               pbackslash = s++;
 967             }
 968           else if (__builtin_expect (c == '?', 0))
 969             {
 970               if (__builtin_expect (s[1] == '?', false)
 971                    && _cpp_trigraph_map[s[2]])
 972                 {
 973                   /* Have a trigraph.  We may or may not have to convert
 974                      it.  Add a line note regardless, for -Wtrigraphs.  */
 975                   add_line_note (buffer, s, s[2]);
 976                   if (CPP_OPTION (pfile, trigraphs))
 977                     {
 978                       /* We do, and that means we have to switch to the
 979                          slow path.  */
 980                       d = (uchar *) s;
 981                       *d = _cpp_trigraph_map[s[2]];
 982                       s += 2;
 983                       goto slow_path;
 984                     }
 985                 }
 986               /* Not a trigraph.  Continue on fast-path.  */
 987               s++;
 988             }
 989           else
 990             break;
 991         }
 992
 993       /* This must be \r or \n.  We're either done, or we'll be forced
 994          to write back to the buffer and continue on the slow path.  */
 995       d = (uchar *) s;
 996
 997       if (__builtin_expect (s == buffer->rlimit, false))
 998         goto done;
 999
1000       /* DOS line ending? */
1001       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1002         {
1003           s++;
1004           if (s == buffer->rlimit)
1005             goto done;
1006         }
1007
1008       if (__builtin_expect (pbackslash == NULL, true))
1009         goto done;
1010
1011       /* Check for escaped newline.  */
1012       p = d;
1013       while (is_nvspace (p[-1]))
1014         p--;
1015       if (p - 1 != pbackslash)
1016         goto done;
1017
1018       /* Have an escaped newline; process it and proceed to
1019          the slow path.  */
1020       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1021       d = p - 2;
1022       buffer->next_line = p - 1;
1023
1024     slow_path:
1025       while (1)
1026         {
1027           c = *++s;
1028           *++d = c;
1029
1030           if (c == '\n' || c == '\r')
1031             {
1032               /* Handle DOS line endings.  */
1033               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1034                 s++;
1035               if (s == buffer->rlimit)
1036                 break;
1037
1038               /* Escaped?  */
1039               p = d;
1040               while (p != buffer->next_line && is_nvspace (p[-1]))
1041                 p--;
1042               if (p == buffer->next_line || p[-1] != '\\')
1043                 break;
1044
1045               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1046               d = p - 2;
1047               buffer->next_line = p - 1;
1048             }
1049           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1050             {
1051               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1052               add_line_note (buffer, d, s[2]);
1053               if (CPP_OPTION (pfile, trigraphs))
1054                 {
1055                   *d = _cpp_trigraph_map[s[2]];
1056                   s += 2;
1057                 }
1058             }
1059         }
1060     }
1061   else
1062     {
1063       while (*s != '\n' && *s != '\r')
1064         s++;
1065       d = (uchar *) s;
1066
1067       /* Handle DOS line endings.  */
1068       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1069         s++;
1070     }
1071
1072  done:
1073   *d = '\n';
1074   /* A sentinel note that should never be processed.  */
1075   add_line_note (buffer, d + 1, '\n');
1076   buffer->next_line = s + 1;
1077 }
1078
1079 template <bool lexing_raw_string>
1080 static bool get_fresh_line_impl (cpp_reader *pfile);
1081
1082 /* Return true if the trigraph indicated by NOTE should be warned
1083    about in a comment.  */
1084 static bool
1085 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1086 {
1087   const uchar *p;
1088
1089   /* Within comments we don't warn about trigraphs, unless the
1090      trigraph forms an escaped newline, as that may change
1091      behavior.  */
1092   if (note->type != '/')
1093     return false;
1094
1095   /* If -trigraphs, then this was an escaped newline iff the next note
1096      is coincident.  */
1097   if (CPP_OPTION (pfile, trigraphs))
1098     return note[1].pos == note->pos;
1099
1100   /* Otherwise, see if this forms an escaped newline.  */
1101   p = note->pos + 3;
1102   while (is_nvspace (*p))
1103     p++;
1104
1105   /* There might have been escaped newlines between the trigraph and the
1106      newline we found.  Hence the position test.  */
1107   return (*p == '\n' && p < note[1].pos);
1108 }
1109
1110 /* Process the notes created by add_line_note as far as the current
1111    location.  */
1112 void
1113 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1114 {
1115   cpp_buffer *buffer = pfile->buffer;
1116
1117   for (;;)
1118     {
1119       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1120       unsigned int col;
1121
1122       if (note->pos > buffer->cur)
1123         break;
1124
1125       buffer->cur_note++;
1126       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1127
1128       if (note->type == '\\' || note->type == ' ')
1129         {
1130           if (note->type == ' ' && !in_comment)
1131             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1132                                  "backslash and newline separated by space");
1133
1134           if (buffer->next_line > buffer->rlimit)
1135             {
1136               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1137                                    "backslash-newline at end of file");
1138               /* Prevent "no newline at end of file" warning.  */
1139               buffer->next_line = buffer->rlimit;
1140             }
1141
1142           buffer->line_base = note->pos;
1143           CPP_INCREMENT_LINE (pfile, 0);
1144         }
1145       else if (_cpp_trigraph_map[note->type])
1146         {
1147           if (CPP_OPTION (pfile, warn_trigraphs)
1148               && (!in_comment || warn_in_comment (pfile, note)))
1149             {
1150               if (CPP_OPTION (pfile, trigraphs))
1151                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1152                                        pfile->line_table->highest_line, col,
1153                                        "trigraph ??%c converted to %c",
1154                                        note->type,
1155                                        (int) _cpp_trigraph_map[note->type]);
1156               else
1157                 {
1158                   cpp_warning_with_line
1159                     (pfile, CPP_W_TRIGRAPHS,
1160                      pfile->line_table->highest_line, col,
1161                      "trigraph ??%c ignored, use -trigraphs to enable",
1162                      note->type);
1163                 }
1164             }
1165         }
1166       else if (note->type == 0)
1167         /* Already processed in lex_raw_string.  */;
1168       else
1169         abort ();
1170     }
1171 }
1172
1173 namespace bidi {
1174   enum class kind {
1175     NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1176   };
1177
1178   /* All the UTF-8 encodings of bidi characters start with E2.  */
1179   constexpr uchar utf8_start = 0xe2;
1180
1181   struct context
1182   {
1183     context () {}
1184     context (location_t loc, kind k, bool pdf, bool ucn)
1185     : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1186     {
1187     }
1188
1189     kind get_pop_kind () const
1190     {
1191       return m_pdf ? kind::PDF : kind::PDI;
1192     }
1193     bool ucn_p () const
1194     {
1195       return m_ucn;
1196     }
1197
1198     location_t m_loc;
1199     kind m_kind;
1200     unsigned m_pdf : 1;
1201     unsigned m_ucn : 1;
1202   };
1203
1204   /* A vector holding currently open bidi contexts.  We use a char for
1205      each context, its LSB is 1 if it represents a PDF context, 0 if it
1206      represents a PDI context.  The next bit is 1 if this context was open
1207      by a bidi character written as a UCN, and 0 when it was UTF-8.  */
1208   semi_embedded_vec <context, 16> vec;
1209
1210   /* Close the whole comment/identifier/string literal/character constant
1211      context.  */
1212   void on_close ()
1213   {
1214     vec.truncate (0);
1215   }
1216
1217   /* Pop the last element in the vector.  */
1218   void pop ()
1219   {
1220     unsigned int len = vec.count ();
1221     gcc_checking_assert (len > 0);
1222     vec.truncate (len - 1);
1223   }
1224
1225   /* Return the pop kind of the context of the Ith element.  */
1226   kind pop_kind_at (unsigned int i)
1227   {
1228     return vec[i].get_pop_kind ();
1229   }
1230
1231   /* Return the pop kind of the context that is currently opened.  */
1232   kind current_ctx ()
1233   {
1234     unsigned int len = vec.count ();
1235     if (len == 0)
1236       return kind::NONE;
1237     return vec[len - 1].get_pop_kind ();
1238   }
1239
1240   /* Return true if the current context comes from a UCN origin, that is,
1241      the bidi char which started this bidi context was written as a UCN.  */
1242   bool current_ctx_ucn_p ()
1243   {
1244     unsigned int len = vec.count ();
1245     gcc_checking_assert (len > 0);
1246     return vec[len - 1].m_ucn;
1247   }
1248
1249   location_t current_ctx_loc ()
1250   {
1251     unsigned int len = vec.count ();
1252     gcc_checking_assert (len > 0);
1253     return vec[len - 1].m_loc;
1254   }
1255
1256   /* We've read a bidi char, update the current vector as necessary.
1257      LOC is only valid when K is not kind::NONE.  */
1258   void on_char (kind k, bool ucn_p, location_t loc)
1259   {
1260     switch (k)
1261       {
1262       case kind::LRE:
1263       case kind::RLE:
1264       case kind::LRO:
1265       case kind::RLO:
1266         vec.push (context (loc, k, true, ucn_p));
1267         break;
1268       case kind::LRI:
1269       case kind::RLI:
1270       case kind::FSI:
1271         vec.push (context (loc, k, false, ucn_p));
1272         break;
1273       /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1274          whose scope has not yet been terminated.  */
1275       case kind::PDF:
1276         if (current_ctx () == kind::PDF)
1277           pop ();
1278         break;
1279       /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1280          scope has not yet been terminated, as well as the scopes of
1281          any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1282          yet been terminated.  */
1283       case kind::PDI:
1284         for (int i = vec.count () - 1; i >= 0; --i)
1285           if (pop_kind_at (i) == kind::PDI)
1286             {
1287               vec.truncate (i);
1288               break;
1289             }
1290         break;
1291       case kind::LTR:
1292       case kind::RTL:
1293         /* These aren't popped by a PDF/PDI.  */
1294         break;
1295       ATTR_LIKELY case kind::NONE:
1296         break;
1297       default:
1298         abort ();
1299       }
1300   }
1301
1302   /* Return a descriptive string for K.  */
1303   const char *to_str (kind k)
1304   {
1305     switch (k)
1306       {
1307       case kind::LRE:
1308         return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1309       case kind::RLE:
1310         return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1311       case kind::LRO:
1312         return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1313       case kind::RLO:
1314         return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1315       case kind::LRI:
1316         return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1317       case kind::RLI:
1318         return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1319       case kind::FSI:
1320         return "U+2068 (FIRST STRONG ISOLATE)";
1321       case kind::PDF:
1322         return "U+202C (POP DIRECTIONAL FORMATTING)";
1323       case kind::PDI:
1324         return "U+2069 (POP DIRECTIONAL ISOLATE)";
1325       case kind::LTR:
1326         return "U+200E (LEFT-TO-RIGHT MARK)";
1327       case kind::RTL:
1328         return "U+200F (RIGHT-TO-LEFT MARK)";
1329       default:
1330         abort ();
1331       }
1332   }
1333 }
1334
1335 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1336    within the current line in FILE, with the caret at START.  */
1337
1338 static location_t
1339 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1340                                          const unsigned char *const start,
1341                                          size_t num_bytes)
1342 {
1343   gcc_checking_assert (num_bytes > 0);
1344
1345   /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1346      to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1347      whereas linemap_position_for_column is 1-based.  */
1348
1349   /* Get 0-based offsets within the line.  */
1350   size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1351   size_t end_offset = start_offset + num_bytes - 1;
1352
1353   /* Now convert to location_t, where "columns" are 1-based byte offsets.  */
1354   location_t start_loc = linemap_position_for_column (pfile->line_table,
1355                                                       start_offset + 1);
1356   location_t end_loc = linemap_position_for_column (pfile->line_table,
1357                                                      end_offset + 1);
1358
1359   if (start_loc == end_loc)
1360     return start_loc;
1361
1362   source_range src_range;
1363   src_range.m_start = start_loc;
1364   src_range.m_finish = end_loc;
1365   location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
1366                                                    start_loc,
1367                                                    src_range,
1368                                                    NULL,
1369                                                    0);
1370   return combined_loc;
1371 }
1372
1373 /* Parse a sequence of 3 bytes starting with P and return its bidi code.  */
1374
1375 static bidi::kind
1376 get_bidi_utf8_1 (const unsigned char *const p)
1377 {
1378   gcc_checking_assert (p[0] == bidi::utf8_start);
1379
1380   if (p[1] == 0x80)
1381     switch (p[2])
1382       {
1383       case 0xaa:
1384         return bidi::kind::LRE;
1385       case 0xab:
1386         return bidi::kind::RLE;
1387       case 0xac:
1388         return bidi::kind::PDF;
1389       case 0xad:
1390         return bidi::kind::LRO;
1391       case 0xae:
1392         return bidi::kind::RLO;
1393       case 0x8e:
1394         return bidi::kind::LTR;
1395       case 0x8f:
1396         return bidi::kind::RTL;
1397       default:
1398         break;
1399       }
1400   else if (p[1] == 0x81)
1401     switch (p[2])
1402       {
1403       case 0xa6:
1404         return bidi::kind::LRI;
1405       case 0xa7:
1406         return bidi::kind::RLI;
1407       case 0xa8:
1408         return bidi::kind::FSI;
1409       case 0xa9:
1410         return bidi::kind::PDI;
1411       default:
1412         break;
1413       }
1414
1415   return bidi::kind::NONE;
1416 }
1417
1418 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1419    If the kind is not NONE, write the location to *OUT.*/
1420
1421 static bidi::kind
1422 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1423 {
1424   bidi::kind result = get_bidi_utf8_1 (p);
1425   if (result != bidi::kind::NONE)
1426     {
1427       /* We have a sequence of 3 bytes starting at P.  */
1428       *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1429     }
1430   return result;
1431 }
1432
1433 /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
1434
1435 static bidi::kind
1436 get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
1437 {
1438   /* 6.4.3 Universal Character Names
1439       \u hex-quad
1440       \U hex-quad hex-quad
1441       \u { simple-hexadecimal-digit-sequence }
1442      where \unnnn means \U0000nnnn.  */
1443
1444   *end = p + 4;
1445   if (is_U)
1446     {
1447       if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1448         return bidi::kind::NONE;
1449       /* Skip 4B so we can treat \u and \U the same below.  */
1450       p += 4;
1451       *end += 4;
1452     }
1453   else if (p[0] == '{')
1454     {
1455       p++;
1456       while (*p == '0')
1457         p++;
1458       if (p[0] != '2'
1459           || p[1] != '0'
1460           || !ISXDIGIT (p[2])
1461           || !ISXDIGIT (p[3])
1462           || p[4] != '}')
1463         return bidi::kind::NONE;
1464       *end = p + 5;
1465     }
1466
1467   /* All code points we are looking for start with 20xx.  */
1468   if (p[0] != '2' || p[1] != '0')
1469     return bidi::kind::NONE;
1470   else if (p[2] == '2')
1471     switch (p[3])
1472       {
1473       case 'a':
1474       case 'A':
1475         return bidi::kind::LRE;
1476       case 'b':
1477       case 'B':
1478         return bidi::kind::RLE;
1479       case 'c':
1480       case 'C':
1481         return bidi::kind::PDF;
1482       case 'd':
1483       case 'D':
1484         return bidi::kind::LRO;
1485       case 'e':
1486       case 'E':
1487         return bidi::kind::RLO;
1488       default:
1489         break;
1490       }
1491   else if (p[2] == '6')
1492     switch (p[3])
1493       {
1494       case '6':
1495         return bidi::kind::LRI;
1496       case '7':
1497         return bidi::kind::RLI;
1498       case '8':
1499         return bidi::kind::FSI;
1500       case '9':
1501         return bidi::kind::PDI;
1502       default:
1503         break;
1504       }
1505   else if (p[2] == '0')
1506     switch (p[3])
1507       {
1508       case 'e':
1509       case 'E':
1510         return bidi::kind::LTR;
1511       case 'f':
1512       case 'F':
1513         return bidi::kind::RTL;
1514       default:
1515         break;
1516       }
1517
1518   return bidi::kind::NONE;
1519 }
1520
1521 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1522    If the kind is not NONE, write the location to *OUT.  */
1523
1524 static bidi::kind
1525 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1526               location_t *out)
1527 {
1528   const unsigned char *end;
1529   bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
1530   if (result != bidi::kind::NONE)
1531     {
1532       const unsigned char *start = p - 2;
1533       size_t num_bytes = end - start;
1534       *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1535     }
1536   return result;
1537 }
1538
1539 /* Parse a named universal character escape where P points just past \N and
1540    return its bidi code.  If the kind is not NONE, write the location to
1541    *OUT.  */
1542
1543 static bidi::kind
1544 get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1545 {
1546   bidi::kind result = bidi::kind::NONE;
1547   if (*p != '{')
1548     return bidi::kind::NONE;
1549   if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
1550     {
1551       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1552         result = bidi::kind::LTR;
1553       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1554         result = bidi::kind::LRE;
1555       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1556         result = bidi::kind::LRO;
1557       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1558         result = bidi::kind::LRI;
1559     }
1560   else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
1561     {
1562       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1563         result = bidi::kind::RTL;
1564       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1565         result = bidi::kind::RLE;
1566       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1567         result = bidi::kind::RLO;
1568       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1569         result = bidi::kind::RLI;
1570     }
1571   else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
1572     {
1573       if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
1574         result = bidi::kind::PDF;
1575       else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
1576         result = bidi::kind::PDI;
1577     }
1578   else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
1579     result = bidi::kind::FSI;
1580   if (result != bidi::kind::NONE)
1581     *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
1582                                                     (strchr ((const char *)
1583                                                              (p + 1), '}')
1584                                                      - (const char *) p)
1585                                                     + 3);
1586   return result;
1587 }
1588
1589 /* Subclass of rich_location for reporting on unpaired UTF-8
1590    bidirectional control character(s).
1591    Escape the source lines on output, and show all unclosed
1592    bidi context, labelling everything.  */
1593
1594 class unpaired_bidi_rich_location : public rich_location
1595 {
1596  public:
1597   class custom_range_label : public range_label
1598   {
1599    public:
1600      label_text get_text (unsigned range_idx) const final override
1601      {
1602        /* range 0 is the primary location; each subsequent range i + 1
1603           is for bidi::vec[i].  */
1604        if (range_idx > 0)
1605          {
1606            const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1607            return label_text::borrow (bidi::to_str (ctxt.m_kind));
1608          }
1609        else
1610          return label_text::borrow (_("end of bidirectional context"));
1611      }
1612   };
1613
1614   unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1615   : rich_location (pfile->line_table, loc, &m_custom_label)
1616   {
1617     set_escape_on_output (true);
1618     for (unsigned i = 0; i < bidi::vec.count (); i++)
1619       add_range (bidi::vec[i].m_loc,
1620                  SHOW_RANGE_WITHOUT_CARET,
1621                  &m_custom_label);
1622   }
1623
1624  private:
1625    custom_range_label m_custom_label;
1626 };
1627
1628 /* We're closing a bidi context, that is, we've encountered a newline,
1629    are closing a C-style comment, or are at the end of a string literal,
1630    character constant, or identifier.  Warn if this context was not
1631    properly terminated by a PDI or PDF.  P points to the last character
1632    in this context.  */
1633
1634 static void
1635 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1636 {
1637   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1638   if (bidi::vec.count () > 0
1639       && (warn_bidi & bidirectional_unpaired
1640           && (!bidi::current_ctx_ucn_p ()
1641               || (warn_bidi & bidirectional_ucn))))
1642     {
1643       const location_t loc
1644         = linemap_position_for_column (pfile->line_table,
1645                                        CPP_BUF_COLUMN (pfile->buffer, p));
1646       unpaired_bidi_rich_location rich_loc (pfile, loc);
1647       /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1648          forms of a diagnostic, so fake it for now.  */
1649       if (bidi::vec.count () > 1)
1650         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1651                         "unpaired UTF-8 bidirectional control characters "
1652                         "detected");
1653       else
1654         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1655                         "unpaired UTF-8 bidirectional control character "
1656                         "detected");
1657     }
1658   /* We're done with this context.  */
1659   bidi::on_close ();
1660 }
1661
1662 /* We're at the beginning or in the middle of an identifier/comment/string
1663    literal/character constant.  Warn if we've encountered a bidi character.
1664    KIND says which bidi control character it was; UCN_P is true iff this bidi
1665    control character was written as a UCN.  LOC is the location of the
1666    character, but is only valid if KIND != bidi::kind::NONE.  */
1667
1668 static void
1669 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1670                          bool ucn_p, location_t loc)
1671 {
1672   if (__builtin_expect (kind == bidi::kind::NONE, 1))
1673     return;
1674
1675   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1676
1677   if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1678     {
1679       rich_location rich_loc (pfile->line_table, loc);
1680       rich_loc.set_escape_on_output (true);
1681
1682       /* It seems excessive to warn about a PDI/PDF that is closing
1683          an opened context because we've already warned about the
1684          opening character.  Except warn when we have a UCN x UTF-8
1685          mismatch, if UCN checking is enabled.  */
1686       if (kind == bidi::current_ctx ())
1687         {
1688           if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1689               && bidi::current_ctx_ucn_p () != ucn_p)
1690             {
1691               rich_loc.add_range (bidi::current_ctx_loc ());
1692               cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1693                               "UTF-8 vs UCN mismatch when closing "
1694                               "a context by \"%s\"", bidi::to_str (kind));
1695             }
1696         }
1697       else if (warn_bidi & bidirectional_any
1698                && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1699         {
1700           if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1701             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1702                             "\"%s\" is closing an unopened context",
1703                             bidi::to_str (kind));
1704           else
1705             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1706                             "found problematic Unicode character \"%s\"",
1707                             bidi::to_str (kind));
1708         }
1709     }
1710   /* We're done with this context.  */
1711   bidi::on_char (kind, ucn_p, loc);
1712 }
1713
1714 static const cppchar_t utf8_continuation = 0x80;
1715 static const cppchar_t utf8_signifier = 0xC0;
1716
1717 /* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1718    at PFILE->buffer->cur.  Return a pointer after the diagnosed
1719    invalid character.  */
1720
1721 static const uchar *
1722 _cpp_warn_invalid_utf8 (cpp_reader *pfile)
1723 {
1724   cpp_buffer *buffer = pfile->buffer;
1725   const uchar *cur = buffer->cur;
1726   bool pedantic = (CPP_PEDANTIC (pfile)
1727                    && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1728
1729   if (cur[0] < utf8_signifier
1730       || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1731     {
1732       if (pedantic)
1733         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1734                              pfile->line_table->highest_line,
1735                              CPP_BUF_COL (buffer),
1736                              "invalid UTF-8 character <%x>",
1737                              cur[0]);
1738       else
1739         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1740                                pfile->line_table->highest_line,
1741                                CPP_BUF_COL (buffer),
1742                                "invalid UTF-8 character <%x>",
1743                                cur[0]);
1744       return cur + 1;
1745     }
1746   else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1747     {
1748       if (pedantic)
1749         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1750                              pfile->line_table->highest_line,
1751                              CPP_BUF_COL (buffer),
1752                              "invalid UTF-8 character <%x><%x>",
1753                              cur[0], cur[1]);
1754       else
1755         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1756                                pfile->line_table->highest_line,
1757                                CPP_BUF_COL (buffer),
1758                                "invalid UTF-8 character <%x><%x>",
1759                                cur[0], cur[1]);
1760       return cur + 2;
1761     }
1762   else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1763     {
1764       if (pedantic)
1765         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1766                              pfile->line_table->highest_line,
1767                              CPP_BUF_COL (buffer),
1768                              "invalid UTF-8 character <%x><%x><%x>",
1769                              cur[0], cur[1], cur[2]);
1770       else
1771         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1772                                pfile->line_table->highest_line,
1773                                CPP_BUF_COL (buffer),
1774                                "invalid UTF-8 character <%x><%x><%x>",
1775                                cur[0], cur[1], cur[2]);
1776       return cur + 3;
1777     }
1778   else
1779     {
1780       if (pedantic)
1781         cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1782                              pfile->line_table->highest_line,
1783                              CPP_BUF_COL (buffer),
1784                              "invalid UTF-8 character <%x><%x><%x><%x>",
1785                              cur[0], cur[1], cur[2], cur[3]);
1786       else
1787         cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1788                                pfile->line_table->highest_line,
1789                                CPP_BUF_COL (buffer),
1790                                "invalid UTF-8 character <%x><%x><%x><%x>",
1791                                cur[0], cur[1], cur[2], cur[3]);
1792       return cur + 4;
1793     }
1794 }
1795
1796 /* Helper function of *skip_*_comment and lex*_string.  For C,
1797    character at CUR[-1] with MSB set handle -Wbidi-chars* and
1798    -Winvalid-utf8 diagnostics and return pointer to first character
1799    that should be processed next.  */
1800
1801 static inline const uchar *
1802 _cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1803                             const uchar *cur, bool warn_bidi_p,
1804                             bool warn_invalid_utf8_p)
1805 {
1806   /* If this is a beginning of a UTF-8 encoding, it might be
1807      a bidirectional control character.  */
1808   if (c == bidi::utf8_start && warn_bidi_p)
1809     {
1810       location_t loc;
1811       bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1812       maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1813     }
1814   if (!warn_invalid_utf8_p)
1815     return cur;
1816   if (c >= utf8_signifier)
1817     {
1818       cppchar_t s;
1819       const uchar *pstr = cur - 1;
1820       if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
1821           && s <= UCS_LIMIT)
1822         return pstr;
1823     }
1824   pfile->buffer->cur = cur - 1;
1825   return _cpp_warn_invalid_utf8 (pfile);
1826 }
1827
1828 /* Skip a C-style block comment.  We find the end of the comment by
1829    seeing if an asterisk is before every '/' we encounter.  Returns
1830    nonzero if comment terminated by EOF, zero otherwise.
1831
1832    Buffer->cur points to the initial asterisk of the comment.  */
1833 bool
1834 _cpp_skip_block_comment (cpp_reader *pfile)
1835 {
1836   cpp_buffer *buffer = pfile->buffer;
1837   const uchar *cur = buffer->cur;
1838   uchar c;
1839   const bool warn_bidi_p = pfile->warn_bidi_p ();
1840   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1841   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1842
1843   cur++;
1844   if (*cur == '/')
1845     cur++;
1846
1847   for (;;)
1848     {
1849       /* People like decorating comments with '*', so check for '/'
1850          instead for efficiency.  */
1851       c = *cur++;
1852
1853       if (c == '/')
1854         {
1855           if (cur[-2] == '*')
1856             {
1857               if (warn_bidi_p)
1858                 maybe_warn_bidi_on_close (pfile, cur);
1859               break;
1860             }
1861
1862           /* Warn about potential nested comments, but not if the '/'
1863              comes immediately before the true comment delimiter.
1864              Don't bother to get it right across escaped newlines.  */
1865           if (CPP_OPTION (pfile, warn_comments)
1866               && cur[0] == '*' && cur[1] != '/')
1867             {
1868               buffer->cur = cur;
1869               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1870                                      pfile->line_table->highest_line,
1871                                      CPP_BUF_COL (buffer),
1872                                      "\"/*\" within comment");
1873             }
1874         }
1875       else if (c == '\n')
1876         {
1877           unsigned int cols;
1878           buffer->cur = cur - 1;
1879           if (warn_bidi_p)
1880             maybe_warn_bidi_on_close (pfile, cur);
1881           _cpp_process_line_notes (pfile, true);
1882           if (buffer->next_line >= buffer->rlimit)
1883             return true;
1884           _cpp_clean_line (pfile);
1885
1886           cols = buffer->next_line - buffer->line_base;
1887           CPP_INCREMENT_LINE (pfile, cols);
1888
1889           cur = buffer->cur;
1890         }
1891       else if (__builtin_expect (c >= utf8_continuation, 0)
1892                && warn_bidi_or_invalid_utf8_p)
1893         cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1894                                           warn_invalid_utf8_p);
1895     }
1896
1897   buffer->cur = cur;
1898   _cpp_process_line_notes (pfile, true);
1899   return false;
1900 }
1901
1902 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1903    terminating newline.  Handles escaped newlines.  Returns nonzero
1904    if a multiline comment.  */
1905 static int
1906 skip_line_comment (cpp_reader *pfile)
1907 {
1908   cpp_buffer *buffer = pfile->buffer;
1909   location_t orig_line = pfile->line_table->highest_line;
1910   const bool warn_bidi_p = pfile->warn_bidi_p ();
1911   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1912   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1913
1914   if (!warn_bidi_or_invalid_utf8_p)
1915     while (*buffer->cur != '\n')
1916       buffer->cur++;
1917   else if (!warn_invalid_utf8_p)
1918     {
1919       while (*buffer->cur != '\n'
1920              && *buffer->cur != bidi::utf8_start)
1921         buffer->cur++;
1922       if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1923         {
1924           while (*buffer->cur != '\n')
1925             {
1926               if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1927                 {
1928                   location_t loc;
1929                   bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1930                   maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1931                 }
1932               buffer->cur++;
1933             }
1934           maybe_warn_bidi_on_close (pfile, buffer->cur);
1935         }
1936     }
1937   else
1938     {
1939       while (*buffer->cur != '\n')
1940         {
1941           if (*buffer->cur < utf8_continuation)
1942             {
1943               buffer->cur++;
1944               continue;
1945             }
1946           buffer->cur
1947             = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
1948                                           warn_bidi_p, warn_invalid_utf8_p);
1949         }
1950       if (warn_bidi_p)
1951         maybe_warn_bidi_on_close (pfile, buffer->cur);
1952     }
1953
1954   _cpp_process_line_notes (pfile, true);
1955   return orig_line != pfile->line_table->highest_line;
1956 }
1957
1958 /* Skips whitespace, saving the next non-whitespace character.  */
1959 static void
1960 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1961 {
1962   cpp_buffer *buffer = pfile->buffer;
1963   bool saw_NUL = false;
1964
1965   do
1966     {
1967       /* Horizontal space always OK.  */
1968       if (c == ' ' || c == '\t')
1969         ;
1970       /* Just \f \v or \0 left.  */
1971       else if (c == '\0')
1972         saw_NUL = true;
1973       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1974         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1975                              CPP_BUF_COL (buffer),
1976                              "%s in preprocessing directive",
1977                              c == '\f' ? "form feed" : "vertical tab");
1978
1979       c = *buffer->cur++;
1980     }
1981   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1982   while (is_nvspace (c));
1983
1984   if (saw_NUL)
1985     {
1986       encoding_rich_location rich_loc (pfile);
1987       cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1988                     "null character(s) ignored");
1989     }
1990
1991   buffer->cur--;
1992 }
1993
1994 /* See if the characters of a number token are valid in a name (no
1995    '.', '+' or '-').  */
1996 static int
1997 name_p (cpp_reader *pfile, const cpp_string *string)
1998 {
1999   unsigned int i;
2000
2001   for (i = 0; i < string->len; i++)
2002     if (!is_idchar (string->text[i]))
2003       return 0;
2004
2005   return 1;
2006 }
2007
2008 /* After parsing an identifier or other sequence, produce a warning about
2009    sequences not in NFC/NFKC.  */
2010 static void
2011 warn_about_normalization (cpp_reader *pfile,
2012                           const cpp_token *token,
2013                           const struct normalize_state *s,
2014                           bool identifier)
2015 {
2016   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2017       && !pfile->state.skipping)
2018     {
2019       location_t loc = token->src_loc;
2020
2021       /* If possible, create a location range for the token.  */
2022       if (loc >= RESERVED_LOCATION_COUNT
2023           && token->type != CPP_EOF
2024           /* There must be no line notes to process.  */
2025           && (!(pfile->buffer->cur
2026                 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2027                 && !pfile->overlaid_buffer)))
2028         {
2029           source_range tok_range;
2030           tok_range.m_start = loc;
2031           tok_range.m_finish
2032             = linemap_position_for_column (pfile->line_table,
2033                                            CPP_BUF_COLUMN (pfile->buffer,
2034                                                            pfile->buffer->cur));
2035           loc = COMBINE_LOCATION_DATA (pfile->line_table,
2036                                        loc, tok_range, NULL, 0);
2037         }
2038
2039       encoding_rich_location rich_loc (pfile, loc);
2040
2041       /* Make sure that the token is printed using UCNs, even
2042          if we'd otherwise happily print UTF-8.  */
2043       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2044       size_t sz;
2045
2046       sz = cpp_spell_token (pfile, token, buf, false) - buf;
2047       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2048         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2049                         "`%.*s' is not in NFKC", (int) sz, buf);
2050       else if (identifier && CPP_OPTION (pfile, xid_identifiers))
2051         cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2052                                   "`%.*s' is not in NFC", (int) sz, buf);
2053       else
2054         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2055                         "`%.*s' is not in NFC", (int) sz, buf);
2056       free (buf);
2057     }
2058 }
2059
2060 /* Returns TRUE if the sequence starting at buffer->cur is valid in
2061    an identifier.  FIRST is TRUE if this starts an identifier.  */
2062
2063 static bool
2064 forms_identifier_p (cpp_reader *pfile, int first,
2065                     struct normalize_state *state)
2066 {
2067   cpp_buffer *buffer = pfile->buffer;
2068   const bool warn_bidi_p = pfile->warn_bidi_p ();
2069
2070   if (*buffer->cur == '$')
2071     {
2072       if (!CPP_OPTION (pfile, dollars_in_ident))
2073         return false;
2074
2075       buffer->cur++;
2076       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
2077         {
2078           CPP_OPTION (pfile, warn_dollars) = 0;
2079           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
2080         }
2081
2082       return true;
2083     }
2084
2085   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
2086   if (CPP_OPTION (pfile, extended_identifiers))
2087     {
2088       cppchar_t s;
2089       if (*buffer->cur >= utf8_signifier)
2090         {
2091           if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2092               && warn_bidi_p)
2093             {
2094               location_t loc;
2095               bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
2096               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2097             }
2098           if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2099                                state, &s))
2100             return true;
2101         }
2102       else if (*buffer->cur == '\\'
2103                && (buffer->cur[1] == 'u'
2104                    || buffer->cur[1] == 'U'
2105                    || buffer->cur[1] == 'N'))
2106         {
2107           buffer->cur += 2;
2108           if (warn_bidi_p)
2109             {
2110               location_t loc;
2111               bidi::kind kind;
2112               if (buffer->cur[-1] == 'N')
2113                 kind = get_bidi_named (pfile, buffer->cur, &loc);
2114               else
2115                 kind = get_bidi_ucn (pfile, buffer->cur,
2116                                      buffer->cur[-1] == 'U', &loc);
2117               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2118             }
2119           if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2120                               state, &s, NULL, NULL))
2121             return true;
2122           buffer->cur -= 2;
2123         }
2124     }
2125
2126   return false;
2127 }
2128
2129 /* Helper function to issue error about improper __VA_OPT__ use.  */
2130 static void
2131 maybe_va_opt_error (cpp_reader *pfile)
2132 {
2133   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2134     {
2135       /* __VA_OPT__ should not be accepted at all, but allow it in
2136          system headers.  */
2137       if (!_cpp_in_system_header (pfile))
2138         cpp_error (pfile, CPP_DL_PEDWARN,
2139                    "__VA_OPT__ is not available until C++20");
2140     }
2141   else if (!pfile->state.va_args_ok)
2142     {
2143       /* __VA_OPT__ should only appear in the replacement list of a
2144          variadic macro.  */
2145       cpp_error (pfile, CPP_DL_PEDWARN,
2146                  "__VA_OPT__ can only appear in the expansion"
2147                  " of a C++20 variadic macro");
2148     }
2149 }
2150
2151 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
2152 static cpp_hashnode *
2153 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
2154 {
2155   cpp_hashnode *result;
2156   const uchar *cur;
2157   unsigned int len;
2158   unsigned int hash = HT_HASHSTEP (0, *base);
2159
2160   cur = base + 1;
2161   while (ISIDNUM (*cur))
2162     {
2163       hash = HT_HASHSTEP (hash, *cur);
2164       cur++;
2165     }
2166   len = cur - base;
2167   hash = HT_HASHFINISH (hash, len);
2168   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2169                                               base, len, hash, HT_ALLOC));
2170
2171   /* Rarely, identifiers require diagnostics when lexed.  */
2172   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2173                         && !pfile->state.skipping, 0))
2174     {
2175       /* It is allowed to poison the same identifier twice.  */
2176       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2177         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2178                    NODE_NAME (result));
2179
2180       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2181          replacement list of a variadic macro.  */
2182       if (result == pfile->spec_nodes.n__VA_ARGS__
2183           && !pfile->state.va_args_ok)
2184         {
2185           if (CPP_OPTION (pfile, cplusplus))
2186             cpp_error (pfile, CPP_DL_PEDWARN,
2187                        "__VA_ARGS__ can only appear in the expansion"
2188                        " of a C++11 variadic macro");
2189           else
2190             cpp_error (pfile, CPP_DL_PEDWARN,
2191                        "__VA_ARGS__ can only appear in the expansion"
2192                        " of a C99 variadic macro");
2193         }
2194
2195       if (result == pfile->spec_nodes.n__VA_OPT__)
2196         maybe_va_opt_error (pfile);
2197
2198       /* For -Wc++-compat, warn about use of C++ named operators.  */
2199       if (result->flags & NODE_WARN_OPERATOR)
2200         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2201                      "identifier \"%s\" is a special operator name in C++",
2202                      NODE_NAME (result));
2203     }
2204
2205   return result;
2206 }
2207
2208 /* Get the cpp_hashnode of an identifier specified by NAME in
2209    the current cpp_reader object.  If none is found, NULL is returned.  */
2210 cpp_hashnode *
2211 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2212 {
2213   cpp_hashnode *result;
2214   result = lex_identifier_intern (pfile, (uchar *) name);
2215   return result;
2216 }
2217
2218 /* Lex an identifier starting at BUFFER->CUR - 1.  */
2219 static cpp_hashnode *
2220 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2221                 struct normalize_state *nst, cpp_hashnode **spelling)
2222 {
2223   cpp_hashnode *result;
2224   const uchar *cur;
2225   unsigned int len;
2226   unsigned int hash = HT_HASHSTEP (0, *base);
2227   const bool warn_bidi_p = pfile->warn_bidi_p ();
2228
2229   cur = pfile->buffer->cur;
2230   if (! starts_ucn)
2231     {
2232       while (ISIDNUM (*cur))
2233         {
2234           hash = HT_HASHSTEP (hash, *cur);
2235           cur++;
2236         }
2237       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2238     }
2239   pfile->buffer->cur = cur;
2240   if (starts_ucn || forms_identifier_p (pfile, false, nst))
2241     {
2242       /* Slower version for identifiers containing UCNs
2243          or extended chars (including $).  */
2244       do {
2245         while (ISIDNUM (*pfile->buffer->cur))
2246           {
2247             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2248             pfile->buffer->cur++;
2249           }
2250       } while (forms_identifier_p (pfile, false, nst));
2251       if (warn_bidi_p)
2252         maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2253       result = _cpp_interpret_identifier (pfile, base,
2254                                           pfile->buffer->cur - base);
2255       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2256     }
2257   else
2258     {
2259       len = cur - base;
2260       hash = HT_HASHFINISH (hash, len);
2261
2262       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2263                                                   base, len, hash, HT_ALLOC));
2264       *spelling = result;
2265     }
2266
2267   /* Rarely, identifiers require diagnostics when lexed.  */
2268   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2269                         && !pfile->state.skipping, 0))
2270     {
2271       /* It is allowed to poison the same identifier twice.  */
2272       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2273         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2274                    NODE_NAME (result));
2275
2276       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2277          replacement list of a variadic macro.  */
2278       if (result == pfile->spec_nodes.n__VA_ARGS__
2279           && !pfile->state.va_args_ok)
2280         {
2281           if (CPP_OPTION (pfile, cplusplus))
2282             cpp_error (pfile, CPP_DL_PEDWARN,
2283                        "__VA_ARGS__ can only appear in the expansion"
2284                        " of a C++11 variadic macro");
2285           else
2286             cpp_error (pfile, CPP_DL_PEDWARN,
2287                        "__VA_ARGS__ can only appear in the expansion"
2288                        " of a C99 variadic macro");
2289         }
2290
2291       /* __VA_OPT__ should only appear in the replacement list of a
2292          variadic macro.  */
2293       if (result == pfile->spec_nodes.n__VA_OPT__)
2294         maybe_va_opt_error (pfile);
2295
2296       /* For -Wc++-compat, warn about use of C++ named operators.  */
2297       if (result->flags & NODE_WARN_OPERATOR)
2298         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2299                      "identifier \"%s\" is a special operator name in C++",
2300                      NODE_NAME (result));
2301     }
2302
2303   return result;
2304 }
2305
2306 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
2307 static void
2308 lex_number (cpp_reader *pfile, cpp_string *number,
2309             struct normalize_state *nst)
2310 {
2311   const uchar *cur;
2312   const uchar *base;
2313   uchar *dest;
2314
2315   base = pfile->buffer->cur - 1;
2316   do
2317     {
2318       const uchar *adj_digit_sep = NULL;
2319       cur = pfile->buffer->cur;
2320
2321       /* N.B. ISIDNUM does not include $.  */
2322       while (ISIDNUM (*cur)
2323              || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2324              || DIGIT_SEP (*cur)
2325              || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2326         {
2327           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2328           /* Adjacent digit separators do not form part of the pp-number syntax.
2329              However, they can safely be diagnosed here as an error, since '' is
2330              not a valid preprocessing token.  */
2331           if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2332             adj_digit_sep = cur;
2333           cur++;
2334         }
2335       /* A number can't end with a digit separator.  */
2336       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2337         --cur;
2338       if (adj_digit_sep && adj_digit_sep < cur)
2339         cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2340
2341       pfile->buffer->cur = cur;
2342     }
2343   while (forms_identifier_p (pfile, false, nst));
2344
2345   number->len = cur - base;
2346   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2347   memcpy (dest, base, number->len);
2348   dest[number->len] = '\0';
2349   number->text = dest;
2350 }
2351
2352 /* Create a token of type TYPE with a literal spelling.  */
2353 static void
2354 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2355                 unsigned int len, enum cpp_ttype type)
2356 {
2357   token->type = type;
2358   token->val.str.len = len;
2359   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2360 }
2361
2362 const uchar *
2363 cpp_alloc_token_string (cpp_reader *pfile,
2364                         const unsigned char *ptr, unsigned len)
2365 {
2366   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2367
2368   dest[len] = 0;
2369   memcpy (dest, ptr, len);
2370   return dest;
2371 }
2372
2373 /* A pair of raw buffer pointers.  The currently open one is [1], the
2374    first one is [0].  Used for string literal lexing.  */
2375 struct lit_accum {
2376   _cpp_buff *first;
2377   _cpp_buff *last;
2378   const uchar *rpos;
2379   size_t accum;
2380
2381   lit_accum ()
2382     : first (NULL), last (NULL), rpos (0), accum (0)
2383   {
2384   }
2385
2386   void append (cpp_reader *, const uchar *, size_t);
2387
2388   void read_begin (cpp_reader *);
2389   bool reading_p () const
2390   {
2391     return rpos != NULL;
2392   }
2393   char read_char ()
2394   {
2395     char c = *rpos++;
2396     if (rpos == BUFF_FRONT (last))
2397       rpos = NULL;
2398     return c;
2399   }
2400 };
2401
2402 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2403    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
2404
2405 void
2406 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2407 {
2408   if (!last)
2409     /* Starting.  */
2410     first = last = _cpp_get_buff (pfile, len);
2411   else if (len > BUFF_ROOM (last))
2412     {
2413       /* There is insufficient room in the buffer.  Copy what we can,
2414          and then either extend or create a new one.  */
2415       size_t room = BUFF_ROOM (last);
2416       memcpy (BUFF_FRONT (last), base, room);
2417       BUFF_FRONT (last) += room;
2418       base += room;
2419       len -= room;
2420       accum += room;
2421
2422       gcc_checking_assert (!rpos);
2423
2424       last = _cpp_append_extend_buff (pfile, last, len);
2425     }
2426
2427   memcpy (BUFF_FRONT (last), base, len);
2428   BUFF_FRONT (last) += len;
2429   accum += len;
2430 }
2431
2432 void
2433 lit_accum::read_begin (cpp_reader *pfile)
2434 {
2435   /* We never accumulate more than 4 chars to read.  */
2436   if (BUFF_ROOM (last) < 4)
2437
2438     last = _cpp_append_extend_buff (pfile, last, 4);
2439   rpos = BUFF_FRONT (last);
2440 }
2441
2442 /* Returns true if a macro has been defined.
2443    This might not work if compile with -save-temps,
2444    or preprocess separately from compilation.  */
2445
2446 static bool
2447 is_macro(cpp_reader *pfile, const uchar *base)
2448 {
2449   const uchar *cur = base;
2450   if (! ISIDST (*cur))
2451     return false;
2452   unsigned int hash = HT_HASHSTEP (0, *cur);
2453   ++cur;
2454   while (ISIDNUM (*cur))
2455     {
2456       hash = HT_HASHSTEP (hash, *cur);
2457       ++cur;
2458     }
2459   hash = HT_HASHFINISH (hash, cur - base);
2460
2461   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2462                                         base, cur - base, hash, HT_NO_INSERT));
2463
2464   return result && cpp_macro_p (result);
2465 }
2466
2467 /* Returns true if a literal suffix does not have the expected form
2468    and is defined as a macro.  */
2469
2470 static bool
2471 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
2472 {
2473   /* User-defined literals outside of namespace std must start with a single
2474      underscore, so assume anything of that form really is a UDL suffix.
2475      We don't need to worry about UDLs defined inside namespace std because
2476      their names are reserved, so cannot be used as macro names in valid
2477      programs.  */
2478   if (base[0] == '_' && base[1] != '_')
2479     return false;
2480   return is_macro (pfile, base);
2481 }
2482
2483 /* Lexes a raw string.  The stored string contains the spelling,
2484    including double quotes, delimiter string, '(' and ')', any leading
2485    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
2486    the type of the literal, or CPP_OTHER if it was not properly
2487    terminated.
2488
2489    BASE is the start of the token.  Updates pfile->buffer->cur to just
2490    after the lexed string.
2491
2492    The spelling is NUL-terminated, but it is not guaranteed that this
2493    is the first NUL since embedded NULs are preserved.  */
2494
2495 static void
2496 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2497 {
2498   const uchar *pos = base;
2499   const bool warn_bidi_p = pfile->warn_bidi_p ();
2500   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2501   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2502
2503   /* 'tis a pity this information isn't passed down from the lexer's
2504      initial categorization of the token.  */
2505   enum cpp_ttype type = CPP_STRING;
2506
2507   if (*pos == 'L')
2508     {
2509       type = CPP_WSTRING;
2510       pos++;
2511     }
2512   else if (*pos == 'U')
2513     {
2514       type = CPP_STRING32;
2515       pos++;
2516     }
2517   else if (*pos == 'u')
2518     {
2519       if (pos[1] == '8')
2520         {
2521           type = CPP_UTF8STRING;
2522           pos++;
2523         }
2524       else
2525         type = CPP_STRING16;
2526       pos++;
2527     }
2528
2529   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2530   pos += 2;
2531
2532   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2533
2534   /* Skip notes before the ".  */
2535   while (note->pos < pos)
2536     ++note;
2537
2538   lit_accum accum;
2539
2540   uchar prefix[17];
2541   unsigned prefix_len = 0;
2542   enum Phase
2543   {
2544    PHASE_PREFIX = -2,
2545    PHASE_NONE = -1,
2546    PHASE_SUFFIX = 0
2547   } phase = PHASE_PREFIX;
2548
2549   for (;;)
2550     {
2551       gcc_checking_assert (note->pos >= pos);
2552
2553       /* Undo any escaped newlines and trigraphs.  */
2554       if (!accum.reading_p () && note->pos == pos)
2555         switch (note->type)
2556           {
2557           case '\\':
2558           case ' ':
2559             /* Restore backslash followed by newline.  */
2560             accum.append (pfile, base, pos - base);
2561             base = pos;
2562             accum.read_begin (pfile);
2563             accum.append (pfile, UC"\\", 1);
2564
2565           after_backslash:
2566             if (note->type == ' ')
2567               /* GNU backslash whitespace newline extension.  FIXME
2568                  could be any sequence of non-vertical space.  When we
2569                  can properly restore any such sequence, we should
2570                  mark this note as handled so _cpp_process_line_notes
2571                  doesn't warn.  */
2572               accum.append (pfile, UC" ", 1);
2573
2574             accum.append (pfile, UC"\n", 1);
2575             note++;
2576             break;
2577
2578           case '\n':
2579             /* This can happen for ??/<NEWLINE> when trigraphs are not
2580                being interpretted.  */
2581             gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2582             note->type = 0;
2583             note++;
2584             break;
2585
2586           default:
2587             gcc_checking_assert (_cpp_trigraph_map[note->type]);
2588
2589             /* Don't warn about this trigraph in
2590                _cpp_process_line_notes, since trigraphs show up as
2591                trigraphs in raw strings.  */
2592             uchar type = note->type;
2593             note->type = 0;
2594
2595             if (CPP_OPTION (pfile, trigraphs))
2596               {
2597                 accum.append (pfile, base, pos - base);
2598                 base = pos;
2599                 accum.read_begin (pfile);
2600                 accum.append (pfile, UC"??", 2);
2601                 accum.append (pfile, &type, 1);
2602
2603                 /* ??/ followed by newline gets two line notes, one for
2604                    the trigraph and one for the backslash/newline.  */
2605                 if (type == '/' && note[1].pos == pos)
2606                   {
2607                     note++;
2608                     gcc_assert (note->type == '\\' || note->type == ' ');
2609                     goto after_backslash;
2610                   }
2611                 /* Skip the replacement character.  */
2612                 base = ++pos;
2613               }
2614
2615             note++;
2616             break;
2617           }
2618
2619       /* Now get a char to process.  Either from an expanded note, or
2620          from the line buffer.  */
2621       bool read_note = accum.reading_p ();
2622       char c = read_note ? accum.read_char () : *pos++;
2623
2624       if (phase == PHASE_PREFIX)
2625         {
2626           if (c == '(')
2627             {
2628               /* Done.  */
2629               phase = PHASE_NONE;
2630               prefix[prefix_len++] = '"';
2631             }
2632           else if (prefix_len < 16
2633                    /* Prefix chars are any of the basic character set,
2634                       [lex.charset] except for '
2635                       ()\\\t\v\f\n'. Optimized for a contiguous
2636                       alphabet.  */
2637                    /* Unlike a switch, this collapses down to one or
2638                       two shift and bitmask operations on an ASCII
2639                       system, with an outlier or two.   */
2640                    && (('Z' - 'A' == 25
2641                         ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2642                         : ISIDST (c))
2643                        || (c >= '0' && c <= '9')
2644                        || c == '_' || c == '{' || c == '}'
2645                        || c == '[' || c == ']' || c == '#'
2646                        || c == '<' || c == '>' || c == '%'
2647                        || c == ':' || c == ';' || c == '.' || c == '?'
2648                        || c == '*' || c == '+' || c == '-' || c == '/'
2649                        || c == '^' || c == '&' || c == '|' || c == '~'
2650                        || c == '!' || c == '=' || c == ','
2651                        || c == '"' || c == '\''))
2652             prefix[prefix_len++] = c;
2653           else
2654             {
2655               /* Something is wrong.  */
2656               int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2657               if (prefix_len == 16)
2658                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2659                                      col, "raw string delimiter longer "
2660                                      "than 16 characters");
2661               else if (c == '\n')
2662                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2663                                      col, "invalid new-line in raw "
2664                                      "string delimiter");
2665               else
2666                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2667                                      col, "invalid character '%c' in "
2668                                      "raw string delimiter", c);
2669               type = CPP_OTHER;
2670               phase = PHASE_NONE;
2671               /* Continue until we get a close quote, that's probably
2672                  the best failure mode.  */
2673               prefix_len = 0;
2674             }
2675           if (c != '\n')
2676             continue;
2677         }
2678
2679       if (phase != PHASE_NONE)
2680         {
2681           if (prefix[phase] != c)
2682             phase = PHASE_NONE;
2683           else if (unsigned (phase + 1) == prefix_len)
2684             break;
2685           else
2686             {
2687               phase = Phase (phase + 1);
2688               continue;
2689             }
2690         }
2691
2692       if (!prefix_len && c == '"')
2693         /* Failure mode lexing.  */
2694         goto out;
2695       else if (prefix_len && c == ')')
2696         phase = PHASE_SUFFIX;
2697       else if (!read_note && c == '\n')
2698         {
2699           pos--;
2700           pfile->buffer->cur = pos;
2701           if ((pfile->state.in_directive || pfile->state.parsing_args)
2702               && pfile->buffer->next_line >= pfile->buffer->rlimit)
2703             {
2704               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2705                                    "unterminated raw string");
2706               type = CPP_OTHER;
2707               goto out;
2708             }
2709
2710           accum.append (pfile, base, pos - base + 1);
2711           _cpp_process_line_notes (pfile, false);
2712
2713           if (pfile->buffer->next_line < pfile->buffer->rlimit)
2714             CPP_INCREMENT_LINE (pfile, 0);
2715           pfile->buffer->need_line = true;
2716
2717           if (!get_fresh_line_impl<true> (pfile))
2718             {
2719               /* We ran out of file and failed to get a line.  */
2720               location_t src_loc = token->src_loc;
2721               token->type = CPP_EOF;
2722               /* Tell the compiler the line number of the EOF token.  */
2723               token->src_loc = pfile->line_table->highest_line;
2724               token->flags = BOL;
2725               if (accum.first)
2726                 _cpp_release_buff (pfile, accum.first);
2727               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2728                                    "unterminated raw string");
2729
2730               /* Now pop the buffer that get_fresh_line_impl() did not.  Popping
2731                  is not safe if processing a directive, however this cannot
2732                  happen as we already checked above that a line would be
2733                  available, and get_fresh_line_impl() can't fail in this
2734                  case.  */
2735               gcc_assert (!pfile->state.in_directive);
2736               _cpp_pop_buffer (pfile);
2737
2738               return;
2739             }
2740
2741           pos = base = pfile->buffer->cur;
2742           note = &pfile->buffer->notes[pfile->buffer->cur_note];
2743         }
2744       else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2745                && warn_bidi_or_invalid_utf8_p)
2746         pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
2747                                           warn_invalid_utf8_p);
2748     }
2749
2750   if (warn_bidi_p)
2751     maybe_warn_bidi_on_close (pfile, pos);
2752
2753   if (CPP_OPTION (pfile, user_literals))
2754     {
2755       /* If a string format macro, say from inttypes.h, is placed touching
2756          a string literal it could be parsed as a C++11 user-defined string
2757          literal thus breaking the program.  */
2758       if (is_macro_not_literal_suffix (pfile, pos))
2759         {
2760           /* Raise a warning, but do not consume subsequent tokens.  */
2761           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2762             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2763                                    token->src_loc, 0,
2764                                    "invalid suffix on literal; C++11 requires "
2765                                    "a space between literal and string macro");
2766         }
2767       /* Grab user defined literal suffix.  */
2768       else if (ISIDST (*pos))
2769         {
2770           type = cpp_userdef_string_add_type (type);
2771           ++pos;
2772
2773           while (ISIDNUM (*pos))
2774             ++pos;
2775         }
2776     }
2777
2778  out:
2779   pfile->buffer->cur = pos;
2780   if (!accum.accum)
2781     create_literal (pfile, token, base, pos - base, type);
2782   else
2783     {
2784       size_t extra_len = pos - base;
2785       uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2786
2787       token->type = type;
2788       token->val.str.len = accum.accum + extra_len;
2789       token->val.str.text = dest;
2790       for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2791         {
2792           size_t len = BUFF_FRONT (buf) - buf->base;
2793           memcpy (dest, buf->base, len);
2794           dest += len;
2795         }
2796       _cpp_release_buff (pfile, accum.first);
2797       memcpy (dest, base, extra_len);
2798       dest[extra_len] = '\0';
2799     }
2800 }
2801
2802 /* Lexes a string, character constant, or angle-bracketed header file
2803    name.  The stored string contains the spelling, including opening
2804    quote and any leading 'L', 'u', 'U' or 'u8' and optional
2805    'R' modifier.  It returns the type of the literal, or CPP_OTHER
2806    if it was not properly terminated, or CPP_LESS for an unterminated
2807    header name which must be relexed as normal tokens.
2808
2809    The spelling is NUL-terminated, but it is not guaranteed that this
2810    is the first NUL since embedded NULs are preserved.  */
2811 static void
2812 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2813 {
2814   bool saw_NUL = false;
2815   const uchar *cur;
2816   cppchar_t terminator;
2817   enum cpp_ttype type;
2818
2819   cur = base;
2820   terminator = *cur++;
2821   if (terminator == 'L' || terminator == 'U')
2822     terminator = *cur++;
2823   else if (terminator == 'u')
2824     {
2825       terminator = *cur++;
2826       if (terminator == '8')
2827         terminator = *cur++;
2828     }
2829   if (terminator == 'R')
2830     {
2831       lex_raw_string (pfile, token, base);
2832       return;
2833     }
2834   if (terminator == '"')
2835     type = (*base == 'L' ? CPP_WSTRING :
2836             *base == 'U' ? CPP_STRING32 :
2837             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2838                          : CPP_STRING);
2839   else if (terminator == '\'')
2840     type = (*base == 'L' ? CPP_WCHAR :
2841             *base == 'U' ? CPP_CHAR32 :
2842             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2843                          : CPP_CHAR);
2844   else
2845     terminator = '>', type = CPP_HEADER_NAME;
2846
2847   const bool warn_bidi_p = pfile->warn_bidi_p ();
2848   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2849   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2850   for (;;)
2851     {
2852       cppchar_t c = *cur++;
2853
2854       /* In #include-style directives, terminators are not escapable.  */
2855       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2856         {
2857           if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
2858             {
2859               location_t loc;
2860               bidi::kind kind;
2861               if (cur[0] == 'N')
2862                 kind = get_bidi_named (pfile, cur + 1, &loc);
2863               else
2864                 kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
2865               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2866             }
2867           cur++;
2868         }
2869       else if (c == terminator)
2870         {
2871           if (warn_bidi_p)
2872             maybe_warn_bidi_on_close (pfile, cur - 1);
2873           break;
2874         }
2875       else if (c == '\n')
2876         {
2877           cur--;
2878           /* Unmatched quotes always yield undefined behavior, but
2879              greedy lexing means that what appears to be an unterminated
2880              header name may actually be a legitimate sequence of tokens.  */
2881           if (terminator == '>')
2882             {
2883               token->type = CPP_LESS;
2884               return;
2885             }
2886           type = CPP_OTHER;
2887           break;
2888         }
2889       else if (c == '\0')
2890         saw_NUL = true;
2891       else if (__builtin_expect (c >= utf8_continuation, 0)
2892                && warn_bidi_or_invalid_utf8_p)
2893         cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2894                                           warn_invalid_utf8_p);
2895     }
2896
2897   if (saw_NUL && !pfile->state.skipping)
2898     cpp_error (pfile, CPP_DL_WARNING,
2899                "null character(s) preserved in literal");
2900
2901   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2902     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2903                (int) terminator);
2904
2905   if (CPP_OPTION (pfile, user_literals))
2906     {
2907       /* If a string format macro, say from inttypes.h, is placed touching
2908          a string literal it could be parsed as a C++11 user-defined string
2909          literal thus breaking the program.  */
2910       if (is_macro_not_literal_suffix (pfile, cur))
2911         {
2912           /* Raise a warning, but do not consume subsequent tokens.  */
2913           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2914             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2915                                    token->src_loc, 0,
2916                                    "invalid suffix on literal; C++11 requires "
2917                                    "a space between literal and string macro");
2918         }
2919       /* Grab user defined literal suffix.  */
2920       else if (ISIDST (*cur))
2921         {
2922           type = cpp_userdef_char_add_type (type);
2923           type = cpp_userdef_string_add_type (type);
2924           ++cur;
2925
2926           while (ISIDNUM (*cur))
2927             ++cur;
2928         }
2929     }
2930   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2931            && is_macro (pfile, cur)
2932            && !pfile->state.skipping)
2933     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2934                            token->src_loc, 0, "C++11 requires a space "
2935                            "between string literal and macro");
2936
2937   pfile->buffer->cur = cur;
2938   create_literal (pfile, token, base, cur - base, type);
2939 }
2940
2941 /* Return the comment table. The client may not make any assumption
2942    about the ordering of the table.  */
2943 cpp_comment_table *
2944 cpp_get_comments (cpp_reader *pfile)
2945 {
2946   return &pfile->comments;
2947 }
2948
2949 /* Append a comment to the end of the comment table. */
2950 static void
2951 store_comment (cpp_reader *pfile, cpp_token *token)
2952 {
2953   int len;
2954
2955   if (pfile->comments.allocated == 0)
2956     {
2957       pfile->comments.allocated = 256;
2958       pfile->comments.entries = (cpp_comment *) xmalloc
2959         (pfile->comments.allocated * sizeof (cpp_comment));
2960     }
2961
2962   if (pfile->comments.count == pfile->comments.allocated)
2963     {
2964       pfile->comments.allocated *= 2;
2965       pfile->comments.entries = (cpp_comment *) xrealloc
2966         (pfile->comments.entries,
2967          pfile->comments.allocated * sizeof (cpp_comment));
2968     }
2969
2970   len = token->val.str.len;
2971
2972   /* Copy comment. Note, token may not be NULL terminated. */
2973   pfile->comments.entries[pfile->comments.count].comment =
2974     (char *) xmalloc (sizeof (char) * (len + 1));
2975   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2976           token->val.str.text, len);
2977   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2978
2979   /* Set source location. */
2980   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2981
2982   /* Increment the count of entries in the comment table. */
2983   pfile->comments.count++;
2984 }
2985
2986 /* The stored comment includes the comment start and any terminator.  */
2987 static void
2988 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2989               cppchar_t type)
2990 {
2991   unsigned char *buffer;
2992   unsigned int len, clen, i;
2993
2994   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2995
2996   /* C++ comments probably (not definitely) have moved past a new
2997      line, which we don't want to save in the comment.  */
2998   if (is_vspace (pfile->buffer->cur[-1]))
2999     len--;
3000
3001   /* If we are currently in a directive or in argument parsing, then
3002      we need to store all C++ comments as C comments internally, and
3003      so we need to allocate a little extra space in that case.
3004
3005      Note that the only time we encounter a directive here is
3006      when we are saving comments in a "#define".  */
3007   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
3008           && type == '/') ? len + 2 : len;
3009
3010   buffer = _cpp_unaligned_alloc (pfile, clen);
3011
3012   token->type = CPP_COMMENT;
3013   token->val.str.len = clen;
3014   token->val.str.text = buffer;
3015
3016   buffer[0] = '/';
3017   memcpy (buffer + 1, from, len - 1);
3018
3019   /* Finish conversion to a C comment, if necessary.  */
3020   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
3021     {
3022       buffer[1] = '*';
3023       buffer[clen - 2] = '*';
3024       buffer[clen - 1] = '/';
3025       /* As there can be in a C++ comments illegal sequences for C comments
3026          we need to filter them out.  */
3027       for (i = 2; i < (clen - 2); i++)
3028         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3029           buffer[i] = '|';
3030     }
3031
3032   /* Finally store this comment for use by clients of libcpp. */
3033   store_comment (pfile, token);
3034 }
3035
3036 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3037    comment.  */
3038
3039 static bool
3040 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3041 {
3042   const unsigned char *from = comment_start + 1;
3043
3044   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3045     {
3046       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3047          don't recognize any comments.  The latter only checks attributes,
3048          the former doesn't warn.  */
3049     case 0:
3050     default:
3051       return false;
3052       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3053          content it has.  */
3054     case 1:
3055       return true;
3056     case 2:
3057       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3058          .*falls?[ \t-]*thr(u|ough).* regex.  */
3059       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3060            from++)
3061         {
3062           /* Is there anything like strpbrk with upper boundary, or
3063              memchr looking for 2 characters rather than just one?  */
3064           if (from[0] != 'f' && from[0] != 'F')
3065             continue;
3066           if (from[1] != 'a' && from[1] != 'A')
3067             continue;
3068           if (from[2] != 'l' && from[2] != 'L')
3069             continue;
3070           if (from[3] != 'l' && from[3] != 'L')
3071             continue;
3072           from += sizeof "fall" - 1;
3073           if (from[0] == 's' || from[0] == 'S')
3074             from++;
3075           while (*from == ' ' || *from == '\t' || *from == '-')
3076             from++;
3077           if (from[0] != 't' && from[0] != 'T')
3078             continue;
3079           if (from[1] != 'h' && from[1] != 'H')
3080             continue;
3081           if (from[2] != 'r' && from[2] != 'R')
3082             continue;
3083           if (from[3] == 'u' || from[3] == 'U')
3084             return true;
3085           if (from[3] != 'o' && from[3] != 'O')
3086             continue;
3087           if (from[4] != 'u' && from[4] != 'U')
3088             continue;
3089           if (from[5] != 'g' && from[5] != 'G')
3090             continue;
3091           if (from[6] != 'h' && from[6] != 'H')
3092             continue;
3093           return true;
3094         }
3095       return false;
3096     case 3:
3097     case 4:
3098       break;
3099     }
3100
3101   /* Whole comment contents:
3102      -fallthrough
3103      @fallthrough@
3104    */
3105   if (*from == '-' || *from == '@')
3106     {
3107       size_t len = sizeof "fallthrough" - 1;
3108       if ((size_t) (pfile->buffer->cur - from - 1) < len)
3109         return false;
3110       if (memcmp (from + 1, "fallthrough", len))
3111         return false;
3112       if (*from == '@')
3113         {
3114           if (from[len + 1] != '@')
3115             return false;
3116           len++;
3117         }
3118       from += 1 + len;
3119     }
3120   /* Whole comment contents (regex):
3121      lint -fallthrough[ \t]*
3122    */
3123   else if (*from == 'l')
3124     {
3125       size_t len = sizeof "int -fallthrough" - 1;
3126       if ((size_t) (pfile->buffer->cur - from - 1) < len)
3127         return false;
3128       if (memcmp (from + 1, "int -fallthrough", len))
3129         return false;
3130       from += 1 + len;
3131       while (*from == ' ' || *from == '\t')
3132         from++;
3133     }
3134   /* Whole comment contents (regex):
3135      [ \t]*FALLTHR(U|OUGH)[ \t]*
3136    */
3137   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3138     {
3139       while (*from == ' ' || *from == '\t')
3140         from++;
3141       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
3142         return false;
3143       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
3144         return false;
3145       from += sizeof "FALLTHR" - 1;
3146       if (*from == 'U')
3147         from++;
3148       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
3149         return false;
3150       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
3151         return false;
3152       else
3153         from += sizeof "OUGH" - 1;
3154       while (*from == ' ' || *from == '\t')
3155         from++;
3156     }
3157   /* Whole comment contents (regex):
3158      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3159      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3160      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
3161    */
3162   else
3163     {
3164       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3165         from++;
3166       unsigned char f = *from;
3167       bool all_upper = false;
3168       if (f == 'E' || f == 'e')
3169         {
3170           if ((size_t) (pfile->buffer->cur - from)
3171               < sizeof "else fallthru" - 1)
3172             return false;
3173           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
3174             all_upper = true;
3175           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
3176             return false;
3177           from += sizeof "else" - 1;
3178           if (*from == ',')
3179             from++;
3180           if (*from != ' ')
3181             return false;
3182           from++;
3183           if (all_upper && *from == 'f')
3184             return false;
3185           if (f == 'e' && *from == 'F')
3186             return false;
3187           f = *from;
3188         }
3189       else if (f == 'I' || f == 'i')
3190         {
3191           if ((size_t) (pfile->buffer->cur - from)
3192               < sizeof "intentional fallthru" - 1)
3193             return false;
3194           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
3195                                   sizeof "NTENTIONAL" - 1) == 0)
3196             all_upper = true;
3197           else if (memcmp (from + 1, "ntentional",
3198                            sizeof "ntentional" - 1))
3199             return false;
3200           from += sizeof "intentional" - 1;
3201           if (*from == ' ')
3202             {
3203               from++;
3204               if (all_upper && *from == 'f')
3205                 return false;
3206             }
3207           else if (all_upper)
3208             {
3209               if (memcmp (from, "LY F", sizeof "LY F" - 1))
3210                 return false;
3211               from += sizeof "LY " - 1;
3212             }
3213           else
3214             {
3215               if (memcmp (from, "ly ", sizeof "ly " - 1))
3216                 return false;
3217               from += sizeof "ly " - 1;
3218             }
3219           if (f == 'i' && *from == 'F')
3220             return false;
3221           f = *from;
3222         }
3223       if (f != 'F' && f != 'f')
3224         return false;
3225       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3226         return false;
3227       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3228         all_upper = true;
3229       else if (all_upper)
3230         return false;
3231       else if (memcmp (from + 1, "all", sizeof "all" - 1))
3232         return false;
3233       from += sizeof "fall" - 1;
3234       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3235         from += 2;
3236       else if (*from == ' ' || *from == '-')
3237         from++;
3238       else if (*from != (all_upper ? 'T' : 't'))
3239         return false;
3240       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3241         return false;
3242       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3243         return false;
3244       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3245         {
3246           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3247             return false;
3248           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3249                       sizeof "hrough" - 1))
3250             return false;
3251           from += sizeof "through" - 1;
3252         }
3253       else
3254         from += sizeof "thru" - 1;
3255       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3256         from++;
3257       if (*from == '-')
3258         {
3259           from++;
3260           if (*comment_start == '*')
3261             {
3262               do
3263                 {
3264                   while (*from && *from != '*'
3265                          && *from != '\n' && *from != '\r')
3266                     from++;
3267                   if (*from != '*' || from[1] == '/')
3268                     break;
3269                   from++;
3270                 }
3271               while (1);
3272             }
3273           else
3274             while (*from && *from != '\n' && *from != '\r')
3275               from++;
3276         }
3277     }
3278   /* C block comment.  */
3279   if (*comment_start == '*')
3280     {
3281       if (*from != '*' || from[1] != '/')
3282         return false;
3283     }
3284   /* C++ line comment.  */
3285   else if (*from != '\n')
3286     return false;
3287
3288   return true;
3289 }
3290
3291 /* Allocate COUNT tokens for RUN.  */
3292 void
3293 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3294 {
3295   run->base = XNEWVEC (cpp_token, count);
3296   run->limit = run->base + count;
3297   run->next = NULL;
3298 }
3299
3300 /* Returns the next tokenrun, or creates one if there is none.  */
3301 static tokenrun *
3302 next_tokenrun (tokenrun *run)
3303 {
3304   if (run->next == NULL)
3305     {
3306       run->next = XNEW (tokenrun);
3307       run->next->prev = run;
3308       _cpp_init_tokenrun (run->next, 250);
3309     }
3310
3311   return run->next;
3312 }
3313
3314 /* Return the number of not yet processed token in a given
3315    context.  */
3316 int
3317 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3318 {
3319   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3320     return (LAST (context).token - FIRST (context).token);
3321   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3322            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3323     return (LAST (context).ptoken - FIRST (context).ptoken);
3324   else
3325       abort ();
3326 }
3327
3328 /* Returns the token present at index INDEX in a given context.  If
3329    INDEX is zero, the next token to be processed is returned.  */
3330 static const cpp_token*
3331 _cpp_token_from_context_at (cpp_context *context, int index)
3332 {
3333   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3334     return &(FIRST (context).token[index]);
3335   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3336            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3337     return FIRST (context).ptoken[index];
3338  else
3339    abort ();
3340 }
3341
3342 /* Look ahead in the input stream.  */
3343 const cpp_token *
3344 cpp_peek_token (cpp_reader *pfile, int index)
3345 {
3346   cpp_context *context = pfile->context;
3347   const cpp_token *peektok;
3348   int count;
3349
3350   /* First, scan through any pending cpp_context objects.  */
3351   while (context->prev)
3352     {
3353       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3354
3355       if (index < (int) sz)
3356         return _cpp_token_from_context_at (context, index);
3357       index -= (int) sz;
3358       context = context->prev;
3359     }
3360
3361   /* We will have to read some new tokens after all (and do so
3362      without invalidating preceding tokens).  */
3363   count = index;
3364   pfile->keep_tokens++;
3365
3366   /* For peeked tokens temporarily disable line_change reporting,
3367      until the tokens are parsed for real.  */
3368   void (*line_change) (cpp_reader *, const cpp_token *, int)
3369     = pfile->cb.line_change;
3370   pfile->cb.line_change = NULL;
3371
3372   do
3373     {
3374       peektok = _cpp_lex_token (pfile);
3375       if (peektok->type == CPP_EOF)
3376         {
3377           index--;
3378           break;
3379         }
3380       else if (peektok->type == CPP_PRAGMA)
3381         {
3382           /* Don't peek past a pragma.  */
3383           if (peektok == &pfile->directive_result)
3384             /* Save the pragma in the buffer.  */
3385             *pfile->cur_token++ = *peektok;
3386           index--;
3387           break;
3388         }
3389     }
3390   while (index--);
3391
3392   _cpp_backup_tokens_direct (pfile, count - index);
3393   pfile->keep_tokens--;
3394   pfile->cb.line_change = line_change;
3395
3396   return peektok;
3397 }
3398
3399 /* Allocate a single token that is invalidated at the same time as the
3400    rest of the tokens on the line.  Has its line and col set to the
3401    same as the last lexed token, so that diagnostics appear in the
3402    right place.  */
3403 cpp_token *
3404 _cpp_temp_token (cpp_reader *pfile)
3405 {
3406   cpp_token *old, *result;
3407   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3408   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3409
3410   old = pfile->cur_token - 1;
3411   /* Any pre-existing lookaheads must not be clobbered.  */
3412   if (la)
3413     {
3414       if (sz <= la)
3415         {
3416           tokenrun *next = next_tokenrun (pfile->cur_run);
3417
3418           if (sz < la)
3419             memmove (next->base + 1, next->base,
3420                      (la - sz) * sizeof (cpp_token));
3421
3422           next->base[0] = pfile->cur_run->limit[-1];
3423         }
3424
3425       if (sz > 1)
3426         memmove (pfile->cur_token + 1, pfile->cur_token,
3427                  MIN (la, sz - 1) * sizeof (cpp_token));
3428     }
3429
3430   if (!sz && pfile->cur_token == pfile->cur_run->limit)
3431     {
3432       pfile->cur_run = next_tokenrun (pfile->cur_run);
3433       pfile->cur_token = pfile->cur_run->base;
3434     }
3435
3436   result = pfile->cur_token++;
3437   result->src_loc = old->src_loc;
3438   return result;
3439 }
3440
3441 /* We're at the beginning of a logical line (so not in
3442   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
3443   if we should enter deferred_pragma mode to tokenize the rest of the
3444   line as a module control-line.  */
3445
3446 static void
3447 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3448 {
3449   unsigned backup = 0; /* Tokens we peeked.  */
3450   cpp_hashnode *node = result->val.node.node;
3451   cpp_token *peek = result;
3452   cpp_token *keyword = peek;
3453   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3454   int header_count = 0;
3455
3456   /* Make sure the incoming state is as we expect it.  This way we
3457      can restore it using constants.  */
3458   gcc_checking_assert (!pfile->state.in_deferred_pragma
3459                        && !pfile->state.skipping
3460                        && !pfile->state.parsing_args
3461                        && !pfile->state.angled_headers
3462                        && (pfile->state.save_comments
3463                            == !CPP_OPTION (pfile, discard_comments)));
3464
3465   /* Enter directives mode sufficiently for peeking.  We don't have
3466      to actually set in_directive.  */
3467   pfile->state.in_deferred_pragma = true;
3468
3469   /* These two fields are needed to process tokenization in deferred
3470      pragma mode.  They are not used outside deferred pragma mode or
3471      directives mode.  */
3472   pfile->state.pragma_allow_expansion = true;
3473   pfile->directive_line = result->src_loc;
3474
3475   /* Saving comments is incompatible with directives mode.   */
3476   pfile->state.save_comments = 0;
3477
3478   if (node == n_modules[spec_nodes::M_EXPORT][0])
3479     {
3480       peek = _cpp_lex_direct (pfile);
3481       keyword = peek;
3482       backup++;
3483       if (keyword->type != CPP_NAME)
3484         goto not_module;
3485       node = keyword->val.node.node;
3486       if (!(node->flags & NODE_MODULE))
3487         goto not_module;
3488     }
3489
3490   if (node == n_modules[spec_nodes::M__IMPORT][0])
3491     /* __import  */
3492     header_count = backup + 2 + 16;
3493   else if (node == n_modules[spec_nodes::M_IMPORT][0])
3494     /* import  */
3495     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3496   else if (node == n_modules[spec_nodes::M_MODULE][0])
3497     ; /* module  */
3498   else
3499     goto not_module;
3500
3501   /* We've seen [export] {module|import|__import}.  Check the next token.  */
3502   if (header_count)
3503     /* After '{,__}import' a header name may appear.  */
3504     pfile->state.angled_headers = true;
3505   peek = _cpp_lex_direct (pfile);
3506   backup++;
3507
3508   /* ... import followed by identifier, ':', '<' or
3509      header-name preprocessing tokens, or module
3510      followed by cpp-identifier, ':' or ';' preprocessing
3511      tokens.  C++ keywords are not yet relevant.  */
3512   if (peek->type == CPP_NAME
3513       || peek->type == CPP_COLON
3514       ||  (header_count
3515            ? (peek->type == CPP_LESS
3516               || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3517               || peek->type == CPP_HEADER_NAME)
3518            : peek->type == CPP_SEMICOLON))
3519     {
3520       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3521       if (!pfile->state.pragma_allow_expansion)
3522         pfile->state.prevent_expansion++;
3523
3524       if (!header_count && linemap_included_from
3525           (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3526         cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3527                              "module control-line cannot be in included file");
3528
3529       /* The first one or two tokens cannot be macro names.  */
3530       for (int ix = backup; ix--;)
3531         {
3532           cpp_token *tok = ix ? keyword : result;
3533           cpp_hashnode *node = tok->val.node.node;
3534
3535           /* Don't attempt to expand the token.  */
3536           tok->flags |= NO_EXPAND;
3537           if (_cpp_defined_macro_p (node)
3538               && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3539               && !cpp_fun_like_macro_p (node))
3540             cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3541                                  "module control-line \"%s\" cannot be"
3542                                  " an object-like macro",
3543                                  NODE_NAME (node));
3544         }
3545
3546       /* Map to underbar variants.  */
3547       keyword->val.node.node = n_modules[header_count
3548                                          ? spec_nodes::M_IMPORT
3549                                          : spec_nodes::M_MODULE][1];
3550       if (backup != 1)
3551         result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3552
3553       /* Maybe tell the tokenizer we expect a header-name down the
3554          road.  */
3555       pfile->state.directive_file_token = header_count;
3556     }
3557   else
3558     {
3559     not_module:
3560       /* Drop out of directive mode.  */
3561       /* We aaserted save_comments had this value upon entry.  */
3562       pfile->state.save_comments
3563         = !CPP_OPTION (pfile, discard_comments);
3564       pfile->state.in_deferred_pragma = false;
3565       /* Do not let this remain on.  */
3566       pfile->state.angled_headers = false;
3567     }
3568
3569   /* In either case we want to backup the peeked tokens.  */
3570   if (backup)
3571     {
3572       /* If we saw EOL, we should drop it, because this isn't a module
3573          control-line after all.  */
3574       bool eol = peek->type == CPP_PRAGMA_EOL;
3575       if (!eol || backup > 1)
3576         {
3577           /* Put put the peeked tokens back  */
3578           _cpp_backup_tokens_direct (pfile, backup);
3579           /* But if the last one was an EOL, forget it.  */
3580           if (eol)
3581             pfile->lookaheads--;
3582         }
3583     }
3584 }
3585
3586 /* Lex a token into RESULT (external interface).  Takes care of issues
3587    like directive handling, token lookahead, multiple include
3588    optimization and skipping.  */
3589 const cpp_token *
3590 _cpp_lex_token (cpp_reader *pfile)
3591 {
3592   cpp_token *result;
3593
3594   for (;;)
3595     {
3596       if (pfile->cur_token == pfile->cur_run->limit)
3597         {
3598           pfile->cur_run = next_tokenrun (pfile->cur_run);
3599           pfile->cur_token = pfile->cur_run->base;
3600         }
3601       /* We assume that the current token is somewhere in the current
3602          run.  */
3603       if (pfile->cur_token < pfile->cur_run->base
3604           || pfile->cur_token >= pfile->cur_run->limit)
3605         abort ();
3606
3607       if (pfile->lookaheads)
3608         {
3609           pfile->lookaheads--;
3610           result = pfile->cur_token++;
3611         }
3612       else
3613         result = _cpp_lex_direct (pfile);
3614
3615       if (result->flags & BOL)
3616         {
3617           /* Is this a directive.  If _cpp_handle_directive returns
3618              false, it is an assembler #.  */
3619           if (result->type == CPP_HASH
3620               /* 6.10.3 p 11: Directives in a list of macro arguments
3621                  gives undefined behavior.  This implementation
3622                  handles the directive as normal.  */
3623               && pfile->state.parsing_args != 1)
3624             {
3625               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3626                 {
3627                   if (pfile->directive_result.type == CPP_PADDING)
3628                     continue;
3629                   result = &pfile->directive_result;
3630                 }
3631             }
3632           else if (pfile->state.in_deferred_pragma)
3633             result = &pfile->directive_result;
3634           else if (result->type == CPP_NAME
3635                    && (result->val.node.node->flags & NODE_MODULE)
3636                    && !pfile->state.skipping
3637                    /* Unlike regular directives, we do not deal with
3638                       tokenizing module directives as macro arguments.
3639                       That's not permitted.  */
3640                    && !pfile->state.parsing_args)
3641             {
3642               /* P1857.  Before macro expansion, At start of logical
3643                  line ... */
3644               /* We don't have to consider lookaheads at this point.  */
3645               gcc_checking_assert (!pfile->lookaheads);
3646
3647               cpp_maybe_module_directive (pfile, result);
3648             }
3649
3650           if (pfile->cb.line_change && !pfile->state.skipping)
3651             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3652         }
3653
3654       /* We don't skip tokens in directives.  */
3655       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3656         break;
3657
3658       /* Outside a directive, invalidate controlling macros.  At file
3659          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3660          get here and MI optimization works.  */
3661       pfile->mi_valid = false;
3662
3663       if (!pfile->state.skipping || result->type == CPP_EOF)
3664         break;
3665     }
3666
3667   return result;
3668 }
3669
3670 /* Returns true if a fresh line has been loaded.  */
3671 template <bool lexing_raw_string>
3672 static bool
3673 get_fresh_line_impl (cpp_reader *pfile)
3674 {
3675   /* We can't get a new line until we leave the current directive, unless we
3676      are lexing a raw string, in which case it will be OK as long as we don't
3677      pop the current buffer.  */
3678   if (!lexing_raw_string && pfile->state.in_directive)
3679     return false;
3680
3681   for (;;)
3682     {
3683       cpp_buffer *buffer = pfile->buffer;
3684
3685       if (!buffer->need_line)
3686         return true;
3687
3688       if (buffer->next_line < buffer->rlimit)
3689         {
3690           _cpp_clean_line (pfile);
3691           return true;
3692         }
3693
3694       /* We can't change buffers until we leave the current directive.  */
3695       if (lexing_raw_string && pfile->state.in_directive)
3696         return false;
3697
3698       /* First, get out of parsing arguments state.  */
3699       if (pfile->state.parsing_args)
3700         return false;
3701
3702       /* End of buffer.  Non-empty files should end in a newline.  */
3703       if (buffer->buf != buffer->rlimit
3704           && buffer->next_line > buffer->rlimit
3705           && !buffer->from_stage3)
3706         {
3707           /* Clip to buffer size.  */
3708           buffer->next_line = buffer->rlimit;
3709         }
3710
3711       if (buffer->prev && !buffer->return_at_eof)
3712         _cpp_pop_buffer (pfile);
3713       else
3714         {
3715           /* End of translation.  Do not pop the buffer yet. Increment
3716              line number so that the EOF token is on a line of its own
3717              (_cpp_lex_direct doesn't increment in that case, because
3718              it's hard for it to distinguish this special case). */
3719           CPP_INCREMENT_LINE (pfile, 0);
3720           return false;
3721         }
3722     }
3723 }
3724
3725 bool
3726 _cpp_get_fresh_line (cpp_reader *pfile)
3727 {
3728   return get_fresh_line_impl<false> (pfile);
3729 }
3730
3731
3732 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
3733   do                                                    \
3734     {                                                   \
3735       result->type = ELSE_TYPE;                         \
3736       if (*buffer->cur == CHAR)                         \
3737         buffer->cur++, result->type = THEN_TYPE;        \
3738     }                                                   \
3739   while (0)
3740
3741 /* Lex a token into pfile->cur_token, which is also incremented, to
3742    get diagnostics pointing to the correct location.
3743
3744    Does not handle issues such as token lookahead, multiple-include
3745    optimization, directives, skipping etc.  This function is only
3746    suitable for use by _cpp_lex_token, and in special cases like
3747    lex_expansion_token which doesn't care for any of these issues.
3748
3749    When meeting a newline, returns CPP_EOF if parsing a directive,
3750    otherwise returns to the start of the token buffer if permissible.
3751    Returns the location of the lexed token.  */
3752 cpp_token *
3753 _cpp_lex_direct (cpp_reader *pfile)
3754 {
3755   cppchar_t c;
3756   cpp_buffer *buffer;
3757   const unsigned char *comment_start;
3758   bool fallthrough_comment = false;
3759   cpp_token *result = pfile->cur_token++;
3760
3761  fresh_line:
3762   result->flags = 0;
3763   buffer = pfile->buffer;
3764   if (buffer->need_line)
3765     {
3766       if (pfile->state.in_deferred_pragma)
3767         {
3768           /* This can happen in cases like:
3769              #define loop(x) whatever
3770              #pragma omp loop
3771              where when trying to expand loop we need to peek
3772              next token after loop, but aren't still in_deferred_pragma
3773              mode but are in in_directive mode, so buffer->need_line
3774              is set, a CPP_EOF is peeked.  */
3775           result->type = CPP_PRAGMA_EOL;
3776           pfile->state.in_deferred_pragma = false;
3777           if (!pfile->state.pragma_allow_expansion)
3778             pfile->state.prevent_expansion--;
3779           return result;
3780         }
3781       if (!_cpp_get_fresh_line (pfile))
3782         {
3783           result->type = CPP_EOF;
3784           /* Not a real EOF in a directive or arg parsing -- we refuse
3785              to advance to the next file now, and will once we're out
3786              of those modes.  */
3787           if (!pfile->state.in_directive && !pfile->state.parsing_args)
3788             {
3789               /* Tell the compiler the line number of the EOF token.  */
3790               result->src_loc = pfile->line_table->highest_line;
3791               result->flags = BOL;
3792               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
3793               _cpp_pop_buffer (pfile);
3794             }
3795           return result;
3796         }
3797       if (buffer != pfile->buffer)
3798         fallthrough_comment = false;
3799       if (!pfile->keep_tokens)
3800         {
3801           pfile->cur_run = &pfile->base_run;
3802           result = pfile->base_run.base;
3803           pfile->cur_token = result + 1;
3804         }
3805       result->flags = BOL;
3806       if (pfile->state.parsing_args == 2)
3807         result->flags |= PREV_WHITE;
3808     }
3809   buffer = pfile->buffer;
3810  update_tokens_line:
3811   result->src_loc = pfile->line_table->highest_line;
3812
3813  skipped_white:
3814   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3815       && !pfile->overlaid_buffer)
3816     {
3817       _cpp_process_line_notes (pfile, false);
3818       result->src_loc = pfile->line_table->highest_line;
3819     }
3820   c = *buffer->cur++;
3821
3822   if (pfile->forced_token_location)
3823     result->src_loc = pfile->forced_token_location;
3824   else
3825     result->src_loc = linemap_position_for_column (pfile->line_table,
3826                                           CPP_BUF_COLUMN (buffer, buffer->cur));
3827
3828   switch (c)
3829     {
3830     case ' ': case '\t': case '\f': case '\v': case '\0':
3831       result->flags |= PREV_WHITE;
3832       skip_whitespace (pfile, c);
3833       goto skipped_white;
3834
3835     case '\n':
3836       /* Increment the line, unless this is the last line ...  */
3837       if (buffer->cur < buffer->rlimit
3838           /* ... or this is a #include, (where _cpp_stack_file needs to
3839              unwind by one line) ...  */
3840           || (pfile->state.in_directive > 1
3841               /* ... except traditional-cpp increments this elsewhere.  */
3842               && !CPP_OPTION (pfile, traditional)))
3843         CPP_INCREMENT_LINE (pfile, 0);
3844       buffer->need_line = true;
3845       if (pfile->state.in_deferred_pragma)
3846         {
3847           /* Produce the PRAGMA_EOL on this line.  File reading
3848              ensures there is always a \n at end of the buffer, thus
3849              in a deferred pragma we always see CPP_PRAGMA_EOL before
3850              any CPP_EOF.  */
3851           result->type = CPP_PRAGMA_EOL;
3852           result->flags &= ~PREV_WHITE;
3853           pfile->state.in_deferred_pragma = false;
3854           if (!pfile->state.pragma_allow_expansion)
3855             pfile->state.prevent_expansion--;
3856           return result;
3857         }
3858       goto fresh_line;
3859
3860     case '0': case '1': case '2': case '3': case '4':
3861     case '5': case '6': case '7': case '8': case '9':
3862       {
3863         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3864         result->type = CPP_NUMBER;
3865         lex_number (pfile, &result->val.str, &nst);
3866         warn_about_normalization (pfile, result, &nst, false);
3867         break;
3868       }
3869
3870     case 'L':
3871     case 'u':
3872     case 'U':
3873     case 'R':
3874       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3875          wide strings or raw strings.  */
3876       if (c == 'L' || CPP_OPTION (pfile, rliterals)
3877           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3878         {
3879           if ((*buffer->cur == '\'' && c != 'R')
3880               || *buffer->cur == '"'
3881               || (*buffer->cur == 'R'
3882                   && c != 'R'
3883                   && buffer->cur[1] == '"'
3884                   && CPP_OPTION (pfile, rliterals))
3885               || (*buffer->cur == '8'
3886                   && c == 'u'
3887                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3888                                 && CPP_OPTION (pfile, utf8_char_literals)))
3889                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3890                           && CPP_OPTION (pfile, rliterals)))))
3891             {
3892               lex_string (pfile, result, buffer->cur - 1);
3893               break;
3894             }
3895         }
3896       /* Fall through.  */
3897
3898     case '_':
3899     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3900     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3901     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3902     case 's': case 't':           case 'v': case 'w': case 'x':
3903     case 'y': case 'z':
3904     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3905     case 'G': case 'H': case 'I': case 'J': case 'K':
3906     case 'M': case 'N': case 'O': case 'P': case 'Q':
3907     case 'S': case 'T':           case 'V': case 'W': case 'X':
3908     case 'Y': case 'Z':
3909       result->type = CPP_NAME;
3910       {
3911         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3912         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3913                                                 &nst,
3914                                                 &result->val.node.spelling);
3915         warn_about_normalization (pfile, result, &nst, true);
3916       }
3917
3918       /* Convert named operators to their proper types.  */
3919       if (result->val.node.node->flags & NODE_OPERATOR)
3920         {
3921           result->flags |= NAMED_OP;
3922           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3923         }
3924
3925       /* Signal FALLTHROUGH comment followed by another token.  */
3926       if (fallthrough_comment)
3927         result->flags |= PREV_FALLTHROUGH;
3928       break;
3929
3930     case '\'':
3931     case '"':
3932       lex_string (pfile, result, buffer->cur - 1);
3933       break;
3934
3935     case '/':
3936       /* A potential block or line comment.  */
3937       comment_start = buffer->cur;
3938       c = *buffer->cur;
3939
3940       if (c == '*')
3941         {
3942           if (_cpp_skip_block_comment (pfile))
3943             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3944         }
3945       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3946         {
3947           /* Don't warn for system headers.  */
3948           if (_cpp_in_system_header (pfile))
3949             ;
3950           /* Warn about comments if pedantically GNUC89, and not
3951              in system headers.  */
3952           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3953                    && CPP_PEDANTIC (pfile)
3954                    && ! buffer->warned_cplusplus_comments)
3955             {
3956               if (cpp_error (pfile, CPP_DL_PEDWARN,
3957                              "C++ style comments are not allowed in ISO C90"))
3958                 cpp_error (pfile, CPP_DL_NOTE,
3959                            "(this will be reported only once per input file)");
3960               buffer->warned_cplusplus_comments = 1;
3961             }
3962           /* Or if specifically desired via -Wc90-c99-compat.  */
3963           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3964                    && ! CPP_OPTION (pfile, cplusplus)
3965                    && ! buffer->warned_cplusplus_comments)
3966             {
3967               if (cpp_error (pfile, CPP_DL_WARNING,
3968                              "C++ style comments are incompatible with C90"))
3969                 cpp_error (pfile, CPP_DL_NOTE,
3970                            "(this will be reported only once per input file)");
3971               buffer->warned_cplusplus_comments = 1;
3972             }
3973           /* In C89/C94, C++ style comments are forbidden.  */
3974           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3975                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
3976             {
3977               /* But don't be confused about valid code such as
3978                  - // immediately followed by *,
3979                  - // in a preprocessing directive,
3980                  - // in an #if 0 block.  */
3981               if (buffer->cur[1] == '*'
3982                   || pfile->state.in_directive
3983                   || pfile->state.skipping)
3984                 {
3985                   result->type = CPP_DIV;
3986                   break;
3987                 }
3988               else if (! buffer->warned_cplusplus_comments)
3989                 {
3990                   if (cpp_error (pfile, CPP_DL_ERROR,
3991                                  "C++ style comments are not allowed in "
3992                                  "ISO C90"))
3993                     cpp_error (pfile, CPP_DL_NOTE,
3994                                "(this will be reported only once per input "
3995                                "file)");
3996                   buffer->warned_cplusplus_comments = 1;
3997                 }
3998             }
3999           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
4000             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
4001         }
4002       else if (c == '=')
4003         {
4004           buffer->cur++;
4005           result->type = CPP_DIV_EQ;
4006           break;
4007         }
4008       else
4009         {
4010           result->type = CPP_DIV;
4011           break;
4012         }
4013
4014       if (fallthrough_comment_p (pfile, comment_start))
4015         fallthrough_comment = true;
4016
4017       if (pfile->cb.comment)
4018         {
4019           size_t len = pfile->buffer->cur - comment_start;
4020           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
4021                              len + 1);
4022         }
4023
4024       if (!pfile->state.save_comments)
4025         {
4026           result->flags |= PREV_WHITE;
4027           goto update_tokens_line;
4028         }
4029
4030       if (fallthrough_comment)
4031         result->flags |= PREV_FALLTHROUGH;
4032
4033       /* Save the comment as a token in its own right.  */
4034       save_comment (pfile, result, comment_start, c);
4035       break;
4036
4037     case '<':
4038       if (pfile->state.angled_headers)
4039         {
4040           lex_string (pfile, result, buffer->cur - 1);
4041           if (result->type != CPP_LESS)
4042             break;
4043         }
4044
4045       result->type = CPP_LESS;
4046       if (*buffer->cur == '=')
4047         {
4048           buffer->cur++, result->type = CPP_LESS_EQ;
4049           if (*buffer->cur == '>'
4050               && CPP_OPTION (pfile, cplusplus)
4051               && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
4052             buffer->cur++, result->type = CPP_SPACESHIP;
4053         }
4054       else if (*buffer->cur == '<')
4055         {
4056           buffer->cur++;
4057           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
4058         }
4059       else if (CPP_OPTION (pfile, digraphs))
4060         {
4061           if (*buffer->cur == ':')
4062             {
4063               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4064                  three characters are <:: and the subsequent character
4065                  is neither : nor >, the < is treated as a preprocessor
4066                  token by itself".  */
4067               if (CPP_OPTION (pfile, cplusplus)
4068                   && CPP_OPTION (pfile, lang) != CLK_CXX98
4069                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
4070                   && buffer->cur[1] == ':'
4071                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4072                 break;
4073
4074               buffer->cur++;
4075               result->flags |= DIGRAPH;
4076               result->type = CPP_OPEN_SQUARE;
4077             }
4078           else if (*buffer->cur == '%')
4079             {
4080               buffer->cur++;
4081               result->flags |= DIGRAPH;
4082               result->type = CPP_OPEN_BRACE;
4083             }
4084         }
4085       break;
4086
4087     case '>':
4088       result->type = CPP_GREATER;
4089       if (*buffer->cur == '=')
4090         buffer->cur++, result->type = CPP_GREATER_EQ;
4091       else if (*buffer->cur == '>')
4092         {
4093           buffer->cur++;
4094           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4095         }
4096       break;
4097
4098     case '%':
4099       result->type = CPP_MOD;
4100       if (*buffer->cur == '=')
4101         buffer->cur++, result->type = CPP_MOD_EQ;
4102       else if (CPP_OPTION (pfile, digraphs))
4103         {
4104           if (*buffer->cur == ':')
4105             {
4106               buffer->cur++;
4107               result->flags |= DIGRAPH;
4108               result->type = CPP_HASH;
4109               if (*buffer->cur == '%' && buffer->cur[1] == ':')
4110                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
4111             }
4112           else if (*buffer->cur == '>')
4113             {
4114               buffer->cur++;
4115               result->flags |= DIGRAPH;
4116               result->type = CPP_CLOSE_BRACE;
4117             }
4118         }
4119       break;
4120
4121     case '.':
4122       result->type = CPP_DOT;
4123       if (ISDIGIT (*buffer->cur))
4124         {
4125           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4126           result->type = CPP_NUMBER;
4127           lex_number (pfile, &result->val.str, &nst);
4128           warn_about_normalization (pfile, result, &nst, false);
4129         }
4130       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4131         buffer->cur += 2, result->type = CPP_ELLIPSIS;
4132       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4133         buffer->cur++, result->type = CPP_DOT_STAR;
4134       break;
4135
4136     case '+':
4137       result->type = CPP_PLUS;
4138       if (*buffer->cur == '+')
4139         buffer->cur++, result->type = CPP_PLUS_PLUS;
4140       else if (*buffer->cur == '=')
4141         buffer->cur++, result->type = CPP_PLUS_EQ;
4142       break;
4143
4144     case '-':
4145       result->type = CPP_MINUS;
4146       if (*buffer->cur == '>')
4147         {
4148           buffer->cur++;
4149           result->type = CPP_DEREF;
4150           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4151             buffer->cur++, result->type = CPP_DEREF_STAR;
4152         }
4153       else if (*buffer->cur == '-')
4154         buffer->cur++, result->type = CPP_MINUS_MINUS;
4155       else if (*buffer->cur == '=')
4156         buffer->cur++, result->type = CPP_MINUS_EQ;
4157       break;
4158
4159     case '&':
4160       result->type = CPP_AND;
4161       if (*buffer->cur == '&')
4162         buffer->cur++, result->type = CPP_AND_AND;
4163       else if (*buffer->cur == '=')
4164         buffer->cur++, result->type = CPP_AND_EQ;
4165       break;
4166
4167     case '|':
4168       result->type = CPP_OR;
4169       if (*buffer->cur == '|')
4170         buffer->cur++, result->type = CPP_OR_OR;
4171       else if (*buffer->cur == '=')
4172         buffer->cur++, result->type = CPP_OR_EQ;
4173       break;
4174
4175     case ':':
4176       result->type = CPP_COLON;
4177       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
4178         buffer->cur++, result->type = CPP_SCOPE;
4179       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
4180         {
4181           buffer->cur++;
4182           result->flags |= DIGRAPH;
4183           result->type = CPP_CLOSE_SQUARE;
4184         }
4185       break;
4186
4187     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4188     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4189     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4190     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
4191     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
4192
4193     case '?': result->type = CPP_QUERY; break;
4194     case '~': result->type = CPP_COMPL; break;
4195     case ',': result->type = CPP_COMMA; break;
4196     case '(': result->type = CPP_OPEN_PAREN; break;
4197     case ')': result->type = CPP_CLOSE_PAREN; break;
4198     case '[': result->type = CPP_OPEN_SQUARE; break;
4199     case ']': result->type = CPP_CLOSE_SQUARE; break;
4200     case '{': result->type = CPP_OPEN_BRACE; break;
4201     case '}': result->type = CPP_CLOSE_BRACE; break;
4202     case ';': result->type = CPP_SEMICOLON; break;
4203
4204       /* @ is a punctuator in Objective-C.  */
4205     case '@': result->type = CPP_ATSIGN; break;
4206
4207     default:
4208       {
4209         const uchar *base = --buffer->cur;
4210         static int no_warn_cnt;
4211
4212         /* Check for an extended identifier ($ or UCN or UTF-8).  */
4213         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4214         if (forms_identifier_p (pfile, true, &nst))
4215           {
4216             result->type = CPP_NAME;
4217             result->val.node.node = lex_identifier (pfile, base, true, &nst,
4218                                                     &result->val.node.spelling);
4219             warn_about_normalization (pfile, result, &nst, true);
4220             break;
4221           }
4222
4223         /* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
4224            single token.  */
4225         buffer->cur++;
4226         if (c >= utf8_signifier)
4227           {
4228             const uchar *pstr = base;
4229             cppchar_t s;
4230             if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4231               {
4232                 if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4233                   {
4234                     buffer->cur = base;
4235                     _cpp_warn_invalid_utf8 (pfile);
4236                   }
4237                 buffer->cur = pstr;
4238               }
4239             else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4240               {
4241                 buffer->cur = base;
4242                 const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4243                 buffer->cur = base + 1;
4244                 no_warn_cnt = end - buffer->cur;
4245               }
4246           }
4247         else if (c >= utf8_continuation
4248                  && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4249           {
4250             if (no_warn_cnt)
4251               --no_warn_cnt;
4252             else
4253               {
4254                 buffer->cur = base;
4255                 _cpp_warn_invalid_utf8 (pfile);
4256                 buffer->cur = base + 1;
4257               }
4258           }
4259         create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4260         break;
4261       }
4262
4263     }
4264
4265   /* Potentially convert the location of the token to a range.  */
4266   if (result->src_loc >= RESERVED_LOCATION_COUNT
4267       && result->type != CPP_EOF)
4268     {
4269       /* Ensure that any line notes are processed, so that we have the
4270          correct physical line/column for the end-point of the token even
4271          when a logical line is split via one or more backslashes.  */
4272       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4273           && !pfile->overlaid_buffer)
4274         _cpp_process_line_notes (pfile, false);
4275
4276       source_range tok_range;
4277       tok_range.m_start = result->src_loc;
4278       tok_range.m_finish
4279         = linemap_position_for_column (pfile->line_table,
4280                                        CPP_BUF_COLUMN (buffer, buffer->cur));
4281
4282       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
4283                                                result->src_loc,
4284                                                tok_range, NULL, 0);
4285     }
4286
4287   return result;
4288 }
4289
4290 /* An upper bound on the number of bytes needed to spell TOKEN.
4291    Does not include preceding whitespace.  */
4292 unsigned int
4293 cpp_token_len (const cpp_token *token)
4294 {
4295   unsigned int len;
4296
4297   switch (TOKEN_SPELL (token))
4298     {
4299     default:            len = 6;                                break;
4300     case SPELL_LITERAL: len = token->val.str.len;               break;
4301     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
4302     }
4303
4304   return len;
4305 }
4306
4307 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4308    Return the number of bytes read out of NAME.  (There are always
4309    10 bytes written to BUFFER.)  */
4310
4311 static size_t
4312 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4313 {
4314   int j;
4315   int ucn_len = 0;
4316   int ucn_len_c;
4317   unsigned t;
4318   unsigned long utf32;
4319
4320   /* Compute the length of the UTF-8 sequence.  */
4321   for (t = *name; t & 0x80; t <<= 1)
4322     ucn_len++;
4323
4324   utf32 = *name & (0x7F >> ucn_len);
4325   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4326     {
4327       utf32 = (utf32 << 6) | (*++name & 0x3F);
4328
4329       /* Ill-formed UTF-8.  */
4330       if ((*name & ~0x3F) != 0x80)
4331         abort ();
4332     }
4333
4334   *buffer++ = '\\';
4335   *buffer++ = 'U';
4336   for (j = 7; j >= 0; j--)
4337     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4338   return ucn_len;
4339 }
4340
4341 /* Given a token TYPE corresponding to a digraph, return a pointer to
4342    the spelling of the digraph.  */
4343 static const unsigned char *
4344 cpp_digraph2name (enum cpp_ttype type)
4345 {
4346   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4347 }
4348
4349 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4350    The buffer must already contain the enough space to hold the
4351    token's spelling.  Returns a pointer to the character after the
4352    last character written.  */
4353 unsigned char *
4354 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4355 {
4356   size_t i;
4357   const unsigned char *name = NODE_NAME (ident);
4358
4359   for (i = 0; i < NODE_LEN (ident); i++)
4360     if (name[i] & ~0x7F)
4361       {
4362         i += utf8_to_ucn (buffer, name + i) - 1;
4363         buffer += 10;
4364       }
4365     else
4366       *buffer++ = name[i];
4367
4368   return buffer;
4369 }
4370
4371 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
4372    already contain the enough space to hold the token's spelling.
4373    Returns a pointer to the character after the last character written.
4374    FORSTRING is true if this is to be the spelling after translation
4375    phase 1 (with the original spelling of extended identifiers), false
4376    if extended identifiers should always be written using UCNs (there is
4377    no option for always writing them in the internal UTF-8 form).
4378    FIXME: Would be nice if we didn't need the PFILE argument.  */
4379 unsigned char *
4380 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4381                  unsigned char *buffer, bool forstring)
4382 {
4383   switch (TOKEN_SPELL (token))
4384     {
4385     case SPELL_OPERATOR:
4386       {
4387         const unsigned char *spelling;
4388         unsigned char c;
4389
4390         if (token->flags & DIGRAPH)
4391           spelling = cpp_digraph2name (token->type);
4392         else if (token->flags & NAMED_OP)
4393           goto spell_ident;
4394         else
4395           spelling = TOKEN_NAME (token);
4396
4397         while ((c = *spelling++) != '\0')
4398           *buffer++ = c;
4399       }
4400       break;
4401
4402     spell_ident:
4403     case SPELL_IDENT:
4404       if (forstring)
4405         {
4406           memcpy (buffer, NODE_NAME (token->val.node.spelling),
4407                   NODE_LEN (token->val.node.spelling));
4408           buffer += NODE_LEN (token->val.node.spelling);
4409         }
4410       else
4411         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4412       break;
4413
4414     case SPELL_LITERAL:
4415       memcpy (buffer, token->val.str.text, token->val.str.len);
4416       buffer += token->val.str.len;
4417       break;
4418
4419     case SPELL_NONE:
4420       cpp_error (pfile, CPP_DL_ICE,
4421                  "unspellable token %s", TOKEN_NAME (token));
4422       break;
4423     }
4424
4425   return buffer;
4426 }
4427
4428 /* Returns TOKEN spelt as a null-terminated string.  The string is
4429    freed when the reader is destroyed.  Useful for diagnostics.  */
4430 unsigned char *
4431 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4432 {
4433   unsigned int len = cpp_token_len (token) + 1;
4434   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4435
4436   end = cpp_spell_token (pfile, token, start, false);
4437   end[0] = '\0';
4438
4439   return start;
4440 }
4441
4442 /* Returns a pointer to a string which spells the token defined by
4443    TYPE and FLAGS.  Used by C front ends, which really should move to
4444    using cpp_token_as_text.  */
4445 const char *
4446 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4447 {
4448   if (flags & DIGRAPH)
4449     return (const char *) cpp_digraph2name (type);
4450   else if (flags & NAMED_OP)
4451     return cpp_named_operator2name (type);
4452
4453   return (const char *) token_spellings[type].name;
4454 }
4455
4456 /* Writes the spelling of token to FP, without any preceding space.
4457    Separated from cpp_spell_token for efficiency - to avoid stdio
4458    double-buffering.  */
4459 void
4460 cpp_output_token (const cpp_token *token, FILE *fp)
4461 {
4462   switch (TOKEN_SPELL (token))
4463     {
4464     case SPELL_OPERATOR:
4465       {
4466         const unsigned char *spelling;
4467         int c;
4468
4469         if (token->flags & DIGRAPH)
4470           spelling = cpp_digraph2name (token->type);
4471         else if (token->flags & NAMED_OP)
4472           goto spell_ident;
4473         else
4474           spelling = TOKEN_NAME (token);
4475
4476         c = *spelling;
4477         do
4478           putc (c, fp);
4479         while ((c = *++spelling) != '\0');
4480       }
4481       break;
4482
4483     spell_ident:
4484     case SPELL_IDENT:
4485       {
4486         size_t i;
4487         const unsigned char * name = NODE_NAME (token->val.node.node);
4488
4489         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4490           if (name[i] & ~0x7F)
4491             {
4492               unsigned char buffer[10];
4493               i += utf8_to_ucn (buffer, name + i) - 1;
4494               fwrite (buffer, 1, 10, fp);
4495             }
4496           else
4497             fputc (NODE_NAME (token->val.node.node)[i], fp);
4498       }
4499       break;
4500
4501     case SPELL_LITERAL:
4502       if (token->type == CPP_HEADER_NAME)
4503         fputc ('"', fp);
4504       fwrite (token->val.str.text, 1, token->val.str.len, fp);
4505       if (token->type == CPP_HEADER_NAME)
4506         fputc ('"', fp);
4507       break;
4508
4509     case SPELL_NONE:
4510       /* An error, most probably.  */
4511       break;
4512     }
4513 }
4514
4515 /* Compare two tokens.  */
4516 int
4517 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4518 {
4519   if (a->type == b->type && a->flags == b->flags)
4520     switch (TOKEN_SPELL (a))
4521       {
4522       default:                  /* Keep compiler happy.  */
4523       case SPELL_OPERATOR:
4524         /* token_no is used to track where multiple consecutive ##
4525            tokens were originally located.  */
4526         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4527       case SPELL_NONE:
4528         return (a->type != CPP_MACRO_ARG
4529                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4530                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4531       case SPELL_IDENT:
4532         return (a->val.node.node == b->val.node.node
4533                 && a->val.node.spelling == b->val.node.spelling);
4534       case SPELL_LITERAL:
4535         return (a->val.str.len == b->val.str.len
4536                 && !memcmp (a->val.str.text, b->val.str.text,
4537                             a->val.str.len));
4538       }
4539
4540   return 0;
4541 }
4542
4543 /* Returns nonzero if a space should be inserted to avoid an
4544    accidental token paste for output.  For simplicity, it is
4545    conservative, and occasionally advises a space where one is not
4546    needed, e.g. "." and ".2".  */
4547 int
4548 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4549                  const cpp_token *token2)
4550 {
4551   enum cpp_ttype a = token1->type, b = token2->type;
4552   cppchar_t c;
4553
4554   if (token1->flags & NAMED_OP)
4555     a = CPP_NAME;
4556   if (token2->flags & NAMED_OP)
4557     b = CPP_NAME;
4558
4559   c = EOF;
4560   if (token2->flags & DIGRAPH)
4561     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4562   else if (token_spellings[b].category == SPELL_OPERATOR)
4563     c = token_spellings[b].name[0];
4564
4565   /* Quickly get everything that can paste with an '='.  */
4566   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4567     return 1;
4568
4569   switch (a)
4570     {
4571     case CPP_GREATER:   return c == '>';
4572     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
4573     case CPP_PLUS:      return c == '+';
4574     case CPP_MINUS:     return c == '-' || c == '>';
4575     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
4576     case CPP_MOD:       return c == ':' || c == '>';
4577     case CPP_AND:       return c == '&';
4578     case CPP_OR:        return c == '|';
4579     case CPP_COLON:     return c == ':' || c == '>';
4580     case CPP_DEREF:     return c == '*';
4581     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
4582     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
4583     case CPP_PRAGMA:
4584     case CPP_NAME:      return ((b == CPP_NUMBER
4585                                  && name_p (pfile, &token2->val.str))
4586                                 || b == CPP_NAME
4587                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
4588     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
4589                                 || b == CPP_CHAR
4590                                 || c == '.' || c == '+' || c == '-');
4591                                       /* UCNs */
4592     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
4593                                  && b == CPP_NAME)
4594                                 || (CPP_OPTION (pfile, objc)
4595                                     && token1->val.str.text[0] == '@'
4596                                     && (b == CPP_NAME || b == CPP_STRING)));
4597     case CPP_LESS_EQ:   return c == '>';
4598     case CPP_STRING:
4599     case CPP_WSTRING:
4600     case CPP_UTF8STRING:
4601     case CPP_STRING16:
4602     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
4603                                 && (b == CPP_NAME
4604                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
4605                                         && ISIDST (token2->val.str.text[0]))));
4606
4607     default:            break;
4608     }
4609
4610   return 0;
4611 }
4612
4613 /* Output all the remaining tokens on the current line, and a newline
4614    character, to FP.  Leading whitespace is removed.  If there are
4615    macros, special token padding is not performed.  */
4616 void
4617 cpp_output_line (cpp_reader *pfile, FILE *fp)
4618 {
4619   const cpp_token *token;
4620
4621   token = cpp_get_token (pfile);
4622   while (token->type != CPP_EOF)
4623     {
4624       cpp_output_token (token, fp);
4625       token = cpp_get_token (pfile);
4626       if (token->flags & PREV_WHITE)
4627         putc (' ', fp);
4628     }
4629
4630   putc ('\n', fp);
4631 }
4632
4633 /* Return a string representation of all the remaining tokens on the
4634    current line.  The result is allocated using xmalloc and must be
4635    freed by the caller.  */
4636 unsigned char *
4637 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4638 {
4639   const cpp_token *token;
4640   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4641   unsigned int alloced = 120 + out;
4642   unsigned char *result = (unsigned char *) xmalloc (alloced);
4643
4644   /* If DIR_NAME is empty, there are no initial contents.  */
4645   if (dir_name)
4646     {
4647       sprintf ((char *) result, "#%s ", dir_name);
4648       out += 2;
4649     }
4650
4651   token = cpp_get_token (pfile);
4652   while (token->type != CPP_EOF)
4653     {
4654       unsigned char *last;
4655       /* Include room for a possible space and the terminating nul.  */
4656       unsigned int len = cpp_token_len (token) + 2;
4657
4658       if (out + len > alloced)
4659         {
4660           alloced *= 2;
4661           if (out + len > alloced)
4662             alloced = out + len;
4663           result = (unsigned char *) xrealloc (result, alloced);
4664         }
4665
4666       last = cpp_spell_token (pfile, token, &result[out], 0);
4667       out = last - result;
4668
4669       token = cpp_get_token (pfile);
4670       if (token->flags & PREV_WHITE)
4671         result[out++] = ' ';
4672     }
4673
4674   result[out] = '\0';
4675   return result;
4676 }
4677
4678 /* Memory buffers.  Changing these three constants can have a dramatic
4679    effect on performance.  The values here are reasonable defaults,
4680    but might be tuned.  If you adjust them, be sure to test across a
4681    range of uses of cpplib, including heavy nested function-like macro
4682    expansion.  Also check the change in peak memory usage (NJAMD is a
4683    good tool for this).  */
4684 #define MIN_BUFF_SIZE 8000
4685 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4686 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4687         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4688
4689 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4690   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4691 #endif
4692
4693 /* Create a new allocation buffer.  Place the control block at the end
4694    of the buffer, so that buffer overflows will cause immediate chaos.  */
4695 static _cpp_buff *
4696 new_buff (size_t len)
4697 {
4698   _cpp_buff *result;
4699   unsigned char *base;
4700
4701   if (len < MIN_BUFF_SIZE)
4702     len = MIN_BUFF_SIZE;
4703   len = CPP_ALIGN (len);
4704
4705 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4706   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4707      struct first.  */
4708   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4709   base = XNEWVEC (unsigned char, len + slen);
4710   result = (_cpp_buff *) base;
4711   base += slen;
4712 #else
4713   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4714   result = (_cpp_buff *) (base + len);
4715 #endif
4716   result->base = base;
4717   result->cur = base;
4718   result->limit = base + len;
4719   result->next = NULL;
4720   return result;
4721 }
4722
4723 /* Place a chain of unwanted allocation buffers on the free list.  */
4724 void
4725 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4726 {
4727   _cpp_buff *end = buff;
4728
4729   while (end->next)
4730     end = end->next;
4731   end->next = pfile->free_buffs;
4732   pfile->free_buffs = buff;
4733 }
4734
4735 /* Return a free buffer of size at least MIN_SIZE.  */
4736 _cpp_buff *
4737 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4738 {
4739   _cpp_buff *result, **p;
4740
4741   for (p = &pfile->free_buffs;; p = &(*p)->next)
4742     {
4743       size_t size;
4744
4745       if (*p == NULL)
4746         return new_buff (min_size);
4747       result = *p;
4748       size = result->limit - result->base;
4749       /* Return a buffer that's big enough, but don't waste one that's
4750          way too big.  */
4751       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4752         break;
4753     }
4754
4755   *p = result->next;
4756   result->next = NULL;
4757   result->cur = result->base;
4758   return result;
4759 }
4760
4761 /* Creates a new buffer with enough space to hold the uncommitted
4762    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
4763    the excess bytes to the new buffer.  Chains the new buffer after
4764    BUFF, and returns the new buffer.  */
4765 _cpp_buff *
4766 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4767 {
4768   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4769   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4770
4771   buff->next = new_buff;
4772   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4773   return new_buff;
4774 }
4775
4776 /* Creates a new buffer with enough space to hold the uncommitted
4777    remaining bytes of the buffer pointed to by BUFF, and at least
4778    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
4779    Chains the new buffer before the buffer pointed to by BUFF, and
4780    updates the pointer to point to the new buffer.  */
4781 void
4782 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4783 {
4784   _cpp_buff *new_buff, *old_buff = *pbuff;
4785   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4786
4787   new_buff = _cpp_get_buff (pfile, size);
4788   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4789   new_buff->next = old_buff;
4790   *pbuff = new_buff;
4791 }
4792
4793 /* Free a chain of buffers starting at BUFF.  */
4794 void
4795 _cpp_free_buff (_cpp_buff *buff)
4796 {
4797   _cpp_buff *next;
4798
4799   for (; buff; buff = next)
4800     {
4801       next = buff->next;
4802 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4803       free (buff);
4804 #else
4805       free (buff->base);
4806 #endif
4807     }
4808 }
4809
4810 /* Allocate permanent, unaligned storage of length LEN.  */
4811 unsigned char *
4812 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4813 {
4814   _cpp_buff *buff = pfile->u_buff;
4815   unsigned char *result = buff->cur;
4816
4817   if (len > (size_t) (buff->limit - result))
4818     {
4819       buff = _cpp_get_buff (pfile, len);
4820       buff->next = pfile->u_buff;
4821       pfile->u_buff = buff;
4822       result = buff->cur;
4823     }
4824
4825   buff->cur = result + len;
4826   return result;
4827 }
4828
4829 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4830    That buffer is used for growing allocations when saving macro
4831    replacement lists in a #define, and when parsing an answer to an
4832    assertion in #assert, #unassert or #if (and therefore possibly
4833    whilst expanding macros).  It therefore must not be used by any
4834    code that they might call: specifically the lexer and the guts of
4835    the macro expander.
4836
4837    All existing other uses clearly fit this restriction: storing
4838    registered pragmas during initialization.  */
4839 unsigned char *
4840 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4841 {
4842   _cpp_buff *buff = pfile->a_buff;
4843   unsigned char *result = buff->cur;
4844
4845   if (len > (size_t) (buff->limit - result))
4846     {
4847       buff = _cpp_get_buff (pfile, len);
4848       buff->next = pfile->a_buff;
4849       pfile->a_buff = buff;
4850       result = buff->cur;
4851     }
4852
4853   buff->cur = result + len;
4854   return result;
4855 }
4856
4857 /* Commit or allocate storage from a buffer.  */
4858
4859 void *
4860 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4861 {
4862   void *ptr = BUFF_FRONT (pfile->a_buff);
4863
4864   if (pfile->hash_table->alloc_subobject)
4865     {
4866       void *copy = pfile->hash_table->alloc_subobject (size);
4867       memcpy (copy, ptr, size);
4868       ptr = copy;
4869     }
4870   else
4871     BUFF_FRONT (pfile->a_buff) += size;
4872
4873   return ptr;
4874 }
4875
4876 /* Say which field of TOK is in use.  */
4877
4878 enum cpp_token_fld_kind
4879 cpp_token_val_index (const cpp_token *tok)
4880 {
4881   switch (TOKEN_SPELL (tok))
4882     {
4883     case SPELL_IDENT:
4884       return CPP_TOKEN_FLD_NODE;
4885     case SPELL_LITERAL:
4886       return CPP_TOKEN_FLD_STR;
4887     case SPELL_OPERATOR:
4888       /* Operands which were originally spelled as ident keep around
4889          the node for the exact spelling.  */
4890       if (tok->flags & NAMED_OP)
4891         return CPP_TOKEN_FLD_NODE;
4892       else if (tok->type == CPP_PASTE)
4893         return CPP_TOKEN_FLD_TOKEN_NO;
4894       else
4895         return CPP_TOKEN_FLD_NONE;
4896     case SPELL_NONE:
4897       if (tok->type == CPP_MACRO_ARG)
4898         return CPP_TOKEN_FLD_ARG_NO;
4899       else if (tok->type == CPP_PADDING)
4900         return CPP_TOKEN_FLD_SOURCE;
4901       else if (tok->type == CPP_PRAGMA)
4902         return CPP_TOKEN_FLD_PRAGMA;
4903       /* fall through */
4904     default:
4905       return CPP_TOKEN_FLD_NONE;
4906     }
4907 }
4908
4909 /* All tokens lexed in R after calling this function will be forced to
4910    have their location_t to be P, until
4911    cpp_stop_forcing_token_locations is called for R.  */
4912
4913 void
4914 cpp_force_token_locations (cpp_reader *r, location_t loc)
4915 {
4916   r->forced_token_location = loc;
4917 }
4918
4919 /* Go back to assigning locations naturally for lexed tokens.  */
4920
4921 void
4922 cpp_stop_forcing_token_locations (cpp_reader *r)
4923 {
4924   r->forced_token_location = 0;
4925 }
4926
4927 /* We're looking at \, if it's escaping EOL, look past it.  If at
4928    LIMIT, don't advance.  */
4929
4930 static const unsigned char *
4931 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4932 {
4933   const unsigned char *probe = peek;
4934
4935   if (__builtin_expect (peek[1] == '\n', true))
4936     {
4937     eol:
4938       probe += 2;
4939       if (__builtin_expect (probe < limit, true))
4940         {
4941           peek = probe;
4942           if (*peek == '\\')
4943             /* The user might be perverse.  */
4944             return do_peek_backslash (peek, limit);
4945         }
4946     }
4947   else if (__builtin_expect (peek[1] == '\r', false))
4948     {
4949       if (probe[2] == '\n')
4950         probe++;
4951       goto eol;
4952     }
4953
4954   return peek;
4955 }
4956
4957 static const unsigned char *
4958 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4959 {
4960   if (__builtin_expect (*peek == '\\', false))
4961     peek = do_peek_backslash (peek, limit);
4962   return peek;
4963 }
4964
4965 static const unsigned char *
4966 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4967 {
4968   if (peek == bound)
4969     return NULL;
4970
4971   unsigned char c = *--peek;
4972   if (__builtin_expect (c == '\n', false)
4973       || __builtin_expect (c == 'r', false))
4974     {
4975       if (peek == bound)
4976         return peek;
4977       int ix = -1;
4978       if (c == '\n' && peek[ix] == '\r')
4979         {
4980           if (peek + ix == bound)
4981             return peek;
4982           ix--;
4983         }
4984
4985       if (peek[ix] == '\\')
4986         return do_peek_prev (peek + ix, bound);
4987
4988       return peek;
4989     }
4990   else
4991     return peek;
4992 }
4993
4994 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4995    space.  Otherwise return NULL.  */
4996
4997 static const unsigned char *
4998 do_peek_ident (const char *match, const unsigned char *peek,
4999                const unsigned char *limit)
5000 {
5001   for (; *++match; peek++)
5002     if (*peek != *match)
5003       {
5004         peek = do_peek_next (peek, limit);
5005         if (*peek != *match)
5006           return NULL;
5007       }
5008
5009   /* Must now not be looking at an identifier char.  */
5010   peek = do_peek_next (peek, limit);
5011   if (ISIDNUM (*peek))
5012     return NULL;
5013
5014   /* Skip control-line whitespace.  */
5015  ws:
5016   while (*peek == ' ' || *peek == '\t')
5017     peek++;
5018   if (__builtin_expect (*peek == '\\', false))
5019     {
5020       peek = do_peek_backslash (peek, limit);
5021       if (*peek != '\\')
5022         goto ws;
5023     }
5024
5025   return peek;
5026 }
5027
5028 /* Are we looking at a module control line starting as PEEK - 1?  */
5029
5030 static bool
5031 do_peek_module (cpp_reader *pfile, unsigned char c,
5032                 const unsigned char *peek, const unsigned char *limit)
5033 {
5034   bool import = false;
5035
5036   if (__builtin_expect (c == 'e', false))
5037     {
5038       if (!((peek[0] == 'x' || peek[0] == '\\')
5039             && (peek = do_peek_ident ("export", peek, limit))))
5040         return false;
5041
5042       /* export, peek for import or module.  No need to peek __import
5043          here.  */
5044       if (peek[0] == 'i')
5045         {
5046           if (!((peek[1] == 'm' || peek[1] == '\\')
5047                 && (peek = do_peek_ident ("import", peek + 1, limit))))
5048             return false;
5049           import = true;
5050         }
5051       else if (peek[0] == 'm')
5052         {
5053           if (!((peek[1] == 'o' || peek[1] == '\\')
5054                 && (peek = do_peek_ident ("module", peek + 1, limit))))
5055             return false;
5056         }
5057       else
5058         return false;
5059     }
5060   else if (__builtin_expect (c == 'i', false))
5061     {
5062       if (!((peek[0] == 'm' || peek[0] == '\\')
5063             && (peek = do_peek_ident ("import", peek, limit))))
5064         return false;
5065       import = true;
5066     }
5067   else if (__builtin_expect (c == '_', false))
5068     {
5069       /* Needed for translated includes.   */
5070       if (!((peek[0] == '_' || peek[0] == '\\')
5071             && (peek = do_peek_ident ("__import", peek, limit))))
5072         return false;
5073       import = true;
5074     }
5075   else if (__builtin_expect (c == 'm', false))
5076     {
5077       if (!((peek[0] == 'o' || peek[0] == '\\')
5078             && (peek = do_peek_ident ("module", peek, limit))))
5079         return false;
5080     }
5081   else
5082     return false;
5083
5084   /* Peek the next character to see if it's good enough.  We'll be at
5085      the first non-whitespace char, including skipping an escaped
5086      newline.  */
5087   /* ... import followed by identifier, ':', '<' or header-name
5088      preprocessing tokens, or module followed by identifier, ':' or
5089      ';' preprocessing tokens.  */
5090   unsigned char p = *peek++;
5091
5092   /* A character literal is ... single quotes, ... optionally preceded
5093      by u8, u, U, or L */
5094   /* A string-literal is a ... double quotes, optionally prefixed by
5095      R, u8, u8R, u, uR, U, UR, L, or LR */
5096   if (p == 'u')
5097     {
5098       peek = do_peek_next (peek, limit);
5099       if (*peek == '8')
5100         {
5101           peek++;
5102           goto peek_u8;
5103         }
5104       goto peek_u;
5105     }
5106   else if (p == 'U' || p == 'L')
5107     {
5108     peek_u8:
5109       peek = do_peek_next (peek, limit);
5110     peek_u:
5111       if (*peek == '\"' || *peek == '\'')
5112         return false;
5113
5114       if (*peek == 'R')
5115         goto peek_R;
5116       /* Identifier. Ok.  */
5117     }
5118   else if (p == 'R')
5119     {
5120     peek_R:
5121       if (CPP_OPTION (pfile, rliterals))
5122         {
5123           peek = do_peek_next (peek, limit);
5124           if (*peek == '\"')
5125             return false;
5126         }
5127       /* Identifier. Ok.  */
5128     }
5129   else if ('Z' - 'A' == 25
5130            ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5131            : ISIDST (p))
5132     {
5133       /* Identifier.  Ok. */
5134     }
5135   else if (p == '<')
5136     {
5137       /* Maybe angle header, ok for import.  Reject
5138          '<=', '<<' digraph:'<:'.  */
5139       if (!import)
5140         return false;
5141       peek = do_peek_next (peek, limit);
5142       if (*peek == '=' || *peek == '<'
5143           || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5144         return false;
5145     }
5146   else if (p == ';')
5147     {
5148       /* SEMICOLON, ok for module.  */
5149       if (import)
5150         return false;
5151     }
5152   else if (p == '"')
5153     {
5154       /* STRING, ok for import.  */
5155       if (!import)
5156         return false;
5157     }
5158   else if (p == ':')
5159     {
5160       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
5161       peek = do_peek_next (peek, limit);
5162       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5163         return false;
5164     }
5165   else
5166     /* FIXME: Detect a unicode character, excluding those not
5167        permitted as the initial character. [lex.name]/1.  I presume
5168        we need to check the \[uU] spellings, and directly using
5169        Unicode in say UTF8 form?  Or perhaps we do the phase-1
5170        conversion of UTF8 to universal-character-names?  */
5171     return false;
5172
5173   return true;
5174 }
5175
5176 /* Directives-only scanning.  Somewhat more relaxed than correct
5177    parsing -- some ill-formed programs will not be rejected.  */
5178
5179 void
5180 cpp_directive_only_process (cpp_reader *pfile,
5181                             void *data,
5182                             void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5183 {
5184   bool module_p = CPP_OPTION (pfile, module_directives);
5185
5186   do
5187     {
5188     restart:
5189       /* Buffer initialization, but no line cleaning. */
5190       cpp_buffer *buffer = pfile->buffer;
5191       buffer->cur_note = buffer->notes_used = 0;
5192       buffer->cur = buffer->line_base = buffer->next_line;
5193       buffer->need_line = false;
5194       /* Files always end in a newline or carriage return.  We rely on this for
5195          character peeking safety.  */
5196       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
5197
5198       const unsigned char *base = buffer->cur;
5199       unsigned line_count = 0;
5200       const unsigned char *line_start = base;
5201
5202       bool bol = true;
5203       bool raw = false;
5204
5205       const unsigned char *lwm = base;
5206       for (const unsigned char *pos = base, *limit = buffer->rlimit;
5207            pos < limit;)
5208         {
5209           unsigned char c = *pos++;
5210           /* This matches the switch in _cpp_lex_direct.  */
5211           switch (c)
5212             {
5213             case ' ': case '\t': case '\f': case '\v':
5214               /* Whitespace, do nothing.  */
5215               break;
5216
5217             case '\r': /* MAC line ending, or Windows \r\n  */
5218               if (*pos == '\n')
5219                 pos++;
5220               /* FALLTHROUGH */
5221
5222             case '\n':
5223               bol = true;
5224
5225             next_line:
5226               CPP_INCREMENT_LINE (pfile, 0);
5227               line_count++;
5228               line_start = pos;
5229               break;
5230
5231             case '\\':
5232               /* <backslash><newline> is removed, and doesn't undo any
5233                  preceeding escape or whatnot.  */
5234               if (*pos == '\n')
5235                 {
5236                   pos++;
5237                   goto next_line;
5238                 }
5239               else if (*pos == '\r')
5240                 {
5241                   if (pos[1] == '\n')
5242                     pos++;
5243                   pos++;
5244                   goto next_line;
5245                 }
5246               goto dflt;
5247
5248             case '#':
5249               if (bol)
5250                 {
5251                   /* Line directive.  */
5252                   if (pos - 1 > base && !pfile->state.skipping)
5253                     cb (pfile, CPP_DO_print, data,
5254                         line_count, base, pos - 1 - base);
5255
5256                   /* Prep things for directive handling. */
5257                   buffer->next_line = pos;
5258                   buffer->need_line = true;
5259                   bool ok = _cpp_get_fresh_line (pfile);
5260                   gcc_checking_assert (ok);
5261
5262                   /* Ensure proper column numbering for generated
5263                      error messages. */
5264                   buffer->line_base -= pos - line_start;
5265
5266                   _cpp_handle_directive (pfile, line_start + 1 != pos);
5267
5268                   /* Sanitize the line settings.  Duplicate #include's can
5269                      mess things up. */
5270                   // FIXME: Necessary?
5271                   pfile->line_table->highest_location
5272                     = pfile->line_table->highest_line;
5273
5274                   if (!pfile->state.skipping
5275                       && pfile->buffer->next_line < pfile->buffer->rlimit)
5276                     cb (pfile, CPP_DO_location, data,
5277                         pfile->line_table->highest_line);
5278
5279                   goto restart;
5280                 }
5281               goto dflt;
5282
5283             case '/':
5284               {
5285                 const unsigned char *peek = do_peek_next (pos, limit);
5286                 if (!(*peek == '/' || *peek == '*'))
5287                   goto dflt;
5288
5289                 /* Line or block comment  */
5290                 bool is_block = *peek == '*';
5291                 bool star = false;
5292                 bool esc = false;
5293                 location_t sloc
5294                   = linemap_position_for_column (pfile->line_table,
5295                                                  pos - line_start);
5296
5297                 while (pos < limit)
5298                   {
5299                     char c = *pos++;
5300                     switch (c)
5301                       {
5302                       case '\\':
5303                         esc = true;
5304                         break;
5305
5306                       case '\r':
5307                         if (*pos == '\n')
5308                           pos++;
5309                         /* FALLTHROUGH  */
5310
5311                       case '\n':
5312                         {
5313                           CPP_INCREMENT_LINE (pfile, 0);
5314                           line_count++;
5315                           line_start = pos;
5316                           if (!esc && !is_block)
5317                             {
5318                               bol = true;
5319                               goto done_comment;
5320                             }
5321                         }
5322                         if (!esc)
5323                           star = false;
5324                         esc = false;
5325                         break;
5326
5327                       case '*':
5328                         if (pos > peek)
5329                           star = is_block;
5330                         esc = false;
5331                         break;
5332
5333                       case '/':
5334                         if (star)
5335                           goto done_comment;
5336                         /* FALLTHROUGH  */
5337
5338                       default:
5339                         star = false;
5340                         esc = false;
5341                         break;
5342                       }
5343                   }
5344                 if (pos < limit || is_block)
5345                   cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5346                                        "unterminated comment");
5347               done_comment:
5348                 lwm = pos;
5349                 break;
5350               }
5351
5352             case '\'':
5353               if (!CPP_OPTION (pfile, digit_separators))
5354                 goto delimited_string;
5355
5356               /* Possibly a number punctuator.  */
5357               if (!ISIDNUM (*do_peek_next (pos, limit)))
5358                 goto delimited_string;
5359
5360               goto quote_peek;
5361
5362             case '\"':
5363               if (!CPP_OPTION (pfile, rliterals))
5364                 goto delimited_string;
5365
5366             quote_peek:
5367               {
5368                 /* For ' see if it's a number punctuator
5369                    \.?<digit>(<digit>|<identifier-nondigit>
5370                    |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5371                 /* For " see if it's a raw string
5372                    {U,L,u,u8}R.  This includes CPP_NUMBER detection,
5373                    because that could be 0e+R.  */
5374                 const unsigned char *peek = pos - 1;
5375                 bool quote_first = c == '"';
5376                 bool quote_eight = false;
5377                 bool maybe_number_start = false;
5378                 bool want_number = false;
5379
5380                 while ((peek = do_peek_prev (peek, lwm)))
5381                   {
5382                     unsigned char p = *peek;
5383                     if (quote_first)
5384                       {
5385                         if (!raw)
5386                           {
5387                             if (p != 'R')
5388                               break;
5389                             raw = true;
5390                             continue;
5391                           }
5392
5393                         quote_first = false;
5394                         if (p == 'L' || p == 'U' || p == 'u')
5395                           ;
5396                         else if (p == '8')
5397                           quote_eight = true;
5398                         else
5399                           goto second_raw;
5400                       }
5401                     else if (quote_eight)
5402                       {
5403                         if (p != 'u')
5404                           {
5405                             raw = false;
5406                             break;
5407                           }
5408                         quote_eight = false;
5409                       }
5410                     else if (c == '"')
5411                       {
5412                       second_raw:;
5413                         if (!want_number && ISIDNUM (p))
5414                           {
5415                             raw = false;
5416                             break;
5417                           }
5418                       }
5419
5420                     if (ISDIGIT (p))
5421                       maybe_number_start = true;
5422                     else if (p == '.')
5423                       want_number = true;
5424                     else if (ISIDNUM (p))
5425                       maybe_number_start = false;
5426                     else if (p == '+' || p == '-')
5427                       {
5428                         if (const unsigned char *peek_prev
5429                             = do_peek_prev (peek, lwm))
5430                           {
5431                             p = *peek_prev;
5432                             if (p == 'e' || p == 'E'
5433                                 || p == 'p' || p == 'P')
5434                               {
5435                                 want_number = true;
5436                                 maybe_number_start = false;
5437                               }
5438                             else
5439                               break;
5440                           }
5441                         else
5442                           break;
5443                       }
5444                     else if (p == '\'' || p == '\"')
5445                       {
5446                         /* If this is lwm, this must be the end of a
5447                            previous string.  So this is a trailing
5448                            literal type, (a) if those are allowed,
5449                              and (b) maybe_start is false.  Otherwise
5450                              this must be a CPP_NUMBER because we've
5451                              met another ', and we'd have checked that
5452                              in its own right.  */
5453                         if (peek == lwm && CPP_OPTION (pfile, uliterals))
5454                           {
5455                             if  (!maybe_number_start && !want_number)
5456                               /* Must be a literal type.  */
5457                               raw = false;
5458                           }
5459                         else if (p == '\''
5460                                  && CPP_OPTION (pfile, digit_separators))
5461                           maybe_number_start = true;
5462                         break;
5463                       }
5464                     else if (c == '\'')
5465                       break;
5466                     else if (!quote_first && !quote_eight)
5467                       break;
5468                   }
5469
5470                 if (maybe_number_start)
5471                   {
5472                     if (c == '\'')
5473                       /* A CPP NUMBER.  */
5474                       goto dflt;
5475                     raw = false;
5476                   }
5477
5478                 goto delimited_string;
5479               }
5480
5481             delimited_string:
5482               {
5483                 /* (Possibly raw) string or char literal.  */
5484                 unsigned char end = c;
5485                 int delim_len = -1;
5486                 const unsigned char *delim = NULL;
5487                 location_t sloc = linemap_position_for_column (pfile->line_table,
5488                                                                pos - line_start);
5489                 int esc = 0;
5490
5491                 if (raw)
5492                   {
5493                     /* There can be no line breaks in the delimiter.  */
5494                     delim = pos;
5495                     for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5496                       {
5497                         if (delim_len == 16)
5498                           {
5499                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5500                                                  sloc, 0,
5501                                                  "raw string delimiter"
5502                                                  " longer than %d"
5503                                                  " characters",
5504                                                  delim_len);
5505                             raw = false;
5506                             pos = delim;
5507                             break;
5508                           }
5509                         if (strchr (") \\\t\v\f\n", c))
5510                           {
5511                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5512                                                  sloc, 0,
5513                                                  "invalid character '%c'"
5514                                                  " in raw string"
5515                                                  " delimiter", c);
5516                             raw = false;
5517                             pos = delim;
5518                             break;
5519                           }
5520                         if (pos >= limit)
5521                           goto bad_string;
5522                       }
5523                   }
5524
5525                 while (pos < limit)
5526                   {
5527                     char c = *pos++;
5528                     switch (c)
5529                       {
5530                       case '\\':
5531                         if (!raw)
5532                           esc++;
5533                         break;
5534
5535                       case '\r':
5536                         if (*pos == '\n')
5537                           pos++;
5538                         /* FALLTHROUGH  */
5539
5540                       case '\n':
5541                         {
5542                           CPP_INCREMENT_LINE (pfile, 0);
5543                           line_count++;
5544                           line_start = pos;
5545                         }
5546                         if (esc)
5547                           esc--;
5548                         break;
5549
5550                       case ')':
5551                         if (raw
5552                             && pos + delim_len + 1 < limit
5553                             && pos[delim_len] == end
5554                             && !memcmp (delim, pos, delim_len))
5555                           {
5556                             pos += delim_len + 1;
5557                             raw = false;
5558                             goto done_string;
5559                           }
5560                         break;
5561
5562                       default:
5563                         if (!raw && !(esc & 1) && c == end)
5564                           goto done_string;
5565                         esc = 0;
5566                         break;
5567                       }
5568                   }
5569               bad_string:
5570                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5571                                      "unterminated literal");
5572
5573               done_string:
5574                 raw = false;
5575                 lwm = pos - 1;
5576               }
5577               goto dflt;
5578
5579             case '_':
5580             case 'e':
5581             case 'i':
5582             case 'm':
5583               if (bol && module_p && !pfile->state.skipping
5584                   && do_peek_module (pfile, c, pos, limit))
5585                 {
5586                   /* We've seen the start of a module control line.
5587                      Start up the tokenizer.  */
5588                   pos--; /* Backup over the first character.  */
5589
5590                   /* Backup over whitespace to start of line.  */
5591                   while (pos > line_start
5592                          && (pos[-1] == ' ' || pos[-1] == '\t'))
5593                     pos--;
5594
5595                   if (pos > base)
5596                     cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5597
5598                   /* Prep things for directive handling. */
5599                   buffer->next_line = pos;
5600                   buffer->need_line = true;
5601
5602                   /* Now get tokens until the PRAGMA_EOL.  */
5603                   do
5604                     {
5605                       location_t spelling;
5606                       const cpp_token *tok
5607                         = cpp_get_token_with_location (pfile, &spelling);
5608
5609                       gcc_assert (pfile->state.in_deferred_pragma
5610                                   || tok->type == CPP_PRAGMA_EOL);
5611                       cb (pfile, CPP_DO_token, data, tok, spelling);
5612                     }
5613                   while (pfile->state.in_deferred_pragma);
5614
5615                   if (pfile->buffer->next_line < pfile->buffer->rlimit)
5616                     cb (pfile, CPP_DO_location, data,
5617                         pfile->line_table->highest_line);
5618
5619                   pfile->mi_valid = false;
5620                   goto restart;
5621                 }
5622               goto dflt;
5623
5624             default:
5625             dflt:
5626               bol = false;
5627               pfile->mi_valid = false;
5628               break;
5629             }
5630         }
5631
5632       if (buffer->rlimit > base && !pfile->state.skipping)
5633         {
5634           const unsigned char *limit = buffer->rlimit;
5635           /* If the file was not newline terminated, add rlimit, which is
5636              guaranteed to point to a newline, to the end of our range.  */
5637           if (limit[-1] != '\n')
5638             {
5639               limit++;
5640               CPP_INCREMENT_LINE (pfile, 0);
5641               line_count++;
5642             }
5643           cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5644         }
5645
5646       _cpp_pop_buffer (pfile);
5647     }
5648   while (pfile->buffer);
5649 }