libcpp/lex.cc

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2022 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problem can be properly
 267    autoconfed:
 268
 269    The Solaris 10+ assembler tags objects with the instruction set
 270    extensions used, so SSE4.2 executables cannot run on machines that
 271    don't support that extension.  */
 272
 273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 274
 275 /* Replicated character data to be shared between implementations.
 276    Recall that outside of a context with vector support we can't
 277    define compatible vector types, therefore these are all defined
 278    in terms of raw characters.  */
 279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 280   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 281     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 282   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 283     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 284   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 285     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 286   { '?', '?', '?', '?', '?', '?', '?', '?',
 287     '?', '?', '?', '?', '?', '?', '?', '?' },
 288 };
 289
 290 /* A version of the fast scanner using MMX vectorized byte compare insns.
 291
 292    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 293    which was packaged into SSE1; it is also present in the AMD MMX
 294    extension.  Mark the function as using "sse" so that we emit a real
 295    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 296
 297 static const uchar *
 298 #ifndef __SSE__
 299 __attribute__((__target__("sse")))
 300 #endif
 301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 302 {
 303   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 304   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 305
 306   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 307   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 308   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 309   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 310
 311   unsigned int misalign, found, mask;
 312   const v8qi *p;
 313   v8qi data, t, c;
 314
 315   /* Align the source pointer.  While MMX doesn't generate unaligned data
 316      faults, this allows us to safely scan to the end of the buffer without
 317      reading beyond the end of the last page.  */
 318   misalign = (uintptr_t)s & 7;
 319   p = (const v8qi *)((uintptr_t)s & -8);
 320   data = *p;
 321
 322   /* Create a mask for the bytes that are valid within the first
 323      16-byte block.  The Idea here is that the AND with the mask
 324      within the loop is "free", since we need some AND or TEST
 325      insn in order to set the flags for the branch anyway.  */
 326   mask = -1u << misalign;
 327
 328   /* Main loop processing 8 bytes at a time.  */
 329   goto start;
 330   do
 331     {
 332       data = *++p;
 333       mask = -1;
 334
 335     start:
 336       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 337       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 338       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 339       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       found = __builtin_ia32_pmovmskb (t);
 344       found &= mask;
 345     }
 346   while (!found);
 347
 348   __builtin_ia32_emms ();
 349
 350   /* FOUND contains 1 in bits for which we matched a relevant
 351      character.  Conversion to the byte index is trivial.  */
 352   found = __builtin_ctz(found);
 353   return (const uchar *)p + found;
 354 }
 355
 356 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 357
 358 static const uchar *
 359 #ifndef __SSE2__
 360 __attribute__((__target__("sse2")))
 361 #endif
 362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 363 {
 364   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 365
 366   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 367   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 368   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 369   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 370
 371   unsigned int misalign, found, mask;
 372   const v16qi *p;
 373   v16qi data, t;
 374
 375   /* Align the source pointer.  */
 376   misalign = (uintptr_t)s & 15;
 377   p = (const v16qi *)((uintptr_t)s & -16);
 378   data = *p;
 379
 380   /* Create a mask for the bytes that are valid within the first
 381      16-byte block.  The Idea here is that the AND with the mask
 382      within the loop is "free", since we need some AND or TEST
 383      insn in order to set the flags for the branch anyway.  */
 384   mask = -1u << misalign;
 385
 386   /* Main loop processing 16 bytes at a time.  */
 387   goto start;
 388   do
 389     {
 390       data = *++p;
 391       mask = -1;
 392
 393     start:
 394       t  = data == repl_nl;
 395       t |= data == repl_cr;
 396       t |= data == repl_bs;
 397       t |= data == repl_qm;
 398       found = __builtin_ia32_pmovmskb128 (t);
 399       found &= mask;
 400     }
 401   while (!found);
 402
 403   /* FOUND contains 1 in bits for which we matched a relevant
 404      character.  Conversion to the byte index is trivial.  */
 405   found = __builtin_ctz(found);
 406   return (const uchar *)p + found;
 407 }
 408
 409 #ifdef HAVE_SSE4
 410 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 411
 412 static const uchar *
 413 #ifndef __SSE4_2__
 414 __attribute__((__target__("sse4.2")))
 415 #endif
 416 search_line_sse42 (const uchar *s, const uchar *end)
 417 {
 418   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 419   static const v16qi search = { '\n', '\r', '?', '\\' };
 420
 421   uintptr_t si = (uintptr_t)s;
 422   uintptr_t index;
 423
 424   /* Check for unaligned input.  */
 425   if (si & 15)
 426     {
 427       v16qi sv;
 428
 429       if (__builtin_expect (end - s < 16, 0)
 430           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 431         {
 432           /* There are less than 16 bytes left in the buffer, and less
 433              than 16 bytes left on the page.  Reading 16 bytes at this
 434              point might generate a spurious page fault.  Defer to the
 435              SSE2 implementation, which already handles alignment.  */
 436           return search_line_sse2 (s, end);
 437         }
 438
 439       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 440          memory need not be aligned.  */
 441       sv = __builtin_ia32_loaddqu ((const char *) s);
 442       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 443
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 15) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  */
 454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 455   while (1)
 456     {
 457       char f;
 458
 459       /* By using inline assembly instead of the builtin,
 460          we can use the result, as well as the flags set.  */
 461       __asm ("%vpcmpestri\t$0, %2, %3"
 462              : "=c"(index), "=@ccc"(f)
 463              : "m"(*s), "x"(search), "a"(4), "d"(16));
 464       if (f)
 465         break;
 466
 467       s += 16;
 468     }
 469 #else
 470   s -= 16;
 471   /* By doing the whole loop in inline assembly,
 472      we can make proper use of the flags set.  */
 473   __asm (      ".balign 16\n"
 474         "0:     add $16, %1\n"
 475         "       %vpcmpestri\t$0, (%1), %2\n"
 476         "       jnc 0b"
 477         : "=&c"(index), "+r"(s)
 478         : "x"(search), "a"(4), "d"(16));
 479 #endif
 480
 481  found:
 482   return s + index;
 483 }
 484
 485 #else
 486 /* Work around out-dated assemblers without sse4 support.  */
 487 #define search_line_sse42 search_line_sse2
 488 #endif
 489
 490 /* Check the CPU capabilities.  */
 491
 492 #include "../gcc/config/i386/cpuid.h"
 493
 494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 495 static search_line_fast_type search_line_fast;
 496
 497 #define HAVE_init_vectorized_lexer 1
 498 static inline void
 499 init_vectorized_lexer (void)
 500 {
 501   unsigned dummy, ecx = 0, edx = 0;
 502   search_line_fast_type impl = search_line_acc_char;
 503   int minimum = 0;
 504
 505 #if defined(__SSE4_2__)
 506   minimum = 3;
 507 #elif defined(__SSE2__)
 508   minimum = 2;
 509 #elif defined(__SSE__)
 510   minimum = 1;
 511 #endif
 512
 513   if (minimum == 3)
 514     impl = search_line_sse42;
 515   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 516     {
 517       if (minimum == 3 || (ecx & bit_SSE4_2))
 518         impl = search_line_sse42;
 519       else if (minimum == 2 || (edx & bit_SSE2))
 520         impl = search_line_sse2;
 521       else if (minimum == 1 || (edx & bit_SSE))
 522         impl = search_line_mmx;
 523     }
 524   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 525     {
 526       if (minimum == 1
 527           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 528         impl = search_line_mmx;
 529     }
 530
 531   search_line_fast = impl;
 532 }
 533
 534 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 535
 536 /* A vection of the fast scanner using AltiVec vectorized byte compares
 537    and VSX unaligned loads (when VSX is available).  This is otherwise
 538    the same as the AltiVec version.  */
 539
 540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 541 static const uchar *
 542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 543 {
 544   typedef __attribute__((altivec(vector))) unsigned char vc;
 545
 546   const vc repl_nl = {
 547     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 548     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 549   };
 550   const vc repl_cr = {
 551     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 552     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 553   };
 554   const vc repl_bs = {
 555     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 556     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 557   };
 558   const vc repl_qm = {
 559     '?', '?', '?', '?', '?', '?', '?', '?',
 560     '?', '?', '?', '?', '?', '?', '?', '?',
 561   };
 562   const vc zero = { 0 };
 563
 564   vc data, t;
 565
 566   /* Main loop processing 16 bytes at a time.  */
 567   do
 568     {
 569       vc m_nl, m_cr, m_bs, m_qm;
 570
 571       data = __builtin_vec_vsx_ld (0, s);
 572       s += 16;
 573
 574       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 575       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 576       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 577       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 578       t = (m_nl | m_cr) | (m_bs | m_qm);
 579
 580       /* T now contains 0xff in bytes for which we matched one of the relevant
 581          characters.  We want to exit the loop if any byte in T is non-zero.
 582          Below is the expansion of vec_any_ne(t, zero).  */
 583     }
 584   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 585
 586   /* Restore s to to point to the 16 bytes we just processed.  */
 587   s -= 16;
 588
 589   {
 590 #define N  (sizeof(vc) / sizeof(long))
 591
 592     union {
 593       vc v;
 594       /* Statically assert that N is 2 or 4.  */
 595       unsigned long l[(N == 2 || N == 4) ? N : -1];
 596     } u;
 597     unsigned long l, i = 0;
 598
 599     u.v = t;
 600
 601     /* Find the first word of T that is non-zero.  */
 602     switch (N)
 603       {
 604       case 4:
 605         l = u.l[i++];
 606         if (l != 0)
 607           break;
 608         s += sizeof(unsigned long);
 609         l = u.l[i++];
 610         if (l != 0)
 611           break;
 612         s += sizeof(unsigned long);
 613         /* FALLTHRU */
 614       case 2:
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619         l = u.l[i];
 620       }
 621
 622     /* L now contains 0xff in bytes for which we matched one of the
 623        relevant characters.  We can find the byte index by finding
 624        its bit index and dividing by 8.  */
 625 #ifdef __BIG_ENDIAN__
 626     l = __builtin_clzl(l) >> 3;
 627 #else
 628     l = __builtin_ctzl(l) >> 3;
 629 #endif
 630     return s + l;
 631
 632 #undef N
 633   }
 634 }
 635
 636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 637
 638 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 639    This cannot be used for little endian because vec_lvsl/lvsr are
 640    deprecated for little endian and the code won't work properly.  */
 641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 642    so we can't compile this function without -maltivec on the command line
 643    (or implied by some other switch).  */
 644
 645 static const uchar *
 646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 647 {
 648   typedef __attribute__((altivec(vector))) unsigned char vc;
 649
 650   const vc repl_nl = {
 651     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 652     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 653   };
 654   const vc repl_cr = {
 655     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 656     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 657   };
 658   const vc repl_bs = {
 659     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 660     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 661   };
 662   const vc repl_qm = {
 663     '?', '?', '?', '?', '?', '?', '?', '?',
 664     '?', '?', '?', '?', '?', '?', '?', '?',
 665   };
 666   const vc ones = {
 667     -1, -1, -1, -1, -1, -1, -1, -1,
 668     -1, -1, -1, -1, -1, -1, -1, -1,
 669   };
 670   const vc zero = { 0 };
 671
 672   vc data, mask, t;
 673
 674   /* Altivec loads automatically mask addresses with -16.  This lets us
 675      issue the first load as early as possible.  */
 676   data = __builtin_vec_ld(0, (const vc *)s);
 677
 678   /* Discard bytes before the beginning of the buffer.  Do this by
 679      beginning with all ones and shifting in zeros according to the
 680      mis-alignment.  The LVSR instruction pulls the exact shift we
 681      want from the address.  */
 682   mask = __builtin_vec_lvsr(0, s);
 683   mask = __builtin_vec_perm(zero, ones, mask);
 684   data &= mask;
 685
 686   /* While altivec loads mask addresses, we still need to align S so
 687      that the offset we compute at the end is correct.  */
 688   s = (const uchar *)((uintptr_t)s & -16);
 689
 690   /* Main loop processing 16 bytes at a time.  */
 691   goto start;
 692   do
 693     {
 694       vc m_nl, m_cr, m_bs, m_qm;
 695
 696       s += 16;
 697       data = __builtin_vec_ld(0, (const vc *)s);
 698
 699     start:
 700       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 701       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 702       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 703       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 704       t = (m_nl | m_cr) | (m_bs | m_qm);
 705
 706       /* T now contains 0xff in bytes for which we matched one of the relevant
 707          characters.  We want to exit the loop if any byte in T is non-zero.
 708          Below is the expansion of vec_any_ne(t, zero).  */
 709     }
 710   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 711
 712   {
 713 #define N  (sizeof(vc) / sizeof(long))
 714
 715     union {
 716       vc v;
 717       /* Statically assert that N is 2 or 4.  */
 718       unsigned long l[(N == 2 || N == 4) ? N : -1];
 719     } u;
 720     unsigned long l, i = 0;
 721
 722     u.v = t;
 723
 724     /* Find the first word of T that is non-zero.  */
 725     switch (N)
 726       {
 727       case 4:
 728         l = u.l[i++];
 729         if (l != 0)
 730           break;
 731         s += sizeof(unsigned long);
 732         l = u.l[i++];
 733         if (l != 0)
 734           break;
 735         s += sizeof(unsigned long);
 736         /* FALLTHROUGH */
 737       case 2:
 738         l = u.l[i++];
 739         if (l != 0)
 740           break;
 741         s += sizeof(unsigned long);
 742         l = u.l[i];
 743       }
 744
 745     /* L now contains 0xff in bytes for which we matched one of the
 746        relevant characters.  We can find the byte index by finding
 747        its bit index and dividing by 8.  */
 748     l = __builtin_clzl(l) >> 3;
 749     return s + l;
 750
 751 #undef N
 752   }
 753 }
 754
 755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 756 #include "arm_neon.h"
 757
 758 /* This doesn't have to be the exact page size, but no system may use
 759    a size smaller than this.  ARMv8 requires a minimum page size of
 760    4k.  The impact of being conservative here is a small number of
 761    cases will take the slightly slower entry path into the main
 762    loop.  */
 763
 764 #define AARCH64_MIN_PAGE_SIZE 4096
 765
 766 static const uchar *
 767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 768 {
 769   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 770   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 771   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 772   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 773   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 774
 775 #ifdef __ARM_BIG_ENDIAN
 776   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 777 #else
 778   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 779 #endif
 780
 781   unsigned int found;
 782   const uint8_t *p;
 783   uint8x16_t data;
 784   uint8x16_t t;
 785   uint16x8_t m;
 786   uint8x16_t u, v, w;
 787
 788   /* Align the source pointer.  */
 789   p = (const uint8_t *)((uintptr_t)s & -16);
 790
 791   /* Assuming random string start positions, with a 4k page size we'll take
 792      the slow path about 0.37% of the time.  */
 793   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 794                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 795                         < 16, 0))
 796     {
 797       /* Slow path: the string starts near a possible page boundary.  */
 798       uint32_t misalign, mask;
 799
 800       misalign = (uintptr_t)s & 15;
 801       mask = (-1u << misalign) & 0xffff;
 802       data = vld1q_u8 (p);
 803       t = vceqq_u8 (data, repl_nl);
 804       u = vceqq_u8 (data, repl_cr);
 805       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 806       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 807       t = vorrq_u8 (v, w);
 808       t = vandq_u8 (t, xmask);
 809       m = vpaddlq_u8 (t);
 810       m = vshlq_u16 (m, shift);
 811       found = vaddvq_u16 (m);
 812       found &= mask;
 813       if (found)
 814         return (const uchar*)p + __builtin_ctz (found);
 815     }
 816   else
 817     {
 818       data = vld1q_u8 ((const uint8_t *) s);
 819       t = vceqq_u8 (data, repl_nl);
 820       u = vceqq_u8 (data, repl_cr);
 821       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 822       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 823       t = vorrq_u8 (v, w);
 824       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 825         goto done;
 826     }
 827
 828   do
 829     {
 830       p += 16;
 831       data = vld1q_u8 (p);
 832       t = vceqq_u8 (data, repl_nl);
 833       u = vceqq_u8 (data, repl_cr);
 834       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 835       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 836       t = vorrq_u8 (v, w);
 837     } while (!vpaddd_u64 ((uint64x2_t)t));
 838
 839 done:
 840   /* Now that we've found the terminating substring, work out precisely where
 841      we need to stop.  */
 842   t = vandq_u8 (t, xmask);
 843   m = vpaddlq_u8 (t);
 844   m = vshlq_u16 (m, shift);
 845   found = vaddvq_u16 (m);
 846   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 847           + __builtin_ctz (found));
 848 }
 849
 850 #elif defined (__ARM_NEON)
 851 #include "arm_neon.h"
 852
 853 static const uchar *
 854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 855 {
 856   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 857   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 858   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 859   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 860   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 861
 862   unsigned int misalign, found, mask;
 863   const uint8_t *p;
 864   uint8x16_t data;
 865
 866   /* Align the source pointer.  */
 867   misalign = (uintptr_t)s & 15;
 868   p = (const uint8_t *)((uintptr_t)s & -16);
 869   data = vld1q_u8 (p);
 870
 871   /* Create a mask for the bytes that are valid within the first
 872      16-byte block.  The Idea here is that the AND with the mask
 873      within the loop is "free", since we need some AND or TEST
 874      insn in order to set the flags for the branch anyway.  */
 875   mask = (-1u << misalign) & 0xffff;
 876
 877   /* Main loop, processing 16 bytes at a time.  */
 878   goto start;
 879
 880   do
 881     {
 882       uint8x8_t l;
 883       uint16x4_t m;
 884       uint32x2_t n;
 885       uint8x16_t t, u, v, w;
 886
 887       p += 16;
 888       data = vld1q_u8 (p);
 889       mask = 0xffff;
 890
 891     start:
 892       t = vceqq_u8 (data, repl_nl);
 893       u = vceqq_u8 (data, repl_cr);
 894       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 895       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 896       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 897       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 898       m = vpaddl_u8 (l);
 899       n = vpaddl_u16 (m);
 900
 901       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 902               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 903       found &= mask;
 904     }
 905   while (!found);
 906
 907   /* FOUND contains 1 in bits for which we matched a relevant
 908      character.  Conversion to the byte index is trivial.  */
 909   found = __builtin_ctz (found);
 910   return (const uchar *)p + found;
 911 }
 912
 913 #else
 914
 915 /* We only have one accelerated alternative.  Use a direct call so that
 916    we encourage inlining.  */
 917
 918 #define search_line_fast  search_line_acc_char
 919
 920 #endif
 921
 922 /* Initialize the lexer if needed.  */
 923
 924 void
 925 _cpp_init_lexer (void)
 926 {
 927 #ifdef HAVE_init_vectorized_lexer
 928   init_vectorized_lexer ();
 929 #endif
 930 }
 931
 932 /* Returns with a logical line that contains no escaped newlines or
 933    trigraphs.  This is a time-critical inner loop.  */
 934 void
 935 _cpp_clean_line (cpp_reader *pfile)
 936 {
 937   cpp_buffer *buffer;
 938   const uchar *s;
 939   uchar c, *d, *p;
 940
 941   buffer = pfile->buffer;
 942   buffer->cur_note = buffer->notes_used = 0;
 943   buffer->cur = buffer->line_base = buffer->next_line;
 944   buffer->need_line = false;
 945   s = buffer->next_line;
 946
 947   if (!buffer->from_stage3)
 948     {
 949       const uchar *pbackslash = NULL;
 950
 951       /* Fast path.  This is the common case of an un-escaped line with
 952          no trigraphs.  The primary win here is by not writing any
 953          data back to memory until we have to.  */
 954       while (1)
 955         {
 956           /* Perform an optimized search for \n, \r, \\, ?.  */
 957           s = search_line_fast (s, buffer->rlimit);
 958
 959           c = *s;
 960           if (c == '\\')
 961             {
 962               /* Record the location of the backslash and continue.  */
 963               pbackslash = s++;
 964             }
 965           else if (__builtin_expect (c == '?', 0))
 966             {
 967               if (__builtin_expect (s[1] == '?', false)
 968                    && _cpp_trigraph_map[s[2]])
 969                 {
 970                   /* Have a trigraph.  We may or may not have to convert
 971                      it.  Add a line note regardless, for -Wtrigraphs.  */
 972                   add_line_note (buffer, s, s[2]);
 973                   if (CPP_OPTION (pfile, trigraphs))
 974                     {
 975                       /* We do, and that means we have to switch to the
 976                          slow path.  */
 977                       d = (uchar *) s;
 978                       *d = _cpp_trigraph_map[s[2]];
 979                       s += 2;
 980                       goto slow_path;
 981                     }
 982                 }
 983               /* Not a trigraph.  Continue on fast-path.  */
 984               s++;
 985             }
 986           else
 987             break;
 988         }
 989
 990       /* This must be \r or \n.  We're either done, or we'll be forced
 991          to write back to the buffer and continue on the slow path.  */
 992       d = (uchar *) s;
 993
 994       if (__builtin_expect (s == buffer->rlimit, false))
 995         goto done;
 996
 997       /* DOS line ending? */
 998       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 999         {
1000           s++;
1001           if (s == buffer->rlimit)
1002             goto done;
1003         }
1004
1005       if (__builtin_expect (pbackslash == NULL, true))
1006         goto done;
1007
1008       /* Check for escaped newline.  */
1009       p = d;
1010       while (is_nvspace (p[-1]))
1011         p--;
1012       if (p - 1 != pbackslash)
1013         goto done;
1014
1015       /* Have an escaped newline; process it and proceed to
1016          the slow path.  */
1017       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018       d = p - 2;
1019       buffer->next_line = p - 1;
1020
1021     slow_path:
1022       while (1)
1023         {
1024           c = *++s;
1025           *++d = c;
1026
1027           if (c == '\n' || c == '\r')
1028             {
1029               /* Handle DOS line endings.  */
1030               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031                 s++;
1032               if (s == buffer->rlimit)
1033                 break;
1034
1035               /* Escaped?  */
1036               p = d;
1037               while (p != buffer->next_line && is_nvspace (p[-1]))
1038                 p--;
1039               if (p == buffer->next_line || p[-1] != '\\')
1040                 break;
1041
1042               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043               d = p - 2;
1044               buffer->next_line = p - 1;
1045             }
1046           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1047             {
1048               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1049               add_line_note (buffer, d, s[2]);
1050               if (CPP_OPTION (pfile, trigraphs))
1051                 {
1052                   *d = _cpp_trigraph_map[s[2]];
1053                   s += 2;
1054                 }
1055             }
1056         }
1057     }
1058   else
1059     {
1060       while (*s != '\n' && *s != '\r')
1061         s++;
1062       d = (uchar *) s;
1063
1064       /* Handle DOS line endings.  */
1065       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1066         s++;
1067     }
1068
1069  done:
1070   *d = '\n';
1071   /* A sentinel note that should never be processed.  */
1072   add_line_note (buffer, d + 1, '\n');
1073   buffer->next_line = s + 1;
1074 }
1075
1076 /* Return true if the trigraph indicated by NOTE should be warned
1077    about in a comment.  */
1078 static bool
1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1080 {
1081   const uchar *p;
1082
1083   /* Within comments we don't warn about trigraphs, unless the
1084      trigraph forms an escaped newline, as that may change
1085      behavior.  */
1086   if (note->type != '/')
1087     return false;
1088
1089   /* If -trigraphs, then this was an escaped newline iff the next note
1090      is coincident.  */
1091   if (CPP_OPTION (pfile, trigraphs))
1092     return note[1].pos == note->pos;
1093
1094   /* Otherwise, see if this forms an escaped newline.  */
1095   p = note->pos + 3;
1096   while (is_nvspace (*p))
1097     p++;
1098
1099   /* There might have been escaped newlines between the trigraph and the
1100      newline we found.  Hence the position test.  */
1101   return (*p == '\n' && p < note[1].pos);
1102 }
1103
1104 /* Process the notes created by add_line_note as far as the current
1105    location.  */
1106 void
1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1108 {
1109   cpp_buffer *buffer = pfile->buffer;
1110
1111   for (;;)
1112     {
1113       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114       unsigned int col;
1115
1116       if (note->pos > buffer->cur)
1117         break;
1118
1119       buffer->cur_note++;
1120       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1121
1122       if (note->type == '\\' || note->type == ' ')
1123         {
1124           if (note->type == ' ' && !in_comment)
1125             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126                                  "backslash and newline separated by space");
1127
1128           if (buffer->next_line > buffer->rlimit)
1129             {
1130               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131                                    "backslash-newline at end of file");
1132               /* Prevent "no newline at end of file" warning.  */
1133               buffer->next_line = buffer->rlimit;
1134             }
1135
1136           buffer->line_base = note->pos;
1137           CPP_INCREMENT_LINE (pfile, 0);
1138         }
1139       else if (_cpp_trigraph_map[note->type])
1140         {
1141           if (CPP_OPTION (pfile, warn_trigraphs)
1142               && (!in_comment || warn_in_comment (pfile, note)))
1143             {
1144               if (CPP_OPTION (pfile, trigraphs))
1145                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146                                        pfile->line_table->highest_line, col,
1147                                        "trigraph ??%c converted to %c",
1148                                        note->type,
1149                                        (int) _cpp_trigraph_map[note->type]);
1150               else
1151                 {
1152                   cpp_warning_with_line
1153                     (pfile, CPP_W_TRIGRAPHS,
1154                      pfile->line_table->highest_line, col,
1155                      "trigraph ??%c ignored, use -trigraphs to enable",
1156                      note->type);
1157                 }
1158             }
1159         }
1160       else if (note->type == 0)
1161         /* Already processed in lex_raw_string.  */;
1162       else
1163         abort ();
1164     }
1165 }
1166
1167 namespace bidi {
1168   enum class kind {
1169     NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1170   };
1171
1172   /* All the UTF-8 encodings of bidi characters start with E2.  */
1173   constexpr uchar utf8_start = 0xe2;
1174
1175   struct context
1176   {
1177     context () {}
1178     context (location_t loc, kind k, bool pdf, bool ucn)
1179     : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1180     {
1181     }
1182
1183     kind get_pop_kind () const
1184     {
1185       return m_pdf ? kind::PDF : kind::PDI;
1186     }
1187     bool ucn_p () const
1188     {
1189       return m_ucn;
1190     }
1191
1192     location_t m_loc;
1193     kind m_kind;
1194     unsigned m_pdf : 1;
1195     unsigned m_ucn : 1;
1196   };
1197
1198   /* A vector holding currently open bidi contexts.  We use a char for
1199      each context, its LSB is 1 if it represents a PDF context, 0 if it
1200      represents a PDI context.  The next bit is 1 if this context was open
1201      by a bidi character written as a UCN, and 0 when it was UTF-8.  */
1202   semi_embedded_vec <context, 16> vec;
1203
1204   /* Close the whole comment/identifier/string literal/character constant
1205      context.  */
1206   void on_close ()
1207   {
1208     vec.truncate (0);
1209   }
1210
1211   /* Pop the last element in the vector.  */
1212   void pop ()
1213   {
1214     unsigned int len = vec.count ();
1215     gcc_checking_assert (len > 0);
1216     vec.truncate (len - 1);
1217   }
1218
1219   /* Return the pop kind of the context of the Ith element.  */
1220   kind pop_kind_at (unsigned int i)
1221   {
1222     return vec[i].get_pop_kind ();
1223   }
1224
1225   /* Return the pop kind of the context that is currently opened.  */
1226   kind current_ctx ()
1227   {
1228     unsigned int len = vec.count ();
1229     if (len == 0)
1230       return kind::NONE;
1231     return vec[len - 1].get_pop_kind ();
1232   }
1233
1234   /* Return true if the current context comes from a UCN origin, that is,
1235      the bidi char which started this bidi context was written as a UCN.  */
1236   bool current_ctx_ucn_p ()
1237   {
1238     unsigned int len = vec.count ();
1239     gcc_checking_assert (len > 0);
1240     return vec[len - 1].m_ucn;
1241   }
1242
1243   location_t current_ctx_loc ()
1244   {
1245     unsigned int len = vec.count ();
1246     gcc_checking_assert (len > 0);
1247     return vec[len - 1].m_loc;
1248   }
1249
1250   /* We've read a bidi char, update the current vector as necessary.
1251      LOC is only valid when K is not kind::NONE.  */
1252   void on_char (kind k, bool ucn_p, location_t loc)
1253   {
1254     switch (k)
1255       {
1256       case kind::LRE:
1257       case kind::RLE:
1258       case kind::LRO:
1259       case kind::RLO:
1260         vec.push (context (loc, k, true, ucn_p));
1261         break;
1262       case kind::LRI:
1263       case kind::RLI:
1264       case kind::FSI:
1265         vec.push (context (loc, k, false, ucn_p));
1266         break;
1267       /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1268          whose scope has not yet been terminated.  */
1269       case kind::PDF:
1270         if (current_ctx () == kind::PDF)
1271           pop ();
1272         break;
1273       /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1274          scope has not yet been terminated, as well as the scopes of
1275          any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1276          yet been terminated.  */
1277       case kind::PDI:
1278         for (int i = vec.count () - 1; i >= 0; --i)
1279           if (pop_kind_at (i) == kind::PDI)
1280             {
1281               vec.truncate (i);
1282               break;
1283             }
1284         break;
1285       case kind::LTR:
1286       case kind::RTL:
1287         /* These aren't popped by a PDF/PDI.  */
1288         break;
1289       ATTR_LIKELY case kind::NONE:
1290         break;
1291       default:
1292         abort ();
1293       }
1294   }
1295
1296   /* Return a descriptive string for K.  */
1297   const char *to_str (kind k)
1298   {
1299     switch (k)
1300       {
1301       case kind::LRE:
1302         return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1303       case kind::RLE:
1304         return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1305       case kind::LRO:
1306         return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1307       case kind::RLO:
1308         return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1309       case kind::LRI:
1310         return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1311       case kind::RLI:
1312         return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1313       case kind::FSI:
1314         return "U+2068 (FIRST STRONG ISOLATE)";
1315       case kind::PDF:
1316         return "U+202C (POP DIRECTIONAL FORMATTING)";
1317       case kind::PDI:
1318         return "U+2069 (POP DIRECTIONAL ISOLATE)";
1319       case kind::LTR:
1320         return "U+200E (LEFT-TO-RIGHT MARK)";
1321       case kind::RTL:
1322         return "U+200F (RIGHT-TO-LEFT MARK)";
1323       default:
1324         abort ();
1325       }
1326   }
1327 }
1328
1329 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1330    within the current line in FILE, with the caret at START.  */
1331
1332 static location_t
1333 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1334                                          const unsigned char *const start,
1335                                          size_t num_bytes)
1336 {
1337   gcc_checking_assert (num_bytes > 0);
1338
1339   /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1340      to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1341      whereas linemap_position_for_column is 1-based.  */
1342
1343   /* Get 0-based offsets within the line.  */
1344   size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1345   size_t end_offset = start_offset + num_bytes - 1;
1346
1347   /* Now convert to location_t, where "columns" are 1-based byte offsets.  */
1348   location_t start_loc = linemap_position_for_column (pfile->line_table,
1349                                                       start_offset + 1);
1350   location_t end_loc = linemap_position_for_column (pfile->line_table,
1351                                                      end_offset + 1);
1352
1353   if (start_loc == end_loc)
1354     return start_loc;
1355
1356   source_range src_range;
1357   src_range.m_start = start_loc;
1358   src_range.m_finish = end_loc;
1359   location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
1360                                                    start_loc,
1361                                                    src_range,
1362                                                    NULL);
1363   return combined_loc;
1364 }
1365
1366 /* Parse a sequence of 3 bytes starting with P and return its bidi code.  */
1367
1368 static bidi::kind
1369 get_bidi_utf8_1 (const unsigned char *const p)
1370 {
1371   gcc_checking_assert (p[0] == bidi::utf8_start);
1372
1373   if (p[1] == 0x80)
1374     switch (p[2])
1375       {
1376       case 0xaa:
1377         return bidi::kind::LRE;
1378       case 0xab:
1379         return bidi::kind::RLE;
1380       case 0xac:
1381         return bidi::kind::PDF;
1382       case 0xad:
1383         return bidi::kind::LRO;
1384       case 0xae:
1385         return bidi::kind::RLO;
1386       case 0x8e:
1387         return bidi::kind::LTR;
1388       case 0x8f:
1389         return bidi::kind::RTL;
1390       default:
1391         break;
1392       }
1393   else if (p[1] == 0x81)
1394     switch (p[2])
1395       {
1396       case 0xa6:
1397         return bidi::kind::LRI;
1398       case 0xa7:
1399         return bidi::kind::RLI;
1400       case 0xa8:
1401         return bidi::kind::FSI;
1402       case 0xa9:
1403         return bidi::kind::PDI;
1404       default:
1405         break;
1406       }
1407
1408   return bidi::kind::NONE;
1409 }
1410
1411 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1412    If the kind is not NONE, write the location to *OUT.*/
1413
1414 static bidi::kind
1415 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1416 {
1417   bidi::kind result = get_bidi_utf8_1 (p);
1418   if (result != bidi::kind::NONE)
1419     {
1420       /* We have a sequence of 3 bytes starting at P.  */
1421       *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1422     }
1423   return result;
1424 }
1425
1426 /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
1427
1428 static bidi::kind
1429 get_bidi_ucn_1 (const unsigned char *p, bool is_U)
1430 {
1431   /* 6.4.3 Universal Character Names
1432       \u hex-quad
1433       \U hex-quad hex-quad
1434      where \unnnn means \U0000nnnn.  */
1435
1436   if (is_U)
1437     {
1438       if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1439         return bidi::kind::NONE;
1440       /* Skip 4B so we can treat \u and \U the same below.  */
1441       p += 4;
1442     }
1443
1444   /* All code points we are looking for start with 20xx.  */
1445   if (p[0] != '2' || p[1] != '0')
1446     return bidi::kind::NONE;
1447   else if (p[2] == '2')
1448     switch (p[3])
1449       {
1450       case 'a':
1451       case 'A':
1452         return bidi::kind::LRE;
1453       case 'b':
1454       case 'B':
1455         return bidi::kind::RLE;
1456       case 'c':
1457       case 'C':
1458         return bidi::kind::PDF;
1459       case 'd':
1460       case 'D':
1461         return bidi::kind::LRO;
1462       case 'e':
1463       case 'E':
1464         return bidi::kind::RLO;
1465       default:
1466         break;
1467       }
1468   else if (p[2] == '6')
1469     switch (p[3])
1470       {
1471       case '6':
1472         return bidi::kind::LRI;
1473       case '7':
1474         return bidi::kind::RLI;
1475       case '8':
1476         return bidi::kind::FSI;
1477       case '9':
1478         return bidi::kind::PDI;
1479       default:
1480         break;
1481       }
1482   else if (p[2] == '0')
1483     switch (p[3])
1484       {
1485       case 'e':
1486       case 'E':
1487         return bidi::kind::LTR;
1488       case 'f':
1489       case 'F':
1490         return bidi::kind::RTL;
1491       default:
1492         break;
1493       }
1494
1495   return bidi::kind::NONE;
1496 }
1497
1498 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1499    If the kind is not NONE, write the location to *OUT.*/
1500
1501 static bidi::kind
1502 get_bidi_ucn (cpp_reader *pfile,  const unsigned char *p, bool is_U,
1503               location_t *out)
1504 {
1505   bidi::kind result = get_bidi_ucn_1 (p, is_U);
1506   if (result != bidi::kind::NONE)
1507     {
1508       const unsigned char *start = p - 2;
1509       size_t num_bytes = 2 + (is_U ? 8 : 4);
1510       *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1511     }
1512   return result;
1513 }
1514
1515 /* Subclass of rich_location for reporting on unpaired UTF-8
1516    bidirectional control character(s).
1517    Escape the source lines on output, and show all unclosed
1518    bidi context, labelling everything.  */
1519
1520 class unpaired_bidi_rich_location : public rich_location
1521 {
1522  public:
1523   class custom_range_label : public range_label
1524   {
1525    public:
1526      label_text get_text (unsigned range_idx) const FINAL OVERRIDE
1527      {
1528        /* range 0 is the primary location; each subsequent range i + 1
1529           is for bidi::vec[i].  */
1530        if (range_idx > 0)
1531          {
1532            const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1533            return label_text::borrow (bidi::to_str (ctxt.m_kind));
1534          }
1535        else
1536          return label_text::borrow (_("end of bidirectional context"));
1537      }
1538   };
1539
1540   unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1541   : rich_location (pfile->line_table, loc, &m_custom_label)
1542   {
1543     set_escape_on_output (true);
1544     for (unsigned i = 0; i < bidi::vec.count (); i++)
1545       add_range (bidi::vec[i].m_loc,
1546                  SHOW_RANGE_WITHOUT_CARET,
1547                  &m_custom_label);
1548   }
1549
1550  private:
1551    custom_range_label m_custom_label;
1552 };
1553
1554 /* We're closing a bidi context, that is, we've encountered a newline,
1555    are closing a C-style comment, or are at the end of a string literal,
1556    character constant, or identifier.  Warn if this context was not
1557    properly terminated by a PDI or PDF.  P points to the last character
1558    in this context.  */
1559
1560 static void
1561 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1562 {
1563   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1564   if (bidi::vec.count () > 0
1565       && (warn_bidi & bidirectional_unpaired
1566           && (!bidi::current_ctx_ucn_p ()
1567               || (warn_bidi & bidirectional_ucn))))
1568     {
1569       const location_t loc
1570         = linemap_position_for_column (pfile->line_table,
1571                                        CPP_BUF_COLUMN (pfile->buffer, p));
1572       unpaired_bidi_rich_location rich_loc (pfile, loc);
1573       /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1574          forms of a diagnostic, so fake it for now.  */
1575       if (bidi::vec.count () > 1)
1576         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1577                         "unpaired UTF-8 bidirectional control characters "
1578                         "detected");
1579       else
1580         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1581                         "unpaired UTF-8 bidirectional control character "
1582                         "detected");
1583     }
1584   /* We're done with this context.  */
1585   bidi::on_close ();
1586 }
1587
1588 /* We're at the beginning or in the middle of an identifier/comment/string
1589    literal/character constant.  Warn if we've encountered a bidi character.
1590    KIND says which bidi control character it was; UCN_P is true iff this bidi
1591    control character was written as a UCN.  LOC is the location of the
1592    character, but is only valid if KIND != bidi::kind::NONE.  */
1593
1594 static void
1595 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1596                          bool ucn_p, location_t loc)
1597 {
1598   if (__builtin_expect (kind == bidi::kind::NONE, 1))
1599     return;
1600
1601   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1602
1603   if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1604     {
1605       rich_location rich_loc (pfile->line_table, loc);
1606       rich_loc.set_escape_on_output (true);
1607
1608       /* It seems excessive to warn about a PDI/PDF that is closing
1609          an opened context because we've already warned about the
1610          opening character.  Except warn when we have a UCN x UTF-8
1611          mismatch, if UCN checking is enabled.  */
1612       if (kind == bidi::current_ctx ())
1613         {
1614           if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1615               && bidi::current_ctx_ucn_p () != ucn_p)
1616             {
1617               rich_loc.add_range (bidi::current_ctx_loc ());
1618               cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1619                               "UTF-8 vs UCN mismatch when closing "
1620                               "a context by \"%s\"", bidi::to_str (kind));
1621             }
1622         }
1623       else if (warn_bidi & bidirectional_any
1624                && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1625         {
1626           if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1627             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1628                             "\"%s\" is closing an unopened context",
1629                             bidi::to_str (kind));
1630           else
1631             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1632                             "found problematic Unicode character \"%s\"",
1633                             bidi::to_str (kind));
1634         }
1635     }
1636   /* We're done with this context.  */
1637   bidi::on_char (kind, ucn_p, loc);
1638 }
1639
1640 /* Skip a C-style block comment.  We find the end of the comment by
1641    seeing if an asterisk is before every '/' we encounter.  Returns
1642    nonzero if comment terminated by EOF, zero otherwise.
1643
1644    Buffer->cur points to the initial asterisk of the comment.  */
1645 bool
1646 _cpp_skip_block_comment (cpp_reader *pfile)
1647 {
1648   cpp_buffer *buffer = pfile->buffer;
1649   const uchar *cur = buffer->cur;
1650   uchar c;
1651   const bool warn_bidi_p = pfile->warn_bidi_p ();
1652
1653   cur++;
1654   if (*cur == '/')
1655     cur++;
1656
1657   for (;;)
1658     {
1659       /* People like decorating comments with '*', so check for '/'
1660          instead for efficiency.  */
1661       c = *cur++;
1662
1663       if (c == '/')
1664         {
1665           if (cur[-2] == '*')
1666             {
1667               if (warn_bidi_p)
1668                 maybe_warn_bidi_on_close (pfile, cur);
1669               break;
1670             }
1671
1672           /* Warn about potential nested comments, but not if the '/'
1673              comes immediately before the true comment delimiter.
1674              Don't bother to get it right across escaped newlines.  */
1675           if (CPP_OPTION (pfile, warn_comments)
1676               && cur[0] == '*' && cur[1] != '/')
1677             {
1678               buffer->cur = cur;
1679               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1680                                      pfile->line_table->highest_line,
1681                                      CPP_BUF_COL (buffer),
1682                                      "\"/*\" within comment");
1683             }
1684         }
1685       else if (c == '\n')
1686         {
1687           unsigned int cols;
1688           buffer->cur = cur - 1;
1689           if (warn_bidi_p)
1690             maybe_warn_bidi_on_close (pfile, cur);
1691           _cpp_process_line_notes (pfile, true);
1692           if (buffer->next_line >= buffer->rlimit)
1693             return true;
1694           _cpp_clean_line (pfile);
1695
1696           cols = buffer->next_line - buffer->line_base;
1697           CPP_INCREMENT_LINE (pfile, cols);
1698
1699           cur = buffer->cur;
1700         }
1701       /* If this is a beginning of a UTF-8 encoding, it might be
1702          a bidirectional control character.  */
1703       else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
1704         {
1705           location_t loc;
1706           bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1707           maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1708         }
1709     }
1710
1711   buffer->cur = cur;
1712   _cpp_process_line_notes (pfile, true);
1713   return false;
1714 }
1715
1716 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1717    terminating newline.  Handles escaped newlines.  Returns nonzero
1718    if a multiline comment.  */
1719 static int
1720 skip_line_comment (cpp_reader *pfile)
1721 {
1722   cpp_buffer *buffer = pfile->buffer;
1723   location_t orig_line = pfile->line_table->highest_line;
1724   const bool warn_bidi_p = pfile->warn_bidi_p ();
1725
1726   if (!warn_bidi_p)
1727     while (*buffer->cur != '\n')
1728       buffer->cur++;
1729   else
1730     {
1731       while (*buffer->cur != '\n'
1732              && *buffer->cur != bidi::utf8_start)
1733         buffer->cur++;
1734       if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1735         {
1736           while (*buffer->cur != '\n')
1737             {
1738               if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1739                 {
1740                   location_t loc;
1741                   bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1742                   maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1743                 }
1744               buffer->cur++;
1745             }
1746           maybe_warn_bidi_on_close (pfile, buffer->cur);
1747         }
1748     }
1749
1750   _cpp_process_line_notes (pfile, true);
1751   return orig_line != pfile->line_table->highest_line;
1752 }
1753
1754 /* Skips whitespace, saving the next non-whitespace character.  */
1755 static void
1756 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1757 {
1758   cpp_buffer *buffer = pfile->buffer;
1759   bool saw_NUL = false;
1760
1761   do
1762     {
1763       /* Horizontal space always OK.  */
1764       if (c == ' ' || c == '\t')
1765         ;
1766       /* Just \f \v or \0 left.  */
1767       else if (c == '\0')
1768         saw_NUL = true;
1769       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1770         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1771                              CPP_BUF_COL (buffer),
1772                              "%s in preprocessing directive",
1773                              c == '\f' ? "form feed" : "vertical tab");
1774
1775       c = *buffer->cur++;
1776     }
1777   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1778   while (is_nvspace (c));
1779
1780   if (saw_NUL)
1781     {
1782       encoding_rich_location rich_loc (pfile);
1783       cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1784                     "null character(s) ignored");
1785     }
1786
1787   buffer->cur--;
1788 }
1789
1790 /* See if the characters of a number token are valid in a name (no
1791    '.', '+' or '-').  */
1792 static int
1793 name_p (cpp_reader *pfile, const cpp_string *string)
1794 {
1795   unsigned int i;
1796
1797   for (i = 0; i < string->len; i++)
1798     if (!is_idchar (string->text[i]))
1799       return 0;
1800
1801   return 1;
1802 }
1803
1804 /* After parsing an identifier or other sequence, produce a warning about
1805    sequences not in NFC/NFKC.  */
1806 static void
1807 warn_about_normalization (cpp_reader *pfile,
1808                           const cpp_token *token,
1809                           const struct normalize_state *s)
1810 {
1811   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1812       && !pfile->state.skipping)
1813     {
1814       location_t loc = token->src_loc;
1815
1816       /* If possible, create a location range for the token.  */
1817       if (loc >= RESERVED_LOCATION_COUNT
1818           && token->type != CPP_EOF
1819           /* There must be no line notes to process.  */
1820           && (!(pfile->buffer->cur
1821                 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
1822                 && !pfile->overlaid_buffer)))
1823         {
1824           source_range tok_range;
1825           tok_range.m_start = loc;
1826           tok_range.m_finish
1827             = linemap_position_for_column (pfile->line_table,
1828                                            CPP_BUF_COLUMN (pfile->buffer,
1829                                                            pfile->buffer->cur));
1830           loc = COMBINE_LOCATION_DATA (pfile->line_table,
1831                                        loc, tok_range, NULL);
1832         }
1833
1834       encoding_rich_location rich_loc (pfile, loc);
1835
1836       /* Make sure that the token is printed using UCNs, even
1837          if we'd otherwise happily print UTF-8.  */
1838       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1839       size_t sz;
1840
1841       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1842       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1843         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1844                         "`%.*s' is not in NFKC", (int) sz, buf);
1845       else if (CPP_OPTION (pfile, cplusplus))
1846         cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1847                                   "`%.*s' is not in NFC", (int) sz, buf);
1848       else
1849         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1850                         "`%.*s' is not in NFC", (int) sz, buf);
1851       free (buf);
1852     }
1853 }
1854
1855 static const cppchar_t utf8_signifier = 0xC0;
1856
1857 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1858    an identifier.  FIRST is TRUE if this starts an identifier.  */
1859
1860 static bool
1861 forms_identifier_p (cpp_reader *pfile, int first,
1862                     struct normalize_state *state)
1863 {
1864   cpp_buffer *buffer = pfile->buffer;
1865   const bool warn_bidi_p = pfile->warn_bidi_p ();
1866
1867   if (*buffer->cur == '$')
1868     {
1869       if (!CPP_OPTION (pfile, dollars_in_ident))
1870         return false;
1871
1872       buffer->cur++;
1873       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1874         {
1875           CPP_OPTION (pfile, warn_dollars) = 0;
1876           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1877         }
1878
1879       return true;
1880     }
1881
1882   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
1883   if (CPP_OPTION (pfile, extended_identifiers))
1884     {
1885       cppchar_t s;
1886       if (*buffer->cur >= utf8_signifier)
1887         {
1888           if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
1889               && warn_bidi_p)
1890             {
1891               location_t loc;
1892               bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1893               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1894             }
1895           if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1896                                state, &s))
1897             return true;
1898         }
1899       else if (*buffer->cur == '\\'
1900                && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1901         {
1902           buffer->cur += 2;
1903           if (warn_bidi_p)
1904             {
1905               location_t loc;
1906               bidi::kind kind = get_bidi_ucn (pfile,
1907                                               buffer->cur,
1908                                               buffer->cur[-1] == 'U',
1909                                               &loc);
1910               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
1911             }
1912           if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1913                               state, &s, NULL, NULL))
1914             return true;
1915           buffer->cur -= 2;
1916         }
1917     }
1918
1919   return false;
1920 }
1921
1922 /* Helper function to issue error about improper __VA_OPT__ use.  */
1923 static void
1924 maybe_va_opt_error (cpp_reader *pfile)
1925 {
1926   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1927     {
1928       /* __VA_OPT__ should not be accepted at all, but allow it in
1929          system headers.  */
1930       if (!_cpp_in_system_header (pfile))
1931         cpp_error (pfile, CPP_DL_PEDWARN,
1932                    "__VA_OPT__ is not available until C++20");
1933     }
1934   else if (!pfile->state.va_args_ok)
1935     {
1936       /* __VA_OPT__ should only appear in the replacement list of a
1937          variadic macro.  */
1938       cpp_error (pfile, CPP_DL_PEDWARN,
1939                  "__VA_OPT__ can only appear in the expansion"
1940                  " of a C++20 variadic macro");
1941     }
1942 }
1943
1944 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1945 static cpp_hashnode *
1946 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1947 {
1948   cpp_hashnode *result;
1949   const uchar *cur;
1950   unsigned int len;
1951   unsigned int hash = HT_HASHSTEP (0, *base);
1952
1953   cur = base + 1;
1954   while (ISIDNUM (*cur))
1955     {
1956       hash = HT_HASHSTEP (hash, *cur);
1957       cur++;
1958     }
1959   len = cur - base;
1960   hash = HT_HASHFINISH (hash, len);
1961   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1962                                               base, len, hash, HT_ALLOC));
1963
1964   /* Rarely, identifiers require diagnostics when lexed.  */
1965   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1966                         && !pfile->state.skipping, 0))
1967     {
1968       /* It is allowed to poison the same identifier twice.  */
1969       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1970         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1971                    NODE_NAME (result));
1972
1973       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1974          replacement list of a variadic macro.  */
1975       if (result == pfile->spec_nodes.n__VA_ARGS__
1976           && !pfile->state.va_args_ok)
1977         {
1978           if (CPP_OPTION (pfile, cplusplus))
1979             cpp_error (pfile, CPP_DL_PEDWARN,
1980                        "__VA_ARGS__ can only appear in the expansion"
1981                        " of a C++11 variadic macro");
1982           else
1983             cpp_error (pfile, CPP_DL_PEDWARN,
1984                        "__VA_ARGS__ can only appear in the expansion"
1985                        " of a C99 variadic macro");
1986         }
1987
1988       if (result == pfile->spec_nodes.n__VA_OPT__)
1989         maybe_va_opt_error (pfile);
1990
1991       /* For -Wc++-compat, warn about use of C++ named operators.  */
1992       if (result->flags & NODE_WARN_OPERATOR)
1993         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1994                      "identifier \"%s\" is a special operator name in C++",
1995                      NODE_NAME (result));
1996     }
1997
1998   return result;
1999 }
2000
2001 /* Get the cpp_hashnode of an identifier specified by NAME in
2002    the current cpp_reader object.  If none is found, NULL is returned.  */
2003 cpp_hashnode *
2004 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2005 {
2006   cpp_hashnode *result;
2007   result = lex_identifier_intern (pfile, (uchar *) name);
2008   return result;
2009 }
2010
2011 /* Lex an identifier starting at BUFFER->CUR - 1.  */
2012 static cpp_hashnode *
2013 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2014                 struct normalize_state *nst, cpp_hashnode **spelling)
2015 {
2016   cpp_hashnode *result;
2017   const uchar *cur;
2018   unsigned int len;
2019   unsigned int hash = HT_HASHSTEP (0, *base);
2020   const bool warn_bidi_p = pfile->warn_bidi_p ();
2021
2022   cur = pfile->buffer->cur;
2023   if (! starts_ucn)
2024     {
2025       while (ISIDNUM (*cur))
2026         {
2027           hash = HT_HASHSTEP (hash, *cur);
2028           cur++;
2029         }
2030       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2031     }
2032   pfile->buffer->cur = cur;
2033   if (starts_ucn || forms_identifier_p (pfile, false, nst))
2034     {
2035       /* Slower version for identifiers containing UCNs
2036          or extended chars (including $).  */
2037       do {
2038         while (ISIDNUM (*pfile->buffer->cur))
2039           {
2040             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2041             pfile->buffer->cur++;
2042           }
2043       } while (forms_identifier_p (pfile, false, nst));
2044       if (warn_bidi_p)
2045         maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2046       result = _cpp_interpret_identifier (pfile, base,
2047                                           pfile->buffer->cur - base);
2048       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2049     }
2050   else
2051     {
2052       len = cur - base;
2053       hash = HT_HASHFINISH (hash, len);
2054
2055       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2056                                                   base, len, hash, HT_ALLOC));
2057       *spelling = result;
2058     }
2059
2060   /* Rarely, identifiers require diagnostics when lexed.  */
2061   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2062                         && !pfile->state.skipping, 0))
2063     {
2064       /* It is allowed to poison the same identifier twice.  */
2065       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2066         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2067                    NODE_NAME (result));
2068
2069       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2070          replacement list of a variadic macro.  */
2071       if (result == pfile->spec_nodes.n__VA_ARGS__
2072           && !pfile->state.va_args_ok)
2073         {
2074           if (CPP_OPTION (pfile, cplusplus))
2075             cpp_error (pfile, CPP_DL_PEDWARN,
2076                        "__VA_ARGS__ can only appear in the expansion"
2077                        " of a C++11 variadic macro");
2078           else
2079             cpp_error (pfile, CPP_DL_PEDWARN,
2080                        "__VA_ARGS__ can only appear in the expansion"
2081                        " of a C99 variadic macro");
2082         }
2083
2084       /* __VA_OPT__ should only appear in the replacement list of a
2085          variadic macro.  */
2086       if (result == pfile->spec_nodes.n__VA_OPT__)
2087         maybe_va_opt_error (pfile);
2088
2089       /* For -Wc++-compat, warn about use of C++ named operators.  */
2090       if (result->flags & NODE_WARN_OPERATOR)
2091         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2092                      "identifier \"%s\" is a special operator name in C++",
2093                      NODE_NAME (result));
2094     }
2095
2096   return result;
2097 }
2098
2099 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
2100 static void
2101 lex_number (cpp_reader *pfile, cpp_string *number,
2102             struct normalize_state *nst)
2103 {
2104   const uchar *cur;
2105   const uchar *base;
2106   uchar *dest;
2107
2108   base = pfile->buffer->cur - 1;
2109   do
2110     {
2111       const uchar *adj_digit_sep = NULL;
2112       cur = pfile->buffer->cur;
2113
2114       /* N.B. ISIDNUM does not include $.  */
2115       while (ISIDNUM (*cur)
2116              || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2117              || DIGIT_SEP (*cur)
2118              || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2119         {
2120           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2121           /* Adjacent digit separators do not form part of the pp-number syntax.
2122              However, they can safely be diagnosed here as an error, since '' is
2123              not a valid preprocessing token.  */
2124           if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2125             adj_digit_sep = cur;
2126           cur++;
2127         }
2128       /* A number can't end with a digit separator.  */
2129       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2130         --cur;
2131       if (adj_digit_sep && adj_digit_sep < cur)
2132         cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2133
2134       pfile->buffer->cur = cur;
2135     }
2136   while (forms_identifier_p (pfile, false, nst));
2137
2138   number->len = cur - base;
2139   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2140   memcpy (dest, base, number->len);
2141   dest[number->len] = '\0';
2142   number->text = dest;
2143 }
2144
2145 /* Create a token of type TYPE with a literal spelling.  */
2146 static void
2147 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2148                 unsigned int len, enum cpp_ttype type)
2149 {
2150   token->type = type;
2151   token->val.str.len = len;
2152   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2153 }
2154
2155 const uchar *
2156 cpp_alloc_token_string (cpp_reader *pfile,
2157                         const unsigned char *ptr, unsigned len)
2158 {
2159   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2160
2161   dest[len] = 0;
2162   memcpy (dest, ptr, len);
2163   return dest;
2164 }
2165
2166 /* A pair of raw buffer pointers.  The currently open one is [1], the
2167    first one is [0].  Used for string literal lexing.  */
2168 struct lit_accum {
2169   _cpp_buff *first;
2170   _cpp_buff *last;
2171   const uchar *rpos;
2172   size_t accum;
2173
2174   lit_accum ()
2175     : first (NULL), last (NULL), rpos (0), accum (0)
2176   {
2177   }
2178
2179   void append (cpp_reader *, const uchar *, size_t);
2180
2181   void read_begin (cpp_reader *);
2182   bool reading_p () const
2183   {
2184     return rpos != NULL;
2185   }
2186   char read_char ()
2187   {
2188     char c = *rpos++;
2189     if (rpos == BUFF_FRONT (last))
2190       rpos = NULL;
2191     return c;
2192   }
2193 };
2194
2195 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2196    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
2197
2198 void
2199 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2200 {
2201   if (!last)
2202     /* Starting.  */
2203     first = last = _cpp_get_buff (pfile, len);
2204   else if (len > BUFF_ROOM (last))
2205     {
2206       /* There is insufficient room in the buffer.  Copy what we can,
2207          and then either extend or create a new one.  */
2208       size_t room = BUFF_ROOM (last);
2209       memcpy (BUFF_FRONT (last), base, room);
2210       BUFF_FRONT (last) += room;
2211       base += room;
2212       len -= room;
2213       accum += room;
2214
2215       gcc_checking_assert (!rpos);
2216
2217       last = _cpp_append_extend_buff (pfile, last, len);
2218     }
2219
2220   memcpy (BUFF_FRONT (last), base, len);
2221   BUFF_FRONT (last) += len;
2222   accum += len;
2223 }
2224
2225 void
2226 lit_accum::read_begin (cpp_reader *pfile)
2227 {
2228   /* We never accumulate more than 4 chars to read.  */
2229   if (BUFF_ROOM (last) < 4)
2230
2231     last = _cpp_append_extend_buff (pfile, last, 4);
2232   rpos = BUFF_FRONT (last);
2233 }
2234
2235 /* Returns true if a macro has been defined.
2236    This might not work if compile with -save-temps,
2237    or preprocess separately from compilation.  */
2238
2239 static bool
2240 is_macro(cpp_reader *pfile, const uchar *base)
2241 {
2242   const uchar *cur = base;
2243   if (! ISIDST (*cur))
2244     return false;
2245   unsigned int hash = HT_HASHSTEP (0, *cur);
2246   ++cur;
2247   while (ISIDNUM (*cur))
2248     {
2249       hash = HT_HASHSTEP (hash, *cur);
2250       ++cur;
2251     }
2252   hash = HT_HASHFINISH (hash, cur - base);
2253
2254   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2255                                         base, cur - base, hash, HT_NO_INSERT));
2256
2257   return result && cpp_macro_p (result);
2258 }
2259
2260 /* Returns true if a literal suffix does not have the expected form
2261    and is defined as a macro.  */
2262
2263 static bool
2264 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
2265 {
2266   /* User-defined literals outside of namespace std must start with a single
2267      underscore, so assume anything of that form really is a UDL suffix.
2268      We don't need to worry about UDLs defined inside namespace std because
2269      their names are reserved, so cannot be used as macro names in valid
2270      programs.  */
2271   if (base[0] == '_' && base[1] != '_')
2272     return false;
2273   return is_macro (pfile, base);
2274 }
2275
2276 /* Lexes a raw string.  The stored string contains the spelling,
2277    including double quotes, delimiter string, '(' and ')', any leading
2278    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
2279    the type of the literal, or CPP_OTHER if it was not properly
2280    terminated.
2281
2282    BASE is the start of the token.  Updates pfile->buffer->cur to just
2283    after the lexed string.
2284
2285    The spelling is NUL-terminated, but it is not guaranteed that this
2286    is the first NUL since embedded NULs are preserved.  */
2287
2288 static void
2289 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2290 {
2291   const uchar *pos = base;
2292   const bool warn_bidi_p = pfile->warn_bidi_p ();
2293
2294   /* 'tis a pity this information isn't passed down from the lexer's
2295      initial categorization of the token.  */
2296   enum cpp_ttype type = CPP_STRING;
2297
2298   if (*pos == 'L')
2299     {
2300       type = CPP_WSTRING;
2301       pos++;
2302     }
2303   else if (*pos == 'U')
2304     {
2305       type = CPP_STRING32;
2306       pos++;
2307     }
2308   else if (*pos == 'u')
2309     {
2310       if (pos[1] == '8')
2311         {
2312           type = CPP_UTF8STRING;
2313           pos++;
2314         }
2315       else
2316         type = CPP_STRING16;
2317       pos++;
2318     }
2319
2320   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2321   pos += 2;
2322
2323   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2324
2325   /* Skip notes before the ".  */
2326   while (note->pos < pos)
2327     ++note;
2328
2329   lit_accum accum;
2330
2331   uchar prefix[17];
2332   unsigned prefix_len = 0;
2333   enum Phase
2334   {
2335    PHASE_PREFIX = -2,
2336    PHASE_NONE = -1,
2337    PHASE_SUFFIX = 0
2338   } phase = PHASE_PREFIX;
2339
2340   for (;;)
2341     {
2342       gcc_checking_assert (note->pos >= pos);
2343
2344       /* Undo any escaped newlines and trigraphs.  */
2345       if (!accum.reading_p () && note->pos == pos)
2346         switch (note->type)
2347           {
2348           case '\\':
2349           case ' ':
2350             /* Restore backslash followed by newline.  */
2351             accum.append (pfile, base, pos - base);
2352             base = pos;
2353             accum.read_begin (pfile);
2354             accum.append (pfile, UC"\\", 1);
2355
2356           after_backslash:
2357             if (note->type == ' ')
2358               /* GNU backslash whitespace newline extension.  FIXME
2359                  could be any sequence of non-vertical space.  When we
2360                  can properly restore any such sequence, we should
2361                  mark this note as handled so _cpp_process_line_notes
2362                  doesn't warn.  */
2363               accum.append (pfile, UC" ", 1);
2364
2365             accum.append (pfile, UC"\n", 1);
2366             note++;
2367             break;
2368
2369           case '\n':
2370             /* This can happen for ??/<NEWLINE> when trigraphs are not
2371                being interpretted.  */
2372             gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2373             note->type = 0;
2374             note++;
2375             break;
2376
2377           default:
2378             gcc_checking_assert (_cpp_trigraph_map[note->type]);
2379
2380             /* Don't warn about this trigraph in
2381                _cpp_process_line_notes, since trigraphs show up as
2382                trigraphs in raw strings.  */
2383             uchar type = note->type;
2384             note->type = 0;
2385
2386             if (CPP_OPTION (pfile, trigraphs))
2387               {
2388                 accum.append (pfile, base, pos - base);
2389                 base = pos;
2390                 accum.read_begin (pfile);
2391                 accum.append (pfile, UC"??", 2);
2392                 accum.append (pfile, &type, 1);
2393
2394                 /* ??/ followed by newline gets two line notes, one for
2395                    the trigraph and one for the backslash/newline.  */
2396                 if (type == '/' && note[1].pos == pos)
2397                   {
2398                     note++;
2399                     gcc_assert (note->type == '\\' || note->type == ' ');
2400                     goto after_backslash;
2401                   }
2402                 /* Skip the replacement character.  */
2403                 base = ++pos;
2404               }
2405
2406             note++;
2407             break;
2408           }
2409
2410       /* Now get a char to process.  Either from an expanded note, or
2411          from the line buffer.  */
2412       bool read_note = accum.reading_p ();
2413       char c = read_note ? accum.read_char () : *pos++;
2414
2415       if (phase == PHASE_PREFIX)
2416         {
2417           if (c == '(')
2418             {
2419               /* Done.  */
2420               phase = PHASE_NONE;
2421               prefix[prefix_len++] = '"';
2422             }
2423           else if (prefix_len < 16
2424                    /* Prefix chars are any of the basic character set,
2425                       [lex.charset] except for '
2426                       ()\\\t\v\f\n'. Optimized for a contiguous
2427                       alphabet.  */
2428                    /* Unlike a switch, this collapses down to one or
2429                       two shift and bitmask operations on an ASCII
2430                       system, with an outlier or two.   */
2431                    && (('Z' - 'A' == 25
2432                         ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2433                         : ISIDST (c))
2434                        || (c >= '0' && c <= '9')
2435                        || c == '_' || c == '{' || c == '}'
2436                        || c == '[' || c == ']' || c == '#'
2437                        || c == '<' || c == '>' || c == '%'
2438                        || c == ':' || c == ';' || c == '.' || c == '?'
2439                        || c == '*' || c == '+' || c == '-' || c == '/'
2440                        || c == '^' || c == '&' || c == '|' || c == '~'
2441                        || c == '!' || c == '=' || c == ','
2442                        || c == '"' || c == '\''))
2443             prefix[prefix_len++] = c;
2444           else
2445             {
2446               /* Something is wrong.  */
2447               int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2448               if (prefix_len == 16)
2449                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2450                                      col, "raw string delimiter longer "
2451                                      "than 16 characters");
2452               else if (c == '\n')
2453                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2454                                      col, "invalid new-line in raw "
2455                                      "string delimiter");
2456               else
2457                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2458                                      col, "invalid character '%c' in "
2459                                      "raw string delimiter", c);
2460               type = CPP_OTHER;
2461               phase = PHASE_NONE;
2462               /* Continue until we get a close quote, that's probably
2463                  the best failure mode.  */
2464               prefix_len = 0;
2465             }
2466           if (c != '\n')
2467             continue;
2468         }
2469
2470       if (phase != PHASE_NONE)
2471         {
2472           if (prefix[phase] != c)
2473             phase = PHASE_NONE;
2474           else if (unsigned (phase + 1) == prefix_len)
2475             break;
2476           else
2477             {
2478               phase = Phase (phase + 1);
2479               continue;
2480             }
2481         }
2482
2483       if (!prefix_len && c == '"')
2484         /* Failure mode lexing.  */
2485         goto out;
2486       else if (prefix_len && c == ')')
2487         phase = PHASE_SUFFIX;
2488       else if (!read_note && c == '\n')
2489         {
2490           pos--;
2491           pfile->buffer->cur = pos;
2492           if (pfile->state.in_directive
2493               || (pfile->state.parsing_args
2494                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
2495             {
2496               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2497                                    "unterminated raw string");
2498               type = CPP_OTHER;
2499               goto out;
2500             }
2501
2502           accum.append (pfile, base, pos - base + 1);
2503           _cpp_process_line_notes (pfile, false);
2504
2505           if (pfile->buffer->next_line < pfile->buffer->rlimit)
2506             CPP_INCREMENT_LINE (pfile, 0);
2507           pfile->buffer->need_line = true;
2508
2509           if (!_cpp_get_fresh_line (pfile))
2510             {
2511               /* We ran out of file and failed to get a line.  */
2512               location_t src_loc = token->src_loc;
2513               token->type = CPP_EOF;
2514               /* Tell the compiler the line number of the EOF token.  */
2515               token->src_loc = pfile->line_table->highest_line;
2516               token->flags = BOL;
2517               if (accum.first)
2518                 _cpp_release_buff (pfile, accum.first);
2519               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2520                                    "unterminated raw string");
2521               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
2522               _cpp_pop_buffer (pfile);
2523               return;
2524             }
2525
2526           pos = base = pfile->buffer->cur;
2527           note = &pfile->buffer->notes[pfile->buffer->cur_note];
2528         }
2529       else if (__builtin_expect ((unsigned char) c == bidi::utf8_start, 0)
2530                && warn_bidi_p)
2531         {
2532           location_t loc;
2533           bidi::kind kind = get_bidi_utf8 (pfile, pos - 1, &loc);
2534           maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2535         }
2536     }
2537
2538   if (warn_bidi_p)
2539     maybe_warn_bidi_on_close (pfile, pos);
2540
2541   if (CPP_OPTION (pfile, user_literals))
2542     {
2543       /* If a string format macro, say from inttypes.h, is placed touching
2544          a string literal it could be parsed as a C++11 user-defined string
2545          literal thus breaking the program.  */
2546       if (is_macro_not_literal_suffix (pfile, pos))
2547         {
2548           /* Raise a warning, but do not consume subsequent tokens.  */
2549           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2550             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2551                                    token->src_loc, 0,
2552                                    "invalid suffix on literal; C++11 requires "
2553                                    "a space between literal and string macro");
2554         }
2555       /* Grab user defined literal suffix.  */
2556       else if (ISIDST (*pos))
2557         {
2558           type = cpp_userdef_string_add_type (type);
2559           ++pos;
2560
2561           while (ISIDNUM (*pos))
2562             ++pos;
2563         }
2564     }
2565
2566  out:
2567   pfile->buffer->cur = pos;
2568   if (!accum.accum)
2569     create_literal (pfile, token, base, pos - base, type);
2570   else
2571     {
2572       size_t extra_len = pos - base;
2573       uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2574
2575       token->type = type;
2576       token->val.str.len = accum.accum + extra_len;
2577       token->val.str.text = dest;
2578       for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2579         {
2580           size_t len = BUFF_FRONT (buf) - buf->base;
2581           memcpy (dest, buf->base, len);
2582           dest += len;
2583         }
2584       _cpp_release_buff (pfile, accum.first);
2585       memcpy (dest, base, extra_len);
2586       dest[extra_len] = '\0';
2587     }
2588 }
2589
2590 /* Lexes a string, character constant, or angle-bracketed header file
2591    name.  The stored string contains the spelling, including opening
2592    quote and any leading 'L', 'u', 'U' or 'u8' and optional
2593    'R' modifier.  It returns the type of the literal, or CPP_OTHER
2594    if it was not properly terminated, or CPP_LESS for an unterminated
2595    header name which must be relexed as normal tokens.
2596
2597    The spelling is NUL-terminated, but it is not guaranteed that this
2598    is the first NUL since embedded NULs are preserved.  */
2599 static void
2600 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2601 {
2602   bool saw_NUL = false;
2603   const uchar *cur;
2604   cppchar_t terminator;
2605   enum cpp_ttype type;
2606
2607   cur = base;
2608   terminator = *cur++;
2609   if (terminator == 'L' || terminator == 'U')
2610     terminator = *cur++;
2611   else if (terminator == 'u')
2612     {
2613       terminator = *cur++;
2614       if (terminator == '8')
2615         terminator = *cur++;
2616     }
2617   if (terminator == 'R')
2618     {
2619       lex_raw_string (pfile, token, base);
2620       return;
2621     }
2622   if (terminator == '"')
2623     type = (*base == 'L' ? CPP_WSTRING :
2624             *base == 'U' ? CPP_STRING32 :
2625             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2626                          : CPP_STRING);
2627   else if (terminator == '\'')
2628     type = (*base == 'L' ? CPP_WCHAR :
2629             *base == 'U' ? CPP_CHAR32 :
2630             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2631                          : CPP_CHAR);
2632   else
2633     terminator = '>', type = CPP_HEADER_NAME;
2634
2635   const bool warn_bidi_p = pfile->warn_bidi_p ();
2636   for (;;)
2637     {
2638       cppchar_t c = *cur++;
2639
2640       /* In #include-style directives, terminators are not escapable.  */
2641       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2642         {
2643           if ((cur[0] == 'u' || cur[0] == 'U') && warn_bidi_p)
2644             {
2645               location_t loc;
2646               bidi::kind kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U',
2647                                               &loc);
2648               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2649             }
2650           cur++;
2651         }
2652       else if (c == terminator)
2653         {
2654           if (warn_bidi_p)
2655             maybe_warn_bidi_on_close (pfile, cur - 1);
2656           break;
2657         }
2658       else if (c == '\n')
2659         {
2660           cur--;
2661           /* Unmatched quotes always yield undefined behavior, but
2662              greedy lexing means that what appears to be an unterminated
2663              header name may actually be a legitimate sequence of tokens.  */
2664           if (terminator == '>')
2665             {
2666               token->type = CPP_LESS;
2667               return;
2668             }
2669           type = CPP_OTHER;
2670           break;
2671         }
2672       else if (c == '\0')
2673         saw_NUL = true;
2674       else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
2675         {
2676           location_t loc;
2677           bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
2678           maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2679         }
2680     }
2681
2682   if (saw_NUL && !pfile->state.skipping)
2683     cpp_error (pfile, CPP_DL_WARNING,
2684                "null character(s) preserved in literal");
2685
2686   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2687     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2688                (int) terminator);
2689
2690   if (CPP_OPTION (pfile, user_literals))
2691     {
2692       /* If a string format macro, say from inttypes.h, is placed touching
2693          a string literal it could be parsed as a C++11 user-defined string
2694          literal thus breaking the program.  */
2695       if (is_macro_not_literal_suffix (pfile, cur))
2696         {
2697           /* Raise a warning, but do not consume subsequent tokens.  */
2698           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2699             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2700                                    token->src_loc, 0,
2701                                    "invalid suffix on literal; C++11 requires "
2702                                    "a space between literal and string macro");
2703         }
2704       /* Grab user defined literal suffix.  */
2705       else if (ISIDST (*cur))
2706         {
2707           type = cpp_userdef_char_add_type (type);
2708           type = cpp_userdef_string_add_type (type);
2709           ++cur;
2710
2711           while (ISIDNUM (*cur))
2712             ++cur;
2713         }
2714     }
2715   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2716            && is_macro (pfile, cur)
2717            && !pfile->state.skipping)
2718     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2719                            token->src_loc, 0, "C++11 requires a space "
2720                            "between string literal and macro");
2721
2722   pfile->buffer->cur = cur;
2723   create_literal (pfile, token, base, cur - base, type);
2724 }
2725
2726 /* Return the comment table. The client may not make any assumption
2727    about the ordering of the table.  */
2728 cpp_comment_table *
2729 cpp_get_comments (cpp_reader *pfile)
2730 {
2731   return &pfile->comments;
2732 }
2733
2734 /* Append a comment to the end of the comment table. */
2735 static void
2736 store_comment (cpp_reader *pfile, cpp_token *token)
2737 {
2738   int len;
2739
2740   if (pfile->comments.allocated == 0)
2741     {
2742       pfile->comments.allocated = 256;
2743       pfile->comments.entries = (cpp_comment *) xmalloc
2744         (pfile->comments.allocated * sizeof (cpp_comment));
2745     }
2746
2747   if (pfile->comments.count == pfile->comments.allocated)
2748     {
2749       pfile->comments.allocated *= 2;
2750       pfile->comments.entries = (cpp_comment *) xrealloc
2751         (pfile->comments.entries,
2752          pfile->comments.allocated * sizeof (cpp_comment));
2753     }
2754
2755   len = token->val.str.len;
2756
2757   /* Copy comment. Note, token may not be NULL terminated. */
2758   pfile->comments.entries[pfile->comments.count].comment =
2759     (char *) xmalloc (sizeof (char) * (len + 1));
2760   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2761           token->val.str.text, len);
2762   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2763
2764   /* Set source location. */
2765   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2766
2767   /* Increment the count of entries in the comment table. */
2768   pfile->comments.count++;
2769 }
2770
2771 /* The stored comment includes the comment start and any terminator.  */
2772 static void
2773 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2774               cppchar_t type)
2775 {
2776   unsigned char *buffer;
2777   unsigned int len, clen, i;
2778
2779   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2780
2781   /* C++ comments probably (not definitely) have moved past a new
2782      line, which we don't want to save in the comment.  */
2783   if (is_vspace (pfile->buffer->cur[-1]))
2784     len--;
2785
2786   /* If we are currently in a directive or in argument parsing, then
2787      we need to store all C++ comments as C comments internally, and
2788      so we need to allocate a little extra space in that case.
2789
2790      Note that the only time we encounter a directive here is
2791      when we are saving comments in a "#define".  */
2792   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2793           && type == '/') ? len + 2 : len;
2794
2795   buffer = _cpp_unaligned_alloc (pfile, clen);
2796
2797   token->type = CPP_COMMENT;
2798   token->val.str.len = clen;
2799   token->val.str.text = buffer;
2800
2801   buffer[0] = '/';
2802   memcpy (buffer + 1, from, len - 1);
2803
2804   /* Finish conversion to a C comment, if necessary.  */
2805   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2806     {
2807       buffer[1] = '*';
2808       buffer[clen - 2] = '*';
2809       buffer[clen - 1] = '/';
2810       /* As there can be in a C++ comments illegal sequences for C comments
2811          we need to filter them out.  */
2812       for (i = 2; i < (clen - 2); i++)
2813         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2814           buffer[i] = '|';
2815     }
2816
2817   /* Finally store this comment for use by clients of libcpp. */
2818   store_comment (pfile, token);
2819 }
2820
2821 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2822    comment.  */
2823
2824 static bool
2825 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2826 {
2827   const unsigned char *from = comment_start + 1;
2828
2829   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2830     {
2831       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2832          don't recognize any comments.  The latter only checks attributes,
2833          the former doesn't warn.  */
2834     case 0:
2835     default:
2836       return false;
2837       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2838          content it has.  */
2839     case 1:
2840       return true;
2841     case 2:
2842       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2843          .*falls?[ \t-]*thr(u|ough).* regex.  */
2844       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2845            from++)
2846         {
2847           /* Is there anything like strpbrk with upper boundary, or
2848              memchr looking for 2 characters rather than just one?  */
2849           if (from[0] != 'f' && from[0] != 'F')
2850             continue;
2851           if (from[1] != 'a' && from[1] != 'A')
2852             continue;
2853           if (from[2] != 'l' && from[2] != 'L')
2854             continue;
2855           if (from[3] != 'l' && from[3] != 'L')
2856             continue;
2857           from += sizeof "fall" - 1;
2858           if (from[0] == 's' || from[0] == 'S')
2859             from++;
2860           while (*from == ' ' || *from == '\t' || *from == '-')
2861             from++;
2862           if (from[0] != 't' && from[0] != 'T')
2863             continue;
2864           if (from[1] != 'h' && from[1] != 'H')
2865             continue;
2866           if (from[2] != 'r' && from[2] != 'R')
2867             continue;
2868           if (from[3] == 'u' || from[3] == 'U')
2869             return true;
2870           if (from[3] != 'o' && from[3] != 'O')
2871             continue;
2872           if (from[4] != 'u' && from[4] != 'U')
2873             continue;
2874           if (from[5] != 'g' && from[5] != 'G')
2875             continue;
2876           if (from[6] != 'h' && from[6] != 'H')
2877             continue;
2878           return true;
2879         }
2880       return false;
2881     case 3:
2882     case 4:
2883       break;
2884     }
2885
2886   /* Whole comment contents:
2887      -fallthrough
2888      @fallthrough@
2889    */
2890   if (*from == '-' || *from == '@')
2891     {
2892       size_t len = sizeof "fallthrough" - 1;
2893       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2894         return false;
2895       if (memcmp (from + 1, "fallthrough", len))
2896         return false;
2897       if (*from == '@')
2898         {
2899           if (from[len + 1] != '@')
2900             return false;
2901           len++;
2902         }
2903       from += 1 + len;
2904     }
2905   /* Whole comment contents (regex):
2906      lint -fallthrough[ \t]*
2907    */
2908   else if (*from == 'l')
2909     {
2910       size_t len = sizeof "int -fallthrough" - 1;
2911       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2912         return false;
2913       if (memcmp (from + 1, "int -fallthrough", len))
2914         return false;
2915       from += 1 + len;
2916       while (*from == ' ' || *from == '\t')
2917         from++;
2918     }
2919   /* Whole comment contents (regex):
2920      [ \t]*FALLTHR(U|OUGH)[ \t]*
2921    */
2922   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2923     {
2924       while (*from == ' ' || *from == '\t')
2925         from++;
2926       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
2927         return false;
2928       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2929         return false;
2930       from += sizeof "FALLTHR" - 1;
2931       if (*from == 'U')
2932         from++;
2933       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
2934         return false;
2935       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2936         return false;
2937       else
2938         from += sizeof "OUGH" - 1;
2939       while (*from == ' ' || *from == '\t')
2940         from++;
2941     }
2942   /* Whole comment contents (regex):
2943      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2944      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2945      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2946    */
2947   else
2948     {
2949       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2950         from++;
2951       unsigned char f = *from;
2952       bool all_upper = false;
2953       if (f == 'E' || f == 'e')
2954         {
2955           if ((size_t) (pfile->buffer->cur - from)
2956               < sizeof "else fallthru" - 1)
2957             return false;
2958           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2959             all_upper = true;
2960           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2961             return false;
2962           from += sizeof "else" - 1;
2963           if (*from == ',')
2964             from++;
2965           if (*from != ' ')
2966             return false;
2967           from++;
2968           if (all_upper && *from == 'f')
2969             return false;
2970           if (f == 'e' && *from == 'F')
2971             return false;
2972           f = *from;
2973         }
2974       else if (f == 'I' || f == 'i')
2975         {
2976           if ((size_t) (pfile->buffer->cur - from)
2977               < sizeof "intentional fallthru" - 1)
2978             return false;
2979           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2980                                   sizeof "NTENTIONAL" - 1) == 0)
2981             all_upper = true;
2982           else if (memcmp (from + 1, "ntentional",
2983                            sizeof "ntentional" - 1))
2984             return false;
2985           from += sizeof "intentional" - 1;
2986           if (*from == ' ')
2987             {
2988               from++;
2989               if (all_upper && *from == 'f')
2990                 return false;
2991             }
2992           else if (all_upper)
2993             {
2994               if (memcmp (from, "LY F", sizeof "LY F" - 1))
2995                 return false;
2996               from += sizeof "LY " - 1;
2997             }
2998           else
2999             {
3000               if (memcmp (from, "ly ", sizeof "ly " - 1))
3001                 return false;
3002               from += sizeof "ly " - 1;
3003             }
3004           if (f == 'i' && *from == 'F')
3005             return false;
3006           f = *from;
3007         }
3008       if (f != 'F' && f != 'f')
3009         return false;
3010       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3011         return false;
3012       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3013         all_upper = true;
3014       else if (all_upper)
3015         return false;
3016       else if (memcmp (from + 1, "all", sizeof "all" - 1))
3017         return false;
3018       from += sizeof "fall" - 1;
3019       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3020         from += 2;
3021       else if (*from == ' ' || *from == '-')
3022         from++;
3023       else if (*from != (all_upper ? 'T' : 't'))
3024         return false;
3025       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3026         return false;
3027       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3028         return false;
3029       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3030         {
3031           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3032             return false;
3033           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3034                       sizeof "hrough" - 1))
3035             return false;
3036           from += sizeof "through" - 1;
3037         }
3038       else
3039         from += sizeof "thru" - 1;
3040       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3041         from++;
3042       if (*from == '-')
3043         {
3044           from++;
3045           if (*comment_start == '*')
3046             {
3047               do
3048                 {
3049                   while (*from && *from != '*'
3050                          && *from != '\n' && *from != '\r')
3051                     from++;
3052                   if (*from != '*' || from[1] == '/')
3053                     break;
3054                   from++;
3055                 }
3056               while (1);
3057             }
3058           else
3059             while (*from && *from != '\n' && *from != '\r')
3060               from++;
3061         }
3062     }
3063   /* C block comment.  */
3064   if (*comment_start == '*')
3065     {
3066       if (*from != '*' || from[1] != '/')
3067         return false;
3068     }
3069   /* C++ line comment.  */
3070   else if (*from != '\n')
3071     return false;
3072
3073   return true;
3074 }
3075
3076 /* Allocate COUNT tokens for RUN.  */
3077 void
3078 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3079 {
3080   run->base = XNEWVEC (cpp_token, count);
3081   run->limit = run->base + count;
3082   run->next = NULL;
3083 }
3084
3085 /* Returns the next tokenrun, or creates one if there is none.  */
3086 static tokenrun *
3087 next_tokenrun (tokenrun *run)
3088 {
3089   if (run->next == NULL)
3090     {
3091       run->next = XNEW (tokenrun);
3092       run->next->prev = run;
3093       _cpp_init_tokenrun (run->next, 250);
3094     }
3095
3096   return run->next;
3097 }
3098
3099 /* Return the number of not yet processed token in a given
3100    context.  */
3101 int
3102 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3103 {
3104   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3105     return (LAST (context).token - FIRST (context).token);
3106   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3107            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3108     return (LAST (context).ptoken - FIRST (context).ptoken);
3109   else
3110       abort ();
3111 }
3112
3113 /* Returns the token present at index INDEX in a given context.  If
3114    INDEX is zero, the next token to be processed is returned.  */
3115 static const cpp_token*
3116 _cpp_token_from_context_at (cpp_context *context, int index)
3117 {
3118   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3119     return &(FIRST (context).token[index]);
3120   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3121            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3122     return FIRST (context).ptoken[index];
3123  else
3124    abort ();
3125 }
3126
3127 /* Look ahead in the input stream.  */
3128 const cpp_token *
3129 cpp_peek_token (cpp_reader *pfile, int index)
3130 {
3131   cpp_context *context = pfile->context;
3132   const cpp_token *peektok;
3133   int count;
3134
3135   /* First, scan through any pending cpp_context objects.  */
3136   while (context->prev)
3137     {
3138       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3139
3140       if (index < (int) sz)
3141         return _cpp_token_from_context_at (context, index);
3142       index -= (int) sz;
3143       context = context->prev;
3144     }
3145
3146   /* We will have to read some new tokens after all (and do so
3147      without invalidating preceding tokens).  */
3148   count = index;
3149   pfile->keep_tokens++;
3150
3151   /* For peeked tokens temporarily disable line_change reporting,
3152      until the tokens are parsed for real.  */
3153   void (*line_change) (cpp_reader *, const cpp_token *, int)
3154     = pfile->cb.line_change;
3155   pfile->cb.line_change = NULL;
3156
3157   do
3158     {
3159       peektok = _cpp_lex_token (pfile);
3160       if (peektok->type == CPP_EOF)
3161         {
3162           index--;
3163           break;
3164         }
3165       else if (peektok->type == CPP_PRAGMA)
3166         {
3167           /* Don't peek past a pragma.  */
3168           if (peektok == &pfile->directive_result)
3169             /* Save the pragma in the buffer.  */
3170             *pfile->cur_token++ = *peektok;
3171           index--;
3172           break;
3173         }
3174     }
3175   while (index--);
3176
3177   _cpp_backup_tokens_direct (pfile, count - index);
3178   pfile->keep_tokens--;
3179   pfile->cb.line_change = line_change;
3180
3181   return peektok;
3182 }
3183
3184 /* Allocate a single token that is invalidated at the same time as the
3185    rest of the tokens on the line.  Has its line and col set to the
3186    same as the last lexed token, so that diagnostics appear in the
3187    right place.  */
3188 cpp_token *
3189 _cpp_temp_token (cpp_reader *pfile)
3190 {
3191   cpp_token *old, *result;
3192   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3193   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3194
3195   old = pfile->cur_token - 1;
3196   /* Any pre-existing lookaheads must not be clobbered.  */
3197   if (la)
3198     {
3199       if (sz <= la)
3200         {
3201           tokenrun *next = next_tokenrun (pfile->cur_run);
3202
3203           if (sz < la)
3204             memmove (next->base + 1, next->base,
3205                      (la - sz) * sizeof (cpp_token));
3206
3207           next->base[0] = pfile->cur_run->limit[-1];
3208         }
3209
3210       if (sz > 1)
3211         memmove (pfile->cur_token + 1, pfile->cur_token,
3212                  MIN (la, sz - 1) * sizeof (cpp_token));
3213     }
3214
3215   if (!sz && pfile->cur_token == pfile->cur_run->limit)
3216     {
3217       pfile->cur_run = next_tokenrun (pfile->cur_run);
3218       pfile->cur_token = pfile->cur_run->base;
3219     }
3220
3221   result = pfile->cur_token++;
3222   result->src_loc = old->src_loc;
3223   return result;
3224 }
3225
3226 /* We're at the beginning of a logical line (so not in
3227   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
3228   if we should enter deferred_pragma mode to tokenize the rest of the
3229   line as a module control-line.  */
3230
3231 static void
3232 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3233 {
3234   unsigned backup = 0; /* Tokens we peeked.  */
3235   cpp_hashnode *node = result->val.node.node;
3236   cpp_token *peek = result;
3237   cpp_token *keyword = peek;
3238   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3239   int header_count = 0;
3240
3241   /* Make sure the incoming state is as we expect it.  This way we
3242      can restore it using constants.  */
3243   gcc_checking_assert (!pfile->state.in_deferred_pragma
3244                        && !pfile->state.skipping
3245                        && !pfile->state.parsing_args
3246                        && !pfile->state.angled_headers
3247                        && (pfile->state.save_comments
3248                            == !CPP_OPTION (pfile, discard_comments)));
3249
3250   /* Enter directives mode sufficiently for peeking.  We don't have
3251      to actually set in_directive.  */
3252   pfile->state.in_deferred_pragma = true;
3253
3254   /* These two fields are needed to process tokenization in deferred
3255      pragma mode.  They are not used outside deferred pragma mode or
3256      directives mode.  */
3257   pfile->state.pragma_allow_expansion = true;
3258   pfile->directive_line = result->src_loc;
3259
3260   /* Saving comments is incompatible with directives mode.   */
3261   pfile->state.save_comments = 0;
3262
3263   if (node == n_modules[spec_nodes::M_EXPORT][0])
3264     {
3265       peek = _cpp_lex_direct (pfile);
3266       keyword = peek;
3267       backup++;
3268       if (keyword->type != CPP_NAME)
3269         goto not_module;
3270       node = keyword->val.node.node;
3271       if (!(node->flags & NODE_MODULE))
3272         goto not_module;
3273     }
3274
3275   if (node == n_modules[spec_nodes::M__IMPORT][0])
3276     /* __import  */
3277     header_count = backup + 2 + 16;
3278   else if (node == n_modules[spec_nodes::M_IMPORT][0])
3279     /* import  */
3280     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3281   else if (node == n_modules[spec_nodes::M_MODULE][0])
3282     ; /* module  */
3283   else
3284     goto not_module;
3285
3286   /* We've seen [export] {module|import|__import}.  Check the next token.  */
3287   if (header_count)
3288     /* After '{,__}import' a header name may appear.  */
3289     pfile->state.angled_headers = true;
3290   peek = _cpp_lex_direct (pfile);
3291   backup++;
3292
3293   /* ... import followed by identifier, ':', '<' or
3294      header-name preprocessing tokens, or module
3295      followed by cpp-identifier, ':' or ';' preprocessing
3296      tokens.  C++ keywords are not yet relevant.  */
3297   if (peek->type == CPP_NAME
3298       || peek->type == CPP_COLON
3299       ||  (header_count
3300            ? (peek->type == CPP_LESS
3301               || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3302               || peek->type == CPP_HEADER_NAME)
3303            : peek->type == CPP_SEMICOLON))
3304     {
3305       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3306       if (!pfile->state.pragma_allow_expansion)
3307         pfile->state.prevent_expansion++;
3308
3309       if (!header_count && linemap_included_from
3310           (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3311         cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3312                              "module control-line cannot be in included file");
3313
3314       /* The first one or two tokens cannot be macro names.  */
3315       for (int ix = backup; ix--;)
3316         {
3317           cpp_token *tok = ix ? keyword : result;
3318           cpp_hashnode *node = tok->val.node.node;
3319
3320           /* Don't attempt to expand the token.  */
3321           tok->flags |= NO_EXPAND;
3322           if (_cpp_defined_macro_p (node)
3323               && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3324               && !cpp_fun_like_macro_p (node))
3325             cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3326                                  "module control-line \"%s\" cannot be"
3327                                  " an object-like macro",
3328                                  NODE_NAME (node));
3329         }
3330
3331       /* Map to underbar variants.  */
3332       keyword->val.node.node = n_modules[header_count
3333                                          ? spec_nodes::M_IMPORT
3334                                          : spec_nodes::M_MODULE][1];
3335       if (backup != 1)
3336         result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3337
3338       /* Maybe tell the tokenizer we expect a header-name down the
3339          road.  */
3340       pfile->state.directive_file_token = header_count;
3341     }
3342   else
3343     {
3344     not_module:
3345       /* Drop out of directive mode.  */
3346       /* We aaserted save_comments had this value upon entry.  */
3347       pfile->state.save_comments
3348         = !CPP_OPTION (pfile, discard_comments);
3349       pfile->state.in_deferred_pragma = false;
3350       /* Do not let this remain on.  */
3351       pfile->state.angled_headers = false;
3352     }
3353
3354   /* In either case we want to backup the peeked tokens.  */
3355   if (backup)
3356     {
3357       /* If we saw EOL, we should drop it, because this isn't a module
3358          control-line after all.  */
3359       bool eol = peek->type == CPP_PRAGMA_EOL;
3360       if (!eol || backup > 1)
3361         {
3362           /* Put put the peeked tokens back  */
3363           _cpp_backup_tokens_direct (pfile, backup);
3364           /* But if the last one was an EOL, forget it.  */
3365           if (eol)
3366             pfile->lookaheads--;
3367         }
3368     }
3369 }
3370
3371 /* Lex a token into RESULT (external interface).  Takes care of issues
3372    like directive handling, token lookahead, multiple include
3373    optimization and skipping.  */
3374 const cpp_token *
3375 _cpp_lex_token (cpp_reader *pfile)
3376 {
3377   cpp_token *result;
3378
3379   for (;;)
3380     {
3381       if (pfile->cur_token == pfile->cur_run->limit)
3382         {
3383           pfile->cur_run = next_tokenrun (pfile->cur_run);
3384           pfile->cur_token = pfile->cur_run->base;
3385         }
3386       /* We assume that the current token is somewhere in the current
3387          run.  */
3388       if (pfile->cur_token < pfile->cur_run->base
3389           || pfile->cur_token >= pfile->cur_run->limit)
3390         abort ();
3391
3392       if (pfile->lookaheads)
3393         {
3394           pfile->lookaheads--;
3395           result = pfile->cur_token++;
3396         }
3397       else
3398         result = _cpp_lex_direct (pfile);
3399
3400       if (result->flags & BOL)
3401         {
3402           /* Is this a directive.  If _cpp_handle_directive returns
3403              false, it is an assembler #.  */
3404           if (result->type == CPP_HASH
3405               /* 6.10.3 p 11: Directives in a list of macro arguments
3406                  gives undefined behavior.  This implementation
3407                  handles the directive as normal.  */
3408               && pfile->state.parsing_args != 1)
3409             {
3410               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3411                 {
3412                   if (pfile->directive_result.type == CPP_PADDING)
3413                     continue;
3414                   result = &pfile->directive_result;
3415                 }
3416             }
3417           else if (pfile->state.in_deferred_pragma)
3418             result = &pfile->directive_result;
3419           else if (result->type == CPP_NAME
3420                    && (result->val.node.node->flags & NODE_MODULE)
3421                    && !pfile->state.skipping
3422                    /* Unlike regular directives, we do not deal with
3423                       tokenizing module directives as macro arguments.
3424                       That's not permitted.  */
3425                    && !pfile->state.parsing_args)
3426             {
3427               /* P1857.  Before macro expansion, At start of logical
3428                  line ... */
3429               /* We don't have to consider lookaheads at this point.  */
3430               gcc_checking_assert (!pfile->lookaheads);
3431
3432               cpp_maybe_module_directive (pfile, result);
3433             }
3434
3435           if (pfile->cb.line_change && !pfile->state.skipping)
3436             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3437         }
3438
3439       /* We don't skip tokens in directives.  */
3440       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3441         break;
3442
3443       /* Outside a directive, invalidate controlling macros.  At file
3444          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3445          get here and MI optimization works.  */
3446       pfile->mi_valid = false;
3447
3448       if (!pfile->state.skipping || result->type == CPP_EOF)
3449         break;
3450     }
3451
3452   return result;
3453 }
3454
3455 /* Returns true if a fresh line has been loaded.  */
3456 bool
3457 _cpp_get_fresh_line (cpp_reader *pfile)
3458 {
3459   /* We can't get a new line until we leave the current directive.  */
3460   if (pfile->state.in_directive)
3461     return false;
3462
3463   for (;;)
3464     {
3465       cpp_buffer *buffer = pfile->buffer;
3466
3467       if (!buffer->need_line)
3468         return true;
3469
3470       if (buffer->next_line < buffer->rlimit)
3471         {
3472           _cpp_clean_line (pfile);
3473           return true;
3474         }
3475
3476       /* First, get out of parsing arguments state.  */
3477       if (pfile->state.parsing_args)
3478         return false;
3479
3480       /* End of buffer.  Non-empty files should end in a newline.  */
3481       if (buffer->buf != buffer->rlimit
3482           && buffer->next_line > buffer->rlimit
3483           && !buffer->from_stage3)
3484         {
3485           /* Clip to buffer size.  */
3486           buffer->next_line = buffer->rlimit;
3487         }
3488
3489       if (buffer->prev && !buffer->return_at_eof)
3490         _cpp_pop_buffer (pfile);
3491       else
3492         {
3493           /* End of translation.  Do not pop the buffer yet. Increment
3494              line number so that the EOF token is on a line of its own
3495              (_cpp_lex_direct doesn't increment in that case, because
3496              it's hard for it to distinguish this special case). */
3497           CPP_INCREMENT_LINE (pfile, 0);
3498           return false;
3499         }
3500     }
3501 }
3502
3503 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
3504   do                                                    \
3505     {                                                   \
3506       result->type = ELSE_TYPE;                         \
3507       if (*buffer->cur == CHAR)                         \
3508         buffer->cur++, result->type = THEN_TYPE;        \
3509     }                                                   \
3510   while (0)
3511
3512 /* Lex a token into pfile->cur_token, which is also incremented, to
3513    get diagnostics pointing to the correct location.
3514
3515    Does not handle issues such as token lookahead, multiple-include
3516    optimization, directives, skipping etc.  This function is only
3517    suitable for use by _cpp_lex_token, and in special cases like
3518    lex_expansion_token which doesn't care for any of these issues.
3519
3520    When meeting a newline, returns CPP_EOF if parsing a directive,
3521    otherwise returns to the start of the token buffer if permissible.
3522    Returns the location of the lexed token.  */
3523 cpp_token *
3524 _cpp_lex_direct (cpp_reader *pfile)
3525 {
3526   cppchar_t c;
3527   cpp_buffer *buffer;
3528   const unsigned char *comment_start;
3529   bool fallthrough_comment = false;
3530   cpp_token *result = pfile->cur_token++;
3531
3532  fresh_line:
3533   result->flags = 0;
3534   buffer = pfile->buffer;
3535   if (buffer->need_line)
3536     {
3537       if (pfile->state.in_deferred_pragma)
3538         {
3539           /* This can happen in cases like:
3540              #define loop(x) whatever
3541              #pragma omp loop
3542              where when trying to expand loop we need to peek
3543              next token after loop, but aren't still in_deferred_pragma
3544              mode but are in in_directive mode, so buffer->need_line
3545              is set, a CPP_EOF is peeked.  */
3546           result->type = CPP_PRAGMA_EOL;
3547           pfile->state.in_deferred_pragma = false;
3548           if (!pfile->state.pragma_allow_expansion)
3549             pfile->state.prevent_expansion--;
3550           return result;
3551         }
3552       if (!_cpp_get_fresh_line (pfile))
3553         {
3554           result->type = CPP_EOF;
3555           /* Not a real EOF in a directive or arg parsing -- we refuse
3556              to advance to the next file now, and will once we're out
3557              of those modes.  */
3558           if (!pfile->state.in_directive && !pfile->state.parsing_args)
3559             {
3560               /* Tell the compiler the line number of the EOF token.  */
3561               result->src_loc = pfile->line_table->highest_line;
3562               result->flags = BOL;
3563               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
3564               _cpp_pop_buffer (pfile);
3565             }
3566           return result;
3567         }
3568       if (buffer != pfile->buffer)
3569         fallthrough_comment = false;
3570       if (!pfile->keep_tokens)
3571         {
3572           pfile->cur_run = &pfile->base_run;
3573           result = pfile->base_run.base;
3574           pfile->cur_token = result + 1;
3575         }
3576       result->flags = BOL;
3577       if (pfile->state.parsing_args == 2)
3578         result->flags |= PREV_WHITE;
3579     }
3580   buffer = pfile->buffer;
3581  update_tokens_line:
3582   result->src_loc = pfile->line_table->highest_line;
3583
3584  skipped_white:
3585   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3586       && !pfile->overlaid_buffer)
3587     {
3588       _cpp_process_line_notes (pfile, false);
3589       result->src_loc = pfile->line_table->highest_line;
3590     }
3591   c = *buffer->cur++;
3592
3593   if (pfile->forced_token_location)
3594     result->src_loc = pfile->forced_token_location;
3595   else
3596     result->src_loc = linemap_position_for_column (pfile->line_table,
3597                                           CPP_BUF_COLUMN (buffer, buffer->cur));
3598
3599   switch (c)
3600     {
3601     case ' ': case '\t': case '\f': case '\v': case '\0':
3602       result->flags |= PREV_WHITE;
3603       skip_whitespace (pfile, c);
3604       goto skipped_white;
3605
3606     case '\n':
3607       /* Increment the line, unless this is the last line ...  */
3608       if (buffer->cur < buffer->rlimit
3609           /* ... or this is a #include, (where _cpp_stack_file needs to
3610              unwind by one line) ...  */
3611           || (pfile->state.in_directive > 1
3612               /* ... except traditional-cpp increments this elsewhere.  */
3613               && !CPP_OPTION (pfile, traditional)))
3614         CPP_INCREMENT_LINE (pfile, 0);
3615       buffer->need_line = true;
3616       if (pfile->state.in_deferred_pragma)
3617         {
3618           /* Produce the PRAGMA_EOL on this line.  File reading
3619              ensures there is always a \n at end of the buffer, thus
3620              in a deferred pragma we always see CPP_PRAGMA_EOL before
3621              any CPP_EOF.  */
3622           result->type = CPP_PRAGMA_EOL;
3623           result->flags &= ~PREV_WHITE;
3624           pfile->state.in_deferred_pragma = false;
3625           if (!pfile->state.pragma_allow_expansion)
3626             pfile->state.prevent_expansion--;
3627           return result;
3628         }
3629       goto fresh_line;
3630
3631     case '0': case '1': case '2': case '3': case '4':
3632     case '5': case '6': case '7': case '8': case '9':
3633       {
3634         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3635         result->type = CPP_NUMBER;
3636         lex_number (pfile, &result->val.str, &nst);
3637         warn_about_normalization (pfile, result, &nst);
3638         break;
3639       }
3640
3641     case 'L':
3642     case 'u':
3643     case 'U':
3644     case 'R':
3645       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3646          wide strings or raw strings.  */
3647       if (c == 'L' || CPP_OPTION (pfile, rliterals)
3648           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3649         {
3650           if ((*buffer->cur == '\'' && c != 'R')
3651               || *buffer->cur == '"'
3652               || (*buffer->cur == 'R'
3653                   && c != 'R'
3654                   && buffer->cur[1] == '"'
3655                   && CPP_OPTION (pfile, rliterals))
3656               || (*buffer->cur == '8'
3657                   && c == 'u'
3658                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3659                                 && CPP_OPTION (pfile, utf8_char_literals)))
3660                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3661                           && CPP_OPTION (pfile, rliterals)))))
3662             {
3663               lex_string (pfile, result, buffer->cur - 1);
3664               break;
3665             }
3666         }
3667       /* Fall through.  */
3668
3669     case '_':
3670     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3671     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3672     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3673     case 's': case 't':           case 'v': case 'w': case 'x':
3674     case 'y': case 'z':
3675     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3676     case 'G': case 'H': case 'I': case 'J': case 'K':
3677     case 'M': case 'N': case 'O': case 'P': case 'Q':
3678     case 'S': case 'T':           case 'V': case 'W': case 'X':
3679     case 'Y': case 'Z':
3680       result->type = CPP_NAME;
3681       {
3682         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3683         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3684                                                 &nst,
3685                                                 &result->val.node.spelling);
3686         warn_about_normalization (pfile, result, &nst);
3687       }
3688
3689       /* Convert named operators to their proper types.  */
3690       if (result->val.node.node->flags & NODE_OPERATOR)
3691         {
3692           result->flags |= NAMED_OP;
3693           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3694         }
3695
3696       /* Signal FALLTHROUGH comment followed by another token.  */
3697       if (fallthrough_comment)
3698         result->flags |= PREV_FALLTHROUGH;
3699       break;
3700
3701     case '\'':
3702     case '"':
3703       lex_string (pfile, result, buffer->cur - 1);
3704       break;
3705
3706     case '/':
3707       /* A potential block or line comment.  */
3708       comment_start = buffer->cur;
3709       c = *buffer->cur;
3710
3711       if (c == '*')
3712         {
3713           if (_cpp_skip_block_comment (pfile))
3714             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3715         }
3716       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3717         {
3718           /* Don't warn for system headers.  */
3719           if (_cpp_in_system_header (pfile))
3720             ;
3721           /* Warn about comments if pedantically GNUC89, and not
3722              in system headers.  */
3723           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3724                    && CPP_PEDANTIC (pfile)
3725                    && ! buffer->warned_cplusplus_comments)
3726             {
3727               if (cpp_error (pfile, CPP_DL_PEDWARN,
3728                              "C++ style comments are not allowed in ISO C90"))
3729                 cpp_error (pfile, CPP_DL_NOTE,
3730                            "(this will be reported only once per input file)");
3731               buffer->warned_cplusplus_comments = 1;
3732             }
3733           /* Or if specifically desired via -Wc90-c99-compat.  */
3734           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3735                    && ! CPP_OPTION (pfile, cplusplus)
3736                    && ! buffer->warned_cplusplus_comments)
3737             {
3738               if (cpp_error (pfile, CPP_DL_WARNING,
3739                              "C++ style comments are incompatible with C90"))
3740                 cpp_error (pfile, CPP_DL_NOTE,
3741                            "(this will be reported only once per input file)");
3742               buffer->warned_cplusplus_comments = 1;
3743             }
3744           /* In C89/C94, C++ style comments are forbidden.  */
3745           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3746                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
3747             {
3748               /* But don't be confused about valid code such as
3749                  - // immediately followed by *,
3750                  - // in a preprocessing directive,
3751                  - // in an #if 0 block.  */
3752               if (buffer->cur[1] == '*'
3753                   || pfile->state.in_directive
3754                   || pfile->state.skipping)
3755                 {
3756                   result->type = CPP_DIV;
3757                   break;
3758                 }
3759               else if (! buffer->warned_cplusplus_comments)
3760                 {
3761                   if (cpp_error (pfile, CPP_DL_ERROR,
3762                                  "C++ style comments are not allowed in "
3763                                  "ISO C90"))
3764                     cpp_error (pfile, CPP_DL_NOTE,
3765                                "(this will be reported only once per input "
3766                                "file)");
3767                   buffer->warned_cplusplus_comments = 1;
3768                 }
3769             }
3770           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3771             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3772         }
3773       else if (c == '=')
3774         {
3775           buffer->cur++;
3776           result->type = CPP_DIV_EQ;
3777           break;
3778         }
3779       else
3780         {
3781           result->type = CPP_DIV;
3782           break;
3783         }
3784
3785       if (fallthrough_comment_p (pfile, comment_start))
3786         fallthrough_comment = true;
3787
3788       if (pfile->cb.comment)
3789         {
3790           size_t len = pfile->buffer->cur - comment_start;
3791           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3792                              len + 1);
3793         }
3794
3795       if (!pfile->state.save_comments)
3796         {
3797           result->flags |= PREV_WHITE;
3798           goto update_tokens_line;
3799         }
3800
3801       if (fallthrough_comment)
3802         result->flags |= PREV_FALLTHROUGH;
3803
3804       /* Save the comment as a token in its own right.  */
3805       save_comment (pfile, result, comment_start, c);
3806       break;
3807
3808     case '<':
3809       if (pfile->state.angled_headers)
3810         {
3811           lex_string (pfile, result, buffer->cur - 1);
3812           if (result->type != CPP_LESS)
3813             break;
3814         }
3815
3816       result->type = CPP_LESS;
3817       if (*buffer->cur == '=')
3818         {
3819           buffer->cur++, result->type = CPP_LESS_EQ;
3820           if (*buffer->cur == '>'
3821               && CPP_OPTION (pfile, cplusplus)
3822               && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
3823             buffer->cur++, result->type = CPP_SPACESHIP;
3824         }
3825       else if (*buffer->cur == '<')
3826         {
3827           buffer->cur++;
3828           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
3829         }
3830       else if (CPP_OPTION (pfile, digraphs))
3831         {
3832           if (*buffer->cur == ':')
3833             {
3834               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
3835                  three characters are <:: and the subsequent character
3836                  is neither : nor >, the < is treated as a preprocessor
3837                  token by itself".  */
3838               if (CPP_OPTION (pfile, cplusplus)
3839                   && CPP_OPTION (pfile, lang) != CLK_CXX98
3840                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
3841                   && buffer->cur[1] == ':'
3842                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3843                 break;
3844
3845               buffer->cur++;
3846               result->flags |= DIGRAPH;
3847               result->type = CPP_OPEN_SQUARE;
3848             }
3849           else if (*buffer->cur == '%')
3850             {
3851               buffer->cur++;
3852               result->flags |= DIGRAPH;
3853               result->type = CPP_OPEN_BRACE;
3854             }
3855         }
3856       break;
3857
3858     case '>':
3859       result->type = CPP_GREATER;
3860       if (*buffer->cur == '=')
3861         buffer->cur++, result->type = CPP_GREATER_EQ;
3862       else if (*buffer->cur == '>')
3863         {
3864           buffer->cur++;
3865           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3866         }
3867       break;
3868
3869     case '%':
3870       result->type = CPP_MOD;
3871       if (*buffer->cur == '=')
3872         buffer->cur++, result->type = CPP_MOD_EQ;
3873       else if (CPP_OPTION (pfile, digraphs))
3874         {
3875           if (*buffer->cur == ':')
3876             {
3877               buffer->cur++;
3878               result->flags |= DIGRAPH;
3879               result->type = CPP_HASH;
3880               if (*buffer->cur == '%' && buffer->cur[1] == ':')
3881                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3882             }
3883           else if (*buffer->cur == '>')
3884             {
3885               buffer->cur++;
3886               result->flags |= DIGRAPH;
3887               result->type = CPP_CLOSE_BRACE;
3888             }
3889         }
3890       break;
3891
3892     case '.':
3893       result->type = CPP_DOT;
3894       if (ISDIGIT (*buffer->cur))
3895         {
3896           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3897           result->type = CPP_NUMBER;
3898           lex_number (pfile, &result->val.str, &nst);
3899           warn_about_normalization (pfile, result, &nst);
3900         }
3901       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3902         buffer->cur += 2, result->type = CPP_ELLIPSIS;
3903       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3904         buffer->cur++, result->type = CPP_DOT_STAR;
3905       break;
3906
3907     case '+':
3908       result->type = CPP_PLUS;
3909       if (*buffer->cur == '+')
3910         buffer->cur++, result->type = CPP_PLUS_PLUS;
3911       else if (*buffer->cur == '=')
3912         buffer->cur++, result->type = CPP_PLUS_EQ;
3913       break;
3914
3915     case '-':
3916       result->type = CPP_MINUS;
3917       if (*buffer->cur == '>')
3918         {
3919           buffer->cur++;
3920           result->type = CPP_DEREF;
3921           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3922             buffer->cur++, result->type = CPP_DEREF_STAR;
3923         }
3924       else if (*buffer->cur == '-')
3925         buffer->cur++, result->type = CPP_MINUS_MINUS;
3926       else if (*buffer->cur == '=')
3927         buffer->cur++, result->type = CPP_MINUS_EQ;
3928       break;
3929
3930     case '&':
3931       result->type = CPP_AND;
3932       if (*buffer->cur == '&')
3933         buffer->cur++, result->type = CPP_AND_AND;
3934       else if (*buffer->cur == '=')
3935         buffer->cur++, result->type = CPP_AND_EQ;
3936       break;
3937
3938     case '|':
3939       result->type = CPP_OR;
3940       if (*buffer->cur == '|')
3941         buffer->cur++, result->type = CPP_OR_OR;
3942       else if (*buffer->cur == '=')
3943         buffer->cur++, result->type = CPP_OR_EQ;
3944       break;
3945
3946     case ':':
3947       result->type = CPP_COLON;
3948       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
3949         buffer->cur++, result->type = CPP_SCOPE;
3950       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3951         {
3952           buffer->cur++;
3953           result->flags |= DIGRAPH;
3954           result->type = CPP_CLOSE_SQUARE;
3955         }
3956       break;
3957
3958     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3959     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3960     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3961     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3962     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3963
3964     case '?': result->type = CPP_QUERY; break;
3965     case '~': result->type = CPP_COMPL; break;
3966     case ',': result->type = CPP_COMMA; break;
3967     case '(': result->type = CPP_OPEN_PAREN; break;
3968     case ')': result->type = CPP_CLOSE_PAREN; break;
3969     case '[': result->type = CPP_OPEN_SQUARE; break;
3970     case ']': result->type = CPP_CLOSE_SQUARE; break;
3971     case '{': result->type = CPP_OPEN_BRACE; break;
3972     case '}': result->type = CPP_CLOSE_BRACE; break;
3973     case ';': result->type = CPP_SEMICOLON; break;
3974
3975       /* @ is a punctuator in Objective-C.  */
3976     case '@': result->type = CPP_ATSIGN; break;
3977
3978     default:
3979       {
3980         const uchar *base = --buffer->cur;
3981
3982         /* Check for an extended identifier ($ or UCN or UTF-8).  */
3983         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3984         if (forms_identifier_p (pfile, true, &nst))
3985           {
3986             result->type = CPP_NAME;
3987             result->val.node.node = lex_identifier (pfile, base, true, &nst,
3988                                                     &result->val.node.spelling);
3989             warn_about_normalization (pfile, result, &nst);
3990             break;
3991           }
3992
3993         /* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
3994            single token.  */
3995         buffer->cur++;
3996         if (c >= utf8_signifier)
3997           {
3998             const uchar *pstr = base;
3999             cppchar_t s;
4000             if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4001               buffer->cur = pstr;
4002           }
4003         create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4004         break;
4005       }
4006
4007     }
4008
4009   /* Potentially convert the location of the token to a range.  */
4010   if (result->src_loc >= RESERVED_LOCATION_COUNT
4011       && result->type != CPP_EOF)
4012     {
4013       /* Ensure that any line notes are processed, so that we have the
4014          correct physical line/column for the end-point of the token even
4015          when a logical line is split via one or more backslashes.  */
4016       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4017           && !pfile->overlaid_buffer)
4018         _cpp_process_line_notes (pfile, false);
4019
4020       source_range tok_range;
4021       tok_range.m_start = result->src_loc;
4022       tok_range.m_finish
4023         = linemap_position_for_column (pfile->line_table,
4024                                        CPP_BUF_COLUMN (buffer, buffer->cur));
4025
4026       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
4027                                                result->src_loc,
4028                                                tok_range, NULL);
4029     }
4030
4031   return result;
4032 }
4033
4034 /* An upper bound on the number of bytes needed to spell TOKEN.
4035    Does not include preceding whitespace.  */
4036 unsigned int
4037 cpp_token_len (const cpp_token *token)
4038 {
4039   unsigned int len;
4040
4041   switch (TOKEN_SPELL (token))
4042     {
4043     default:            len = 6;                                break;
4044     case SPELL_LITERAL: len = token->val.str.len;               break;
4045     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
4046     }
4047
4048   return len;
4049 }
4050
4051 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4052    Return the number of bytes read out of NAME.  (There are always
4053    10 bytes written to BUFFER.)  */
4054
4055 static size_t
4056 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4057 {
4058   int j;
4059   int ucn_len = 0;
4060   int ucn_len_c;
4061   unsigned t;
4062   unsigned long utf32;
4063
4064   /* Compute the length of the UTF-8 sequence.  */
4065   for (t = *name; t & 0x80; t <<= 1)
4066     ucn_len++;
4067
4068   utf32 = *name & (0x7F >> ucn_len);
4069   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4070     {
4071       utf32 = (utf32 << 6) | (*++name & 0x3F);
4072
4073       /* Ill-formed UTF-8.  */
4074       if ((*name & ~0x3F) != 0x80)
4075         abort ();
4076     }
4077
4078   *buffer++ = '\\';
4079   *buffer++ = 'U';
4080   for (j = 7; j >= 0; j--)
4081     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4082   return ucn_len;
4083 }
4084
4085 /* Given a token TYPE corresponding to a digraph, return a pointer to
4086    the spelling of the digraph.  */
4087 static const unsigned char *
4088 cpp_digraph2name (enum cpp_ttype type)
4089 {
4090   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4091 }
4092
4093 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4094    The buffer must already contain the enough space to hold the
4095    token's spelling.  Returns a pointer to the character after the
4096    last character written.  */
4097 unsigned char *
4098 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4099 {
4100   size_t i;
4101   const unsigned char *name = NODE_NAME (ident);
4102
4103   for (i = 0; i < NODE_LEN (ident); i++)
4104     if (name[i] & ~0x7F)
4105       {
4106         i += utf8_to_ucn (buffer, name + i) - 1;
4107         buffer += 10;
4108       }
4109     else
4110       *buffer++ = name[i];
4111
4112   return buffer;
4113 }
4114
4115 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
4116    already contain the enough space to hold the token's spelling.
4117    Returns a pointer to the character after the last character written.
4118    FORSTRING is true if this is to be the spelling after translation
4119    phase 1 (with the original spelling of extended identifiers), false
4120    if extended identifiers should always be written using UCNs (there is
4121    no option for always writing them in the internal UTF-8 form).
4122    FIXME: Would be nice if we didn't need the PFILE argument.  */
4123 unsigned char *
4124 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4125                  unsigned char *buffer, bool forstring)
4126 {
4127   switch (TOKEN_SPELL (token))
4128     {
4129     case SPELL_OPERATOR:
4130       {
4131         const unsigned char *spelling;
4132         unsigned char c;
4133
4134         if (token->flags & DIGRAPH)
4135           spelling = cpp_digraph2name (token->type);
4136         else if (token->flags & NAMED_OP)
4137           goto spell_ident;
4138         else
4139           spelling = TOKEN_NAME (token);
4140
4141         while ((c = *spelling++) != '\0')
4142           *buffer++ = c;
4143       }
4144       break;
4145
4146     spell_ident:
4147     case SPELL_IDENT:
4148       if (forstring)
4149         {
4150           memcpy (buffer, NODE_NAME (token->val.node.spelling),
4151                   NODE_LEN (token->val.node.spelling));
4152           buffer += NODE_LEN (token->val.node.spelling);
4153         }
4154       else
4155         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4156       break;
4157
4158     case SPELL_LITERAL:
4159       memcpy (buffer, token->val.str.text, token->val.str.len);
4160       buffer += token->val.str.len;
4161       break;
4162
4163     case SPELL_NONE:
4164       cpp_error (pfile, CPP_DL_ICE,
4165                  "unspellable token %s", TOKEN_NAME (token));
4166       break;
4167     }
4168
4169   return buffer;
4170 }
4171
4172 /* Returns TOKEN spelt as a null-terminated string.  The string is
4173    freed when the reader is destroyed.  Useful for diagnostics.  */
4174 unsigned char *
4175 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4176 {
4177   unsigned int len = cpp_token_len (token) + 1;
4178   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4179
4180   end = cpp_spell_token (pfile, token, start, false);
4181   end[0] = '\0';
4182
4183   return start;
4184 }
4185
4186 /* Returns a pointer to a string which spells the token defined by
4187    TYPE and FLAGS.  Used by C front ends, which really should move to
4188    using cpp_token_as_text.  */
4189 const char *
4190 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4191 {
4192   if (flags & DIGRAPH)
4193     return (const char *) cpp_digraph2name (type);
4194   else if (flags & NAMED_OP)
4195     return cpp_named_operator2name (type);
4196
4197   return (const char *) token_spellings[type].name;
4198 }
4199
4200 /* Writes the spelling of token to FP, without any preceding space.
4201    Separated from cpp_spell_token for efficiency - to avoid stdio
4202    double-buffering.  */
4203 void
4204 cpp_output_token (const cpp_token *token, FILE *fp)
4205 {
4206   switch (TOKEN_SPELL (token))
4207     {
4208     case SPELL_OPERATOR:
4209       {
4210         const unsigned char *spelling;
4211         int c;
4212
4213         if (token->flags & DIGRAPH)
4214           spelling = cpp_digraph2name (token->type);
4215         else if (token->flags & NAMED_OP)
4216           goto spell_ident;
4217         else
4218           spelling = TOKEN_NAME (token);
4219
4220         c = *spelling;
4221         do
4222           putc (c, fp);
4223         while ((c = *++spelling) != '\0');
4224       }
4225       break;
4226
4227     spell_ident:
4228     case SPELL_IDENT:
4229       {
4230         size_t i;
4231         const unsigned char * name = NODE_NAME (token->val.node.node);
4232
4233         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4234           if (name[i] & ~0x7F)
4235             {
4236               unsigned char buffer[10];
4237               i += utf8_to_ucn (buffer, name + i) - 1;
4238               fwrite (buffer, 1, 10, fp);
4239             }
4240           else
4241             fputc (NODE_NAME (token->val.node.node)[i], fp);
4242       }
4243       break;
4244
4245     case SPELL_LITERAL:
4246       if (token->type == CPP_HEADER_NAME)
4247         fputc ('"', fp);
4248       fwrite (token->val.str.text, 1, token->val.str.len, fp);
4249       if (token->type == CPP_HEADER_NAME)
4250         fputc ('"', fp);
4251       break;
4252
4253     case SPELL_NONE:
4254       /* An error, most probably.  */
4255       break;
4256     }
4257 }
4258
4259 /* Compare two tokens.  */
4260 int
4261 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4262 {
4263   if (a->type == b->type && a->flags == b->flags)
4264     switch (TOKEN_SPELL (a))
4265       {
4266       default:                  /* Keep compiler happy.  */
4267       case SPELL_OPERATOR:
4268         /* token_no is used to track where multiple consecutive ##
4269            tokens were originally located.  */
4270         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4271       case SPELL_NONE:
4272         return (a->type != CPP_MACRO_ARG
4273                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4274                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4275       case SPELL_IDENT:
4276         return (a->val.node.node == b->val.node.node
4277                 && a->val.node.spelling == b->val.node.spelling);
4278       case SPELL_LITERAL:
4279         return (a->val.str.len == b->val.str.len
4280                 && !memcmp (a->val.str.text, b->val.str.text,
4281                             a->val.str.len));
4282       }
4283
4284   return 0;
4285 }
4286
4287 /* Returns nonzero if a space should be inserted to avoid an
4288    accidental token paste for output.  For simplicity, it is
4289    conservative, and occasionally advises a space where one is not
4290    needed, e.g. "." and ".2".  */
4291 int
4292 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4293                  const cpp_token *token2)
4294 {
4295   enum cpp_ttype a = token1->type, b = token2->type;
4296   cppchar_t c;
4297
4298   if (token1->flags & NAMED_OP)
4299     a = CPP_NAME;
4300   if (token2->flags & NAMED_OP)
4301     b = CPP_NAME;
4302
4303   c = EOF;
4304   if (token2->flags & DIGRAPH)
4305     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4306   else if (token_spellings[b].category == SPELL_OPERATOR)
4307     c = token_spellings[b].name[0];
4308
4309   /* Quickly get everything that can paste with an '='.  */
4310   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4311     return 1;
4312
4313   switch (a)
4314     {
4315     case CPP_GREATER:   return c == '>';
4316     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
4317     case CPP_PLUS:      return c == '+';
4318     case CPP_MINUS:     return c == '-' || c == '>';
4319     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
4320     case CPP_MOD:       return c == ':' || c == '>';
4321     case CPP_AND:       return c == '&';
4322     case CPP_OR:        return c == '|';
4323     case CPP_COLON:     return c == ':' || c == '>';
4324     case CPP_DEREF:     return c == '*';
4325     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
4326     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
4327     case CPP_PRAGMA:
4328     case CPP_NAME:      return ((b == CPP_NUMBER
4329                                  && name_p (pfile, &token2->val.str))
4330                                 || b == CPP_NAME
4331                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
4332     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
4333                                 || b == CPP_CHAR
4334                                 || c == '.' || c == '+' || c == '-');
4335                                       /* UCNs */
4336     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
4337                                  && b == CPP_NAME)
4338                                 || (CPP_OPTION (pfile, objc)
4339                                     && token1->val.str.text[0] == '@'
4340                                     && (b == CPP_NAME || b == CPP_STRING)));
4341     case CPP_LESS_EQ:   return c == '>';
4342     case CPP_STRING:
4343     case CPP_WSTRING:
4344     case CPP_UTF8STRING:
4345     case CPP_STRING16:
4346     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
4347                                 && (b == CPP_NAME
4348                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
4349                                         && ISIDST (token2->val.str.text[0]))));
4350
4351     default:            break;
4352     }
4353
4354   return 0;
4355 }
4356
4357 /* Output all the remaining tokens on the current line, and a newline
4358    character, to FP.  Leading whitespace is removed.  If there are
4359    macros, special token padding is not performed.  */
4360 void
4361 cpp_output_line (cpp_reader *pfile, FILE *fp)
4362 {
4363   const cpp_token *token;
4364
4365   token = cpp_get_token (pfile);
4366   while (token->type != CPP_EOF)
4367     {
4368       cpp_output_token (token, fp);
4369       token = cpp_get_token (pfile);
4370       if (token->flags & PREV_WHITE)
4371         putc (' ', fp);
4372     }
4373
4374   putc ('\n', fp);
4375 }
4376
4377 /* Return a string representation of all the remaining tokens on the
4378    current line.  The result is allocated using xmalloc and must be
4379    freed by the caller.  */
4380 unsigned char *
4381 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4382 {
4383   const cpp_token *token;
4384   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4385   unsigned int alloced = 120 + out;
4386   unsigned char *result = (unsigned char *) xmalloc (alloced);
4387
4388   /* If DIR_NAME is empty, there are no initial contents.  */
4389   if (dir_name)
4390     {
4391       sprintf ((char *) result, "#%s ", dir_name);
4392       out += 2;
4393     }
4394
4395   token = cpp_get_token (pfile);
4396   while (token->type != CPP_EOF)
4397     {
4398       unsigned char *last;
4399       /* Include room for a possible space and the terminating nul.  */
4400       unsigned int len = cpp_token_len (token) + 2;
4401
4402       if (out + len > alloced)
4403         {
4404           alloced *= 2;
4405           if (out + len > alloced)
4406             alloced = out + len;
4407           result = (unsigned char *) xrealloc (result, alloced);
4408         }
4409
4410       last = cpp_spell_token (pfile, token, &result[out], 0);
4411       out = last - result;
4412
4413       token = cpp_get_token (pfile);
4414       if (token->flags & PREV_WHITE)
4415         result[out++] = ' ';
4416     }
4417
4418   result[out] = '\0';
4419   return result;
4420 }
4421
4422 /* Memory buffers.  Changing these three constants can have a dramatic
4423    effect on performance.  The values here are reasonable defaults,
4424    but might be tuned.  If you adjust them, be sure to test across a
4425    range of uses of cpplib, including heavy nested function-like macro
4426    expansion.  Also check the change in peak memory usage (NJAMD is a
4427    good tool for this).  */
4428 #define MIN_BUFF_SIZE 8000
4429 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4430 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4431         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4432
4433 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4434   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4435 #endif
4436
4437 /* Create a new allocation buffer.  Place the control block at the end
4438    of the buffer, so that buffer overflows will cause immediate chaos.  */
4439 static _cpp_buff *
4440 new_buff (size_t len)
4441 {
4442   _cpp_buff *result;
4443   unsigned char *base;
4444
4445   if (len < MIN_BUFF_SIZE)
4446     len = MIN_BUFF_SIZE;
4447   len = CPP_ALIGN (len);
4448
4449 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4450   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4451      struct first.  */
4452   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4453   base = XNEWVEC (unsigned char, len + slen);
4454   result = (_cpp_buff *) base;
4455   base += slen;
4456 #else
4457   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4458   result = (_cpp_buff *) (base + len);
4459 #endif
4460   result->base = base;
4461   result->cur = base;
4462   result->limit = base + len;
4463   result->next = NULL;
4464   return result;
4465 }
4466
4467 /* Place a chain of unwanted allocation buffers on the free list.  */
4468 void
4469 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4470 {
4471   _cpp_buff *end = buff;
4472
4473   while (end->next)
4474     end = end->next;
4475   end->next = pfile->free_buffs;
4476   pfile->free_buffs = buff;
4477 }
4478
4479 /* Return a free buffer of size at least MIN_SIZE.  */
4480 _cpp_buff *
4481 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4482 {
4483   _cpp_buff *result, **p;
4484
4485   for (p = &pfile->free_buffs;; p = &(*p)->next)
4486     {
4487       size_t size;
4488
4489       if (*p == NULL)
4490         return new_buff (min_size);
4491       result = *p;
4492       size = result->limit - result->base;
4493       /* Return a buffer that's big enough, but don't waste one that's
4494          way too big.  */
4495       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4496         break;
4497     }
4498
4499   *p = result->next;
4500   result->next = NULL;
4501   result->cur = result->base;
4502   return result;
4503 }
4504
4505 /* Creates a new buffer with enough space to hold the uncommitted
4506    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
4507    the excess bytes to the new buffer.  Chains the new buffer after
4508    BUFF, and returns the new buffer.  */
4509 _cpp_buff *
4510 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4511 {
4512   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4513   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4514
4515   buff->next = new_buff;
4516   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4517   return new_buff;
4518 }
4519
4520 /* Creates a new buffer with enough space to hold the uncommitted
4521    remaining bytes of the buffer pointed to by BUFF, and at least
4522    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
4523    Chains the new buffer before the buffer pointed to by BUFF, and
4524    updates the pointer to point to the new buffer.  */
4525 void
4526 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4527 {
4528   _cpp_buff *new_buff, *old_buff = *pbuff;
4529   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4530
4531   new_buff = _cpp_get_buff (pfile, size);
4532   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4533   new_buff->next = old_buff;
4534   *pbuff = new_buff;
4535 }
4536
4537 /* Free a chain of buffers starting at BUFF.  */
4538 void
4539 _cpp_free_buff (_cpp_buff *buff)
4540 {
4541   _cpp_buff *next;
4542
4543   for (; buff; buff = next)
4544     {
4545       next = buff->next;
4546 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4547       free (buff);
4548 #else
4549       free (buff->base);
4550 #endif
4551     }
4552 }
4553
4554 /* Allocate permanent, unaligned storage of length LEN.  */
4555 unsigned char *
4556 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4557 {
4558   _cpp_buff *buff = pfile->u_buff;
4559   unsigned char *result = buff->cur;
4560
4561   if (len > (size_t) (buff->limit - result))
4562     {
4563       buff = _cpp_get_buff (pfile, len);
4564       buff->next = pfile->u_buff;
4565       pfile->u_buff = buff;
4566       result = buff->cur;
4567     }
4568
4569   buff->cur = result + len;
4570   return result;
4571 }
4572
4573 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4574    That buffer is used for growing allocations when saving macro
4575    replacement lists in a #define, and when parsing an answer to an
4576    assertion in #assert, #unassert or #if (and therefore possibly
4577    whilst expanding macros).  It therefore must not be used by any
4578    code that they might call: specifically the lexer and the guts of
4579    the macro expander.
4580
4581    All existing other uses clearly fit this restriction: storing
4582    registered pragmas during initialization.  */
4583 unsigned char *
4584 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4585 {
4586   _cpp_buff *buff = pfile->a_buff;
4587   unsigned char *result = buff->cur;
4588
4589   if (len > (size_t) (buff->limit - result))
4590     {
4591       buff = _cpp_get_buff (pfile, len);
4592       buff->next = pfile->a_buff;
4593       pfile->a_buff = buff;
4594       result = buff->cur;
4595     }
4596
4597   buff->cur = result + len;
4598   return result;
4599 }
4600
4601 /* Commit or allocate storage from a buffer.  */
4602
4603 void *
4604 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4605 {
4606   void *ptr = BUFF_FRONT (pfile->a_buff);
4607
4608   if (pfile->hash_table->alloc_subobject)
4609     {
4610       void *copy = pfile->hash_table->alloc_subobject (size);
4611       memcpy (copy, ptr, size);
4612       ptr = copy;
4613     }
4614   else
4615     BUFF_FRONT (pfile->a_buff) += size;
4616
4617   return ptr;
4618 }
4619
4620 /* Say which field of TOK is in use.  */
4621
4622 enum cpp_token_fld_kind
4623 cpp_token_val_index (const cpp_token *tok)
4624 {
4625   switch (TOKEN_SPELL (tok))
4626     {
4627     case SPELL_IDENT:
4628       return CPP_TOKEN_FLD_NODE;
4629     case SPELL_LITERAL:
4630       return CPP_TOKEN_FLD_STR;
4631     case SPELL_OPERATOR:
4632       /* Operands which were originally spelled as ident keep around
4633          the node for the exact spelling.  */
4634       if (tok->flags & NAMED_OP)
4635         return CPP_TOKEN_FLD_NODE;
4636       else if (tok->type == CPP_PASTE)
4637         return CPP_TOKEN_FLD_TOKEN_NO;
4638       else
4639         return CPP_TOKEN_FLD_NONE;
4640     case SPELL_NONE:
4641       if (tok->type == CPP_MACRO_ARG)
4642         return CPP_TOKEN_FLD_ARG_NO;
4643       else if (tok->type == CPP_PADDING)
4644         return CPP_TOKEN_FLD_SOURCE;
4645       else if (tok->type == CPP_PRAGMA)
4646         return CPP_TOKEN_FLD_PRAGMA;
4647       /* fall through */
4648     default:
4649       return CPP_TOKEN_FLD_NONE;
4650     }
4651 }
4652
4653 /* All tokens lexed in R after calling this function will be forced to
4654    have their location_t to be P, until
4655    cpp_stop_forcing_token_locations is called for R.  */
4656
4657 void
4658 cpp_force_token_locations (cpp_reader *r, location_t loc)
4659 {
4660   r->forced_token_location = loc;
4661 }
4662
4663 /* Go back to assigning locations naturally for lexed tokens.  */
4664
4665 void
4666 cpp_stop_forcing_token_locations (cpp_reader *r)
4667 {
4668   r->forced_token_location = 0;
4669 }
4670
4671 /* We're looking at \, if it's escaping EOL, look past it.  If at
4672    LIMIT, don't advance.  */
4673
4674 static const unsigned char *
4675 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4676 {
4677   const unsigned char *probe = peek;
4678
4679   if (__builtin_expect (peek[1] == '\n', true))
4680     {
4681     eol:
4682       probe += 2;
4683       if (__builtin_expect (probe < limit, true))
4684         {
4685           peek = probe;
4686           if (*peek == '\\')
4687             /* The user might be perverse.  */
4688             return do_peek_backslash (peek, limit);
4689         }
4690     }
4691   else if (__builtin_expect (peek[1] == '\r', false))
4692     {
4693       if (probe[2] == '\n')
4694         probe++;
4695       goto eol;
4696     }
4697
4698   return peek;
4699 }
4700
4701 static const unsigned char *
4702 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4703 {
4704   if (__builtin_expect (*peek == '\\', false))
4705     peek = do_peek_backslash (peek, limit);
4706   return peek;
4707 }
4708
4709 static const unsigned char *
4710 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4711 {
4712   if (peek == bound)
4713     return NULL;
4714
4715   unsigned char c = *--peek;
4716   if (__builtin_expect (c == '\n', false)
4717       || __builtin_expect (c == 'r', false))
4718     {
4719       if (peek == bound)
4720         return peek;
4721       int ix = -1;
4722       if (c == '\n' && peek[ix] == '\r')
4723         {
4724           if (peek + ix == bound)
4725             return peek;
4726           ix--;
4727         }
4728
4729       if (peek[ix] == '\\')
4730         return do_peek_prev (peek + ix, bound);
4731
4732       return peek;
4733     }
4734   else
4735     return peek;
4736 }
4737
4738 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4739    space.  Otherwise return NULL.  */
4740
4741 static const unsigned char *
4742 do_peek_ident (const char *match, const unsigned char *peek,
4743                const unsigned char *limit)
4744 {
4745   for (; *++match; peek++)
4746     if (*peek != *match)
4747       {
4748         peek = do_peek_next (peek, limit);
4749         if (*peek != *match)
4750           return NULL;
4751       }
4752
4753   /* Must now not be looking at an identifier char.  */
4754   peek = do_peek_next (peek, limit);
4755   if (ISIDNUM (*peek))
4756     return NULL;
4757
4758   /* Skip control-line whitespace.  */
4759  ws:
4760   while (*peek == ' ' || *peek == '\t')
4761     peek++;
4762   if (__builtin_expect (*peek == '\\', false))
4763     {
4764       peek = do_peek_backslash (peek, limit);
4765       if (*peek != '\\')
4766         goto ws;
4767     }
4768
4769   return peek;
4770 }
4771
4772 /* Are we looking at a module control line starting as PEEK - 1?  */
4773
4774 static bool
4775 do_peek_module (cpp_reader *pfile, unsigned char c,
4776                 const unsigned char *peek, const unsigned char *limit)
4777 {
4778   bool import = false;
4779
4780   if (__builtin_expect (c == 'e', false))
4781     {
4782       if (!((peek[0] == 'x' || peek[0] == '\\')
4783             && (peek = do_peek_ident ("export", peek, limit))))
4784         return false;
4785
4786       /* export, peek for import or module.  No need to peek __import
4787          here.  */
4788       if (peek[0] == 'i')
4789         {
4790           if (!((peek[1] == 'm' || peek[1] == '\\')
4791                 && (peek = do_peek_ident ("import", peek + 1, limit))))
4792             return false;
4793           import = true;
4794         }
4795       else if (peek[0] == 'm')
4796         {
4797           if (!((peek[1] == 'o' || peek[1] == '\\')
4798                 && (peek = do_peek_ident ("module", peek + 1, limit))))
4799             return false;
4800         }
4801       else
4802         return false;
4803     }
4804   else if (__builtin_expect (c == 'i', false))
4805     {
4806       if (!((peek[0] == 'm' || peek[0] == '\\')
4807             && (peek = do_peek_ident ("import", peek, limit))))
4808         return false;
4809       import = true;
4810     }
4811   else if (__builtin_expect (c == '_', false))
4812     {
4813       /* Needed for translated includes.   */
4814       if (!((peek[0] == '_' || peek[0] == '\\')
4815             && (peek = do_peek_ident ("__import", peek, limit))))
4816         return false;
4817       import = true;
4818     }
4819   else if (__builtin_expect (c == 'm', false))
4820     {
4821       if (!((peek[0] == 'o' || peek[0] == '\\')
4822             && (peek = do_peek_ident ("module", peek, limit))))
4823         return false;
4824     }
4825   else
4826     return false;
4827
4828   /* Peek the next character to see if it's good enough.  We'll be at
4829      the first non-whitespace char, including skipping an escaped
4830      newline.  */
4831   /* ... import followed by identifier, ':', '<' or header-name
4832      preprocessing tokens, or module followed by identifier, ':' or
4833      ';' preprocessing tokens.  */
4834   unsigned char p = *peek++;
4835
4836   /* A character literal is ... single quotes, ... optionally preceded
4837      by u8, u, U, or L */
4838   /* A string-literal is a ... double quotes, optionally prefixed by
4839      R, u8, u8R, u, uR, U, UR, L, or LR */
4840   if (p == 'u')
4841     {
4842       peek = do_peek_next (peek, limit);
4843       if (*peek == '8')
4844         {
4845           peek++;
4846           goto peek_u8;
4847         }
4848       goto peek_u;
4849     }
4850   else if (p == 'U' || p == 'L')
4851     {
4852     peek_u8:
4853       peek = do_peek_next (peek, limit);
4854     peek_u:
4855       if (*peek == '\"' || *peek == '\'')
4856         return false;
4857
4858       if (*peek == 'R')
4859         goto peek_R;
4860       /* Identifier. Ok.  */
4861     }
4862   else if (p == 'R')
4863     {
4864     peek_R:
4865       if (CPP_OPTION (pfile, rliterals))
4866         {
4867           peek = do_peek_next (peek, limit);
4868           if (*peek == '\"')
4869             return false;
4870         }
4871       /* Identifier. Ok.  */
4872     }
4873   else if ('Z' - 'A' == 25
4874            ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
4875            : ISIDST (p))
4876     {
4877       /* Identifier.  Ok. */
4878     }
4879   else if (p == '<')
4880     {
4881       /* Maybe angle header, ok for import.  Reject
4882          '<=', '<<' digraph:'<:'.  */
4883       if (!import)
4884         return false;
4885       peek = do_peek_next (peek, limit);
4886       if (*peek == '=' || *peek == '<'
4887           || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
4888         return false;
4889     }
4890   else if (p == ';')
4891     {
4892       /* SEMICOLON, ok for module.  */
4893       if (import)
4894         return false;
4895     }
4896   else if (p == '"')
4897     {
4898       /* STRING, ok for import.  */
4899       if (!import)
4900         return false;
4901     }
4902   else if (p == ':')
4903     {
4904       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
4905       peek = do_peek_next (peek, limit);
4906       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
4907         return false;
4908     }
4909   else
4910     /* FIXME: Detect a unicode character, excluding those not
4911        permitted as the initial character. [lex.name]/1.  I presume
4912        we need to check the \[uU] spellings, and directly using
4913        Unicode in say UTF8 form?  Or perhaps we do the phase-1
4914        conversion of UTF8 to universal-character-names?  */
4915     return false;
4916
4917   return true;
4918 }
4919
4920 /* Directives-only scanning.  Somewhat more relaxed than correct
4921    parsing -- some ill-formed programs will not be rejected.  */
4922
4923 void
4924 cpp_directive_only_process (cpp_reader *pfile,
4925                             void *data,
4926                             void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
4927 {
4928   bool module_p = CPP_OPTION (pfile, module_directives);
4929
4930   do
4931     {
4932     restart:
4933       /* Buffer initialization, but no line cleaning. */
4934       cpp_buffer *buffer = pfile->buffer;
4935       buffer->cur_note = buffer->notes_used = 0;
4936       buffer->cur = buffer->line_base = buffer->next_line;
4937       buffer->need_line = false;
4938       /* Files always end in a newline or carriage return.  We rely on this for
4939          character peeking safety.  */
4940       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
4941
4942       const unsigned char *base = buffer->cur;
4943       unsigned line_count = 0;
4944       const unsigned char *line_start = base;
4945
4946       bool bol = true;
4947       bool raw = false;
4948
4949       const unsigned char *lwm = base;
4950       for (const unsigned char *pos = base, *limit = buffer->rlimit;
4951            pos < limit;)
4952         {
4953           unsigned char c = *pos++;
4954           /* This matches the switch in _cpp_lex_direct.  */
4955           switch (c)
4956             {
4957             case ' ': case '\t': case '\f': case '\v':
4958               /* Whitespace, do nothing.  */
4959               break;
4960
4961             case '\r': /* MAC line ending, or Windows \r\n  */
4962               if (*pos == '\n')
4963                 pos++;
4964               /* FALLTHROUGH */
4965
4966             case '\n':
4967               bol = true;
4968
4969             next_line:
4970               CPP_INCREMENT_LINE (pfile, 0);
4971               line_count++;
4972               line_start = pos;
4973               break;
4974
4975             case '\\':
4976               /* <backslash><newline> is removed, and doesn't undo any
4977                  preceeding escape or whatnot.  */
4978               if (*pos == '\n')
4979                 {
4980                   pos++;
4981                   goto next_line;
4982                 }
4983               else if (*pos == '\r')
4984                 {
4985                   if (pos[1] == '\n')
4986                     pos++;
4987                   pos++;
4988                   goto next_line;
4989                 }
4990               goto dflt;
4991
4992             case '#':
4993               if (bol)
4994                 {
4995                   /* Line directive.  */
4996                   if (pos - 1 > base && !pfile->state.skipping)
4997                     cb (pfile, CPP_DO_print, data,
4998                         line_count, base, pos - 1 - base);
4999
5000                   /* Prep things for directive handling. */
5001                   buffer->next_line = pos;
5002                   buffer->need_line = true;
5003                   bool ok = _cpp_get_fresh_line (pfile);
5004                   gcc_checking_assert (ok);
5005
5006                   /* Ensure proper column numbering for generated
5007                      error messages. */
5008                   buffer->line_base -= pos - line_start;
5009
5010                   _cpp_handle_directive (pfile, line_start + 1 != pos);
5011
5012                   /* Sanitize the line settings.  Duplicate #include's can
5013                      mess things up. */
5014                   // FIXME: Necessary?
5015                   pfile->line_table->highest_location
5016                     = pfile->line_table->highest_line;
5017
5018                   if (!pfile->state.skipping
5019                       && pfile->buffer->next_line < pfile->buffer->rlimit)
5020                     cb (pfile, CPP_DO_location, data,
5021                         pfile->line_table->highest_line);
5022
5023                   goto restart;
5024                 }
5025               goto dflt;
5026
5027             case '/':
5028               {
5029                 const unsigned char *peek = do_peek_next (pos, limit);
5030                 if (!(*peek == '/' || *peek == '*'))
5031                   goto dflt;
5032
5033                 /* Line or block comment  */
5034                 bool is_block = *peek == '*';
5035                 bool star = false;
5036                 bool esc = false;
5037                 location_t sloc
5038                   = linemap_position_for_column (pfile->line_table,
5039                                                  pos - line_start);
5040
5041                 while (pos < limit)
5042                   {
5043                     char c = *pos++;
5044                     switch (c)
5045                       {
5046                       case '\\':
5047                         esc = true;
5048                         break;
5049
5050                       case '\r':
5051                         if (*pos == '\n')
5052                           pos++;
5053                         /* FALLTHROUGH  */
5054
5055                       case '\n':
5056                         {
5057                           CPP_INCREMENT_LINE (pfile, 0);
5058                           line_count++;
5059                           line_start = pos;
5060                           if (!esc && !is_block)
5061                             {
5062                               bol = true;
5063                               goto done_comment;
5064                             }
5065                         }
5066                         if (!esc)
5067                           star = false;
5068                         esc = false;
5069                         break;
5070
5071                       case '*':
5072                         if (pos > peek)
5073                           star = is_block;
5074                         esc = false;
5075                         break;
5076
5077                       case '/':
5078                         if (star)
5079                           goto done_comment;
5080                         /* FALLTHROUGH  */
5081
5082                       default:
5083                         star = false;
5084                         esc = false;
5085                         break;
5086                       }
5087                   }
5088                 if (pos < limit || is_block)
5089                   cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5090                                        "unterminated comment");
5091               done_comment:
5092                 lwm = pos;
5093                 break;
5094               }
5095
5096             case '\'':
5097               if (!CPP_OPTION (pfile, digit_separators))
5098                 goto delimited_string;
5099
5100               /* Possibly a number punctuator.  */
5101               if (!ISIDNUM (*do_peek_next (pos, limit)))
5102                 goto delimited_string;
5103
5104               goto quote_peek;
5105
5106             case '\"':
5107               if (!CPP_OPTION (pfile, rliterals))
5108                 goto delimited_string;
5109
5110             quote_peek:
5111               {
5112                 /* For ' see if it's a number punctuator
5113                    \.?<digit>(<digit>|<identifier-nondigit>
5114                    |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5115                 /* For " see if it's a raw string
5116                    {U,L,u,u8}R.  This includes CPP_NUMBER detection,
5117                    because that could be 0e+R.  */
5118                 const unsigned char *peek = pos - 1;
5119                 bool quote_first = c == '"';
5120                 bool quote_eight = false;
5121                 bool maybe_number_start = false;
5122                 bool want_number = false;
5123
5124                 while ((peek = do_peek_prev (peek, lwm)))
5125                   {
5126                     unsigned char p = *peek;
5127                     if (quote_first)
5128                       {
5129                         if (!raw)
5130                           {
5131                             if (p != 'R')
5132                               break;
5133                             raw = true;
5134                             continue;
5135                           }
5136
5137                         quote_first = false;
5138                         if (p == 'L' || p == 'U' || p == 'u')
5139                           ;
5140                         else if (p == '8')
5141                           quote_eight = true;
5142                         else
5143                           goto second_raw;
5144                       }
5145                     else if (quote_eight)
5146                       {
5147                         if (p != 'u')
5148                           {
5149                             raw = false;
5150                             break;
5151                           }
5152                         quote_eight = false;
5153                       }
5154                     else if (c == '"')
5155                       {
5156                       second_raw:;
5157                         if (!want_number && ISIDNUM (p))
5158                           {
5159                             raw = false;
5160                             break;
5161                           }
5162                       }
5163
5164                     if (ISDIGIT (p))
5165                       maybe_number_start = true;
5166                     else if (p == '.')
5167                       want_number = true;
5168                     else if (ISIDNUM (p))
5169                       maybe_number_start = false;
5170                     else if (p == '+' || p == '-')
5171                       {
5172                         if (const unsigned char *peek_prev
5173                             = do_peek_prev (peek, lwm))
5174                           {
5175                             p = *peek_prev;
5176                             if (p == 'e' || p == 'E'
5177                                 || p == 'p' || p == 'P')
5178                               {
5179                                 want_number = true;
5180                                 maybe_number_start = false;
5181                               }
5182                             else
5183                               break;
5184                           }
5185                         else
5186                           break;
5187                       }
5188                     else if (p == '\'' || p == '\"')
5189                       {
5190                         /* If this is lwm, this must be the end of a
5191                            previous string.  So this is a trailing
5192                            literal type, (a) if those are allowed,
5193                              and (b) maybe_start is false.  Otherwise
5194                              this must be a CPP_NUMBER because we've
5195                              met another ', and we'd have checked that
5196                              in its own right.  */
5197                         if (peek == lwm && CPP_OPTION (pfile, uliterals))
5198                           {
5199                             if  (!maybe_number_start && !want_number)
5200                               /* Must be a literal type.  */
5201                               raw = false;
5202                           }
5203                         else if (p == '\''
5204                                  && CPP_OPTION (pfile, digit_separators))
5205                           maybe_number_start = true;
5206                         break;
5207                       }
5208                     else if (c == '\'')
5209                       break;
5210                     else if (!quote_first && !quote_eight)
5211                       break;
5212                   }
5213
5214                 if (maybe_number_start)
5215                   {
5216                     if (c == '\'')
5217                       /* A CPP NUMBER.  */
5218                       goto dflt;
5219                     raw = false;
5220                   }
5221
5222                 goto delimited_string;
5223               }
5224
5225             delimited_string:
5226               {
5227                 /* (Possibly raw) string or char literal.  */
5228                 unsigned char end = c;
5229                 int delim_len = -1;
5230                 const unsigned char *delim = NULL;
5231                 location_t sloc = linemap_position_for_column (pfile->line_table,
5232                                                                pos - line_start);
5233                 int esc = 0;
5234
5235                 if (raw)
5236                   {
5237                     /* There can be no line breaks in the delimiter.  */
5238                     delim = pos;
5239                     for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5240                       {
5241                         if (delim_len == 16)
5242                           {
5243                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5244                                                  sloc, 0,
5245                                                  "raw string delimiter"
5246                                                  " longer than %d"
5247                                                  " characters",
5248                                                  delim_len);
5249                             raw = false;
5250                             pos = delim;
5251                             break;
5252                           }
5253                         if (strchr (") \\\t\v\f\n", c))
5254                           {
5255                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5256                                                  sloc, 0,
5257                                                  "invalid character '%c'"
5258                                                  " in raw string"
5259                                                  " delimiter", c);
5260                             raw = false;
5261                             pos = delim;
5262                             break;
5263                           }
5264                         if (pos >= limit)
5265                           goto bad_string;
5266                       }
5267                   }
5268
5269                 while (pos < limit)
5270                   {
5271                     char c = *pos++;
5272                     switch (c)
5273                       {
5274                       case '\\':
5275                         if (!raw)
5276                           esc++;
5277                         break;
5278
5279                       case '\r':
5280                         if (*pos == '\n')
5281                           pos++;
5282                         /* FALLTHROUGH  */
5283
5284                       case '\n':
5285                         {
5286                           CPP_INCREMENT_LINE (pfile, 0);
5287                           line_count++;
5288                           line_start = pos;
5289                         }
5290                         if (esc)
5291                           esc--;
5292                         break;
5293
5294                       case ')':
5295                         if (raw
5296                             && pos + delim_len + 1 < limit
5297                             && pos[delim_len] == end
5298                             && !memcmp (delim, pos, delim_len))
5299                           {
5300                             pos += delim_len + 1;
5301                             raw = false;
5302                             goto done_string;
5303                           }
5304                         break;
5305
5306                       default:
5307                         if (!raw && !(esc & 1) && c == end)
5308                           goto done_string;
5309                         esc = 0;
5310                         break;
5311                       }
5312                   }
5313               bad_string:
5314                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5315                                      "unterminated literal");
5316
5317               done_string:
5318                 raw = false;
5319                 lwm = pos - 1;
5320               }
5321               goto dflt;
5322
5323             case '_':
5324             case 'e':
5325             case 'i':
5326             case 'm':
5327               if (bol && module_p && !pfile->state.skipping
5328                   && do_peek_module (pfile, c, pos, limit))
5329                 {
5330                   /* We've seen the start of a module control line.
5331                      Start up the tokenizer.  */
5332                   pos--; /* Backup over the first character.  */
5333
5334                   /* Backup over whitespace to start of line.  */
5335                   while (pos > line_start
5336                          && (pos[-1] == ' ' || pos[-1] == '\t'))
5337                     pos--;
5338
5339                   if (pos > base)
5340                     cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5341
5342                   /* Prep things for directive handling. */
5343                   buffer->next_line = pos;
5344                   buffer->need_line = true;
5345
5346                   /* Now get tokens until the PRAGMA_EOL.  */
5347                   do
5348                     {
5349                       location_t spelling;
5350                       const cpp_token *tok
5351                         = cpp_get_token_with_location (pfile, &spelling);
5352
5353                       gcc_assert (pfile->state.in_deferred_pragma
5354                                   || tok->type == CPP_PRAGMA_EOL);
5355                       cb (pfile, CPP_DO_token, data, tok, spelling);
5356                     }
5357                   while (pfile->state.in_deferred_pragma);
5358
5359                   if (pfile->buffer->next_line < pfile->buffer->rlimit)
5360                     cb (pfile, CPP_DO_location, data,
5361                         pfile->line_table->highest_line);
5362
5363                   pfile->mi_valid = false;
5364                   goto restart;
5365                 }
5366               goto dflt;
5367
5368             default:
5369             dflt:
5370               bol = false;
5371               pfile->mi_valid = false;
5372               break;
5373             }
5374         }
5375
5376       if (buffer->rlimit > base && !pfile->state.skipping)
5377         {
5378           const unsigned char *limit = buffer->rlimit;
5379           /* If the file was not newline terminated, add rlimit, which is
5380              guaranteed to point to a newline, to the end of our range.  */
5381           if (limit[-1] != '\n')
5382             {
5383               limit++;
5384               CPP_INCREMENT_LINE (pfile, 0);
5385               line_count++;
5386             }
5387           cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5388         }
5389
5390       _cpp_pop_buffer (pfile);
5391     }
5392   while (pfile->buffer);
5393 }