libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2022 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problem can be properly
 267    autoconfed:
 268
 269    The Solaris 10+ assembler tags objects with the instruction set
 270    extensions used, so SSE4.2 executables cannot run on machines that
 271    don't support that extension.  */
 272
 273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 274
 275 /* Replicated character data to be shared between implementations.
 276    Recall that outside of a context with vector support we can't
 277    define compatible vector types, therefore these are all defined
 278    in terms of raw characters.  */
 279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 280   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 281     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 282   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 283     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 284   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 285     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 286   { '?', '?', '?', '?', '?', '?', '?', '?',
 287     '?', '?', '?', '?', '?', '?', '?', '?' },
 288 };
 289
 290 /* A version of the fast scanner using MMX vectorized byte compare insns.
 291
 292    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 293    which was packaged into SSE1; it is also present in the AMD MMX
 294    extension.  Mark the function as using "sse" so that we emit a real
 295    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 296
 297 static const uchar *
 298 #ifndef __SSE__
 299 __attribute__((__target__("sse")))
 300 #endif
 301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 302 {
 303   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 304   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 305
 306   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 307   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 308   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 309   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 310
 311   unsigned int misalign, found, mask;
 312   const v8qi *p;
 313   v8qi data, t, c;
 314
 315   /* Align the source pointer.  While MMX doesn't generate unaligned data
 316      faults, this allows us to safely scan to the end of the buffer without
 317      reading beyond the end of the last page.  */
 318   misalign = (uintptr_t)s & 7;
 319   p = (const v8qi *)((uintptr_t)s & -8);
 320   data = *p;
 321
 322   /* Create a mask for the bytes that are valid within the first
 323      16-byte block.  The Idea here is that the AND with the mask
 324      within the loop is "free", since we need some AND or TEST
 325      insn in order to set the flags for the branch anyway.  */
 326   mask = -1u << misalign;
 327
 328   /* Main loop processing 8 bytes at a time.  */
 329   goto start;
 330   do
 331     {
 332       data = *++p;
 333       mask = -1;
 334
 335     start:
 336       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 337       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 338       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 339       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       found = __builtin_ia32_pmovmskb (t);
 344       found &= mask;
 345     }
 346   while (!found);
 347
 348   __builtin_ia32_emms ();
 349
 350   /* FOUND contains 1 in bits for which we matched a relevant
 351      character.  Conversion to the byte index is trivial.  */
 352   found = __builtin_ctz(found);
 353   return (const uchar *)p + found;
 354 }
 355
 356 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 357
 358 static const uchar *
 359 #ifndef __SSE2__
 360 __attribute__((__target__("sse2")))
 361 #endif
 362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 363 {
 364   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 365
 366   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 367   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 368   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 369   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 370
 371   unsigned int misalign, found, mask;
 372   const v16qi *p;
 373   v16qi data, t;
 374
 375   /* Align the source pointer.  */
 376   misalign = (uintptr_t)s & 15;
 377   p = (const v16qi *)((uintptr_t)s & -16);
 378   data = *p;
 379
 380   /* Create a mask for the bytes that are valid within the first
 381      16-byte block.  The Idea here is that the AND with the mask
 382      within the loop is "free", since we need some AND or TEST
 383      insn in order to set the flags for the branch anyway.  */
 384   mask = -1u << misalign;
 385
 386   /* Main loop processing 16 bytes at a time.  */
 387   goto start;
 388   do
 389     {
 390       data = *++p;
 391       mask = -1;
 392
 393     start:
 394       t  = data == repl_nl;
 395       t |= data == repl_cr;
 396       t |= data == repl_bs;
 397       t |= data == repl_qm;
 398       found = __builtin_ia32_pmovmskb128 (t);
 399       found &= mask;
 400     }
 401   while (!found);
 402
 403   /* FOUND contains 1 in bits for which we matched a relevant
 404      character.  Conversion to the byte index is trivial.  */
 405   found = __builtin_ctz(found);
 406   return (const uchar *)p + found;
 407 }
 408
 409 #ifdef HAVE_SSE4
 410 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 411
 412 static const uchar *
 413 #ifndef __SSE4_2__
 414 __attribute__((__target__("sse4.2")))
 415 #endif
 416 search_line_sse42 (const uchar *s, const uchar *end)
 417 {
 418   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 419   static const v16qi search = { '\n', '\r', '?', '\\' };
 420
 421   uintptr_t si = (uintptr_t)s;
 422   uintptr_t index;
 423
 424   /* Check for unaligned input.  */
 425   if (si & 15)
 426     {
 427       v16qi sv;
 428
 429       if (__builtin_expect (end - s < 16, 0)
 430           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 431         {
 432           /* There are less than 16 bytes left in the buffer, and less
 433              than 16 bytes left on the page.  Reading 16 bytes at this
 434              point might generate a spurious page fault.  Defer to the
 435              SSE2 implementation, which already handles alignment.  */
 436           return search_line_sse2 (s, end);
 437         }
 438
 439       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 440          memory need not be aligned.  */
 441       sv = __builtin_ia32_loaddqu ((const char *) s);
 442       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 443
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 15) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  */
 454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 455   while (1)
 456     {
 457       char f;
 458
 459       /* By using inline assembly instead of the builtin,
 460          we can use the result, as well as the flags set.  */
 461       __asm ("%vpcmpestri\t$0, %2, %3"
 462              : "=c"(index), "=@ccc"(f)
 463              : "m"(*s), "x"(search), "a"(4), "d"(16));
 464       if (f)
 465         break;
 466
 467       s += 16;
 468     }
 469 #else
 470   s -= 16;
 471   /* By doing the whole loop in inline assembly,
 472      we can make proper use of the flags set.  */
 473   __asm (      ".balign 16\n"
 474         "0:     add $16, %1\n"
 475         "       %vpcmpestri\t$0, (%1), %2\n"
 476         "       jnc 0b"
 477         : "=&c"(index), "+r"(s)
 478         : "x"(search), "a"(4), "d"(16));
 479 #endif
 480
 481  found:
 482   return s + index;
 483 }
 484
 485 #else
 486 /* Work around out-dated assemblers without sse4 support.  */
 487 #define search_line_sse42 search_line_sse2
 488 #endif
 489
 490 /* Check the CPU capabilities.  */
 491
 492 #include "../gcc/config/i386/cpuid.h"
 493
 494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 495 static search_line_fast_type search_line_fast;
 496
 497 #define HAVE_init_vectorized_lexer 1
 498 static inline void
 499 init_vectorized_lexer (void)
 500 {
 501   unsigned dummy, ecx = 0, edx = 0;
 502   search_line_fast_type impl = search_line_acc_char;
 503   int minimum = 0;
 504
 505 #if defined(__SSE4_2__)
 506   minimum = 3;
 507 #elif defined(__SSE2__)
 508   minimum = 2;
 509 #elif defined(__SSE__)
 510   minimum = 1;
 511 #endif
 512
 513   if (minimum == 3)
 514     impl = search_line_sse42;
 515   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 516     {
 517       if (minimum == 3 || (ecx & bit_SSE4_2))
 518         impl = search_line_sse42;
 519       else if (minimum == 2 || (edx & bit_SSE2))
 520         impl = search_line_sse2;
 521       else if (minimum == 1 || (edx & bit_SSE))
 522         impl = search_line_mmx;
 523     }
 524   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 525     {
 526       if (minimum == 1
 527           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 528         impl = search_line_mmx;
 529     }
 530
 531   search_line_fast = impl;
 532 }
 533
 534 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 535
 536 /* A vection of the fast scanner using AltiVec vectorized byte compares
 537    and VSX unaligned loads (when VSX is available).  This is otherwise
 538    the same as the AltiVec version.  */
 539
 540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 541 static const uchar *
 542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 543 {
 544   typedef __attribute__((altivec(vector))) unsigned char vc;
 545
 546   const vc repl_nl = {
 547     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 548     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 549   };
 550   const vc repl_cr = {
 551     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 552     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 553   };
 554   const vc repl_bs = {
 555     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 556     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 557   };
 558   const vc repl_qm = {
 559     '?', '?', '?', '?', '?', '?', '?', '?',
 560     '?', '?', '?', '?', '?', '?', '?', '?',
 561   };
 562   const vc zero = { 0 };
 563
 564   vc data, t;
 565
 566   /* Main loop processing 16 bytes at a time.  */
 567   do
 568     {
 569       vc m_nl, m_cr, m_bs, m_qm;
 570
 571       data = __builtin_vec_vsx_ld (0, s);
 572       s += 16;
 573
 574       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 575       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 576       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 577       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 578       t = (m_nl | m_cr) | (m_bs | m_qm);
 579
 580       /* T now contains 0xff in bytes for which we matched one of the relevant
 581          characters.  We want to exit the loop if any byte in T is non-zero.
 582          Below is the expansion of vec_any_ne(t, zero).  */
 583     }
 584   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 585
 586   /* Restore s to to point to the 16 bytes we just processed.  */
 587   s -= 16;
 588
 589   {
 590 #define N  (sizeof(vc) / sizeof(long))
 591
 592     union {
 593       vc v;
 594       /* Statically assert that N is 2 or 4.  */
 595       unsigned long l[(N == 2 || N == 4) ? N : -1];
 596     } u;
 597     unsigned long l, i = 0;
 598
 599     u.v = t;
 600
 601     /* Find the first word of T that is non-zero.  */
 602     switch (N)
 603       {
 604       case 4:
 605         l = u.l[i++];
 606         if (l != 0)
 607           break;
 608         s += sizeof(unsigned long);
 609         l = u.l[i++];
 610         if (l != 0)
 611           break;
 612         s += sizeof(unsigned long);
 613         /* FALLTHRU */
 614       case 2:
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619         l = u.l[i];
 620       }
 621
 622     /* L now contains 0xff in bytes for which we matched one of the
 623        relevant characters.  We can find the byte index by finding
 624        its bit index and dividing by 8.  */
 625 #ifdef __BIG_ENDIAN__
 626     l = __builtin_clzl(l) >> 3;
 627 #else
 628     l = __builtin_ctzl(l) >> 3;
 629 #endif
 630     return s + l;
 631
 632 #undef N
 633   }
 634 }
 635
 636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 637
 638 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 639    This cannot be used for little endian because vec_lvsl/lvsr are
 640    deprecated for little endian and the code won't work properly.  */
 641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 642    so we can't compile this function without -maltivec on the command line
 643    (or implied by some other switch).  */
 644
 645 static const uchar *
 646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 647 {
 648   typedef __attribute__((altivec(vector))) unsigned char vc;
 649
 650   const vc repl_nl = {
 651     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 652     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 653   };
 654   const vc repl_cr = {
 655     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 656     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 657   };
 658   const vc repl_bs = {
 659     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 660     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 661   };
 662   const vc repl_qm = {
 663     '?', '?', '?', '?', '?', '?', '?', '?',
 664     '?', '?', '?', '?', '?', '?', '?', '?',
 665   };
 666   const vc ones = {
 667     -1, -1, -1, -1, -1, -1, -1, -1,
 668     -1, -1, -1, -1, -1, -1, -1, -1,
 669   };
 670   const vc zero = { 0 };
 671
 672   vc data, mask, t;
 673
 674   /* Altivec loads automatically mask addresses with -16.  This lets us
 675      issue the first load as early as possible.  */
 676   data = __builtin_vec_ld(0, (const vc *)s);
 677
 678   /* Discard bytes before the beginning of the buffer.  Do this by
 679      beginning with all ones and shifting in zeros according to the
 680      mis-alignment.  The LVSR instruction pulls the exact shift we
 681      want from the address.  */
 682   mask = __builtin_vec_lvsr(0, s);
 683   mask = __builtin_vec_perm(zero, ones, mask);
 684   data &= mask;
 685
 686   /* While altivec loads mask addresses, we still need to align S so
 687      that the offset we compute at the end is correct.  */
 688   s = (const uchar *)((uintptr_t)s & -16);
 689
 690   /* Main loop processing 16 bytes at a time.  */
 691   goto start;
 692   do
 693     {
 694       vc m_nl, m_cr, m_bs, m_qm;
 695
 696       s += 16;
 697       data = __builtin_vec_ld(0, (const vc *)s);
 698
 699     start:
 700       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 701       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 702       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 703       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 704       t = (m_nl | m_cr) | (m_bs | m_qm);
 705
 706       /* T now contains 0xff in bytes for which we matched one of the relevant
 707          characters.  We want to exit the loop if any byte in T is non-zero.
 708          Below is the expansion of vec_any_ne(t, zero).  */
 709     }
 710   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 711
 712   {
 713 #define N  (sizeof(vc) / sizeof(long))
 714
 715     union {
 716       vc v;
 717       /* Statically assert that N is 2 or 4.  */
 718       unsigned long l[(N == 2 || N == 4) ? N : -1];
 719     } u;
 720     unsigned long l, i = 0;
 721
 722     u.v = t;
 723
 724     /* Find the first word of T that is non-zero.  */
 725     switch (N)
 726       {
 727       case 4:
 728         l = u.l[i++];
 729         if (l != 0)
 730           break;
 731         s += sizeof(unsigned long);
 732         l = u.l[i++];
 733         if (l != 0)
 734           break;
 735         s += sizeof(unsigned long);
 736         /* FALLTHROUGH */
 737       case 2:
 738         l = u.l[i++];
 739         if (l != 0)
 740           break;
 741         s += sizeof(unsigned long);
 742         l = u.l[i];
 743       }
 744
 745     /* L now contains 0xff in bytes for which we matched one of the
 746        relevant characters.  We can find the byte index by finding
 747        its bit index and dividing by 8.  */
 748     l = __builtin_clzl(l) >> 3;
 749     return s + l;
 750
 751 #undef N
 752   }
 753 }
 754
 755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 756 #include "arm_neon.h"
 757
 758 /* This doesn't have to be the exact page size, but no system may use
 759    a size smaller than this.  ARMv8 requires a minimum page size of
 760    4k.  The impact of being conservative here is a small number of
 761    cases will take the slightly slower entry path into the main
 762    loop.  */
 763
 764 #define AARCH64_MIN_PAGE_SIZE 4096
 765
 766 static const uchar *
 767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 768 {
 769   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 770   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 771   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 772   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 773   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 774
 775 #ifdef __ARM_BIG_ENDIAN
 776   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 777 #else
 778   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 779 #endif
 780
 781   unsigned int found;
 782   const uint8_t *p;
 783   uint8x16_t data;
 784   uint8x16_t t;
 785   uint16x8_t m;
 786   uint8x16_t u, v, w;
 787
 788   /* Align the source pointer.  */
 789   p = (const uint8_t *)((uintptr_t)s & -16);
 790
 791   /* Assuming random string start positions, with a 4k page size we'll take
 792      the slow path about 0.37% of the time.  */
 793   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 794                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 795                         < 16, 0))
 796     {
 797       /* Slow path: the string starts near a possible page boundary.  */
 798       uint32_t misalign, mask;
 799
 800       misalign = (uintptr_t)s & 15;
 801       mask = (-1u << misalign) & 0xffff;
 802       data = vld1q_u8 (p);
 803       t = vceqq_u8 (data, repl_nl);
 804       u = vceqq_u8 (data, repl_cr);
 805       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 806       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 807       t = vorrq_u8 (v, w);
 808       t = vandq_u8 (t, xmask);
 809       m = vpaddlq_u8 (t);
 810       m = vshlq_u16 (m, shift);
 811       found = vaddvq_u16 (m);
 812       found &= mask;
 813       if (found)
 814         return (const uchar*)p + __builtin_ctz (found);
 815     }
 816   else
 817     {
 818       data = vld1q_u8 ((const uint8_t *) s);
 819       t = vceqq_u8 (data, repl_nl);
 820       u = vceqq_u8 (data, repl_cr);
 821       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 822       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 823       t = vorrq_u8 (v, w);
 824       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 825         goto done;
 826     }
 827
 828   do
 829     {
 830       p += 16;
 831       data = vld1q_u8 (p);
 832       t = vceqq_u8 (data, repl_nl);
 833       u = vceqq_u8 (data, repl_cr);
 834       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 835       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 836       t = vorrq_u8 (v, w);
 837     } while (!vpaddd_u64 ((uint64x2_t)t));
 838
 839 done:
 840   /* Now that we've found the terminating substring, work out precisely where
 841      we need to stop.  */
 842   t = vandq_u8 (t, xmask);
 843   m = vpaddlq_u8 (t);
 844   m = vshlq_u16 (m, shift);
 845   found = vaddvq_u16 (m);
 846   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 847           + __builtin_ctz (found));
 848 }
 849
 850 #elif defined (__ARM_NEON)
 851 #include "arm_neon.h"
 852
 853 static const uchar *
 854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 855 {
 856   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 857   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 858   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 859   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 860   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 861
 862   unsigned int misalign, found, mask;
 863   const uint8_t *p;
 864   uint8x16_t data;
 865
 866   /* Align the source pointer.  */
 867   misalign = (uintptr_t)s & 15;
 868   p = (const uint8_t *)((uintptr_t)s & -16);
 869   data = vld1q_u8 (p);
 870
 871   /* Create a mask for the bytes that are valid within the first
 872      16-byte block.  The Idea here is that the AND with the mask
 873      within the loop is "free", since we need some AND or TEST
 874      insn in order to set the flags for the branch anyway.  */
 875   mask = (-1u << misalign) & 0xffff;
 876
 877   /* Main loop, processing 16 bytes at a time.  */
 878   goto start;
 879
 880   do
 881     {
 882       uint8x8_t l;
 883       uint16x4_t m;
 884       uint32x2_t n;
 885       uint8x16_t t, u, v, w;
 886
 887       p += 16;
 888       data = vld1q_u8 (p);
 889       mask = 0xffff;
 890
 891     start:
 892       t = vceqq_u8 (data, repl_nl);
 893       u = vceqq_u8 (data, repl_cr);
 894       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 895       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 896       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 897       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 898       m = vpaddl_u8 (l);
 899       n = vpaddl_u16 (m);
 900
 901       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 902               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 903       found &= mask;
 904     }
 905   while (!found);
 906
 907   /* FOUND contains 1 in bits for which we matched a relevant
 908      character.  Conversion to the byte index is trivial.  */
 909   found = __builtin_ctz (found);
 910   return (const uchar *)p + found;
 911 }
 912
 913 #else
 914
 915 /* We only have one accelerated alternative.  Use a direct call so that
 916    we encourage inlining.  */
 917
 918 #define search_line_fast  search_line_acc_char
 919
 920 #endif
 921
 922 /* Initialize the lexer if needed.  */
 923
 924 void
 925 _cpp_init_lexer (void)
 926 {
 927 #ifdef HAVE_init_vectorized_lexer
 928   init_vectorized_lexer ();
 929 #endif
 930 }
 931
 932 /* Returns with a logical line that contains no escaped newlines or
 933    trigraphs.  This is a time-critical inner loop.  */
 934 void
 935 _cpp_clean_line (cpp_reader *pfile)
 936 {
 937   cpp_buffer *buffer;
 938   const uchar *s;
 939   uchar c, *d, *p;
 940
 941   buffer = pfile->buffer;
 942   buffer->cur_note = buffer->notes_used = 0;
 943   buffer->cur = buffer->line_base = buffer->next_line;
 944   buffer->need_line = false;
 945   s = buffer->next_line;
 946
 947   if (!buffer->from_stage3)
 948     {
 949       const uchar *pbackslash = NULL;
 950
 951       /* Fast path.  This is the common case of an un-escaped line with
 952          no trigraphs.  The primary win here is by not writing any
 953          data back to memory until we have to.  */
 954       while (1)
 955         {
 956           /* Perform an optimized search for \n, \r, \\, ?.  */
 957           s = search_line_fast (s, buffer->rlimit);
 958
 959           c = *s;
 960           if (c == '\\')
 961             {
 962               /* Record the location of the backslash and continue.  */
 963               pbackslash = s++;
 964             }
 965           else if (__builtin_expect (c == '?', 0))
 966             {
 967               if (__builtin_expect (s[1] == '?', false)
 968                    && _cpp_trigraph_map[s[2]])
 969                 {
 970                   /* Have a trigraph.  We may or may not have to convert
 971                      it.  Add a line note regardless, for -Wtrigraphs.  */
 972                   add_line_note (buffer, s, s[2]);
 973                   if (CPP_OPTION (pfile, trigraphs))
 974                     {
 975                       /* We do, and that means we have to switch to the
 976                          slow path.  */
 977                       d = (uchar *) s;
 978                       *d = _cpp_trigraph_map[s[2]];
 979                       s += 2;
 980                       goto slow_path;
 981                     }
 982                 }
 983               /* Not a trigraph.  Continue on fast-path.  */
 984               s++;
 985             }
 986           else
 987             break;
 988         }
 989
 990       /* This must be \r or \n.  We're either done, or we'll be forced
 991          to write back to the buffer and continue on the slow path.  */
 992       d = (uchar *) s;
 993
 994       if (__builtin_expect (s == buffer->rlimit, false))
 995         goto done;
 996
 997       /* DOS line ending? */
 998       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 999         {
1000           s++;
1001           if (s == buffer->rlimit)
1002             goto done;
1003         }
1004
1005       if (__builtin_expect (pbackslash == NULL, true))
1006         goto done;
1007
1008       /* Check for escaped newline.  */
1009       p = d;
1010       while (is_nvspace (p[-1]))
1011         p--;
1012       if (p - 1 != pbackslash)
1013         goto done;
1014
1015       /* Have an escaped newline; process it and proceed to
1016          the slow path.  */
1017       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018       d = p - 2;
1019       buffer->next_line = p - 1;
1020
1021     slow_path:
1022       while (1)
1023         {
1024           c = *++s;
1025           *++d = c;
1026
1027           if (c == '\n' || c == '\r')
1028             {
1029               /* Handle DOS line endings.  */
1030               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031                 s++;
1032               if (s == buffer->rlimit)
1033                 break;
1034
1035               /* Escaped?  */
1036               p = d;
1037               while (p != buffer->next_line && is_nvspace (p[-1]))
1038                 p--;
1039               if (p == buffer->next_line || p[-1] != '\\')
1040                 break;
1041
1042               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043               d = p - 2;
1044               buffer->next_line = p - 1;
1045             }
1046           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1047             {
1048               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1049               add_line_note (buffer, d, s[2]);
1050               if (CPP_OPTION (pfile, trigraphs))
1051                 {
1052                   *d = _cpp_trigraph_map[s[2]];
1053                   s += 2;
1054                 }
1055             }
1056         }
1057     }
1058   else
1059     {
1060       while (*s != '\n' && *s != '\r')
1061         s++;
1062       d = (uchar *) s;
1063
1064       /* Handle DOS line endings.  */
1065       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1066         s++;
1067     }
1068
1069  done:
1070   *d = '\n';
1071   /* A sentinel note that should never be processed.  */
1072   add_line_note (buffer, d + 1, '\n');
1073   buffer->next_line = s + 1;
1074 }
1075
1076 /* Return true if the trigraph indicated by NOTE should be warned
1077    about in a comment.  */
1078 static bool
1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1080 {
1081   const uchar *p;
1082
1083   /* Within comments we don't warn about trigraphs, unless the
1084      trigraph forms an escaped newline, as that may change
1085      behavior.  */
1086   if (note->type != '/')
1087     return false;
1088
1089   /* If -trigraphs, then this was an escaped newline iff the next note
1090      is coincident.  */
1091   if (CPP_OPTION (pfile, trigraphs))
1092     return note[1].pos == note->pos;
1093
1094   /* Otherwise, see if this forms an escaped newline.  */
1095   p = note->pos + 3;
1096   while (is_nvspace (*p))
1097     p++;
1098
1099   /* There might have been escaped newlines between the trigraph and the
1100      newline we found.  Hence the position test.  */
1101   return (*p == '\n' && p < note[1].pos);
1102 }
1103
1104 /* Process the notes created by add_line_note as far as the current
1105    location.  */
1106 void
1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1108 {
1109   cpp_buffer *buffer = pfile->buffer;
1110
1111   for (;;)
1112     {
1113       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114       unsigned int col;
1115
1116       if (note->pos > buffer->cur)
1117         break;
1118
1119       buffer->cur_note++;
1120       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1121
1122       if (note->type == '\\' || note->type == ' ')
1123         {
1124           if (note->type == ' ' && !in_comment)
1125             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126                                  "backslash and newline separated by space");
1127
1128           if (buffer->next_line > buffer->rlimit)
1129             {
1130               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131                                    "backslash-newline at end of file");
1132               /* Prevent "no newline at end of file" warning.  */
1133               buffer->next_line = buffer->rlimit;
1134             }
1135
1136           buffer->line_base = note->pos;
1137           CPP_INCREMENT_LINE (pfile, 0);
1138         }
1139       else if (_cpp_trigraph_map[note->type])
1140         {
1141           if (CPP_OPTION (pfile, warn_trigraphs)
1142               && (!in_comment || warn_in_comment (pfile, note)))
1143             {
1144               if (CPP_OPTION (pfile, trigraphs))
1145                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146                                        pfile->line_table->highest_line, col,
1147                                        "trigraph ??%c converted to %c",
1148                                        note->type,
1149                                        (int) _cpp_trigraph_map[note->type]);
1150               else
1151                 {
1152                   cpp_warning_with_line
1153                     (pfile, CPP_W_TRIGRAPHS,
1154                      pfile->line_table->highest_line, col,
1155                      "trigraph ??%c ignored, use -trigraphs to enable",
1156                      note->type);
1157                 }
1158             }
1159         }
1160       else if (note->type == 0)
1161         /* Already processed in lex_raw_string.  */;
1162       else
1163         abort ();
1164     }
1165 }
1166
1167 namespace bidi {
1168   enum class kind {
1169     NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1170   };
1171
1172   /* All the UTF-8 encodings of bidi characters start with E2.  */
1173   constexpr uchar utf8_start = 0xe2;
1174
1175   struct context
1176   {
1177     context () {}
1178     context (location_t loc, kind k, bool pdf, bool ucn)
1179     : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1180     {
1181     }
1182
1183     kind get_pop_kind () const
1184     {
1185       return m_pdf ? kind::PDF : kind::PDI;
1186     }
1187     bool ucn_p () const
1188     {
1189       return m_ucn;
1190     }
1191
1192     location_t m_loc;
1193     kind m_kind;
1194     unsigned m_pdf : 1;
1195     unsigned m_ucn : 1;
1196   };
1197
1198   /* A vector holding currently open bidi contexts.  We use a char for
1199      each context, its LSB is 1 if it represents a PDF context, 0 if it
1200      represents a PDI context.  The next bit is 1 if this context was open
1201      by a bidi character written as a UCN, and 0 when it was UTF-8.  */
1202   semi_embedded_vec <context, 16> vec;
1203
1204   /* Close the whole comment/identifier/string literal/character constant
1205      context.  */
1206   void on_close ()
1207   {
1208     vec.truncate (0);
1209   }
1210
1211   /* Pop the last element in the vector.  */
1212   void pop ()
1213   {
1214     unsigned int len = vec.count ();
1215     gcc_checking_assert (len > 0);
1216     vec.truncate (len - 1);
1217   }
1218
1219   /* Return the pop kind of the context of the Ith element.  */
1220   kind pop_kind_at (unsigned int i)
1221   {
1222     return vec[i].get_pop_kind ();
1223   }
1224
1225   /* Return the pop kind of the context that is currently opened.  */
1226   kind current_ctx ()
1227   {
1228     unsigned int len = vec.count ();
1229     if (len == 0)
1230       return kind::NONE;
1231     return vec[len - 1].get_pop_kind ();
1232   }
1233
1234   /* Return true if the current context comes from a UCN origin, that is,
1235      the bidi char which started this bidi context was written as a UCN.  */
1236   bool current_ctx_ucn_p ()
1237   {
1238     unsigned int len = vec.count ();
1239     gcc_checking_assert (len > 0);
1240     return vec[len - 1].m_ucn;
1241   }
1242
1243   location_t current_ctx_loc ()
1244   {
1245     unsigned int len = vec.count ();
1246     gcc_checking_assert (len > 0);
1247     return vec[len - 1].m_loc;
1248   }
1249
1250   /* We've read a bidi char, update the current vector as necessary.
1251      LOC is only valid when K is not kind::NONE.  */
1252   void on_char (kind k, bool ucn_p, location_t loc)
1253   {
1254     switch (k)
1255       {
1256       case kind::LRE:
1257       case kind::RLE:
1258       case kind::LRO:
1259       case kind::RLO:
1260         vec.push (context (loc, k, true, ucn_p));
1261         break;
1262       case kind::LRI:
1263       case kind::RLI:
1264       case kind::FSI:
1265         vec.push (context (loc, k, false, ucn_p));
1266         break;
1267       /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1268          whose scope has not yet been terminated.  */
1269       case kind::PDF:
1270         if (current_ctx () == kind::PDF)
1271           pop ();
1272         break;
1273       /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1274          scope has not yet been terminated, as well as the scopes of
1275          any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1276          yet been terminated.  */
1277       case kind::PDI:
1278         for (int i = vec.count () - 1; i >= 0; --i)
1279           if (pop_kind_at (i) == kind::PDI)
1280             {
1281               vec.truncate (i);
1282               break;
1283             }
1284         break;
1285       case kind::LTR:
1286       case kind::RTL:
1287         /* These aren't popped by a PDF/PDI.  */
1288         break;
1289       ATTR_LIKELY case kind::NONE:
1290         break;
1291       default:
1292         abort ();
1293       }
1294   }
1295
1296   /* Return a descriptive string for K.  */
1297   const char *to_str (kind k)
1298   {
1299     switch (k)
1300       {
1301       case kind::LRE:
1302         return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1303       case kind::RLE:
1304         return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1305       case kind::LRO:
1306         return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1307       case kind::RLO:
1308         return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1309       case kind::LRI:
1310         return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1311       case kind::RLI:
1312         return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1313       case kind::FSI:
1314         return "U+2068 (FIRST STRONG ISOLATE)";
1315       case kind::PDF:
1316         return "U+202C (POP DIRECTIONAL FORMATTING)";
1317       case kind::PDI:
1318         return "U+2069 (POP DIRECTIONAL ISOLATE)";
1319       case kind::LTR:
1320         return "U+200E (LEFT-TO-RIGHT MARK)";
1321       case kind::RTL:
1322         return "U+200F (RIGHT-TO-LEFT MARK)";
1323       default:
1324         abort ();
1325       }
1326   }
1327 }
1328
1329 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1330    within the current line in FILE, with the caret at START.  */
1331
1332 static location_t
1333 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1334                                          const unsigned char *const start,
1335                                          size_t num_bytes)
1336 {
1337   gcc_checking_assert (num_bytes > 0);
1338
1339   /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1340      to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1341      whereas linemap_position_for_column is 1-based.  */
1342
1343   /* Get 0-based offsets within the line.  */
1344   size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1345   size_t end_offset = start_offset + num_bytes - 1;
1346
1347   /* Now convert to location_t, where "columns" are 1-based byte offsets.  */
1348   location_t start_loc = linemap_position_for_column (pfile->line_table,
1349                                                       start_offset + 1);
1350   location_t end_loc = linemap_position_for_column (pfile->line_table,
1351                                                      end_offset + 1);
1352
1353   if (start_loc == end_loc)
1354     return start_loc;
1355
1356   source_range src_range;
1357   src_range.m_start = start_loc;
1358   src_range.m_finish = end_loc;
1359   location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
1360                                                    start_loc,
1361                                                    src_range,
1362                                                    NULL);
1363   return combined_loc;
1364 }
1365
1366 /* Parse a sequence of 3 bytes starting with P and return its bidi code.  */
1367
1368 static bidi::kind
1369 get_bidi_utf8_1 (const unsigned char *const p)
1370 {
1371   gcc_checking_assert (p[0] == bidi::utf8_start);
1372
1373   if (p[1] == 0x80)
1374     switch (p[2])
1375       {
1376       case 0xaa:
1377         return bidi::kind::LRE;
1378       case 0xab:
1379         return bidi::kind::RLE;
1380       case 0xac:
1381         return bidi::kind::PDF;
1382       case 0xad:
1383         return bidi::kind::LRO;
1384       case 0xae:
1385         return bidi::kind::RLO;
1386       case 0x8e:
1387         return bidi::kind::LTR;
1388       case 0x8f:
1389         return bidi::kind::RTL;
1390       default:
1391         break;
1392       }
1393   else if (p[1] == 0x81)
1394     switch (p[2])
1395       {
1396       case 0xa6:
1397         return bidi::kind::LRI;
1398       case 0xa7:
1399         return bidi::kind::RLI;
1400       case 0xa8:
1401         return bidi::kind::FSI;
1402       case 0xa9:
1403         return bidi::kind::PDI;
1404       default:
1405         break;
1406       }
1407
1408   return bidi::kind::NONE;
1409 }
1410
1411 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1412    If the kind is not NONE, write the location to *OUT.*/
1413
1414 static bidi::kind
1415 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1416 {
1417   bidi::kind result = get_bidi_utf8_1 (p);
1418   if (result != bidi::kind::NONE)
1419     {
1420       /* We have a sequence of 3 bytes starting at P.  */
1421       *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1422     }
1423   return result;
1424 }
1425
1426 /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
1427
1428 static bidi::kind
1429 get_bidi_ucn_1 (const unsigned char *p, bool is_U)
1430 {
1431   /* 6.4.3 Universal Character Names
1432       \u hex-quad
1433       \U hex-quad hex-quad
1434      where \unnnn means \U0000nnnn.  */
1435
1436   if (is_U)
1437     {
1438       if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1439         return bidi::kind::NONE;
1440       /* Skip 4B so we can treat \u and \U the same below.  */
1441       p += 4;
1442     }
1443
1444   /* All code points we are looking for start with 20xx.  */
1445   if (p[0] != '2' || p[1] != '0')
1446     return bidi::kind::NONE;
1447   else if (p[2] == '2')
1448     switch (p[3])
1449       {
1450       case 'a':
1451       case 'A':
1452         return bidi::kind::LRE;
1453       case 'b':
1454       case 'B':
1455         return bidi::kind::RLE;
1456       case 'c':
1457       case 'C':
1458         return bidi::kind::PDF;
1459       case 'd':
1460       case 'D':
1461         return bidi::kind::LRO;
1462       case 'e':
1463       case 'E':
1464         return bidi::kind::RLO;
1465       default:
1466         break;
1467       }
1468   else if (p[2] == '6')
1469     switch (p[3])
1470       {
1471       case '6':
1472         return bidi::kind::LRI;
1473       case '7':
1474         return bidi::kind::RLI;
1475       case '8':
1476         return bidi::kind::FSI;
1477       case '9':
1478         return bidi::kind::PDI;
1479       default:
1480         break;
1481       }
1482   else if (p[2] == '0')
1483     switch (p[3])
1484       {
1485       case 'e':
1486       case 'E':
1487         return bidi::kind::LTR;
1488       case 'f':
1489       case 'F':
1490         return bidi::kind::RTL;
1491       default:
1492         break;
1493       }
1494
1495   return bidi::kind::NONE;
1496 }
1497
1498 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1499    If the kind is not NONE, write the location to *OUT.*/
1500
1501 static bidi::kind
1502 get_bidi_ucn (cpp_reader *pfile,  const unsigned char *p, bool is_U,
1503               location_t *out)
1504 {
1505   bidi::kind result = get_bidi_ucn_1 (p, is_U);
1506   if (result != bidi::kind::NONE)
1507     {
1508       const unsigned char *start = p - 2;
1509       size_t num_bytes = 2 + (is_U ? 8 : 4);
1510       *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1511     }
1512   return result;
1513 }
1514
1515 /* Subclass of rich_location for reporting on unpaired UTF-8
1516    bidirectional control character(s).
1517    Escape the source lines on output, and show all unclosed
1518    bidi context, labelling everything.  */
1519
1520 class unpaired_bidi_rich_location : public rich_location
1521 {
1522  public:
1523   class custom_range_label : public range_label
1524   {
1525    public:
1526      label_text get_text (unsigned range_idx) const FINAL OVERRIDE
1527      {
1528        /* range 0 is the primary location; each subsequent range i + 1
1529           is for bidi::vec[i].  */
1530        if (range_idx > 0)
1531          {
1532            const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1533            return label_text::borrow (bidi::to_str (ctxt.m_kind));
1534          }
1535        else
1536          return label_text::borrow (_("end of bidirectional context"));
1537      }
1538   };
1539
1540   unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1541   : rich_location (pfile->line_table, loc, &m_custom_label)
1542   {
1543     set_escape_on_output (true);
1544     for (unsigned i = 0; i < bidi::vec.count (); i++)
1545       add_range (bidi::vec[i].m_loc,
1546                  SHOW_RANGE_WITHOUT_CARET,
1547                  &m_custom_label);
1548   }
1549
1550  private:
1551    custom_range_label m_custom_label;
1552 };
1553
1554 /* We're closing a bidi context, that is, we've encountered a newline,
1555    are closing a C-style comment, or are at the end of a string literal,
1556    character constant, or identifier.  Warn if this context was not
1557    properly terminated by a PDI or PDF.  P points to the last character
1558    in this context.  */
1559
1560 static void
1561 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1562 {
1563   if (CPP_OPTION (pfile, cpp_warn_bidirectional) == bidirectional_unpaired
1564       && bidi::vec.count () > 0)
1565     {
1566       const location_t loc
1567         = linemap_position_for_column (pfile->line_table,
1568                                        CPP_BUF_COLUMN (pfile->buffer, p));
1569       unpaired_bidi_rich_location rich_loc (pfile, loc);
1570       /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1571          forms of a diagnostic, so fake it for now.  */
1572       if (bidi::vec.count () > 1)
1573         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1574                         "unpaired UTF-8 bidirectional control characters "
1575                         "detected");
1576       else
1577         cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1578                         "unpaired UTF-8 bidirectional control character "
1579                         "detected");
1580     }
1581   /* We're done with this context.  */
1582   bidi::on_close ();
1583 }
1584
1585 /* We're at the beginning or in the middle of an identifier/comment/string
1586    literal/character constant.  Warn if we've encountered a bidi character.
1587    KIND says which bidi control character it was; UCN_P is true iff this bidi
1588    control character was written as a UCN.  LOC is the location of the
1589    character, but is only valid if KIND != bidi::kind::NONE.  */
1590
1591 static void
1592 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1593                          bool ucn_p, location_t loc)
1594 {
1595   if (__builtin_expect (kind == bidi::kind::NONE, 1))
1596     return;
1597
1598   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1599
1600   if (warn_bidi != bidirectional_none)
1601     {
1602       rich_location rich_loc (pfile->line_table, loc);
1603       rich_loc.set_escape_on_output (true);
1604
1605       /* It seems excessive to warn about a PDI/PDF that is closing
1606          an opened context because we've already warned about the
1607          opening character.  Except warn when we have a UCN x UTF-8
1608          mismatch.  */
1609       if (kind == bidi::current_ctx ())
1610         {
1611           if (warn_bidi == bidirectional_unpaired
1612               && bidi::current_ctx_ucn_p () != ucn_p)
1613             {
1614               rich_loc.add_range (bidi::current_ctx_loc ());
1615               cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1616                               "UTF-8 vs UCN mismatch when closing "
1617                               "a context by \"%s\"", bidi::to_str (kind));
1618             }
1619         }
1620       else if (warn_bidi == bidirectional_any)
1621         {
1622           if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1623             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1624                             "\"%s\" is closing an unopened context",
1625                             bidi::to_str (kind));
1626           else
1627             cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1628                             "found problematic Unicode character \"%s\"",
1629                             bidi::to_str (kind));
1630         }
1631     }
1632   /* We're done with this context.  */
1633   bidi::on_char (kind, ucn_p, loc);
1634 }
1635
1636 /* Skip a C-style block comment.  We find the end of the comment by
1637    seeing if an asterisk is before every '/' we encounter.  Returns
1638    nonzero if comment terminated by EOF, zero otherwise.
1639
1640    Buffer->cur points to the initial asterisk of the comment.  */
1641 bool
1642 _cpp_skip_block_comment (cpp_reader *pfile)
1643 {
1644   cpp_buffer *buffer = pfile->buffer;
1645   const uchar *cur = buffer->cur;
1646   uchar c;
1647   const bool warn_bidi_p = pfile->warn_bidi_p ();
1648
1649   cur++;
1650   if (*cur == '/')
1651     cur++;
1652
1653   for (;;)
1654     {
1655       /* People like decorating comments with '*', so check for '/'
1656          instead for efficiency.  */
1657       c = *cur++;
1658
1659       if (c == '/')
1660         {
1661           if (cur[-2] == '*')
1662             {
1663               if (warn_bidi_p)
1664                 maybe_warn_bidi_on_close (pfile, cur);
1665               break;
1666             }
1667
1668           /* Warn about potential nested comments, but not if the '/'
1669              comes immediately before the true comment delimiter.
1670              Don't bother to get it right across escaped newlines.  */
1671           if (CPP_OPTION (pfile, warn_comments)
1672               && cur[0] == '*' && cur[1] != '/')
1673             {
1674               buffer->cur = cur;
1675               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1676                                      pfile->line_table->highest_line,
1677                                      CPP_BUF_COL (buffer),
1678                                      "\"/*\" within comment");
1679             }
1680         }
1681       else if (c == '\n')
1682         {
1683           unsigned int cols;
1684           buffer->cur = cur - 1;
1685           if (warn_bidi_p)
1686             maybe_warn_bidi_on_close (pfile, cur);
1687           _cpp_process_line_notes (pfile, true);
1688           if (buffer->next_line >= buffer->rlimit)
1689             return true;
1690           _cpp_clean_line (pfile);
1691
1692           cols = buffer->next_line - buffer->line_base;
1693           CPP_INCREMENT_LINE (pfile, cols);
1694
1695           cur = buffer->cur;
1696         }
1697       /* If this is a beginning of a UTF-8 encoding, it might be
1698          a bidirectional control character.  */
1699       else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
1700         {
1701           location_t loc;
1702           bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1703           maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1704         }
1705     }
1706
1707   buffer->cur = cur;
1708   _cpp_process_line_notes (pfile, true);
1709   return false;
1710 }
1711
1712 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1713    terminating newline.  Handles escaped newlines.  Returns nonzero
1714    if a multiline comment.  */
1715 static int
1716 skip_line_comment (cpp_reader *pfile)
1717 {
1718   cpp_buffer *buffer = pfile->buffer;
1719   location_t orig_line = pfile->line_table->highest_line;
1720   const bool warn_bidi_p = pfile->warn_bidi_p ();
1721
1722   if (!warn_bidi_p)
1723     while (*buffer->cur != '\n')
1724       buffer->cur++;
1725   else
1726     {
1727       while (*buffer->cur != '\n'
1728              && *buffer->cur != bidi::utf8_start)
1729         buffer->cur++;
1730       if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1731         {
1732           while (*buffer->cur != '\n')
1733             {
1734               if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1735                 {
1736                   location_t loc;
1737                   bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1738                   maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1739                 }
1740               buffer->cur++;
1741             }
1742           maybe_warn_bidi_on_close (pfile, buffer->cur);
1743         }
1744     }
1745
1746   _cpp_process_line_notes (pfile, true);
1747   return orig_line != pfile->line_table->highest_line;
1748 }
1749
1750 /* Skips whitespace, saving the next non-whitespace character.  */
1751 static void
1752 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1753 {
1754   cpp_buffer *buffer = pfile->buffer;
1755   bool saw_NUL = false;
1756
1757   do
1758     {
1759       /* Horizontal space always OK.  */
1760       if (c == ' ' || c == '\t')
1761         ;
1762       /* Just \f \v or \0 left.  */
1763       else if (c == '\0')
1764         saw_NUL = true;
1765       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1766         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1767                              CPP_BUF_COL (buffer),
1768                              "%s in preprocessing directive",
1769                              c == '\f' ? "form feed" : "vertical tab");
1770
1771       c = *buffer->cur++;
1772     }
1773   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1774   while (is_nvspace (c));
1775
1776   if (saw_NUL)
1777     {
1778       encoding_rich_location rich_loc (pfile);
1779       cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1780                     "null character(s) ignored");
1781     }
1782
1783   buffer->cur--;
1784 }
1785
1786 /* See if the characters of a number token are valid in a name (no
1787    '.', '+' or '-').  */
1788 static int
1789 name_p (cpp_reader *pfile, const cpp_string *string)
1790 {
1791   unsigned int i;
1792
1793   for (i = 0; i < string->len; i++)
1794     if (!is_idchar (string->text[i]))
1795       return 0;
1796
1797   return 1;
1798 }
1799
1800 /* After parsing an identifier or other sequence, produce a warning about
1801    sequences not in NFC/NFKC.  */
1802 static void
1803 warn_about_normalization (cpp_reader *pfile,
1804                           const cpp_token *token,
1805                           const struct normalize_state *s)
1806 {
1807   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1808       && !pfile->state.skipping)
1809     {
1810       location_t loc = token->src_loc;
1811
1812       /* If possible, create a location range for the token.  */
1813       if (loc >= RESERVED_LOCATION_COUNT
1814           && token->type != CPP_EOF
1815           /* There must be no line notes to process.  */
1816           && (!(pfile->buffer->cur
1817                 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
1818                 && !pfile->overlaid_buffer)))
1819         {
1820           source_range tok_range;
1821           tok_range.m_start = loc;
1822           tok_range.m_finish
1823             = linemap_position_for_column (pfile->line_table,
1824                                            CPP_BUF_COLUMN (pfile->buffer,
1825                                                            pfile->buffer->cur));
1826           loc = COMBINE_LOCATION_DATA (pfile->line_table,
1827                                        loc, tok_range, NULL);
1828         }
1829
1830       encoding_rich_location rich_loc (pfile, loc);
1831
1832       /* Make sure that the token is printed using UCNs, even
1833          if we'd otherwise happily print UTF-8.  */
1834       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1835       size_t sz;
1836
1837       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1838       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1839         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1840                         "`%.*s' is not in NFKC", (int) sz, buf);
1841       else if (CPP_OPTION (pfile, cplusplus))
1842         cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1843                                   "`%.*s' is not in NFC", (int) sz, buf);
1844       else
1845         cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1846                         "`%.*s' is not in NFC", (int) sz, buf);
1847       free (buf);
1848     }
1849 }
1850
1851 static const cppchar_t utf8_signifier = 0xC0;
1852
1853 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1854    an identifier.  FIRST is TRUE if this starts an identifier.  */
1855
1856 static bool
1857 forms_identifier_p (cpp_reader *pfile, int first,
1858                     struct normalize_state *state)
1859 {
1860   cpp_buffer *buffer = pfile->buffer;
1861   const bool warn_bidi_p = pfile->warn_bidi_p ();
1862
1863   if (*buffer->cur == '$')
1864     {
1865       if (!CPP_OPTION (pfile, dollars_in_ident))
1866         return false;
1867
1868       buffer->cur++;
1869       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1870         {
1871           CPP_OPTION (pfile, warn_dollars) = 0;
1872           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1873         }
1874
1875       return true;
1876     }
1877
1878   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
1879   if (CPP_OPTION (pfile, extended_identifiers))
1880     {
1881       cppchar_t s;
1882       if (*buffer->cur >= utf8_signifier)
1883         {
1884           if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
1885               && warn_bidi_p)
1886             {
1887               location_t loc;
1888               bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1889               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1890             }
1891           if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1892                                state, &s))
1893             return true;
1894         }
1895       else if (*buffer->cur == '\\'
1896                && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1897         {
1898           buffer->cur += 2;
1899           if (warn_bidi_p)
1900             {
1901               location_t loc;
1902               bidi::kind kind = get_bidi_ucn (pfile,
1903                                               buffer->cur,
1904                                               buffer->cur[-1] == 'U',
1905                                               &loc);
1906               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
1907             }
1908           if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1909                               state, &s, NULL, NULL))
1910             return true;
1911           buffer->cur -= 2;
1912         }
1913     }
1914
1915   return false;
1916 }
1917
1918 /* Helper function to issue error about improper __VA_OPT__ use.  */
1919 static void
1920 maybe_va_opt_error (cpp_reader *pfile)
1921 {
1922   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1923     {
1924       /* __VA_OPT__ should not be accepted at all, but allow it in
1925          system headers.  */
1926       if (!_cpp_in_system_header (pfile))
1927         cpp_error (pfile, CPP_DL_PEDWARN,
1928                    "__VA_OPT__ is not available until C++20");
1929     }
1930   else if (!pfile->state.va_args_ok)
1931     {
1932       /* __VA_OPT__ should only appear in the replacement list of a
1933          variadic macro.  */
1934       cpp_error (pfile, CPP_DL_PEDWARN,
1935                  "__VA_OPT__ can only appear in the expansion"
1936                  " of a C++20 variadic macro");
1937     }
1938 }
1939
1940 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1941 static cpp_hashnode *
1942 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1943 {
1944   cpp_hashnode *result;
1945   const uchar *cur;
1946   unsigned int len;
1947   unsigned int hash = HT_HASHSTEP (0, *base);
1948
1949   cur = base + 1;
1950   while (ISIDNUM (*cur))
1951     {
1952       hash = HT_HASHSTEP (hash, *cur);
1953       cur++;
1954     }
1955   len = cur - base;
1956   hash = HT_HASHFINISH (hash, len);
1957   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1958                                               base, len, hash, HT_ALLOC));
1959
1960   /* Rarely, identifiers require diagnostics when lexed.  */
1961   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1962                         && !pfile->state.skipping, 0))
1963     {
1964       /* It is allowed to poison the same identifier twice.  */
1965       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1966         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1967                    NODE_NAME (result));
1968
1969       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1970          replacement list of a variadic macro.  */
1971       if (result == pfile->spec_nodes.n__VA_ARGS__
1972           && !pfile->state.va_args_ok)
1973         {
1974           if (CPP_OPTION (pfile, cplusplus))
1975             cpp_error (pfile, CPP_DL_PEDWARN,
1976                        "__VA_ARGS__ can only appear in the expansion"
1977                        " of a C++11 variadic macro");
1978           else
1979             cpp_error (pfile, CPP_DL_PEDWARN,
1980                        "__VA_ARGS__ can only appear in the expansion"
1981                        " of a C99 variadic macro");
1982         }
1983
1984       if (result == pfile->spec_nodes.n__VA_OPT__)
1985         maybe_va_opt_error (pfile);
1986
1987       /* For -Wc++-compat, warn about use of C++ named operators.  */
1988       if (result->flags & NODE_WARN_OPERATOR)
1989         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1990                      "identifier \"%s\" is a special operator name in C++",
1991                      NODE_NAME (result));
1992     }
1993
1994   return result;
1995 }
1996
1997 /* Get the cpp_hashnode of an identifier specified by NAME in
1998    the current cpp_reader object.  If none is found, NULL is returned.  */
1999 cpp_hashnode *
2000 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2001 {
2002   cpp_hashnode *result;
2003   result = lex_identifier_intern (pfile, (uchar *) name);
2004   return result;
2005 }
2006
2007 /* Lex an identifier starting at BUFFER->CUR - 1.  */
2008 static cpp_hashnode *
2009 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2010                 struct normalize_state *nst, cpp_hashnode **spelling)
2011 {
2012   cpp_hashnode *result;
2013   const uchar *cur;
2014   unsigned int len;
2015   unsigned int hash = HT_HASHSTEP (0, *base);
2016   const bool warn_bidi_p = pfile->warn_bidi_p ();
2017
2018   cur = pfile->buffer->cur;
2019   if (! starts_ucn)
2020     {
2021       while (ISIDNUM (*cur))
2022         {
2023           hash = HT_HASHSTEP (hash, *cur);
2024           cur++;
2025         }
2026       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2027     }
2028   pfile->buffer->cur = cur;
2029   if (starts_ucn || forms_identifier_p (pfile, false, nst))
2030     {
2031       /* Slower version for identifiers containing UCNs
2032          or extended chars (including $).  */
2033       do {
2034         while (ISIDNUM (*pfile->buffer->cur))
2035           {
2036             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2037             pfile->buffer->cur++;
2038           }
2039       } while (forms_identifier_p (pfile, false, nst));
2040       if (warn_bidi_p)
2041         maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2042       result = _cpp_interpret_identifier (pfile, base,
2043                                           pfile->buffer->cur - base);
2044       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2045     }
2046   else
2047     {
2048       len = cur - base;
2049       hash = HT_HASHFINISH (hash, len);
2050
2051       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2052                                                   base, len, hash, HT_ALLOC));
2053       *spelling = result;
2054     }
2055
2056   /* Rarely, identifiers require diagnostics when lexed.  */
2057   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2058                         && !pfile->state.skipping, 0))
2059     {
2060       /* It is allowed to poison the same identifier twice.  */
2061       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2062         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2063                    NODE_NAME (result));
2064
2065       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2066          replacement list of a variadic macro.  */
2067       if (result == pfile->spec_nodes.n__VA_ARGS__
2068           && !pfile->state.va_args_ok)
2069         {
2070           if (CPP_OPTION (pfile, cplusplus))
2071             cpp_error (pfile, CPP_DL_PEDWARN,
2072                        "__VA_ARGS__ can only appear in the expansion"
2073                        " of a C++11 variadic macro");
2074           else
2075             cpp_error (pfile, CPP_DL_PEDWARN,
2076                        "__VA_ARGS__ can only appear in the expansion"
2077                        " of a C99 variadic macro");
2078         }
2079
2080       /* __VA_OPT__ should only appear in the replacement list of a
2081          variadic macro.  */
2082       if (result == pfile->spec_nodes.n__VA_OPT__)
2083         maybe_va_opt_error (pfile);
2084
2085       /* For -Wc++-compat, warn about use of C++ named operators.  */
2086       if (result->flags & NODE_WARN_OPERATOR)
2087         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2088                      "identifier \"%s\" is a special operator name in C++",
2089                      NODE_NAME (result));
2090     }
2091
2092   return result;
2093 }
2094
2095 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
2096 static void
2097 lex_number (cpp_reader *pfile, cpp_string *number,
2098             struct normalize_state *nst)
2099 {
2100   const uchar *cur;
2101   const uchar *base;
2102   uchar *dest;
2103
2104   base = pfile->buffer->cur - 1;
2105   do
2106     {
2107       const uchar *adj_digit_sep = NULL;
2108       cur = pfile->buffer->cur;
2109
2110       /* N.B. ISIDNUM does not include $.  */
2111       while (ISIDNUM (*cur)
2112              || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2113              || DIGIT_SEP (*cur)
2114              || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2115         {
2116           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2117           /* Adjacent digit separators do not form part of the pp-number syntax.
2118              However, they can safely be diagnosed here as an error, since '' is
2119              not a valid preprocessing token.  */
2120           if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2121             adj_digit_sep = cur;
2122           cur++;
2123         }
2124       /* A number can't end with a digit separator.  */
2125       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2126         --cur;
2127       if (adj_digit_sep && adj_digit_sep < cur)
2128         cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2129
2130       pfile->buffer->cur = cur;
2131     }
2132   while (forms_identifier_p (pfile, false, nst));
2133
2134   number->len = cur - base;
2135   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2136   memcpy (dest, base, number->len);
2137   dest[number->len] = '\0';
2138   number->text = dest;
2139 }
2140
2141 /* Create a token of type TYPE with a literal spelling.  */
2142 static void
2143 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2144                 unsigned int len, enum cpp_ttype type)
2145 {
2146   token->type = type;
2147   token->val.str.len = len;
2148   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2149 }
2150
2151 const uchar *
2152 cpp_alloc_token_string (cpp_reader *pfile,
2153                         const unsigned char *ptr, unsigned len)
2154 {
2155   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2156
2157   dest[len] = 0;
2158   memcpy (dest, ptr, len);
2159   return dest;
2160 }
2161
2162 /* A pair of raw buffer pointers.  The currently open one is [1], the
2163    first one is [0].  Used for string literal lexing.  */
2164 struct lit_accum {
2165   _cpp_buff *first;
2166   _cpp_buff *last;
2167   const uchar *rpos;
2168   size_t accum;
2169
2170   lit_accum ()
2171     : first (NULL), last (NULL), rpos (0), accum (0)
2172   {
2173   }
2174
2175   void append (cpp_reader *, const uchar *, size_t);
2176
2177   void read_begin (cpp_reader *);
2178   bool reading_p () const
2179   {
2180     return rpos != NULL;
2181   }
2182   char read_char ()
2183   {
2184     char c = *rpos++;
2185     if (rpos == BUFF_FRONT (last))
2186       rpos = NULL;
2187     return c;
2188   }
2189 };
2190
2191 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2192    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
2193
2194 void
2195 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2196 {
2197   if (!last)
2198     /* Starting.  */
2199     first = last = _cpp_get_buff (pfile, len);
2200   else if (len > BUFF_ROOM (last))
2201     {
2202       /* There is insufficient room in the buffer.  Copy what we can,
2203          and then either extend or create a new one.  */
2204       size_t room = BUFF_ROOM (last);
2205       memcpy (BUFF_FRONT (last), base, room);
2206       BUFF_FRONT (last) += room;
2207       base += room;
2208       len -= room;
2209       accum += room;
2210
2211       gcc_checking_assert (!rpos);
2212
2213       last = _cpp_append_extend_buff (pfile, last, len);
2214     }
2215
2216   memcpy (BUFF_FRONT (last), base, len);
2217   BUFF_FRONT (last) += len;
2218   accum += len;
2219 }
2220
2221 void
2222 lit_accum::read_begin (cpp_reader *pfile)
2223 {
2224   /* We never accumulate more than 4 chars to read.  */
2225   if (BUFF_ROOM (last) < 4)
2226
2227     last = _cpp_append_extend_buff (pfile, last, 4);
2228   rpos = BUFF_FRONT (last);
2229 }
2230
2231 /* Returns true if a macro has been defined.
2232    This might not work if compile with -save-temps,
2233    or preprocess separately from compilation.  */
2234
2235 static bool
2236 is_macro(cpp_reader *pfile, const uchar *base)
2237 {
2238   const uchar *cur = base;
2239   if (! ISIDST (*cur))
2240     return false;
2241   unsigned int hash = HT_HASHSTEP (0, *cur);
2242   ++cur;
2243   while (ISIDNUM (*cur))
2244     {
2245       hash = HT_HASHSTEP (hash, *cur);
2246       ++cur;
2247     }
2248   hash = HT_HASHFINISH (hash, cur - base);
2249
2250   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2251                                         base, cur - base, hash, HT_NO_INSERT));
2252
2253   return result && cpp_macro_p (result);
2254 }
2255
2256 /* Returns true if a literal suffix does not have the expected form
2257    and is defined as a macro.  */
2258
2259 static bool
2260 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
2261 {
2262   /* User-defined literals outside of namespace std must start with a single
2263      underscore, so assume anything of that form really is a UDL suffix.
2264      We don't need to worry about UDLs defined inside namespace std because
2265      their names are reserved, so cannot be used as macro names in valid
2266      programs.  */
2267   if (base[0] == '_' && base[1] != '_')
2268     return false;
2269   return is_macro (pfile, base);
2270 }
2271
2272 /* Lexes a raw string.  The stored string contains the spelling,
2273    including double quotes, delimiter string, '(' and ')', any leading
2274    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
2275    the type of the literal, or CPP_OTHER if it was not properly
2276    terminated.
2277
2278    BASE is the start of the token.  Updates pfile->buffer->cur to just
2279    after the lexed string.
2280
2281    The spelling is NUL-terminated, but it is not guaranteed that this
2282    is the first NUL since embedded NULs are preserved.  */
2283
2284 static void
2285 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2286 {
2287   const uchar *pos = base;
2288   const bool warn_bidi_p = pfile->warn_bidi_p ();
2289
2290   /* 'tis a pity this information isn't passed down from the lexer's
2291      initial categorization of the token.  */
2292   enum cpp_ttype type = CPP_STRING;
2293
2294   if (*pos == 'L')
2295     {
2296       type = CPP_WSTRING;
2297       pos++;
2298     }
2299   else if (*pos == 'U')
2300     {
2301       type = CPP_STRING32;
2302       pos++;
2303     }
2304   else if (*pos == 'u')
2305     {
2306       if (pos[1] == '8')
2307         {
2308           type = CPP_UTF8STRING;
2309           pos++;
2310         }
2311       else
2312         type = CPP_STRING16;
2313       pos++;
2314     }
2315
2316   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2317   pos += 2;
2318
2319   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2320
2321   /* Skip notes before the ".  */
2322   while (note->pos < pos)
2323     ++note;
2324
2325   lit_accum accum;
2326
2327   uchar prefix[17];
2328   unsigned prefix_len = 0;
2329   enum Phase
2330   {
2331    PHASE_PREFIX = -2,
2332    PHASE_NONE = -1,
2333    PHASE_SUFFIX = 0
2334   } phase = PHASE_PREFIX;
2335
2336   for (;;)
2337     {
2338       gcc_checking_assert (note->pos >= pos);
2339
2340       /* Undo any escaped newlines and trigraphs.  */
2341       if (!accum.reading_p () && note->pos == pos)
2342         switch (note->type)
2343           {
2344           case '\\':
2345           case ' ':
2346             /* Restore backslash followed by newline.  */
2347             accum.append (pfile, base, pos - base);
2348             base = pos;
2349             accum.read_begin (pfile);
2350             accum.append (pfile, UC"\\", 1);
2351
2352           after_backslash:
2353             if (note->type == ' ')
2354               /* GNU backslash whitespace newline extension.  FIXME
2355                  could be any sequence of non-vertical space.  When we
2356                  can properly restore any such sequence, we should
2357                  mark this note as handled so _cpp_process_line_notes
2358                  doesn't warn.  */
2359               accum.append (pfile, UC" ", 1);
2360
2361             accum.append (pfile, UC"\n", 1);
2362             note++;
2363             break;
2364
2365           case '\n':
2366             /* This can happen for ??/<NEWLINE> when trigraphs are not
2367                being interpretted.  */
2368             gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2369             note->type = 0;
2370             note++;
2371             break;
2372
2373           default:
2374             gcc_checking_assert (_cpp_trigraph_map[note->type]);
2375
2376             /* Don't warn about this trigraph in
2377                _cpp_process_line_notes, since trigraphs show up as
2378                trigraphs in raw strings.  */
2379             uchar type = note->type;
2380             note->type = 0;
2381
2382             if (CPP_OPTION (pfile, trigraphs))
2383               {
2384                 accum.append (pfile, base, pos - base);
2385                 base = pos;
2386                 accum.read_begin (pfile);
2387                 accum.append (pfile, UC"??", 2);
2388                 accum.append (pfile, &type, 1);
2389
2390                 /* ??/ followed by newline gets two line notes, one for
2391                    the trigraph and one for the backslash/newline.  */
2392                 if (type == '/' && note[1].pos == pos)
2393                   {
2394                     note++;
2395                     gcc_assert (note->type == '\\' || note->type == ' ');
2396                     goto after_backslash;
2397                   }
2398                 /* Skip the replacement character.  */
2399                 base = ++pos;
2400               }
2401
2402             note++;
2403             break;
2404           }
2405
2406       /* Now get a char to process.  Either from an expanded note, or
2407          from the line buffer.  */
2408       bool read_note = accum.reading_p ();
2409       char c = read_note ? accum.read_char () : *pos++;
2410
2411       if (phase == PHASE_PREFIX)
2412         {
2413           if (c == '(')
2414             {
2415               /* Done.  */
2416               phase = PHASE_NONE;
2417               prefix[prefix_len++] = '"';
2418             }
2419           else if (prefix_len < 16
2420                    /* Prefix chars are any of the basic character set,
2421                       [lex.charset] except for '
2422                       ()\\\t\v\f\n'. Optimized for a contiguous
2423                       alphabet.  */
2424                    /* Unlike a switch, this collapses down to one or
2425                       two shift and bitmask operations on an ASCII
2426                       system, with an outlier or two.   */
2427                    && (('Z' - 'A' == 25
2428                         ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2429                         : ISIDST (c))
2430                        || (c >= '0' && c <= '9')
2431                        || c == '_' || c == '{' || c == '}'
2432                        || c == '[' || c == ']' || c == '#'
2433                        || c == '<' || c == '>' || c == '%'
2434                        || c == ':' || c == ';' || c == '.' || c == '?'
2435                        || c == '*' || c == '+' || c == '-' || c == '/'
2436                        || c == '^' || c == '&' || c == '|' || c == '~'
2437                        || c == '!' || c == '=' || c == ','
2438                        || c == '"' || c == '\''))
2439             prefix[prefix_len++] = c;
2440           else
2441             {
2442               /* Something is wrong.  */
2443               int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2444               if (prefix_len == 16)
2445                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2446                                      col, "raw string delimiter longer "
2447                                      "than 16 characters");
2448               else if (c == '\n')
2449                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2450                                      col, "invalid new-line in raw "
2451                                      "string delimiter");
2452               else
2453                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2454                                      col, "invalid character '%c' in "
2455                                      "raw string delimiter", c);
2456               type = CPP_OTHER;
2457               phase = PHASE_NONE;
2458               /* Continue until we get a close quote, that's probably
2459                  the best failure mode.  */
2460               prefix_len = 0;
2461             }
2462           if (c != '\n')
2463             continue;
2464         }
2465
2466       if (phase != PHASE_NONE)
2467         {
2468           if (prefix[phase] != c)
2469             phase = PHASE_NONE;
2470           else if (unsigned (phase + 1) == prefix_len)
2471             break;
2472           else
2473             {
2474               phase = Phase (phase + 1);
2475               continue;
2476             }
2477         }
2478
2479       if (!prefix_len && c == '"')
2480         /* Failure mode lexing.  */
2481         goto out;
2482       else if (prefix_len && c == ')')
2483         phase = PHASE_SUFFIX;
2484       else if (!read_note && c == '\n')
2485         {
2486           pos--;
2487           pfile->buffer->cur = pos;
2488           if (pfile->state.in_directive
2489               || (pfile->state.parsing_args
2490                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
2491             {
2492               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2493                                    "unterminated raw string");
2494               type = CPP_OTHER;
2495               goto out;
2496             }
2497
2498           accum.append (pfile, base, pos - base + 1);
2499           _cpp_process_line_notes (pfile, false);
2500
2501           if (pfile->buffer->next_line < pfile->buffer->rlimit)
2502             CPP_INCREMENT_LINE (pfile, 0);
2503           pfile->buffer->need_line = true;
2504
2505           if (!_cpp_get_fresh_line (pfile))
2506             {
2507               /* We ran out of file and failed to get a line.  */
2508               location_t src_loc = token->src_loc;
2509               token->type = CPP_EOF;
2510               /* Tell the compiler the line number of the EOF token.  */
2511               token->src_loc = pfile->line_table->highest_line;
2512               token->flags = BOL;
2513               if (accum.first)
2514                 _cpp_release_buff (pfile, accum.first);
2515               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2516                                    "unterminated raw string");
2517               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
2518               _cpp_pop_buffer (pfile);
2519               return;
2520             }
2521
2522           pos = base = pfile->buffer->cur;
2523           note = &pfile->buffer->notes[pfile->buffer->cur_note];
2524         }
2525       else if (__builtin_expect ((unsigned char) c == bidi::utf8_start, 0)
2526                && warn_bidi_p)
2527         {
2528           location_t loc;
2529           bidi::kind kind = get_bidi_utf8 (pfile, pos - 1, &loc);
2530           maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2531         }
2532     }
2533
2534   if (warn_bidi_p)
2535     maybe_warn_bidi_on_close (pfile, pos);
2536
2537   if (CPP_OPTION (pfile, user_literals))
2538     {
2539       /* If a string format macro, say from inttypes.h, is placed touching
2540          a string literal it could be parsed as a C++11 user-defined string
2541          literal thus breaking the program.  */
2542       if (is_macro_not_literal_suffix (pfile, pos))
2543         {
2544           /* Raise a warning, but do not consume subsequent tokens.  */
2545           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2546             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2547                                    token->src_loc, 0,
2548                                    "invalid suffix on literal; C++11 requires "
2549                                    "a space between literal and string macro");
2550         }
2551       /* Grab user defined literal suffix.  */
2552       else if (ISIDST (*pos))
2553         {
2554           type = cpp_userdef_string_add_type (type);
2555           ++pos;
2556
2557           while (ISIDNUM (*pos))
2558             ++pos;
2559         }
2560     }
2561
2562  out:
2563   pfile->buffer->cur = pos;
2564   if (!accum.accum)
2565     create_literal (pfile, token, base, pos - base, type);
2566   else
2567     {
2568       size_t extra_len = pos - base;
2569       uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2570
2571       token->type = type;
2572       token->val.str.len = accum.accum + extra_len;
2573       token->val.str.text = dest;
2574       for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2575         {
2576           size_t len = BUFF_FRONT (buf) - buf->base;
2577           memcpy (dest, buf->base, len);
2578           dest += len;
2579         }
2580       _cpp_release_buff (pfile, accum.first);
2581       memcpy (dest, base, extra_len);
2582       dest[extra_len] = '\0';
2583     }
2584 }
2585
2586 /* Lexes a string, character constant, or angle-bracketed header file
2587    name.  The stored string contains the spelling, including opening
2588    quote and any leading 'L', 'u', 'U' or 'u8' and optional
2589    'R' modifier.  It returns the type of the literal, or CPP_OTHER
2590    if it was not properly terminated, or CPP_LESS for an unterminated
2591    header name which must be relexed as normal tokens.
2592
2593    The spelling is NUL-terminated, but it is not guaranteed that this
2594    is the first NUL since embedded NULs are preserved.  */
2595 static void
2596 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2597 {
2598   bool saw_NUL = false;
2599   const uchar *cur;
2600   cppchar_t terminator;
2601   enum cpp_ttype type;
2602
2603   cur = base;
2604   terminator = *cur++;
2605   if (terminator == 'L' || terminator == 'U')
2606     terminator = *cur++;
2607   else if (terminator == 'u')
2608     {
2609       terminator = *cur++;
2610       if (terminator == '8')
2611         terminator = *cur++;
2612     }
2613   if (terminator == 'R')
2614     {
2615       lex_raw_string (pfile, token, base);
2616       return;
2617     }
2618   if (terminator == '"')
2619     type = (*base == 'L' ? CPP_WSTRING :
2620             *base == 'U' ? CPP_STRING32 :
2621             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2622                          : CPP_STRING);
2623   else if (terminator == '\'')
2624     type = (*base == 'L' ? CPP_WCHAR :
2625             *base == 'U' ? CPP_CHAR32 :
2626             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2627                          : CPP_CHAR);
2628   else
2629     terminator = '>', type = CPP_HEADER_NAME;
2630
2631   const bool warn_bidi_p = pfile->warn_bidi_p ();
2632   for (;;)
2633     {
2634       cppchar_t c = *cur++;
2635
2636       /* In #include-style directives, terminators are not escapable.  */
2637       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2638         {
2639           if ((cur[0] == 'u' || cur[0] == 'U') && warn_bidi_p)
2640             {
2641               location_t loc;
2642               bidi::kind kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U',
2643                                               &loc);
2644               maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2645             }
2646           cur++;
2647         }
2648       else if (c == terminator)
2649         {
2650           if (warn_bidi_p)
2651             maybe_warn_bidi_on_close (pfile, cur - 1);
2652           break;
2653         }
2654       else if (c == '\n')
2655         {
2656           cur--;
2657           /* Unmatched quotes always yield undefined behavior, but
2658              greedy lexing means that what appears to be an unterminated
2659              header name may actually be a legitimate sequence of tokens.  */
2660           if (terminator == '>')
2661             {
2662               token->type = CPP_LESS;
2663               return;
2664             }
2665           type = CPP_OTHER;
2666           break;
2667         }
2668       else if (c == '\0')
2669         saw_NUL = true;
2670       else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
2671         {
2672           location_t loc;
2673           bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
2674           maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2675         }
2676     }
2677
2678   if (saw_NUL && !pfile->state.skipping)
2679     cpp_error (pfile, CPP_DL_WARNING,
2680                "null character(s) preserved in literal");
2681
2682   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2683     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2684                (int) terminator);
2685
2686   if (CPP_OPTION (pfile, user_literals))
2687     {
2688       /* If a string format macro, say from inttypes.h, is placed touching
2689          a string literal it could be parsed as a C++11 user-defined string
2690          literal thus breaking the program.  */
2691       if (is_macro_not_literal_suffix (pfile, cur))
2692         {
2693           /* Raise a warning, but do not consume subsequent tokens.  */
2694           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2695             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2696                                    token->src_loc, 0,
2697                                    "invalid suffix on literal; C++11 requires "
2698                                    "a space between literal and string macro");
2699         }
2700       /* Grab user defined literal suffix.  */
2701       else if (ISIDST (*cur))
2702         {
2703           type = cpp_userdef_char_add_type (type);
2704           type = cpp_userdef_string_add_type (type);
2705           ++cur;
2706
2707           while (ISIDNUM (*cur))
2708             ++cur;
2709         }
2710     }
2711   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2712            && is_macro (pfile, cur)
2713            && !pfile->state.skipping)
2714     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2715                            token->src_loc, 0, "C++11 requires a space "
2716                            "between string literal and macro");
2717
2718   pfile->buffer->cur = cur;
2719   create_literal (pfile, token, base, cur - base, type);
2720 }
2721
2722 /* Return the comment table. The client may not make any assumption
2723    about the ordering of the table.  */
2724 cpp_comment_table *
2725 cpp_get_comments (cpp_reader *pfile)
2726 {
2727   return &pfile->comments;
2728 }
2729
2730 /* Append a comment to the end of the comment table. */
2731 static void
2732 store_comment (cpp_reader *pfile, cpp_token *token)
2733 {
2734   int len;
2735
2736   if (pfile->comments.allocated == 0)
2737     {
2738       pfile->comments.allocated = 256;
2739       pfile->comments.entries = (cpp_comment *) xmalloc
2740         (pfile->comments.allocated * sizeof (cpp_comment));
2741     }
2742
2743   if (pfile->comments.count == pfile->comments.allocated)
2744     {
2745       pfile->comments.allocated *= 2;
2746       pfile->comments.entries = (cpp_comment *) xrealloc
2747         (pfile->comments.entries,
2748          pfile->comments.allocated * sizeof (cpp_comment));
2749     }
2750
2751   len = token->val.str.len;
2752
2753   /* Copy comment. Note, token may not be NULL terminated. */
2754   pfile->comments.entries[pfile->comments.count].comment =
2755     (char *) xmalloc (sizeof (char) * (len + 1));
2756   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2757           token->val.str.text, len);
2758   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2759
2760   /* Set source location. */
2761   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2762
2763   /* Increment the count of entries in the comment table. */
2764   pfile->comments.count++;
2765 }
2766
2767 /* The stored comment includes the comment start and any terminator.  */
2768 static void
2769 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2770               cppchar_t type)
2771 {
2772   unsigned char *buffer;
2773   unsigned int len, clen, i;
2774
2775   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2776
2777   /* C++ comments probably (not definitely) have moved past a new
2778      line, which we don't want to save in the comment.  */
2779   if (is_vspace (pfile->buffer->cur[-1]))
2780     len--;
2781
2782   /* If we are currently in a directive or in argument parsing, then
2783      we need to store all C++ comments as C comments internally, and
2784      so we need to allocate a little extra space in that case.
2785
2786      Note that the only time we encounter a directive here is
2787      when we are saving comments in a "#define".  */
2788   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2789           && type == '/') ? len + 2 : len;
2790
2791   buffer = _cpp_unaligned_alloc (pfile, clen);
2792
2793   token->type = CPP_COMMENT;
2794   token->val.str.len = clen;
2795   token->val.str.text = buffer;
2796
2797   buffer[0] = '/';
2798   memcpy (buffer + 1, from, len - 1);
2799
2800   /* Finish conversion to a C comment, if necessary.  */
2801   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2802     {
2803       buffer[1] = '*';
2804       buffer[clen - 2] = '*';
2805       buffer[clen - 1] = '/';
2806       /* As there can be in a C++ comments illegal sequences for C comments
2807          we need to filter them out.  */
2808       for (i = 2; i < (clen - 2); i++)
2809         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2810           buffer[i] = '|';
2811     }
2812
2813   /* Finally store this comment for use by clients of libcpp. */
2814   store_comment (pfile, token);
2815 }
2816
2817 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2818    comment.  */
2819
2820 static bool
2821 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2822 {
2823   const unsigned char *from = comment_start + 1;
2824
2825   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2826     {
2827       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2828          don't recognize any comments.  The latter only checks attributes,
2829          the former doesn't warn.  */
2830     case 0:
2831     default:
2832       return false;
2833       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2834          content it has.  */
2835     case 1:
2836       return true;
2837     case 2:
2838       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2839          .*falls?[ \t-]*thr(u|ough).* regex.  */
2840       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2841            from++)
2842         {
2843           /* Is there anything like strpbrk with upper boundary, or
2844              memchr looking for 2 characters rather than just one?  */
2845           if (from[0] != 'f' && from[0] != 'F')
2846             continue;
2847           if (from[1] != 'a' && from[1] != 'A')
2848             continue;
2849           if (from[2] != 'l' && from[2] != 'L')
2850             continue;
2851           if (from[3] != 'l' && from[3] != 'L')
2852             continue;
2853           from += sizeof "fall" - 1;
2854           if (from[0] == 's' || from[0] == 'S')
2855             from++;
2856           while (*from == ' ' || *from == '\t' || *from == '-')
2857             from++;
2858           if (from[0] != 't' && from[0] != 'T')
2859             continue;
2860           if (from[1] != 'h' && from[1] != 'H')
2861             continue;
2862           if (from[2] != 'r' && from[2] != 'R')
2863             continue;
2864           if (from[3] == 'u' || from[3] == 'U')
2865             return true;
2866           if (from[3] != 'o' && from[3] != 'O')
2867             continue;
2868           if (from[4] != 'u' && from[4] != 'U')
2869             continue;
2870           if (from[5] != 'g' && from[5] != 'G')
2871             continue;
2872           if (from[6] != 'h' && from[6] != 'H')
2873             continue;
2874           return true;
2875         }
2876       return false;
2877     case 3:
2878     case 4:
2879       break;
2880     }
2881
2882   /* Whole comment contents:
2883      -fallthrough
2884      @fallthrough@
2885    */
2886   if (*from == '-' || *from == '@')
2887     {
2888       size_t len = sizeof "fallthrough" - 1;
2889       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2890         return false;
2891       if (memcmp (from + 1, "fallthrough", len))
2892         return false;
2893       if (*from == '@')
2894         {
2895           if (from[len + 1] != '@')
2896             return false;
2897           len++;
2898         }
2899       from += 1 + len;
2900     }
2901   /* Whole comment contents (regex):
2902      lint -fallthrough[ \t]*
2903    */
2904   else if (*from == 'l')
2905     {
2906       size_t len = sizeof "int -fallthrough" - 1;
2907       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2908         return false;
2909       if (memcmp (from + 1, "int -fallthrough", len))
2910         return false;
2911       from += 1 + len;
2912       while (*from == ' ' || *from == '\t')
2913         from++;
2914     }
2915   /* Whole comment contents (regex):
2916      [ \t]*FALLTHR(U|OUGH)[ \t]*
2917    */
2918   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2919     {
2920       while (*from == ' ' || *from == '\t')
2921         from++;
2922       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
2923         return false;
2924       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2925         return false;
2926       from += sizeof "FALLTHR" - 1;
2927       if (*from == 'U')
2928         from++;
2929       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
2930         return false;
2931       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2932         return false;
2933       else
2934         from += sizeof "OUGH" - 1;
2935       while (*from == ' ' || *from == '\t')
2936         from++;
2937     }
2938   /* Whole comment contents (regex):
2939      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2940      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2941      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2942    */
2943   else
2944     {
2945       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2946         from++;
2947       unsigned char f = *from;
2948       bool all_upper = false;
2949       if (f == 'E' || f == 'e')
2950         {
2951           if ((size_t) (pfile->buffer->cur - from)
2952               < sizeof "else fallthru" - 1)
2953             return false;
2954           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2955             all_upper = true;
2956           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2957             return false;
2958           from += sizeof "else" - 1;
2959           if (*from == ',')
2960             from++;
2961           if (*from != ' ')
2962             return false;
2963           from++;
2964           if (all_upper && *from == 'f')
2965             return false;
2966           if (f == 'e' && *from == 'F')
2967             return false;
2968           f = *from;
2969         }
2970       else if (f == 'I' || f == 'i')
2971         {
2972           if ((size_t) (pfile->buffer->cur - from)
2973               < sizeof "intentional fallthru" - 1)
2974             return false;
2975           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2976                                   sizeof "NTENTIONAL" - 1) == 0)
2977             all_upper = true;
2978           else if (memcmp (from + 1, "ntentional",
2979                            sizeof "ntentional" - 1))
2980             return false;
2981           from += sizeof "intentional" - 1;
2982           if (*from == ' ')
2983             {
2984               from++;
2985               if (all_upper && *from == 'f')
2986                 return false;
2987             }
2988           else if (all_upper)
2989             {
2990               if (memcmp (from, "LY F", sizeof "LY F" - 1))
2991                 return false;
2992               from += sizeof "LY " - 1;
2993             }
2994           else
2995             {
2996               if (memcmp (from, "ly ", sizeof "ly " - 1))
2997                 return false;
2998               from += sizeof "ly " - 1;
2999             }
3000           if (f == 'i' && *from == 'F')
3001             return false;
3002           f = *from;
3003         }
3004       if (f != 'F' && f != 'f')
3005         return false;
3006       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3007         return false;
3008       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3009         all_upper = true;
3010       else if (all_upper)
3011         return false;
3012       else if (memcmp (from + 1, "all", sizeof "all" - 1))
3013         return false;
3014       from += sizeof "fall" - 1;
3015       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3016         from += 2;
3017       else if (*from == ' ' || *from == '-')
3018         from++;
3019       else if (*from != (all_upper ? 'T' : 't'))
3020         return false;
3021       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3022         return false;
3023       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3024         return false;
3025       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3026         {
3027           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3028             return false;
3029           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3030                       sizeof "hrough" - 1))
3031             return false;
3032           from += sizeof "through" - 1;
3033         }
3034       else
3035         from += sizeof "thru" - 1;
3036       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3037         from++;
3038       if (*from == '-')
3039         {
3040           from++;
3041           if (*comment_start == '*')
3042             {
3043               do
3044                 {
3045                   while (*from && *from != '*'
3046                          && *from != '\n' && *from != '\r')
3047                     from++;
3048                   if (*from != '*' || from[1] == '/')
3049                     break;
3050                   from++;
3051                 }
3052               while (1);
3053             }
3054           else
3055             while (*from && *from != '\n' && *from != '\r')
3056               from++;
3057         }
3058     }
3059   /* C block comment.  */
3060   if (*comment_start == '*')
3061     {
3062       if (*from != '*' || from[1] != '/')
3063         return false;
3064     }
3065   /* C++ line comment.  */
3066   else if (*from != '\n')
3067     return false;
3068
3069   return true;
3070 }
3071
3072 /* Allocate COUNT tokens for RUN.  */
3073 void
3074 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3075 {
3076   run->base = XNEWVEC (cpp_token, count);
3077   run->limit = run->base + count;
3078   run->next = NULL;
3079 }
3080
3081 /* Returns the next tokenrun, or creates one if there is none.  */
3082 static tokenrun *
3083 next_tokenrun (tokenrun *run)
3084 {
3085   if (run->next == NULL)
3086     {
3087       run->next = XNEW (tokenrun);
3088       run->next->prev = run;
3089       _cpp_init_tokenrun (run->next, 250);
3090     }
3091
3092   return run->next;
3093 }
3094
3095 /* Return the number of not yet processed token in a given
3096    context.  */
3097 int
3098 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3099 {
3100   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3101     return (LAST (context).token - FIRST (context).token);
3102   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3103            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3104     return (LAST (context).ptoken - FIRST (context).ptoken);
3105   else
3106       abort ();
3107 }
3108
3109 /* Returns the token present at index INDEX in a given context.  If
3110    INDEX is zero, the next token to be processed is returned.  */
3111 static const cpp_token*
3112 _cpp_token_from_context_at (cpp_context *context, int index)
3113 {
3114   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3115     return &(FIRST (context).token[index]);
3116   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3117            || context->tokens_kind == TOKENS_KIND_EXTENDED)
3118     return FIRST (context).ptoken[index];
3119  else
3120    abort ();
3121 }
3122
3123 /* Look ahead in the input stream.  */
3124 const cpp_token *
3125 cpp_peek_token (cpp_reader *pfile, int index)
3126 {
3127   cpp_context *context = pfile->context;
3128   const cpp_token *peektok;
3129   int count;
3130
3131   /* First, scan through any pending cpp_context objects.  */
3132   while (context->prev)
3133     {
3134       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3135
3136       if (index < (int) sz)
3137         return _cpp_token_from_context_at (context, index);
3138       index -= (int) sz;
3139       context = context->prev;
3140     }
3141
3142   /* We will have to read some new tokens after all (and do so
3143      without invalidating preceding tokens).  */
3144   count = index;
3145   pfile->keep_tokens++;
3146
3147   /* For peeked tokens temporarily disable line_change reporting,
3148      until the tokens are parsed for real.  */
3149   void (*line_change) (cpp_reader *, const cpp_token *, int)
3150     = pfile->cb.line_change;
3151   pfile->cb.line_change = NULL;
3152
3153   do
3154     {
3155       peektok = _cpp_lex_token (pfile);
3156       if (peektok->type == CPP_EOF)
3157         {
3158           index--;
3159           break;
3160         }
3161       else if (peektok->type == CPP_PRAGMA)
3162         {
3163           /* Don't peek past a pragma.  */
3164           if (peektok == &pfile->directive_result)
3165             /* Save the pragma in the buffer.  */
3166             *pfile->cur_token++ = *peektok;
3167           index--;
3168           break;
3169         }
3170     }
3171   while (index--);
3172
3173   _cpp_backup_tokens_direct (pfile, count - index);
3174   pfile->keep_tokens--;
3175   pfile->cb.line_change = line_change;
3176
3177   return peektok;
3178 }
3179
3180 /* Allocate a single token that is invalidated at the same time as the
3181    rest of the tokens on the line.  Has its line and col set to the
3182    same as the last lexed token, so that diagnostics appear in the
3183    right place.  */
3184 cpp_token *
3185 _cpp_temp_token (cpp_reader *pfile)
3186 {
3187   cpp_token *old, *result;
3188   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3189   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3190
3191   old = pfile->cur_token - 1;
3192   /* Any pre-existing lookaheads must not be clobbered.  */
3193   if (la)
3194     {
3195       if (sz <= la)
3196         {
3197           tokenrun *next = next_tokenrun (pfile->cur_run);
3198
3199           if (sz < la)
3200             memmove (next->base + 1, next->base,
3201                      (la - sz) * sizeof (cpp_token));
3202
3203           next->base[0] = pfile->cur_run->limit[-1];
3204         }
3205
3206       if (sz > 1)
3207         memmove (pfile->cur_token + 1, pfile->cur_token,
3208                  MIN (la, sz - 1) * sizeof (cpp_token));
3209     }
3210
3211   if (!sz && pfile->cur_token == pfile->cur_run->limit)
3212     {
3213       pfile->cur_run = next_tokenrun (pfile->cur_run);
3214       pfile->cur_token = pfile->cur_run->base;
3215     }
3216
3217   result = pfile->cur_token++;
3218   result->src_loc = old->src_loc;
3219   return result;
3220 }
3221
3222 /* We're at the beginning of a logical line (so not in
3223   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
3224   if we should enter deferred_pragma mode to tokenize the rest of the
3225   line as a module control-line.  */
3226
3227 static void
3228 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3229 {
3230   unsigned backup = 0; /* Tokens we peeked.  */
3231   cpp_hashnode *node = result->val.node.node;
3232   cpp_token *peek = result;
3233   cpp_token *keyword = peek;
3234   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3235   int header_count = 0;
3236
3237   /* Make sure the incoming state is as we expect it.  This way we
3238      can restore it using constants.  */
3239   gcc_checking_assert (!pfile->state.in_deferred_pragma
3240                        && !pfile->state.skipping
3241                        && !pfile->state.parsing_args
3242                        && !pfile->state.angled_headers
3243                        && (pfile->state.save_comments
3244                            == !CPP_OPTION (pfile, discard_comments)));
3245
3246   /* Enter directives mode sufficiently for peeking.  We don't have
3247      to actually set in_directive.  */
3248   pfile->state.in_deferred_pragma = true;
3249
3250   /* These two fields are needed to process tokenization in deferred
3251      pragma mode.  They are not used outside deferred pragma mode or
3252      directives mode.  */
3253   pfile->state.pragma_allow_expansion = true;
3254   pfile->directive_line = result->src_loc;
3255
3256   /* Saving comments is incompatible with directives mode.   */
3257   pfile->state.save_comments = 0;
3258
3259   if (node == n_modules[spec_nodes::M_EXPORT][0])
3260     {
3261       peek = _cpp_lex_direct (pfile);
3262       keyword = peek;
3263       backup++;
3264       if (keyword->type != CPP_NAME)
3265         goto not_module;
3266       node = keyword->val.node.node;
3267       if (!(node->flags & NODE_MODULE))
3268         goto not_module;
3269     }
3270
3271   if (node == n_modules[spec_nodes::M__IMPORT][0])
3272     /* __import  */
3273     header_count = backup + 2 + 16;
3274   else if (node == n_modules[spec_nodes::M_IMPORT][0])
3275     /* import  */
3276     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3277   else if (node == n_modules[spec_nodes::M_MODULE][0])
3278     ; /* module  */
3279   else
3280     goto not_module;
3281
3282   /* We've seen [export] {module|import|__import}.  Check the next token.  */
3283   if (header_count)
3284     /* After '{,__}import' a header name may appear.  */
3285     pfile->state.angled_headers = true;
3286   peek = _cpp_lex_direct (pfile);
3287   backup++;
3288
3289   /* ... import followed by identifier, ':', '<' or
3290      header-name preprocessing tokens, or module
3291      followed by cpp-identifier, ':' or ';' preprocessing
3292      tokens.  C++ keywords are not yet relevant.  */
3293   if (peek->type == CPP_NAME
3294       || peek->type == CPP_COLON
3295       ||  (header_count
3296            ? (peek->type == CPP_LESS
3297               || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3298               || peek->type == CPP_HEADER_NAME)
3299            : peek->type == CPP_SEMICOLON))
3300     {
3301       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3302       if (!pfile->state.pragma_allow_expansion)
3303         pfile->state.prevent_expansion++;
3304
3305       if (!header_count && linemap_included_from
3306           (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3307         cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3308                              "module control-line cannot be in included file");
3309
3310       /* The first one or two tokens cannot be macro names.  */
3311       for (int ix = backup; ix--;)
3312         {
3313           cpp_token *tok = ix ? keyword : result;
3314           cpp_hashnode *node = tok->val.node.node;
3315
3316           /* Don't attempt to expand the token.  */
3317           tok->flags |= NO_EXPAND;
3318           if (_cpp_defined_macro_p (node)
3319               && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3320               && !cpp_fun_like_macro_p (node))
3321             cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3322                                  "module control-line \"%s\" cannot be"
3323                                  " an object-like macro",
3324                                  NODE_NAME (node));
3325         }
3326
3327       /* Map to underbar variants.  */
3328       keyword->val.node.node = n_modules[header_count
3329                                          ? spec_nodes::M_IMPORT
3330                                          : spec_nodes::M_MODULE][1];
3331       if (backup != 1)
3332         result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3333
3334       /* Maybe tell the tokenizer we expect a header-name down the
3335          road.  */
3336       pfile->state.directive_file_token = header_count;
3337     }
3338   else
3339     {
3340     not_module:
3341       /* Drop out of directive mode.  */
3342       /* We aaserted save_comments had this value upon entry.  */
3343       pfile->state.save_comments
3344         = !CPP_OPTION (pfile, discard_comments);
3345       pfile->state.in_deferred_pragma = false;
3346       /* Do not let this remain on.  */
3347       pfile->state.angled_headers = false;
3348     }
3349
3350   /* In either case we want to backup the peeked tokens.  */
3351   if (backup)
3352     {
3353       /* If we saw EOL, we should drop it, because this isn't a module
3354          control-line after all.  */
3355       bool eol = peek->type == CPP_PRAGMA_EOL;
3356       if (!eol || backup > 1)
3357         {
3358           /* Put put the peeked tokens back  */
3359           _cpp_backup_tokens_direct (pfile, backup);
3360           /* But if the last one was an EOL, forget it.  */
3361           if (eol)
3362             pfile->lookaheads--;
3363         }
3364     }
3365 }
3366
3367 /* Lex a token into RESULT (external interface).  Takes care of issues
3368    like directive handling, token lookahead, multiple include
3369    optimization and skipping.  */
3370 const cpp_token *
3371 _cpp_lex_token (cpp_reader *pfile)
3372 {
3373   cpp_token *result;
3374
3375   for (;;)
3376     {
3377       if (pfile->cur_token == pfile->cur_run->limit)
3378         {
3379           pfile->cur_run = next_tokenrun (pfile->cur_run);
3380           pfile->cur_token = pfile->cur_run->base;
3381         }
3382       /* We assume that the current token is somewhere in the current
3383          run.  */
3384       if (pfile->cur_token < pfile->cur_run->base
3385           || pfile->cur_token >= pfile->cur_run->limit)
3386         abort ();
3387
3388       if (pfile->lookaheads)
3389         {
3390           pfile->lookaheads--;
3391           result = pfile->cur_token++;
3392         }
3393       else
3394         result = _cpp_lex_direct (pfile);
3395
3396       if (result->flags & BOL)
3397         {
3398           /* Is this a directive.  If _cpp_handle_directive returns
3399              false, it is an assembler #.  */
3400           if (result->type == CPP_HASH
3401               /* 6.10.3 p 11: Directives in a list of macro arguments
3402                  gives undefined behavior.  This implementation
3403                  handles the directive as normal.  */
3404               && pfile->state.parsing_args != 1)
3405             {
3406               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3407                 {
3408                   if (pfile->directive_result.type == CPP_PADDING)
3409                     continue;
3410                   result = &pfile->directive_result;
3411                 }
3412             }
3413           else if (pfile->state.in_deferred_pragma)
3414             result = &pfile->directive_result;
3415           else if (result->type == CPP_NAME
3416                    && (result->val.node.node->flags & NODE_MODULE)
3417                    && !pfile->state.skipping
3418                    /* Unlike regular directives, we do not deal with
3419                       tokenizing module directives as macro arguments.
3420                       That's not permitted.  */
3421                    && !pfile->state.parsing_args)
3422             {
3423               /* P1857.  Before macro expansion, At start of logical
3424                  line ... */
3425               /* We don't have to consider lookaheads at this point.  */
3426               gcc_checking_assert (!pfile->lookaheads);
3427
3428               cpp_maybe_module_directive (pfile, result);
3429             }
3430
3431           if (pfile->cb.line_change && !pfile->state.skipping)
3432             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3433         }
3434
3435       /* We don't skip tokens in directives.  */
3436       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3437         break;
3438
3439       /* Outside a directive, invalidate controlling macros.  At file
3440          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3441          get here and MI optimization works.  */
3442       pfile->mi_valid = false;
3443
3444       if (!pfile->state.skipping || result->type == CPP_EOF)
3445         break;
3446     }
3447
3448   return result;
3449 }
3450
3451 /* Returns true if a fresh line has been loaded.  */
3452 bool
3453 _cpp_get_fresh_line (cpp_reader *pfile)
3454 {
3455   /* We can't get a new line until we leave the current directive.  */
3456   if (pfile->state.in_directive)
3457     return false;
3458
3459   for (;;)
3460     {
3461       cpp_buffer *buffer = pfile->buffer;
3462
3463       if (!buffer->need_line)
3464         return true;
3465
3466       if (buffer->next_line < buffer->rlimit)
3467         {
3468           _cpp_clean_line (pfile);
3469           return true;
3470         }
3471
3472       /* First, get out of parsing arguments state.  */
3473       if (pfile->state.parsing_args)
3474         return false;
3475
3476       /* End of buffer.  Non-empty files should end in a newline.  */
3477       if (buffer->buf != buffer->rlimit
3478           && buffer->next_line > buffer->rlimit
3479           && !buffer->from_stage3)
3480         {
3481           /* Clip to buffer size.  */
3482           buffer->next_line = buffer->rlimit;
3483         }
3484
3485       if (buffer->prev && !buffer->return_at_eof)
3486         _cpp_pop_buffer (pfile);
3487       else
3488         {
3489           /* End of translation.  Do not pop the buffer yet. Increment
3490              line number so that the EOF token is on a line of its own
3491              (_cpp_lex_direct doesn't increment in that case, because
3492              it's hard for it to distinguish this special case). */
3493           CPP_INCREMENT_LINE (pfile, 0);
3494           return false;
3495         }
3496     }
3497 }
3498
3499 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
3500   do                                                    \
3501     {                                                   \
3502       result->type = ELSE_TYPE;                         \
3503       if (*buffer->cur == CHAR)                         \
3504         buffer->cur++, result->type = THEN_TYPE;        \
3505     }                                                   \
3506   while (0)
3507
3508 /* Lex a token into pfile->cur_token, which is also incremented, to
3509    get diagnostics pointing to the correct location.
3510
3511    Does not handle issues such as token lookahead, multiple-include
3512    optimization, directives, skipping etc.  This function is only
3513    suitable for use by _cpp_lex_token, and in special cases like
3514    lex_expansion_token which doesn't care for any of these issues.
3515
3516    When meeting a newline, returns CPP_EOF if parsing a directive,
3517    otherwise returns to the start of the token buffer if permissible.
3518    Returns the location of the lexed token.  */
3519 cpp_token *
3520 _cpp_lex_direct (cpp_reader *pfile)
3521 {
3522   cppchar_t c;
3523   cpp_buffer *buffer;
3524   const unsigned char *comment_start;
3525   bool fallthrough_comment = false;
3526   cpp_token *result = pfile->cur_token++;
3527
3528  fresh_line:
3529   result->flags = 0;
3530   buffer = pfile->buffer;
3531   if (buffer->need_line)
3532     {
3533       if (pfile->state.in_deferred_pragma)
3534         {
3535           /* This can happen in cases like:
3536              #define loop(x) whatever
3537              #pragma omp loop
3538              where when trying to expand loop we need to peek
3539              next token after loop, but aren't still in_deferred_pragma
3540              mode but are in in_directive mode, so buffer->need_line
3541              is set, a CPP_EOF is peeked.  */
3542           result->type = CPP_PRAGMA_EOL;
3543           pfile->state.in_deferred_pragma = false;
3544           if (!pfile->state.pragma_allow_expansion)
3545             pfile->state.prevent_expansion--;
3546           return result;
3547         }
3548       if (!_cpp_get_fresh_line (pfile))
3549         {
3550           result->type = CPP_EOF;
3551           /* Not a real EOF in a directive or arg parsing -- we refuse
3552              to advance to the next file now, and will once we're out
3553              of those modes.  */
3554           if (!pfile->state.in_directive && !pfile->state.parsing_args)
3555             {
3556               /* Tell the compiler the line number of the EOF token.  */
3557               result->src_loc = pfile->line_table->highest_line;
3558               result->flags = BOL;
3559               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
3560               _cpp_pop_buffer (pfile);
3561             }
3562           return result;
3563         }
3564       if (buffer != pfile->buffer)
3565         fallthrough_comment = false;
3566       if (!pfile->keep_tokens)
3567         {
3568           pfile->cur_run = &pfile->base_run;
3569           result = pfile->base_run.base;
3570           pfile->cur_token = result + 1;
3571         }
3572       result->flags = BOL;
3573       if (pfile->state.parsing_args == 2)
3574         result->flags |= PREV_WHITE;
3575     }
3576   buffer = pfile->buffer;
3577  update_tokens_line:
3578   result->src_loc = pfile->line_table->highest_line;
3579
3580  skipped_white:
3581   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3582       && !pfile->overlaid_buffer)
3583     {
3584       _cpp_process_line_notes (pfile, false);
3585       result->src_loc = pfile->line_table->highest_line;
3586     }
3587   c = *buffer->cur++;
3588
3589   if (pfile->forced_token_location)
3590     result->src_loc = pfile->forced_token_location;
3591   else
3592     result->src_loc = linemap_position_for_column (pfile->line_table,
3593                                           CPP_BUF_COLUMN (buffer, buffer->cur));
3594
3595   switch (c)
3596     {
3597     case ' ': case '\t': case '\f': case '\v': case '\0':
3598       result->flags |= PREV_WHITE;
3599       skip_whitespace (pfile, c);
3600       goto skipped_white;
3601
3602     case '\n':
3603       /* Increment the line, unless this is the last line ...  */
3604       if (buffer->cur < buffer->rlimit
3605           /* ... or this is a #include, (where _cpp_stack_file needs to
3606              unwind by one line) ...  */
3607           || (pfile->state.in_directive > 1
3608               /* ... except traditional-cpp increments this elsewhere.  */
3609               && !CPP_OPTION (pfile, traditional)))
3610         CPP_INCREMENT_LINE (pfile, 0);
3611       buffer->need_line = true;
3612       if (pfile->state.in_deferred_pragma)
3613         {
3614           /* Produce the PRAGMA_EOL on this line.  File reading
3615              ensures there is always a \n at end of the buffer, thus
3616              in a deferred pragma we always see CPP_PRAGMA_EOL before
3617              any CPP_EOF.  */
3618           result->type = CPP_PRAGMA_EOL;
3619           result->flags &= ~PREV_WHITE;
3620           pfile->state.in_deferred_pragma = false;
3621           if (!pfile->state.pragma_allow_expansion)
3622             pfile->state.prevent_expansion--;
3623           return result;
3624         }
3625       goto fresh_line;
3626
3627     case '0': case '1': case '2': case '3': case '4':
3628     case '5': case '6': case '7': case '8': case '9':
3629       {
3630         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3631         result->type = CPP_NUMBER;
3632         lex_number (pfile, &result->val.str, &nst);
3633         warn_about_normalization (pfile, result, &nst);
3634         break;
3635       }
3636
3637     case 'L':
3638     case 'u':
3639     case 'U':
3640     case 'R':
3641       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3642          wide strings or raw strings.  */
3643       if (c == 'L' || CPP_OPTION (pfile, rliterals)
3644           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3645         {
3646           if ((*buffer->cur == '\'' && c != 'R')
3647               || *buffer->cur == '"'
3648               || (*buffer->cur == 'R'
3649                   && c != 'R'
3650                   && buffer->cur[1] == '"'
3651                   && CPP_OPTION (pfile, rliterals))
3652               || (*buffer->cur == '8'
3653                   && c == 'u'
3654                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3655                                 && CPP_OPTION (pfile, utf8_char_literals)))
3656                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3657                           && CPP_OPTION (pfile, rliterals)))))
3658             {
3659               lex_string (pfile, result, buffer->cur - 1);
3660               break;
3661             }
3662         }
3663       /* Fall through.  */
3664
3665     case '_':
3666     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3667     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3668     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3669     case 's': case 't':           case 'v': case 'w': case 'x':
3670     case 'y': case 'z':
3671     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3672     case 'G': case 'H': case 'I': case 'J': case 'K':
3673     case 'M': case 'N': case 'O': case 'P': case 'Q':
3674     case 'S': case 'T':           case 'V': case 'W': case 'X':
3675     case 'Y': case 'Z':
3676       result->type = CPP_NAME;
3677       {
3678         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3679         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3680                                                 &nst,
3681                                                 &result->val.node.spelling);
3682         warn_about_normalization (pfile, result, &nst);
3683       }
3684
3685       /* Convert named operators to their proper types.  */
3686       if (result->val.node.node->flags & NODE_OPERATOR)
3687         {
3688           result->flags |= NAMED_OP;
3689           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3690         }
3691
3692       /* Signal FALLTHROUGH comment followed by another token.  */
3693       if (fallthrough_comment)
3694         result->flags |= PREV_FALLTHROUGH;
3695       break;
3696
3697     case '\'':
3698     case '"':
3699       lex_string (pfile, result, buffer->cur - 1);
3700       break;
3701
3702     case '/':
3703       /* A potential block or line comment.  */
3704       comment_start = buffer->cur;
3705       c = *buffer->cur;
3706
3707       if (c == '*')
3708         {
3709           if (_cpp_skip_block_comment (pfile))
3710             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3711         }
3712       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3713         {
3714           /* Don't warn for system headers.  */
3715           if (_cpp_in_system_header (pfile))
3716             ;
3717           /* Warn about comments if pedantically GNUC89, and not
3718              in system headers.  */
3719           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3720                    && CPP_PEDANTIC (pfile)
3721                    && ! buffer->warned_cplusplus_comments)
3722             {
3723               if (cpp_error (pfile, CPP_DL_PEDWARN,
3724                              "C++ style comments are not allowed in ISO C90"))
3725                 cpp_error (pfile, CPP_DL_NOTE,
3726                            "(this will be reported only once per input file)");
3727               buffer->warned_cplusplus_comments = 1;
3728             }
3729           /* Or if specifically desired via -Wc90-c99-compat.  */
3730           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3731                    && ! CPP_OPTION (pfile, cplusplus)
3732                    && ! buffer->warned_cplusplus_comments)
3733             {
3734               if (cpp_error (pfile, CPP_DL_WARNING,
3735                              "C++ style comments are incompatible with C90"))
3736                 cpp_error (pfile, CPP_DL_NOTE,
3737                            "(this will be reported only once per input file)");
3738               buffer->warned_cplusplus_comments = 1;
3739             }
3740           /* In C89/C94, C++ style comments are forbidden.  */
3741           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3742                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
3743             {
3744               /* But don't be confused about valid code such as
3745                  - // immediately followed by *,
3746                  - // in a preprocessing directive,
3747                  - // in an #if 0 block.  */
3748               if (buffer->cur[1] == '*'
3749                   || pfile->state.in_directive
3750                   || pfile->state.skipping)
3751                 {
3752                   result->type = CPP_DIV;
3753                   break;
3754                 }
3755               else if (! buffer->warned_cplusplus_comments)
3756                 {
3757                   if (cpp_error (pfile, CPP_DL_ERROR,
3758                                  "C++ style comments are not allowed in "
3759                                  "ISO C90"))
3760                     cpp_error (pfile, CPP_DL_NOTE,
3761                                "(this will be reported only once per input "
3762                                "file)");
3763                   buffer->warned_cplusplus_comments = 1;
3764                 }
3765             }
3766           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3767             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3768         }
3769       else if (c == '=')
3770         {
3771           buffer->cur++;
3772           result->type = CPP_DIV_EQ;
3773           break;
3774         }
3775       else
3776         {
3777           result->type = CPP_DIV;
3778           break;
3779         }
3780
3781       if (fallthrough_comment_p (pfile, comment_start))
3782         fallthrough_comment = true;
3783
3784       if (pfile->cb.comment)
3785         {
3786           size_t len = pfile->buffer->cur - comment_start;
3787           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3788                              len + 1);
3789         }
3790
3791       if (!pfile->state.save_comments)
3792         {
3793           result->flags |= PREV_WHITE;
3794           goto update_tokens_line;
3795         }
3796
3797       if (fallthrough_comment)
3798         result->flags |= PREV_FALLTHROUGH;
3799
3800       /* Save the comment as a token in its own right.  */
3801       save_comment (pfile, result, comment_start, c);
3802       break;
3803
3804     case '<':
3805       if (pfile->state.angled_headers)
3806         {
3807           lex_string (pfile, result, buffer->cur - 1);
3808           if (result->type != CPP_LESS)
3809             break;
3810         }
3811
3812       result->type = CPP_LESS;
3813       if (*buffer->cur == '=')
3814         {
3815           buffer->cur++, result->type = CPP_LESS_EQ;
3816           if (*buffer->cur == '>'
3817               && CPP_OPTION (pfile, cplusplus)
3818               && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
3819             buffer->cur++, result->type = CPP_SPACESHIP;
3820         }
3821       else if (*buffer->cur == '<')
3822         {
3823           buffer->cur++;
3824           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
3825         }
3826       else if (CPP_OPTION (pfile, digraphs))
3827         {
3828           if (*buffer->cur == ':')
3829             {
3830               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
3831                  three characters are <:: and the subsequent character
3832                  is neither : nor >, the < is treated as a preprocessor
3833                  token by itself".  */
3834               if (CPP_OPTION (pfile, cplusplus)
3835                   && CPP_OPTION (pfile, lang) != CLK_CXX98
3836                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
3837                   && buffer->cur[1] == ':'
3838                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3839                 break;
3840
3841               buffer->cur++;
3842               result->flags |= DIGRAPH;
3843               result->type = CPP_OPEN_SQUARE;
3844             }
3845           else if (*buffer->cur == '%')
3846             {
3847               buffer->cur++;
3848               result->flags |= DIGRAPH;
3849               result->type = CPP_OPEN_BRACE;
3850             }
3851         }
3852       break;
3853
3854     case '>':
3855       result->type = CPP_GREATER;
3856       if (*buffer->cur == '=')
3857         buffer->cur++, result->type = CPP_GREATER_EQ;
3858       else if (*buffer->cur == '>')
3859         {
3860           buffer->cur++;
3861           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3862         }
3863       break;
3864
3865     case '%':
3866       result->type = CPP_MOD;
3867       if (*buffer->cur == '=')
3868         buffer->cur++, result->type = CPP_MOD_EQ;
3869       else if (CPP_OPTION (pfile, digraphs))
3870         {
3871           if (*buffer->cur == ':')
3872             {
3873               buffer->cur++;
3874               result->flags |= DIGRAPH;
3875               result->type = CPP_HASH;
3876               if (*buffer->cur == '%' && buffer->cur[1] == ':')
3877                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3878             }
3879           else if (*buffer->cur == '>')
3880             {
3881               buffer->cur++;
3882               result->flags |= DIGRAPH;
3883               result->type = CPP_CLOSE_BRACE;
3884             }
3885         }
3886       break;
3887
3888     case '.':
3889       result->type = CPP_DOT;
3890       if (ISDIGIT (*buffer->cur))
3891         {
3892           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3893           result->type = CPP_NUMBER;
3894           lex_number (pfile, &result->val.str, &nst);
3895           warn_about_normalization (pfile, result, &nst);
3896         }
3897       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3898         buffer->cur += 2, result->type = CPP_ELLIPSIS;
3899       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3900         buffer->cur++, result->type = CPP_DOT_STAR;
3901       break;
3902
3903     case '+':
3904       result->type = CPP_PLUS;
3905       if (*buffer->cur == '+')
3906         buffer->cur++, result->type = CPP_PLUS_PLUS;
3907       else if (*buffer->cur == '=')
3908         buffer->cur++, result->type = CPP_PLUS_EQ;
3909       break;
3910
3911     case '-':
3912       result->type = CPP_MINUS;
3913       if (*buffer->cur == '>')
3914         {
3915           buffer->cur++;
3916           result->type = CPP_DEREF;
3917           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3918             buffer->cur++, result->type = CPP_DEREF_STAR;
3919         }
3920       else if (*buffer->cur == '-')
3921         buffer->cur++, result->type = CPP_MINUS_MINUS;
3922       else if (*buffer->cur == '=')
3923         buffer->cur++, result->type = CPP_MINUS_EQ;
3924       break;
3925
3926     case '&':
3927       result->type = CPP_AND;
3928       if (*buffer->cur == '&')
3929         buffer->cur++, result->type = CPP_AND_AND;
3930       else if (*buffer->cur == '=')
3931         buffer->cur++, result->type = CPP_AND_EQ;
3932       break;
3933
3934     case '|':
3935       result->type = CPP_OR;
3936       if (*buffer->cur == '|')
3937         buffer->cur++, result->type = CPP_OR_OR;
3938       else if (*buffer->cur == '=')
3939         buffer->cur++, result->type = CPP_OR_EQ;
3940       break;
3941
3942     case ':':
3943       result->type = CPP_COLON;
3944       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
3945         buffer->cur++, result->type = CPP_SCOPE;
3946       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3947         {
3948           buffer->cur++;
3949           result->flags |= DIGRAPH;
3950           result->type = CPP_CLOSE_SQUARE;
3951         }
3952       break;
3953
3954     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3955     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3956     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3957     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3958     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3959
3960     case '?': result->type = CPP_QUERY; break;
3961     case '~': result->type = CPP_COMPL; break;
3962     case ',': result->type = CPP_COMMA; break;
3963     case '(': result->type = CPP_OPEN_PAREN; break;
3964     case ')': result->type = CPP_CLOSE_PAREN; break;
3965     case '[': result->type = CPP_OPEN_SQUARE; break;
3966     case ']': result->type = CPP_CLOSE_SQUARE; break;
3967     case '{': result->type = CPP_OPEN_BRACE; break;
3968     case '}': result->type = CPP_CLOSE_BRACE; break;
3969     case ';': result->type = CPP_SEMICOLON; break;
3970
3971       /* @ is a punctuator in Objective-C.  */
3972     case '@': result->type = CPP_ATSIGN; break;
3973
3974     default:
3975       {
3976         const uchar *base = --buffer->cur;
3977
3978         /* Check for an extended identifier ($ or UCN or UTF-8).  */
3979         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3980         if (forms_identifier_p (pfile, true, &nst))
3981           {
3982             result->type = CPP_NAME;
3983             result->val.node.node = lex_identifier (pfile, base, true, &nst,
3984                                                     &result->val.node.spelling);
3985             warn_about_normalization (pfile, result, &nst);
3986             break;
3987           }
3988
3989         /* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
3990            single token.  */
3991         buffer->cur++;
3992         if (c >= utf8_signifier)
3993           {
3994             const uchar *pstr = base;
3995             cppchar_t s;
3996             if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
3997               buffer->cur = pstr;
3998           }
3999         create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4000         break;
4001       }
4002
4003     }
4004
4005   /* Potentially convert the location of the token to a range.  */
4006   if (result->src_loc >= RESERVED_LOCATION_COUNT
4007       && result->type != CPP_EOF)
4008     {
4009       /* Ensure that any line notes are processed, so that we have the
4010          correct physical line/column for the end-point of the token even
4011          when a logical line is split via one or more backslashes.  */
4012       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4013           && !pfile->overlaid_buffer)
4014         _cpp_process_line_notes (pfile, false);
4015
4016       source_range tok_range;
4017       tok_range.m_start = result->src_loc;
4018       tok_range.m_finish
4019         = linemap_position_for_column (pfile->line_table,
4020                                        CPP_BUF_COLUMN (buffer, buffer->cur));
4021
4022       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
4023                                                result->src_loc,
4024                                                tok_range, NULL);
4025     }
4026
4027   return result;
4028 }
4029
4030 /* An upper bound on the number of bytes needed to spell TOKEN.
4031    Does not include preceding whitespace.  */
4032 unsigned int
4033 cpp_token_len (const cpp_token *token)
4034 {
4035   unsigned int len;
4036
4037   switch (TOKEN_SPELL (token))
4038     {
4039     default:            len = 6;                                break;
4040     case SPELL_LITERAL: len = token->val.str.len;               break;
4041     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
4042     }
4043
4044   return len;
4045 }
4046
4047 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4048    Return the number of bytes read out of NAME.  (There are always
4049    10 bytes written to BUFFER.)  */
4050
4051 static size_t
4052 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4053 {
4054   int j;
4055   int ucn_len = 0;
4056   int ucn_len_c;
4057   unsigned t;
4058   unsigned long utf32;
4059
4060   /* Compute the length of the UTF-8 sequence.  */
4061   for (t = *name; t & 0x80; t <<= 1)
4062     ucn_len++;
4063
4064   utf32 = *name & (0x7F >> ucn_len);
4065   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4066     {
4067       utf32 = (utf32 << 6) | (*++name & 0x3F);
4068
4069       /* Ill-formed UTF-8.  */
4070       if ((*name & ~0x3F) != 0x80)
4071         abort ();
4072     }
4073
4074   *buffer++ = '\\';
4075   *buffer++ = 'U';
4076   for (j = 7; j >= 0; j--)
4077     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4078   return ucn_len;
4079 }
4080
4081 /* Given a token TYPE corresponding to a digraph, return a pointer to
4082    the spelling of the digraph.  */
4083 static const unsigned char *
4084 cpp_digraph2name (enum cpp_ttype type)
4085 {
4086   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4087 }
4088
4089 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4090    The buffer must already contain the enough space to hold the
4091    token's spelling.  Returns a pointer to the character after the
4092    last character written.  */
4093 unsigned char *
4094 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4095 {
4096   size_t i;
4097   const unsigned char *name = NODE_NAME (ident);
4098
4099   for (i = 0; i < NODE_LEN (ident); i++)
4100     if (name[i] & ~0x7F)
4101       {
4102         i += utf8_to_ucn (buffer, name + i) - 1;
4103         buffer += 10;
4104       }
4105     else
4106       *buffer++ = name[i];
4107
4108   return buffer;
4109 }
4110
4111 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
4112    already contain the enough space to hold the token's spelling.
4113    Returns a pointer to the character after the last character written.
4114    FORSTRING is true if this is to be the spelling after translation
4115    phase 1 (with the original spelling of extended identifiers), false
4116    if extended identifiers should always be written using UCNs (there is
4117    no option for always writing them in the internal UTF-8 form).
4118    FIXME: Would be nice if we didn't need the PFILE argument.  */
4119 unsigned char *
4120 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4121                  unsigned char *buffer, bool forstring)
4122 {
4123   switch (TOKEN_SPELL (token))
4124     {
4125     case SPELL_OPERATOR:
4126       {
4127         const unsigned char *spelling;
4128         unsigned char c;
4129
4130         if (token->flags & DIGRAPH)
4131           spelling = cpp_digraph2name (token->type);
4132         else if (token->flags & NAMED_OP)
4133           goto spell_ident;
4134         else
4135           spelling = TOKEN_NAME (token);
4136
4137         while ((c = *spelling++) != '\0')
4138           *buffer++ = c;
4139       }
4140       break;
4141
4142     spell_ident:
4143     case SPELL_IDENT:
4144       if (forstring)
4145         {
4146           memcpy (buffer, NODE_NAME (token->val.node.spelling),
4147                   NODE_LEN (token->val.node.spelling));
4148           buffer += NODE_LEN (token->val.node.spelling);
4149         }
4150       else
4151         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4152       break;
4153
4154     case SPELL_LITERAL:
4155       memcpy (buffer, token->val.str.text, token->val.str.len);
4156       buffer += token->val.str.len;
4157       break;
4158
4159     case SPELL_NONE:
4160       cpp_error (pfile, CPP_DL_ICE,
4161                  "unspellable token %s", TOKEN_NAME (token));
4162       break;
4163     }
4164
4165   return buffer;
4166 }
4167
4168 /* Returns TOKEN spelt as a null-terminated string.  The string is
4169    freed when the reader is destroyed.  Useful for diagnostics.  */
4170 unsigned char *
4171 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4172 {
4173   unsigned int len = cpp_token_len (token) + 1;
4174   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4175
4176   end = cpp_spell_token (pfile, token, start, false);
4177   end[0] = '\0';
4178
4179   return start;
4180 }
4181
4182 /* Returns a pointer to a string which spells the token defined by
4183    TYPE and FLAGS.  Used by C front ends, which really should move to
4184    using cpp_token_as_text.  */
4185 const char *
4186 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4187 {
4188   if (flags & DIGRAPH)
4189     return (const char *) cpp_digraph2name (type);
4190   else if (flags & NAMED_OP)
4191     return cpp_named_operator2name (type);
4192
4193   return (const char *) token_spellings[type].name;
4194 }
4195
4196 /* Writes the spelling of token to FP, without any preceding space.
4197    Separated from cpp_spell_token for efficiency - to avoid stdio
4198    double-buffering.  */
4199 void
4200 cpp_output_token (const cpp_token *token, FILE *fp)
4201 {
4202   switch (TOKEN_SPELL (token))
4203     {
4204     case SPELL_OPERATOR:
4205       {
4206         const unsigned char *spelling;
4207         int c;
4208
4209         if (token->flags & DIGRAPH)
4210           spelling = cpp_digraph2name (token->type);
4211         else if (token->flags & NAMED_OP)
4212           goto spell_ident;
4213         else
4214           spelling = TOKEN_NAME (token);
4215
4216         c = *spelling;
4217         do
4218           putc (c, fp);
4219         while ((c = *++spelling) != '\0');
4220       }
4221       break;
4222
4223     spell_ident:
4224     case SPELL_IDENT:
4225       {
4226         size_t i;
4227         const unsigned char * name = NODE_NAME (token->val.node.node);
4228
4229         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4230           if (name[i] & ~0x7F)
4231             {
4232               unsigned char buffer[10];
4233               i += utf8_to_ucn (buffer, name + i) - 1;
4234               fwrite (buffer, 1, 10, fp);
4235             }
4236           else
4237             fputc (NODE_NAME (token->val.node.node)[i], fp);
4238       }
4239       break;
4240
4241     case SPELL_LITERAL:
4242       if (token->type == CPP_HEADER_NAME)
4243         fputc ('"', fp);
4244       fwrite (token->val.str.text, 1, token->val.str.len, fp);
4245       if (token->type == CPP_HEADER_NAME)
4246         fputc ('"', fp);
4247       break;
4248
4249     case SPELL_NONE:
4250       /* An error, most probably.  */
4251       break;
4252     }
4253 }
4254
4255 /* Compare two tokens.  */
4256 int
4257 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4258 {
4259   if (a->type == b->type && a->flags == b->flags)
4260     switch (TOKEN_SPELL (a))
4261       {
4262       default:                  /* Keep compiler happy.  */
4263       case SPELL_OPERATOR:
4264         /* token_no is used to track where multiple consecutive ##
4265            tokens were originally located.  */
4266         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4267       case SPELL_NONE:
4268         return (a->type != CPP_MACRO_ARG
4269                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4270                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4271       case SPELL_IDENT:
4272         return (a->val.node.node == b->val.node.node
4273                 && a->val.node.spelling == b->val.node.spelling);
4274       case SPELL_LITERAL:
4275         return (a->val.str.len == b->val.str.len
4276                 && !memcmp (a->val.str.text, b->val.str.text,
4277                             a->val.str.len));
4278       }
4279
4280   return 0;
4281 }
4282
4283 /* Returns nonzero if a space should be inserted to avoid an
4284    accidental token paste for output.  For simplicity, it is
4285    conservative, and occasionally advises a space where one is not
4286    needed, e.g. "." and ".2".  */
4287 int
4288 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4289                  const cpp_token *token2)
4290 {
4291   enum cpp_ttype a = token1->type, b = token2->type;
4292   cppchar_t c;
4293
4294   if (token1->flags & NAMED_OP)
4295     a = CPP_NAME;
4296   if (token2->flags & NAMED_OP)
4297     b = CPP_NAME;
4298
4299   c = EOF;
4300   if (token2->flags & DIGRAPH)
4301     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4302   else if (token_spellings[b].category == SPELL_OPERATOR)
4303     c = token_spellings[b].name[0];
4304
4305   /* Quickly get everything that can paste with an '='.  */
4306   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4307     return 1;
4308
4309   switch (a)
4310     {
4311     case CPP_GREATER:   return c == '>';
4312     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
4313     case CPP_PLUS:      return c == '+';
4314     case CPP_MINUS:     return c == '-' || c == '>';
4315     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
4316     case CPP_MOD:       return c == ':' || c == '>';
4317     case CPP_AND:       return c == '&';
4318     case CPP_OR:        return c == '|';
4319     case CPP_COLON:     return c == ':' || c == '>';
4320     case CPP_DEREF:     return c == '*';
4321     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
4322     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
4323     case CPP_PRAGMA:
4324     case CPP_NAME:      return ((b == CPP_NUMBER
4325                                  && name_p (pfile, &token2->val.str))
4326                                 || b == CPP_NAME
4327                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
4328     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
4329                                 || b == CPP_CHAR
4330                                 || c == '.' || c == '+' || c == '-');
4331                                       /* UCNs */
4332     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
4333                                  && b == CPP_NAME)
4334                                 || (CPP_OPTION (pfile, objc)
4335                                     && token1->val.str.text[0] == '@'
4336                                     && (b == CPP_NAME || b == CPP_STRING)));
4337     case CPP_LESS_EQ:   return c == '>';
4338     case CPP_STRING:
4339     case CPP_WSTRING:
4340     case CPP_UTF8STRING:
4341     case CPP_STRING16:
4342     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
4343                                 && (b == CPP_NAME
4344                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
4345                                         && ISIDST (token2->val.str.text[0]))));
4346
4347     default:            break;
4348     }
4349
4350   return 0;
4351 }
4352
4353 /* Output all the remaining tokens on the current line, and a newline
4354    character, to FP.  Leading whitespace is removed.  If there are
4355    macros, special token padding is not performed.  */
4356 void
4357 cpp_output_line (cpp_reader *pfile, FILE *fp)
4358 {
4359   const cpp_token *token;
4360
4361   token = cpp_get_token (pfile);
4362   while (token->type != CPP_EOF)
4363     {
4364       cpp_output_token (token, fp);
4365       token = cpp_get_token (pfile);
4366       if (token->flags & PREV_WHITE)
4367         putc (' ', fp);
4368     }
4369
4370   putc ('\n', fp);
4371 }
4372
4373 /* Return a string representation of all the remaining tokens on the
4374    current line.  The result is allocated using xmalloc and must be
4375    freed by the caller.  */
4376 unsigned char *
4377 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4378 {
4379   const cpp_token *token;
4380   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4381   unsigned int alloced = 120 + out;
4382   unsigned char *result = (unsigned char *) xmalloc (alloced);
4383
4384   /* If DIR_NAME is empty, there are no initial contents.  */
4385   if (dir_name)
4386     {
4387       sprintf ((char *) result, "#%s ", dir_name);
4388       out += 2;
4389     }
4390
4391   token = cpp_get_token (pfile);
4392   while (token->type != CPP_EOF)
4393     {
4394       unsigned char *last;
4395       /* Include room for a possible space and the terminating nul.  */
4396       unsigned int len = cpp_token_len (token) + 2;
4397
4398       if (out + len > alloced)
4399         {
4400           alloced *= 2;
4401           if (out + len > alloced)
4402             alloced = out + len;
4403           result = (unsigned char *) xrealloc (result, alloced);
4404         }
4405
4406       last = cpp_spell_token (pfile, token, &result[out], 0);
4407       out = last - result;
4408
4409       token = cpp_get_token (pfile);
4410       if (token->flags & PREV_WHITE)
4411         result[out++] = ' ';
4412     }
4413
4414   result[out] = '\0';
4415   return result;
4416 }
4417
4418 /* Memory buffers.  Changing these three constants can have a dramatic
4419    effect on performance.  The values here are reasonable defaults,
4420    but might be tuned.  If you adjust them, be sure to test across a
4421    range of uses of cpplib, including heavy nested function-like macro
4422    expansion.  Also check the change in peak memory usage (NJAMD is a
4423    good tool for this).  */
4424 #define MIN_BUFF_SIZE 8000
4425 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4426 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4427         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4428
4429 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4430   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4431 #endif
4432
4433 /* Create a new allocation buffer.  Place the control block at the end
4434    of the buffer, so that buffer overflows will cause immediate chaos.  */
4435 static _cpp_buff *
4436 new_buff (size_t len)
4437 {
4438   _cpp_buff *result;
4439   unsigned char *base;
4440
4441   if (len < MIN_BUFF_SIZE)
4442     len = MIN_BUFF_SIZE;
4443   len = CPP_ALIGN (len);
4444
4445 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4446   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4447      struct first.  */
4448   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4449   base = XNEWVEC (unsigned char, len + slen);
4450   result = (_cpp_buff *) base;
4451   base += slen;
4452 #else
4453   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4454   result = (_cpp_buff *) (base + len);
4455 #endif
4456   result->base = base;
4457   result->cur = base;
4458   result->limit = base + len;
4459   result->next = NULL;
4460   return result;
4461 }
4462
4463 /* Place a chain of unwanted allocation buffers on the free list.  */
4464 void
4465 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4466 {
4467   _cpp_buff *end = buff;
4468
4469   while (end->next)
4470     end = end->next;
4471   end->next = pfile->free_buffs;
4472   pfile->free_buffs = buff;
4473 }
4474
4475 /* Return a free buffer of size at least MIN_SIZE.  */
4476 _cpp_buff *
4477 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4478 {
4479   _cpp_buff *result, **p;
4480
4481   for (p = &pfile->free_buffs;; p = &(*p)->next)
4482     {
4483       size_t size;
4484
4485       if (*p == NULL)
4486         return new_buff (min_size);
4487       result = *p;
4488       size = result->limit - result->base;
4489       /* Return a buffer that's big enough, but don't waste one that's
4490          way too big.  */
4491       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4492         break;
4493     }
4494
4495   *p = result->next;
4496   result->next = NULL;
4497   result->cur = result->base;
4498   return result;
4499 }
4500
4501 /* Creates a new buffer with enough space to hold the uncommitted
4502    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
4503    the excess bytes to the new buffer.  Chains the new buffer after
4504    BUFF, and returns the new buffer.  */
4505 _cpp_buff *
4506 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4507 {
4508   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4509   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4510
4511   buff->next = new_buff;
4512   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4513   return new_buff;
4514 }
4515
4516 /* Creates a new buffer with enough space to hold the uncommitted
4517    remaining bytes of the buffer pointed to by BUFF, and at least
4518    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
4519    Chains the new buffer before the buffer pointed to by BUFF, and
4520    updates the pointer to point to the new buffer.  */
4521 void
4522 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4523 {
4524   _cpp_buff *new_buff, *old_buff = *pbuff;
4525   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4526
4527   new_buff = _cpp_get_buff (pfile, size);
4528   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4529   new_buff->next = old_buff;
4530   *pbuff = new_buff;
4531 }
4532
4533 /* Free a chain of buffers starting at BUFF.  */
4534 void
4535 _cpp_free_buff (_cpp_buff *buff)
4536 {
4537   _cpp_buff *next;
4538
4539   for (; buff; buff = next)
4540     {
4541       next = buff->next;
4542 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4543       free (buff);
4544 #else
4545       free (buff->base);
4546 #endif
4547     }
4548 }
4549
4550 /* Allocate permanent, unaligned storage of length LEN.  */
4551 unsigned char *
4552 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4553 {
4554   _cpp_buff *buff = pfile->u_buff;
4555   unsigned char *result = buff->cur;
4556
4557   if (len > (size_t) (buff->limit - result))
4558     {
4559       buff = _cpp_get_buff (pfile, len);
4560       buff->next = pfile->u_buff;
4561       pfile->u_buff = buff;
4562       result = buff->cur;
4563     }
4564
4565   buff->cur = result + len;
4566   return result;
4567 }
4568
4569 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4570    That buffer is used for growing allocations when saving macro
4571    replacement lists in a #define, and when parsing an answer to an
4572    assertion in #assert, #unassert or #if (and therefore possibly
4573    whilst expanding macros).  It therefore must not be used by any
4574    code that they might call: specifically the lexer and the guts of
4575    the macro expander.
4576
4577    All existing other uses clearly fit this restriction: storing
4578    registered pragmas during initialization.  */
4579 unsigned char *
4580 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4581 {
4582   _cpp_buff *buff = pfile->a_buff;
4583   unsigned char *result = buff->cur;
4584
4585   if (len > (size_t) (buff->limit - result))
4586     {
4587       buff = _cpp_get_buff (pfile, len);
4588       buff->next = pfile->a_buff;
4589       pfile->a_buff = buff;
4590       result = buff->cur;
4591     }
4592
4593   buff->cur = result + len;
4594   return result;
4595 }
4596
4597 /* Commit or allocate storage from a buffer.  */
4598
4599 void *
4600 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4601 {
4602   void *ptr = BUFF_FRONT (pfile->a_buff);
4603
4604   if (pfile->hash_table->alloc_subobject)
4605     {
4606       void *copy = pfile->hash_table->alloc_subobject (size);
4607       memcpy (copy, ptr, size);
4608       ptr = copy;
4609     }
4610   else
4611     BUFF_FRONT (pfile->a_buff) += size;
4612
4613   return ptr;
4614 }
4615
4616 /* Say which field of TOK is in use.  */
4617
4618 enum cpp_token_fld_kind
4619 cpp_token_val_index (const cpp_token *tok)
4620 {
4621   switch (TOKEN_SPELL (tok))
4622     {
4623     case SPELL_IDENT:
4624       return CPP_TOKEN_FLD_NODE;
4625     case SPELL_LITERAL:
4626       return CPP_TOKEN_FLD_STR;
4627     case SPELL_OPERATOR:
4628       /* Operands which were originally spelled as ident keep around
4629          the node for the exact spelling.  */
4630       if (tok->flags & NAMED_OP)
4631         return CPP_TOKEN_FLD_NODE;
4632       else if (tok->type == CPP_PASTE)
4633         return CPP_TOKEN_FLD_TOKEN_NO;
4634       else
4635         return CPP_TOKEN_FLD_NONE;
4636     case SPELL_NONE:
4637       if (tok->type == CPP_MACRO_ARG)
4638         return CPP_TOKEN_FLD_ARG_NO;
4639       else if (tok->type == CPP_PADDING)
4640         return CPP_TOKEN_FLD_SOURCE;
4641       else if (tok->type == CPP_PRAGMA)
4642         return CPP_TOKEN_FLD_PRAGMA;
4643       /* fall through */
4644     default:
4645       return CPP_TOKEN_FLD_NONE;
4646     }
4647 }
4648
4649 /* All tokens lexed in R after calling this function will be forced to
4650    have their location_t to be P, until
4651    cpp_stop_forcing_token_locations is called for R.  */
4652
4653 void
4654 cpp_force_token_locations (cpp_reader *r, location_t loc)
4655 {
4656   r->forced_token_location = loc;
4657 }
4658
4659 /* Go back to assigning locations naturally for lexed tokens.  */
4660
4661 void
4662 cpp_stop_forcing_token_locations (cpp_reader *r)
4663 {
4664   r->forced_token_location = 0;
4665 }
4666
4667 /* We're looking at \, if it's escaping EOL, look past it.  If at
4668    LIMIT, don't advance.  */
4669
4670 static const unsigned char *
4671 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4672 {
4673   const unsigned char *probe = peek;
4674
4675   if (__builtin_expect (peek[1] == '\n', true))
4676     {
4677     eol:
4678       probe += 2;
4679       if (__builtin_expect (probe < limit, true))
4680         {
4681           peek = probe;
4682           if (*peek == '\\')
4683             /* The user might be perverse.  */
4684             return do_peek_backslash (peek, limit);
4685         }
4686     }
4687   else if (__builtin_expect (peek[1] == '\r', false))
4688     {
4689       if (probe[2] == '\n')
4690         probe++;
4691       goto eol;
4692     }
4693
4694   return peek;
4695 }
4696
4697 static const unsigned char *
4698 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4699 {
4700   if (__builtin_expect (*peek == '\\', false))
4701     peek = do_peek_backslash (peek, limit);
4702   return peek;
4703 }
4704
4705 static const unsigned char *
4706 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4707 {
4708   if (peek == bound)
4709     return NULL;
4710
4711   unsigned char c = *--peek;
4712   if (__builtin_expect (c == '\n', false)
4713       || __builtin_expect (c == 'r', false))
4714     {
4715       if (peek == bound)
4716         return peek;
4717       int ix = -1;
4718       if (c == '\n' && peek[ix] == '\r')
4719         {
4720           if (peek + ix == bound)
4721             return peek;
4722           ix--;
4723         }
4724
4725       if (peek[ix] == '\\')
4726         return do_peek_prev (peek + ix, bound);
4727
4728       return peek;
4729     }
4730   else
4731     return peek;
4732 }
4733
4734 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4735    space.  Otherwise return NULL.  */
4736
4737 static const unsigned char *
4738 do_peek_ident (const char *match, const unsigned char *peek,
4739                const unsigned char *limit)
4740 {
4741   for (; *++match; peek++)
4742     if (*peek != *match)
4743       {
4744         peek = do_peek_next (peek, limit);
4745         if (*peek != *match)
4746           return NULL;
4747       }
4748
4749   /* Must now not be looking at an identifier char.  */
4750   peek = do_peek_next (peek, limit);
4751   if (ISIDNUM (*peek))
4752     return NULL;
4753
4754   /* Skip control-line whitespace.  */
4755  ws:
4756   while (*peek == ' ' || *peek == '\t')
4757     peek++;
4758   if (__builtin_expect (*peek == '\\', false))
4759     {
4760       peek = do_peek_backslash (peek, limit);
4761       if (*peek != '\\')
4762         goto ws;
4763     }
4764
4765   return peek;
4766 }
4767
4768 /* Are we looking at a module control line starting as PEEK - 1?  */
4769
4770 static bool
4771 do_peek_module (cpp_reader *pfile, unsigned char c,
4772                 const unsigned char *peek, const unsigned char *limit)
4773 {
4774   bool import = false;
4775
4776   if (__builtin_expect (c == 'e', false))
4777     {
4778       if (!((peek[0] == 'x' || peek[0] == '\\')
4779             && (peek = do_peek_ident ("export", peek, limit))))
4780         return false;
4781
4782       /* export, peek for import or module.  No need to peek __import
4783          here.  */
4784       if (peek[0] == 'i')
4785         {
4786           if (!((peek[1] == 'm' || peek[1] == '\\')
4787                 && (peek = do_peek_ident ("import", peek + 1, limit))))
4788             return false;
4789           import = true;
4790         }
4791       else if (peek[0] == 'm')
4792         {
4793           if (!((peek[1] == 'o' || peek[1] == '\\')
4794                 && (peek = do_peek_ident ("module", peek + 1, limit))))
4795             return false;
4796         }
4797       else
4798         return false;
4799     }
4800   else if (__builtin_expect (c == 'i', false))
4801     {
4802       if (!((peek[0] == 'm' || peek[0] == '\\')
4803             && (peek = do_peek_ident ("import", peek, limit))))
4804         return false;
4805       import = true;
4806     }
4807   else if (__builtin_expect (c == '_', false))
4808     {
4809       /* Needed for translated includes.   */
4810       if (!((peek[0] == '_' || peek[0] == '\\')
4811             && (peek = do_peek_ident ("__import", peek, limit))))
4812         return false;
4813       import = true;
4814     }
4815   else if (__builtin_expect (c == 'm', false))
4816     {
4817       if (!((peek[0] == 'o' || peek[0] == '\\')
4818             && (peek = do_peek_ident ("module", peek, limit))))
4819         return false;
4820     }
4821   else
4822     return false;
4823
4824   /* Peek the next character to see if it's good enough.  We'll be at
4825      the first non-whitespace char, including skipping an escaped
4826      newline.  */
4827   /* ... import followed by identifier, ':', '<' or header-name
4828      preprocessing tokens, or module followed by identifier, ':' or
4829      ';' preprocessing tokens.  */
4830   unsigned char p = *peek++;
4831
4832   /* A character literal is ... single quotes, ... optionally preceded
4833      by u8, u, U, or L */
4834   /* A string-literal is a ... double quotes, optionally prefixed by
4835      R, u8, u8R, u, uR, U, UR, L, or LR */
4836   if (p == 'u')
4837     {
4838       peek = do_peek_next (peek, limit);
4839       if (*peek == '8')
4840         {
4841           peek++;
4842           goto peek_u8;
4843         }
4844       goto peek_u;
4845     }
4846   else if (p == 'U' || p == 'L')
4847     {
4848     peek_u8:
4849       peek = do_peek_next (peek, limit);
4850     peek_u:
4851       if (*peek == '\"' || *peek == '\'')
4852         return false;
4853
4854       if (*peek == 'R')
4855         goto peek_R;
4856       /* Identifier. Ok.  */
4857     }
4858   else if (p == 'R')
4859     {
4860     peek_R:
4861       if (CPP_OPTION (pfile, rliterals))
4862         {
4863           peek = do_peek_next (peek, limit);
4864           if (*peek == '\"')
4865             return false;
4866         }
4867       /* Identifier. Ok.  */
4868     }
4869   else if ('Z' - 'A' == 25
4870            ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
4871            : ISIDST (p))
4872     {
4873       /* Identifier.  Ok. */
4874     }
4875   else if (p == '<')
4876     {
4877       /* Maybe angle header, ok for import.  Reject
4878          '<=', '<<' digraph:'<:'.  */
4879       if (!import)
4880         return false;
4881       peek = do_peek_next (peek, limit);
4882       if (*peek == '=' || *peek == '<'
4883           || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
4884         return false;
4885     }
4886   else if (p == ';')
4887     {
4888       /* SEMICOLON, ok for module.  */
4889       if (import)
4890         return false;
4891     }
4892   else if (p == '"')
4893     {
4894       /* STRING, ok for import.  */
4895       if (!import)
4896         return false;
4897     }
4898   else if (p == ':')
4899     {
4900       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
4901       peek = do_peek_next (peek, limit);
4902       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
4903         return false;
4904     }
4905   else
4906     /* FIXME: Detect a unicode character, excluding those not
4907        permitted as the initial character. [lex.name]/1.  I presume
4908        we need to check the \[uU] spellings, and directly using
4909        Unicode in say UTF8 form?  Or perhaps we do the phase-1
4910        conversion of UTF8 to universal-character-names?  */
4911     return false;
4912
4913   return true;
4914 }
4915
4916 /* Directives-only scanning.  Somewhat more relaxed than correct
4917    parsing -- some ill-formed programs will not be rejected.  */
4918
4919 void
4920 cpp_directive_only_process (cpp_reader *pfile,
4921                             void *data,
4922                             void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
4923 {
4924   bool module_p = CPP_OPTION (pfile, module_directives);
4925
4926   do
4927     {
4928     restart:
4929       /* Buffer initialization, but no line cleaning. */
4930       cpp_buffer *buffer = pfile->buffer;
4931       buffer->cur_note = buffer->notes_used = 0;
4932       buffer->cur = buffer->line_base = buffer->next_line;
4933       buffer->need_line = false;
4934       /* Files always end in a newline or carriage return.  We rely on this for
4935          character peeking safety.  */
4936       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
4937
4938       const unsigned char *base = buffer->cur;
4939       unsigned line_count = 0;
4940       const unsigned char *line_start = base;
4941
4942       bool bol = true;
4943       bool raw = false;
4944
4945       const unsigned char *lwm = base;
4946       for (const unsigned char *pos = base, *limit = buffer->rlimit;
4947            pos < limit;)
4948         {
4949           unsigned char c = *pos++;
4950           /* This matches the switch in _cpp_lex_direct.  */
4951           switch (c)
4952             {
4953             case ' ': case '\t': case '\f': case '\v':
4954               /* Whitespace, do nothing.  */
4955               break;
4956
4957             case '\r': /* MAC line ending, or Windows \r\n  */
4958               if (*pos == '\n')
4959                 pos++;
4960               /* FALLTHROUGH */
4961
4962             case '\n':
4963               bol = true;
4964
4965             next_line:
4966               CPP_INCREMENT_LINE (pfile, 0);
4967               line_count++;
4968               line_start = pos;
4969               break;
4970
4971             case '\\':
4972               /* <backslash><newline> is removed, and doesn't undo any
4973                  preceeding escape or whatnot.  */
4974               if (*pos == '\n')
4975                 {
4976                   pos++;
4977                   goto next_line;
4978                 }
4979               else if (*pos == '\r')
4980                 {
4981                   if (pos[1] == '\n')
4982                     pos++;
4983                   pos++;
4984                   goto next_line;
4985                 }
4986               goto dflt;
4987
4988             case '#':
4989               if (bol)
4990                 {
4991                   /* Line directive.  */
4992                   if (pos - 1 > base && !pfile->state.skipping)
4993                     cb (pfile, CPP_DO_print, data,
4994                         line_count, base, pos - 1 - base);
4995
4996                   /* Prep things for directive handling. */
4997                   buffer->next_line = pos;
4998                   buffer->need_line = true;
4999                   bool ok = _cpp_get_fresh_line (pfile);
5000                   gcc_checking_assert (ok);
5001
5002                   /* Ensure proper column numbering for generated
5003                      error messages. */
5004                   buffer->line_base -= pos - line_start;
5005
5006                   _cpp_handle_directive (pfile, line_start + 1 != pos);
5007
5008                   /* Sanitize the line settings.  Duplicate #include's can
5009                      mess things up. */
5010                   // FIXME: Necessary?
5011                   pfile->line_table->highest_location
5012                     = pfile->line_table->highest_line;
5013
5014                   if (!pfile->state.skipping
5015                       && pfile->buffer->next_line < pfile->buffer->rlimit)
5016                     cb (pfile, CPP_DO_location, data,
5017                         pfile->line_table->highest_line);
5018
5019                   goto restart;
5020                 }
5021               goto dflt;
5022
5023             case '/':
5024               {
5025                 const unsigned char *peek = do_peek_next (pos, limit);
5026                 if (!(*peek == '/' || *peek == '*'))
5027                   goto dflt;
5028
5029                 /* Line or block comment  */
5030                 bool is_block = *peek == '*';
5031                 bool star = false;
5032                 bool esc = false;
5033                 location_t sloc
5034                   = linemap_position_for_column (pfile->line_table,
5035                                                  pos - line_start);
5036
5037                 while (pos < limit)
5038                   {
5039                     char c = *pos++;
5040                     switch (c)
5041                       {
5042                       case '\\':
5043                         esc = true;
5044                         break;
5045
5046                       case '\r':
5047                         if (*pos == '\n')
5048                           pos++;
5049                         /* FALLTHROUGH  */
5050
5051                       case '\n':
5052                         {
5053                           CPP_INCREMENT_LINE (pfile, 0);
5054                           line_count++;
5055                           line_start = pos;
5056                           if (!esc && !is_block)
5057                             {
5058                               bol = true;
5059                               goto done_comment;
5060                             }
5061                         }
5062                         if (!esc)
5063                           star = false;
5064                         esc = false;
5065                         break;
5066
5067                       case '*':
5068                         if (pos > peek)
5069                           star = is_block;
5070                         esc = false;
5071                         break;
5072
5073                       case '/':
5074                         if (star)
5075                           goto done_comment;
5076                         /* FALLTHROUGH  */
5077
5078                       default:
5079                         star = false;
5080                         esc = false;
5081                         break;
5082                       }
5083                   }
5084                 if (pos < limit || is_block)
5085                   cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5086                                        "unterminated comment");
5087               done_comment:
5088                 lwm = pos;
5089                 break;
5090               }
5091
5092             case '\'':
5093               if (!CPP_OPTION (pfile, digit_separators))
5094                 goto delimited_string;
5095
5096               /* Possibly a number punctuator.  */
5097               if (!ISIDNUM (*do_peek_next (pos, limit)))
5098                 goto delimited_string;
5099
5100               goto quote_peek;
5101
5102             case '\"':
5103               if (!CPP_OPTION (pfile, rliterals))
5104                 goto delimited_string;
5105
5106             quote_peek:
5107               {
5108                 /* For ' see if it's a number punctuator
5109                    \.?<digit>(<digit>|<identifier-nondigit>
5110                    |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5111                 /* For " see if it's a raw string
5112                    {U,L,u,u8}R.  This includes CPP_NUMBER detection,
5113                    because that could be 0e+R.  */
5114                 const unsigned char *peek = pos - 1;
5115                 bool quote_first = c == '"';
5116                 bool quote_eight = false;
5117                 bool maybe_number_start = false;
5118                 bool want_number = false;
5119
5120                 while ((peek = do_peek_prev (peek, lwm)))
5121                   {
5122                     unsigned char p = *peek;
5123                     if (quote_first)
5124                       {
5125                         if (!raw)
5126                           {
5127                             if (p != 'R')
5128                               break;
5129                             raw = true;
5130                             continue;
5131                           }
5132
5133                         quote_first = false;
5134                         if (p == 'L' || p == 'U' || p == 'u')
5135                           ;
5136                         else if (p == '8')
5137                           quote_eight = true;
5138                         else
5139                           goto second_raw;
5140                       }
5141                     else if (quote_eight)
5142                       {
5143                         if (p != 'u')
5144                           {
5145                             raw = false;
5146                             break;
5147                           }
5148                         quote_eight = false;
5149                       }
5150                     else if (c == '"')
5151                       {
5152                       second_raw:;
5153                         if (!want_number && ISIDNUM (p))
5154                           {
5155                             raw = false;
5156                             break;
5157                           }
5158                       }
5159
5160                     if (ISDIGIT (p))
5161                       maybe_number_start = true;
5162                     else if (p == '.')
5163                       want_number = true;
5164                     else if (ISIDNUM (p))
5165                       maybe_number_start = false;
5166                     else if (p == '+' || p == '-')
5167                       {
5168                         if (const unsigned char *peek_prev
5169                             = do_peek_prev (peek, lwm))
5170                           {
5171                             p = *peek_prev;
5172                             if (p == 'e' || p == 'E'
5173                                 || p == 'p' || p == 'P')
5174                               {
5175                                 want_number = true;
5176                                 maybe_number_start = false;
5177                               }
5178                             else
5179                               break;
5180                           }
5181                         else
5182                           break;
5183                       }
5184                     else if (p == '\'' || p == '\"')
5185                       {
5186                         /* If this is lwm, this must be the end of a
5187                            previous string.  So this is a trailing
5188                            literal type, (a) if those are allowed,
5189                              and (b) maybe_start is false.  Otherwise
5190                              this must be a CPP_NUMBER because we've
5191                              met another ', and we'd have checked that
5192                              in its own right.  */
5193                         if (peek == lwm && CPP_OPTION (pfile, uliterals))
5194                           {
5195                             if  (!maybe_number_start && !want_number)
5196                               /* Must be a literal type.  */
5197                               raw = false;
5198                           }
5199                         else if (p == '\''
5200                                  && CPP_OPTION (pfile, digit_separators))
5201                           maybe_number_start = true;
5202                         break;
5203                       }
5204                     else if (c == '\'')
5205                       break;
5206                     else if (!quote_first && !quote_eight)
5207                       break;
5208                   }
5209
5210                 if (maybe_number_start)
5211                   {
5212                     if (c == '\'')
5213                       /* A CPP NUMBER.  */
5214                       goto dflt;
5215                     raw = false;
5216                   }
5217
5218                 goto delimited_string;
5219               }
5220
5221             delimited_string:
5222               {
5223                 /* (Possibly raw) string or char literal.  */
5224                 unsigned char end = c;
5225                 int delim_len = -1;
5226                 const unsigned char *delim = NULL;
5227                 location_t sloc = linemap_position_for_column (pfile->line_table,
5228                                                                pos - line_start);
5229                 int esc = 0;
5230
5231                 if (raw)
5232                   {
5233                     /* There can be no line breaks in the delimiter.  */
5234                     delim = pos;
5235                     for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5236                       {
5237                         if (delim_len == 16)
5238                           {
5239                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5240                                                  sloc, 0,
5241                                                  "raw string delimiter"
5242                                                  " longer than %d"
5243                                                  " characters",
5244                                                  delim_len);
5245                             raw = false;
5246                             pos = delim;
5247                             break;
5248                           }
5249                         if (strchr (") \\\t\v\f\n", c))
5250                           {
5251                             cpp_error_with_line (pfile, CPP_DL_ERROR,
5252                                                  sloc, 0,
5253                                                  "invalid character '%c'"
5254                                                  " in raw string"
5255                                                  " delimiter", c);
5256                             raw = false;
5257                             pos = delim;
5258                             break;
5259                           }
5260                         if (pos >= limit)
5261                           goto bad_string;
5262                       }
5263                   }
5264
5265                 while (pos < limit)
5266                   {
5267                     char c = *pos++;
5268                     switch (c)
5269                       {
5270                       case '\\':
5271                         if (!raw)
5272                           esc++;
5273                         break;
5274
5275                       case '\r':
5276                         if (*pos == '\n')
5277                           pos++;
5278                         /* FALLTHROUGH  */
5279
5280                       case '\n':
5281                         {
5282                           CPP_INCREMENT_LINE (pfile, 0);
5283                           line_count++;
5284                           line_start = pos;
5285                         }
5286                         if (esc)
5287                           esc--;
5288                         break;
5289
5290                       case ')':
5291                         if (raw
5292                             && pos + delim_len + 1 < limit
5293                             && pos[delim_len] == end
5294                             && !memcmp (delim, pos, delim_len))
5295                           {
5296                             pos += delim_len + 1;
5297                             raw = false;
5298                             goto done_string;
5299                           }
5300                         break;
5301
5302                       default:
5303                         if (!raw && !(esc & 1) && c == end)
5304                           goto done_string;
5305                         esc = 0;
5306                         break;
5307                       }
5308                   }
5309               bad_string:
5310                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5311                                      "unterminated literal");
5312
5313               done_string:
5314                 raw = false;
5315                 lwm = pos - 1;
5316               }
5317               goto dflt;
5318
5319             case '_':
5320             case 'e':
5321             case 'i':
5322             case 'm':
5323               if (bol && module_p && !pfile->state.skipping
5324                   && do_peek_module (pfile, c, pos, limit))
5325                 {
5326                   /* We've seen the start of a module control line.
5327                      Start up the tokenizer.  */
5328                   pos--; /* Backup over the first character.  */
5329
5330                   /* Backup over whitespace to start of line.  */
5331                   while (pos > line_start
5332                          && (pos[-1] == ' ' || pos[-1] == '\t'))
5333                     pos--;
5334
5335                   if (pos > base)
5336                     cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5337
5338                   /* Prep things for directive handling. */
5339                   buffer->next_line = pos;
5340                   buffer->need_line = true;
5341
5342                   /* Now get tokens until the PRAGMA_EOL.  */
5343                   do
5344                     {
5345                       location_t spelling;
5346                       const cpp_token *tok
5347                         = cpp_get_token_with_location (pfile, &spelling);
5348
5349                       gcc_assert (pfile->state.in_deferred_pragma
5350                                   || tok->type == CPP_PRAGMA_EOL);
5351                       cb (pfile, CPP_DO_token, data, tok, spelling);
5352                     }
5353                   while (pfile->state.in_deferred_pragma);
5354
5355                   if (pfile->buffer->next_line < pfile->buffer->rlimit)
5356                     cb (pfile, CPP_DO_location, data,
5357                         pfile->line_table->highest_line);
5358
5359                   pfile->mi_valid = false;
5360                   goto restart;
5361                 }
5362               goto dflt;
5363
5364             default:
5365             dflt:
5366               bol = false;
5367               pfile->mi_valid = false;
5368               break;
5369             }
5370         }
5371
5372       if (buffer->rlimit > base && !pfile->state.skipping)
5373         {
5374           const unsigned char *limit = buffer->rlimit;
5375           /* If the file was not newline terminated, add rlimit, which is
5376              guaranteed to point to a newline, to the end of our range.  */
5377           if (limit[-1] != '\n')
5378             {
5379               limit++;
5380               CPP_INCREMENT_LINE (pfile, 0);
5381               line_count++;
5382             }
5383           cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5384         }
5385
5386       _cpp_pop_buffer (pfile);
5387     }
5388   while (pfile->buffer);
5389 }