libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2021 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problem can be properly
 267    autoconfed:
 268
 269    The Solaris 10+ assembler tags objects with the instruction set
 270    extensions used, so SSE4.2 executables cannot run on machines that
 271    don't support that extension.  */
 272
 273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 274
 275 /* Replicated character data to be shared between implementations.
 276    Recall that outside of a context with vector support we can't
 277    define compatible vector types, therefore these are all defined
 278    in terms of raw characters.  */
 279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 280   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 281     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 282   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 283     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 284   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 285     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 286   { '?', '?', '?', '?', '?', '?', '?', '?',
 287     '?', '?', '?', '?', '?', '?', '?', '?' },
 288 };
 289
 290 /* A version of the fast scanner using MMX vectorized byte compare insns.
 291
 292    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 293    which was packaged into SSE1; it is also present in the AMD MMX
 294    extension.  Mark the function as using "sse" so that we emit a real
 295    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 296
 297 static const uchar *
 298 #ifndef __SSE__
 299 __attribute__((__target__("sse")))
 300 #endif
 301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 302 {
 303   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 304   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 305
 306   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 307   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 308   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 309   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 310
 311   unsigned int misalign, found, mask;
 312   const v8qi *p;
 313   v8qi data, t, c;
 314
 315   /* Align the source pointer.  While MMX doesn't generate unaligned data
 316      faults, this allows us to safely scan to the end of the buffer without
 317      reading beyond the end of the last page.  */
 318   misalign = (uintptr_t)s & 7;
 319   p = (const v8qi *)((uintptr_t)s & -8);
 320   data = *p;
 321
 322   /* Create a mask for the bytes that are valid within the first
 323      16-byte block.  The Idea here is that the AND with the mask
 324      within the loop is "free", since we need some AND or TEST
 325      insn in order to set the flags for the branch anyway.  */
 326   mask = -1u << misalign;
 327
 328   /* Main loop processing 8 bytes at a time.  */
 329   goto start;
 330   do
 331     {
 332       data = *++p;
 333       mask = -1;
 334
 335     start:
 336       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 337       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 338       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 339       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       found = __builtin_ia32_pmovmskb (t);
 344       found &= mask;
 345     }
 346   while (!found);
 347
 348   __builtin_ia32_emms ();
 349
 350   /* FOUND contains 1 in bits for which we matched a relevant
 351      character.  Conversion to the byte index is trivial.  */
 352   found = __builtin_ctz(found);
 353   return (const uchar *)p + found;
 354 }
 355
 356 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 357
 358 static const uchar *
 359 #ifndef __SSE2__
 360 __attribute__((__target__("sse2")))
 361 #endif
 362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 363 {
 364   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 365
 366   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 367   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 368   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 369   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 370
 371   unsigned int misalign, found, mask;
 372   const v16qi *p;
 373   v16qi data, t;
 374
 375   /* Align the source pointer.  */
 376   misalign = (uintptr_t)s & 15;
 377   p = (const v16qi *)((uintptr_t)s & -16);
 378   data = *p;
 379
 380   /* Create a mask for the bytes that are valid within the first
 381      16-byte block.  The Idea here is that the AND with the mask
 382      within the loop is "free", since we need some AND or TEST
 383      insn in order to set the flags for the branch anyway.  */
 384   mask = -1u << misalign;
 385
 386   /* Main loop processing 16 bytes at a time.  */
 387   goto start;
 388   do
 389     {
 390       data = *++p;
 391       mask = -1;
 392
 393     start:
 394       t  = data == repl_nl;
 395       t |= data == repl_cr;
 396       t |= data == repl_bs;
 397       t |= data == repl_qm;
 398       found = __builtin_ia32_pmovmskb128 (t);
 399       found &= mask;
 400     }
 401   while (!found);
 402
 403   /* FOUND contains 1 in bits for which we matched a relevant
 404      character.  Conversion to the byte index is trivial.  */
 405   found = __builtin_ctz(found);
 406   return (const uchar *)p + found;
 407 }
 408
 409 #ifdef HAVE_SSE4
 410 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 411
 412 static const uchar *
 413 #ifndef __SSE4_2__
 414 __attribute__((__target__("sse4.2")))
 415 #endif
 416 search_line_sse42 (const uchar *s, const uchar *end)
 417 {
 418   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 419   static const v16qi search = { '\n', '\r', '?', '\\' };
 420
 421   uintptr_t si = (uintptr_t)s;
 422   uintptr_t index;
 423
 424   /* Check for unaligned input.  */
 425   if (si & 15)
 426     {
 427       v16qi sv;
 428
 429       if (__builtin_expect (end - s < 16, 0)
 430           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 431         {
 432           /* There are less than 16 bytes left in the buffer, and less
 433              than 16 bytes left on the page.  Reading 16 bytes at this
 434              point might generate a spurious page fault.  Defer to the
 435              SSE2 implementation, which already handles alignment.  */
 436           return search_line_sse2 (s, end);
 437         }
 438
 439       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 440          memory need not be aligned.  */
 441       sv = __builtin_ia32_loaddqu ((const char *) s);
 442       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 443
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 15) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  */
 454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 455   while (1)
 456     {
 457       char f;
 458
 459       /* By using inline assembly instead of the builtin,
 460          we can use the result, as well as the flags set.  */
 461       __asm ("%vpcmpestri\t$0, %2, %3"
 462              : "=c"(index), "=@ccc"(f)
 463              : "m"(*s), "x"(search), "a"(4), "d"(16));
 464       if (f)
 465         break;
 466
 467       s += 16;
 468     }
 469 #else
 470   s -= 16;
 471   /* By doing the whole loop in inline assembly,
 472      we can make proper use of the flags set.  */
 473   __asm (      ".balign 16\n"
 474         "0:     add $16, %1\n"
 475         "       %vpcmpestri\t$0, (%1), %2\n"
 476         "       jnc 0b"
 477         : "=&c"(index), "+r"(s)
 478         : "x"(search), "a"(4), "d"(16));
 479 #endif
 480
 481  found:
 482   return s + index;
 483 }
 484
 485 #else
 486 /* Work around out-dated assemblers without sse4 support.  */
 487 #define search_line_sse42 search_line_sse2
 488 #endif
 489
 490 /* Check the CPU capabilities.  */
 491
 492 #include "../gcc/config/i386/cpuid.h"
 493
 494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 495 static search_line_fast_type search_line_fast;
 496
 497 #define HAVE_init_vectorized_lexer 1
 498 static inline void
 499 init_vectorized_lexer (void)
 500 {
 501   unsigned dummy, ecx = 0, edx = 0;
 502   search_line_fast_type impl = search_line_acc_char;
 503   int minimum = 0;
 504
 505 #if defined(__SSE4_2__)
 506   minimum = 3;
 507 #elif defined(__SSE2__)
 508   minimum = 2;
 509 #elif defined(__SSE__)
 510   minimum = 1;
 511 #endif
 512
 513   if (minimum == 3)
 514     impl = search_line_sse42;
 515   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 516     {
 517       if (minimum == 3 || (ecx & bit_SSE4_2))
 518         impl = search_line_sse42;
 519       else if (minimum == 2 || (edx & bit_SSE2))
 520         impl = search_line_sse2;
 521       else if (minimum == 1 || (edx & bit_SSE))
 522         impl = search_line_mmx;
 523     }
 524   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 525     {
 526       if (minimum == 1
 527           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 528         impl = search_line_mmx;
 529     }
 530
 531   search_line_fast = impl;
 532 }
 533
 534 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 535
 536 /* A vection of the fast scanner using AltiVec vectorized byte compares
 537    and VSX unaligned loads (when VSX is available).  This is otherwise
 538    the same as the AltiVec version.  */
 539
 540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 541 static const uchar *
 542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 543 {
 544   typedef __attribute__((altivec(vector))) unsigned char vc;
 545
 546   const vc repl_nl = {
 547     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 548     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 549   };
 550   const vc repl_cr = {
 551     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 552     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 553   };
 554   const vc repl_bs = {
 555     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 556     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 557   };
 558   const vc repl_qm = {
 559     '?', '?', '?', '?', '?', '?', '?', '?',
 560     '?', '?', '?', '?', '?', '?', '?', '?',
 561   };
 562   const vc zero = { 0 };
 563
 564   vc data, t;
 565
 566   /* Main loop processing 16 bytes at a time.  */
 567   do
 568     {
 569       vc m_nl, m_cr, m_bs, m_qm;
 570
 571       data = __builtin_vec_vsx_ld (0, s);
 572       s += 16;
 573
 574       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 575       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 576       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 577       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 578       t = (m_nl | m_cr) | (m_bs | m_qm);
 579
 580       /* T now contains 0xff in bytes for which we matched one of the relevant
 581          characters.  We want to exit the loop if any byte in T is non-zero.
 582          Below is the expansion of vec_any_ne(t, zero).  */
 583     }
 584   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 585
 586   /* Restore s to to point to the 16 bytes we just processed.  */
 587   s -= 16;
 588
 589   {
 590 #define N  (sizeof(vc) / sizeof(long))
 591
 592     union {
 593       vc v;
 594       /* Statically assert that N is 2 or 4.  */
 595       unsigned long l[(N == 2 || N == 4) ? N : -1];
 596     } u;
 597     unsigned long l, i = 0;
 598
 599     u.v = t;
 600
 601     /* Find the first word of T that is non-zero.  */
 602     switch (N)
 603       {
 604       case 4:
 605         l = u.l[i++];
 606         if (l != 0)
 607           break;
 608         s += sizeof(unsigned long);
 609         l = u.l[i++];
 610         if (l != 0)
 611           break;
 612         s += sizeof(unsigned long);
 613         /* FALLTHRU */
 614       case 2:
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619         l = u.l[i];
 620       }
 621
 622     /* L now contains 0xff in bytes for which we matched one of the
 623        relevant characters.  We can find the byte index by finding
 624        its bit index and dividing by 8.  */
 625 #ifdef __BIG_ENDIAN__
 626     l = __builtin_clzl(l) >> 3;
 627 #else
 628     l = __builtin_ctzl(l) >> 3;
 629 #endif
 630     return s + l;
 631
 632 #undef N
 633   }
 634 }
 635
 636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 637
 638 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 639    This cannot be used for little endian because vec_lvsl/lvsr are
 640    deprecated for little endian and the code won't work properly.  */
 641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 642    so we can't compile this function without -maltivec on the command line
 643    (or implied by some other switch).  */
 644
 645 static const uchar *
 646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 647 {
 648   typedef __attribute__((altivec(vector))) unsigned char vc;
 649
 650   const vc repl_nl = {
 651     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 652     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 653   };
 654   const vc repl_cr = {
 655     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 656     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 657   };
 658   const vc repl_bs = {
 659     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 660     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 661   };
 662   const vc repl_qm = {
 663     '?', '?', '?', '?', '?', '?', '?', '?',
 664     '?', '?', '?', '?', '?', '?', '?', '?',
 665   };
 666   const vc ones = {
 667     -1, -1, -1, -1, -1, -1, -1, -1,
 668     -1, -1, -1, -1, -1, -1, -1, -1,
 669   };
 670   const vc zero = { 0 };
 671
 672   vc data, mask, t;
 673
 674   /* Altivec loads automatically mask addresses with -16.  This lets us
 675      issue the first load as early as possible.  */
 676   data = __builtin_vec_ld(0, (const vc *)s);
 677
 678   /* Discard bytes before the beginning of the buffer.  Do this by
 679      beginning with all ones and shifting in zeros according to the
 680      mis-alignment.  The LVSR instruction pulls the exact shift we
 681      want from the address.  */
 682   mask = __builtin_vec_lvsr(0, s);
 683   mask = __builtin_vec_perm(zero, ones, mask);
 684   data &= mask;
 685
 686   /* While altivec loads mask addresses, we still need to align S so
 687      that the offset we compute at the end is correct.  */
 688   s = (const uchar *)((uintptr_t)s & -16);
 689
 690   /* Main loop processing 16 bytes at a time.  */
 691   goto start;
 692   do
 693     {
 694       vc m_nl, m_cr, m_bs, m_qm;
 695
 696       s += 16;
 697       data = __builtin_vec_ld(0, (const vc *)s);
 698
 699     start:
 700       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 701       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 702       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 703       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 704       t = (m_nl | m_cr) | (m_bs | m_qm);
 705
 706       /* T now contains 0xff in bytes for which we matched one of the relevant
 707          characters.  We want to exit the loop if any byte in T is non-zero.
 708          Below is the expansion of vec_any_ne(t, zero).  */
 709     }
 710   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 711
 712   {
 713 #define N  (sizeof(vc) / sizeof(long))
 714
 715     union {
 716       vc v;
 717       /* Statically assert that N is 2 or 4.  */
 718       unsigned long l[(N == 2 || N == 4) ? N : -1];
 719     } u;
 720     unsigned long l, i = 0;
 721
 722     u.v = t;
 723
 724     /* Find the first word of T that is non-zero.  */
 725     switch (N)
 726       {
 727       case 4:
 728         l = u.l[i++];
 729         if (l != 0)
 730           break;
 731         s += sizeof(unsigned long);
 732         l = u.l[i++];
 733         if (l != 0)
 734           break;
 735         s += sizeof(unsigned long);
 736         /* FALLTHROUGH */
 737       case 2:
 738         l = u.l[i++];
 739         if (l != 0)
 740           break;
 741         s += sizeof(unsigned long);
 742         l = u.l[i];
 743       }
 744
 745     /* L now contains 0xff in bytes for which we matched one of the
 746        relevant characters.  We can find the byte index by finding
 747        its bit index and dividing by 8.  */
 748     l = __builtin_clzl(l) >> 3;
 749     return s + l;
 750
 751 #undef N
 752   }
 753 }
 754
 755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 756 #include "arm_neon.h"
 757
 758 /* This doesn't have to be the exact page size, but no system may use
 759    a size smaller than this.  ARMv8 requires a minimum page size of
 760    4k.  The impact of being conservative here is a small number of
 761    cases will take the slightly slower entry path into the main
 762    loop.  */
 763
 764 #define AARCH64_MIN_PAGE_SIZE 4096
 765
 766 static const uchar *
 767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 768 {
 769   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 770   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 771   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 772   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 773   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 774
 775 #ifdef __ARM_BIG_ENDIAN
 776   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 777 #else
 778   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 779 #endif
 780
 781   unsigned int found;
 782   const uint8_t *p;
 783   uint8x16_t data;
 784   uint8x16_t t;
 785   uint16x8_t m;
 786   uint8x16_t u, v, w;
 787
 788   /* Align the source pointer.  */
 789   p = (const uint8_t *)((uintptr_t)s & -16);
 790
 791   /* Assuming random string start positions, with a 4k page size we'll take
 792      the slow path about 0.37% of the time.  */
 793   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 794                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 795                         < 16, 0))
 796     {
 797       /* Slow path: the string starts near a possible page boundary.  */
 798       uint32_t misalign, mask;
 799
 800       misalign = (uintptr_t)s & 15;
 801       mask = (-1u << misalign) & 0xffff;
 802       data = vld1q_u8 (p);
 803       t = vceqq_u8 (data, repl_nl);
 804       u = vceqq_u8 (data, repl_cr);
 805       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 806       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 807       t = vorrq_u8 (v, w);
 808       t = vandq_u8 (t, xmask);
 809       m = vpaddlq_u8 (t);
 810       m = vshlq_u16 (m, shift);
 811       found = vaddvq_u16 (m);
 812       found &= mask;
 813       if (found)
 814         return (const uchar*)p + __builtin_ctz (found);
 815     }
 816   else
 817     {
 818       data = vld1q_u8 ((const uint8_t *) s);
 819       t = vceqq_u8 (data, repl_nl);
 820       u = vceqq_u8 (data, repl_cr);
 821       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 822       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 823       t = vorrq_u8 (v, w);
 824       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 825         goto done;
 826     }
 827
 828   do
 829     {
 830       p += 16;
 831       data = vld1q_u8 (p);
 832       t = vceqq_u8 (data, repl_nl);
 833       u = vceqq_u8 (data, repl_cr);
 834       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 835       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 836       t = vorrq_u8 (v, w);
 837     } while (!vpaddd_u64 ((uint64x2_t)t));
 838
 839 done:
 840   /* Now that we've found the terminating substring, work out precisely where
 841      we need to stop.  */
 842   t = vandq_u8 (t, xmask);
 843   m = vpaddlq_u8 (t);
 844   m = vshlq_u16 (m, shift);
 845   found = vaddvq_u16 (m);
 846   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 847           + __builtin_ctz (found));
 848 }
 849
 850 #elif defined (__ARM_NEON)
 851 #include "arm_neon.h"
 852
 853 static const uchar *
 854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 855 {
 856   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 857   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 858   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 859   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 860   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 861
 862   unsigned int misalign, found, mask;
 863   const uint8_t *p;
 864   uint8x16_t data;
 865
 866   /* Align the source pointer.  */
 867   misalign = (uintptr_t)s & 15;
 868   p = (const uint8_t *)((uintptr_t)s & -16);
 869   data = vld1q_u8 (p);
 870
 871   /* Create a mask for the bytes that are valid within the first
 872      16-byte block.  The Idea here is that the AND with the mask
 873      within the loop is "free", since we need some AND or TEST
 874      insn in order to set the flags for the branch anyway.  */
 875   mask = (-1u << misalign) & 0xffff;
 876
 877   /* Main loop, processing 16 bytes at a time.  */
 878   goto start;
 879
 880   do
 881     {
 882       uint8x8_t l;
 883       uint16x4_t m;
 884       uint32x2_t n;
 885       uint8x16_t t, u, v, w;
 886
 887       p += 16;
 888       data = vld1q_u8 (p);
 889       mask = 0xffff;
 890
 891     start:
 892       t = vceqq_u8 (data, repl_nl);
 893       u = vceqq_u8 (data, repl_cr);
 894       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 895       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 896       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 897       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 898       m = vpaddl_u8 (l);
 899       n = vpaddl_u16 (m);
 900
 901       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 902               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 903       found &= mask;
 904     }
 905   while (!found);
 906
 907   /* FOUND contains 1 in bits for which we matched a relevant
 908      character.  Conversion to the byte index is trivial.  */
 909   found = __builtin_ctz (found);
 910   return (const uchar *)p + found;
 911 }
 912
 913 #else
 914
 915 /* We only have one accelerated alternative.  Use a direct call so that
 916    we encourage inlining.  */
 917
 918 #define search_line_fast  search_line_acc_char
 919
 920 #endif
 921
 922 /* Initialize the lexer if needed.  */
 923
 924 void
 925 _cpp_init_lexer (void)
 926 {
 927 #ifdef HAVE_init_vectorized_lexer
 928   init_vectorized_lexer ();
 929 #endif
 930 }
 931
 932 /* Returns with a logical line that contains no escaped newlines or
 933    trigraphs.  This is a time-critical inner loop.  */
 934 void
 935 _cpp_clean_line (cpp_reader *pfile)
 936 {
 937   cpp_buffer *buffer;
 938   const uchar *s;
 939   uchar c, *d, *p;
 940
 941   buffer = pfile->buffer;
 942   buffer->cur_note = buffer->notes_used = 0;
 943   buffer->cur = buffer->line_base = buffer->next_line;
 944   buffer->need_line = false;
 945   s = buffer->next_line;
 946
 947   if (!buffer->from_stage3)
 948     {
 949       const uchar *pbackslash = NULL;
 950
 951       /* Fast path.  This is the common case of an un-escaped line with
 952          no trigraphs.  The primary win here is by not writing any
 953          data back to memory until we have to.  */
 954       while (1)
 955         {
 956           /* Perform an optimized search for \n, \r, \\, ?.  */
 957           s = search_line_fast (s, buffer->rlimit);
 958
 959           c = *s;
 960           if (c == '\\')
 961             {
 962               /* Record the location of the backslash and continue.  */
 963               pbackslash = s++;
 964             }
 965           else if (__builtin_expect (c == '?', 0))
 966             {
 967               if (__builtin_expect (s[1] == '?', false)
 968                    && _cpp_trigraph_map[s[2]])
 969                 {
 970                   /* Have a trigraph.  We may or may not have to convert
 971                      it.  Add a line note regardless, for -Wtrigraphs.  */
 972                   add_line_note (buffer, s, s[2]);
 973                   if (CPP_OPTION (pfile, trigraphs))
 974                     {
 975                       /* We do, and that means we have to switch to the
 976                          slow path.  */
 977                       d = (uchar *) s;
 978                       *d = _cpp_trigraph_map[s[2]];
 979                       s += 2;
 980                       goto slow_path;
 981                     }
 982                 }
 983               /* Not a trigraph.  Continue on fast-path.  */
 984               s++;
 985             }
 986           else
 987             break;
 988         }
 989
 990       /* This must be \r or \n.  We're either done, or we'll be forced
 991          to write back to the buffer and continue on the slow path.  */
 992       d = (uchar *) s;
 993
 994       if (__builtin_expect (s == buffer->rlimit, false))
 995         goto done;
 996
 997       /* DOS line ending? */
 998       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 999         {
1000           s++;
1001           if (s == buffer->rlimit)
1002             goto done;
1003         }
1004
1005       if (__builtin_expect (pbackslash == NULL, true))
1006         goto done;
1007
1008       /* Check for escaped newline.  */
1009       p = d;
1010       while (is_nvspace (p[-1]))
1011         p--;
1012       if (p - 1 != pbackslash)
1013         goto done;
1014
1015       /* Have an escaped newline; process it and proceed to
1016          the slow path.  */
1017       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018       d = p - 2;
1019       buffer->next_line = p - 1;
1020
1021     slow_path:
1022       while (1)
1023         {
1024           c = *++s;
1025           *++d = c;
1026
1027           if (c == '\n' || c == '\r')
1028             {
1029               /* Handle DOS line endings.  */
1030               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031                 s++;
1032               if (s == buffer->rlimit)
1033                 break;
1034
1035               /* Escaped?  */
1036               p = d;
1037               while (p != buffer->next_line && is_nvspace (p[-1]))
1038                 p--;
1039               if (p == buffer->next_line || p[-1] != '\\')
1040                 break;
1041
1042               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043               d = p - 2;
1044               buffer->next_line = p - 1;
1045             }
1046           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1047             {
1048               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1049               add_line_note (buffer, d, s[2]);
1050               if (CPP_OPTION (pfile, trigraphs))
1051                 {
1052                   *d = _cpp_trigraph_map[s[2]];
1053                   s += 2;
1054                 }
1055             }
1056         }
1057     }
1058   else
1059     {
1060       while (*s != '\n' && *s != '\r')
1061         s++;
1062       d = (uchar *) s;
1063
1064       /* Handle DOS line endings.  */
1065       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1066         s++;
1067     }
1068
1069  done:
1070   *d = '\n';
1071   /* A sentinel note that should never be processed.  */
1072   add_line_note (buffer, d + 1, '\n');
1073   buffer->next_line = s + 1;
1074 }
1075
1076 /* Return true if the trigraph indicated by NOTE should be warned
1077    about in a comment.  */
1078 static bool
1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1080 {
1081   const uchar *p;
1082
1083   /* Within comments we don't warn about trigraphs, unless the
1084      trigraph forms an escaped newline, as that may change
1085      behavior.  */
1086   if (note->type != '/')
1087     return false;
1088
1089   /* If -trigraphs, then this was an escaped newline iff the next note
1090      is coincident.  */
1091   if (CPP_OPTION (pfile, trigraphs))
1092     return note[1].pos == note->pos;
1093
1094   /* Otherwise, see if this forms an escaped newline.  */
1095   p = note->pos + 3;
1096   while (is_nvspace (*p))
1097     p++;
1098
1099   /* There might have been escaped newlines between the trigraph and the
1100      newline we found.  Hence the position test.  */
1101   return (*p == '\n' && p < note[1].pos);
1102 }
1103
1104 /* Process the notes created by add_line_note as far as the current
1105    location.  */
1106 void
1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1108 {
1109   cpp_buffer *buffer = pfile->buffer;
1110
1111   for (;;)
1112     {
1113       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114       unsigned int col;
1115
1116       if (note->pos > buffer->cur)
1117         break;
1118
1119       buffer->cur_note++;
1120       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1121
1122       if (note->type == '\\' || note->type == ' ')
1123         {
1124           if (note->type == ' ' && !in_comment)
1125             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126                                  "backslash and newline separated by space");
1127
1128           if (buffer->next_line > buffer->rlimit)
1129             {
1130               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131                                    "backslash-newline at end of file");
1132               /* Prevent "no newline at end of file" warning.  */
1133               buffer->next_line = buffer->rlimit;
1134             }
1135
1136           buffer->line_base = note->pos;
1137           CPP_INCREMENT_LINE (pfile, 0);
1138         }
1139       else if (_cpp_trigraph_map[note->type])
1140         {
1141           if (CPP_OPTION (pfile, warn_trigraphs)
1142               && (!in_comment || warn_in_comment (pfile, note)))
1143             {
1144               if (CPP_OPTION (pfile, trigraphs))
1145                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146                                        pfile->line_table->highest_line, col,
1147                                        "trigraph ??%c converted to %c",
1148                                        note->type,
1149                                        (int) _cpp_trigraph_map[note->type]);
1150               else
1151                 {
1152                   cpp_warning_with_line
1153                     (pfile, CPP_W_TRIGRAPHS,
1154                      pfile->line_table->highest_line, col,
1155                      "trigraph ??%c ignored, use -trigraphs to enable",
1156                      note->type);
1157                 }
1158             }
1159         }
1160       else if (note->type == 0)
1161         /* Already processed in lex_raw_string.  */;
1162       else
1163         abort ();
1164     }
1165 }
1166
1167 /* Skip a C-style block comment.  We find the end of the comment by
1168    seeing if an asterisk is before every '/' we encounter.  Returns
1169    nonzero if comment terminated by EOF, zero otherwise.
1170
1171    Buffer->cur points to the initial asterisk of the comment.  */
1172 bool
1173 _cpp_skip_block_comment (cpp_reader *pfile)
1174 {
1175   cpp_buffer *buffer = pfile->buffer;
1176   const uchar *cur = buffer->cur;
1177   uchar c;
1178
1179   cur++;
1180   if (*cur == '/')
1181     cur++;
1182
1183   for (;;)
1184     {
1185       /* People like decorating comments with '*', so check for '/'
1186          instead for efficiency.  */
1187       c = *cur++;
1188
1189       if (c == '/')
1190         {
1191           if (cur[-2] == '*')
1192             break;
1193
1194           /* Warn about potential nested comments, but not if the '/'
1195              comes immediately before the true comment delimiter.
1196              Don't bother to get it right across escaped newlines.  */
1197           if (CPP_OPTION (pfile, warn_comments)
1198               && cur[0] == '*' && cur[1] != '/')
1199             {
1200               buffer->cur = cur;
1201               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1202                                      pfile->line_table->highest_line,
1203                                      CPP_BUF_COL (buffer),
1204                                      "\"/*\" within comment");
1205             }
1206         }
1207       else if (c == '\n')
1208         {
1209           unsigned int cols;
1210           buffer->cur = cur - 1;
1211           _cpp_process_line_notes (pfile, true);
1212           if (buffer->next_line >= buffer->rlimit)
1213             return true;
1214           _cpp_clean_line (pfile);
1215
1216           cols = buffer->next_line - buffer->line_base;
1217           CPP_INCREMENT_LINE (pfile, cols);
1218
1219           cur = buffer->cur;
1220         }
1221     }
1222
1223   buffer->cur = cur;
1224   _cpp_process_line_notes (pfile, true);
1225   return false;
1226 }
1227
1228 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1229    terminating newline.  Handles escaped newlines.  Returns nonzero
1230    if a multiline comment.  */
1231 static int
1232 skip_line_comment (cpp_reader *pfile)
1233 {
1234   cpp_buffer *buffer = pfile->buffer;
1235   location_t orig_line = pfile->line_table->highest_line;
1236
1237   while (*buffer->cur != '\n')
1238     buffer->cur++;
1239
1240   _cpp_process_line_notes (pfile, true);
1241   return orig_line != pfile->line_table->highest_line;
1242 }
1243
1244 /* Skips whitespace, saving the next non-whitespace character.  */
1245 static void
1246 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1247 {
1248   cpp_buffer *buffer = pfile->buffer;
1249   bool saw_NUL = false;
1250
1251   do
1252     {
1253       /* Horizontal space always OK.  */
1254       if (c == ' ' || c == '\t')
1255         ;
1256       /* Just \f \v or \0 left.  */
1257       else if (c == '\0')
1258         saw_NUL = true;
1259       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1260         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1261                              CPP_BUF_COL (buffer),
1262                              "%s in preprocessing directive",
1263                              c == '\f' ? "form feed" : "vertical tab");
1264
1265       c = *buffer->cur++;
1266     }
1267   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1268   while (is_nvspace (c));
1269
1270   if (saw_NUL)
1271     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1272
1273   buffer->cur--;
1274 }
1275
1276 /* See if the characters of a number token are valid in a name (no
1277    '.', '+' or '-').  */
1278 static int
1279 name_p (cpp_reader *pfile, const cpp_string *string)
1280 {
1281   unsigned int i;
1282
1283   for (i = 0; i < string->len; i++)
1284     if (!is_idchar (string->text[i]))
1285       return 0;
1286
1287   return 1;
1288 }
1289
1290 /* After parsing an identifier or other sequence, produce a warning about
1291    sequences not in NFC/NFKC.  */
1292 static void
1293 warn_about_normalization (cpp_reader *pfile,
1294                           const cpp_token *token,
1295                           const struct normalize_state *s)
1296 {
1297   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1298       && !pfile->state.skipping)
1299     {
1300       /* Make sure that the token is printed using UCNs, even
1301          if we'd otherwise happily print UTF-8.  */
1302       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1303       size_t sz;
1304
1305       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1306       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1307         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1308                                "`%.*s' is not in NFKC", (int) sz, buf);
1309       else
1310         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1311                                "`%.*s' is not in NFC", (int) sz, buf);
1312       free (buf);
1313     }
1314 }
1315
1316 static const cppchar_t utf8_signifier = 0xC0;
1317
1318 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1319    an identifier.  FIRST is TRUE if this starts an identifier.  */
1320 static bool
1321 forms_identifier_p (cpp_reader *pfile, int first,
1322                     struct normalize_state *state)
1323 {
1324   cpp_buffer *buffer = pfile->buffer;
1325
1326   if (*buffer->cur == '$')
1327     {
1328       if (!CPP_OPTION (pfile, dollars_in_ident))
1329         return false;
1330
1331       buffer->cur++;
1332       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1333         {
1334           CPP_OPTION (pfile, warn_dollars) = 0;
1335           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1336         }
1337
1338       return true;
1339     }
1340
1341   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
1342   if (CPP_OPTION (pfile, extended_identifiers))
1343     {
1344       cppchar_t s;
1345       if (*buffer->cur >= utf8_signifier)
1346         {
1347           if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1348                                state, &s))
1349             return true;
1350         }
1351       else if (*buffer->cur == '\\'
1352                && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1353         {
1354           buffer->cur += 2;
1355           if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1356                               state, &s, NULL, NULL))
1357             return true;
1358           buffer->cur -= 2;
1359         }
1360     }
1361
1362   return false;
1363 }
1364
1365 /* Helper function to issue error about improper __VA_OPT__ use.  */
1366 static void
1367 maybe_va_opt_error (cpp_reader *pfile)
1368 {
1369   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1370     {
1371       /* __VA_OPT__ should not be accepted at all, but allow it in
1372          system headers.  */
1373       if (!_cpp_in_system_header (pfile))
1374         cpp_error (pfile, CPP_DL_PEDWARN,
1375                    "__VA_OPT__ is not available until C++20");
1376     }
1377   else if (!pfile->state.va_args_ok)
1378     {
1379       /* __VA_OPT__ should only appear in the replacement list of a
1380          variadic macro.  */
1381       cpp_error (pfile, CPP_DL_PEDWARN,
1382                  "__VA_OPT__ can only appear in the expansion"
1383                  " of a C++20 variadic macro");
1384     }
1385 }
1386
1387 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1388 static cpp_hashnode *
1389 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1390 {
1391   cpp_hashnode *result;
1392   const uchar *cur;
1393   unsigned int len;
1394   unsigned int hash = HT_HASHSTEP (0, *base);
1395
1396   cur = base + 1;
1397   while (ISIDNUM (*cur))
1398     {
1399       hash = HT_HASHSTEP (hash, *cur);
1400       cur++;
1401     }
1402   len = cur - base;
1403   hash = HT_HASHFINISH (hash, len);
1404   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1405                                               base, len, hash, HT_ALLOC));
1406
1407   /* Rarely, identifiers require diagnostics when lexed.  */
1408   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1409                         && !pfile->state.skipping, 0))
1410     {
1411       /* It is allowed to poison the same identifier twice.  */
1412       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1413         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1414                    NODE_NAME (result));
1415
1416       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1417          replacement list of a variadic macro.  */
1418       if (result == pfile->spec_nodes.n__VA_ARGS__
1419           && !pfile->state.va_args_ok)
1420         {
1421           if (CPP_OPTION (pfile, cplusplus))
1422             cpp_error (pfile, CPP_DL_PEDWARN,
1423                        "__VA_ARGS__ can only appear in the expansion"
1424                        " of a C++11 variadic macro");
1425           else
1426             cpp_error (pfile, CPP_DL_PEDWARN,
1427                        "__VA_ARGS__ can only appear in the expansion"
1428                        " of a C99 variadic macro");
1429         }
1430
1431       if (result == pfile->spec_nodes.n__VA_OPT__)
1432         maybe_va_opt_error (pfile);
1433
1434       /* For -Wc++-compat, warn about use of C++ named operators.  */
1435       if (result->flags & NODE_WARN_OPERATOR)
1436         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1437                      "identifier \"%s\" is a special operator name in C++",
1438                      NODE_NAME (result));
1439     }
1440
1441   return result;
1442 }
1443
1444 /* Get the cpp_hashnode of an identifier specified by NAME in
1445    the current cpp_reader object.  If none is found, NULL is returned.  */
1446 cpp_hashnode *
1447 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1448 {
1449   cpp_hashnode *result;
1450   result = lex_identifier_intern (pfile, (uchar *) name);
1451   return result;
1452 }
1453
1454 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1455 static cpp_hashnode *
1456 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1457                 struct normalize_state *nst, cpp_hashnode **spelling)
1458 {
1459   cpp_hashnode *result;
1460   const uchar *cur;
1461   unsigned int len;
1462   unsigned int hash = HT_HASHSTEP (0, *base);
1463
1464   cur = pfile->buffer->cur;
1465   if (! starts_ucn)
1466     {
1467       while (ISIDNUM (*cur))
1468         {
1469           hash = HT_HASHSTEP (hash, *cur);
1470           cur++;
1471         }
1472       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1473     }
1474   pfile->buffer->cur = cur;
1475   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1476     {
1477       /* Slower version for identifiers containing UCNs
1478          or extended chars (including $).  */
1479       do {
1480         while (ISIDNUM (*pfile->buffer->cur))
1481           {
1482             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1483             pfile->buffer->cur++;
1484           }
1485       } while (forms_identifier_p (pfile, false, nst));
1486       result = _cpp_interpret_identifier (pfile, base,
1487                                           pfile->buffer->cur - base);
1488       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1489     }
1490   else
1491     {
1492       len = cur - base;
1493       hash = HT_HASHFINISH (hash, len);
1494
1495       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1496                                                   base, len, hash, HT_ALLOC));
1497       *spelling = result;
1498     }
1499
1500   /* Rarely, identifiers require diagnostics when lexed.  */
1501   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1502                         && !pfile->state.skipping, 0))
1503     {
1504       /* It is allowed to poison the same identifier twice.  */
1505       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1506         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1507                    NODE_NAME (result));
1508
1509       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1510          replacement list of a variadic macro.  */
1511       if (result == pfile->spec_nodes.n__VA_ARGS__
1512           && !pfile->state.va_args_ok)
1513         {
1514           if (CPP_OPTION (pfile, cplusplus))
1515             cpp_error (pfile, CPP_DL_PEDWARN,
1516                        "__VA_ARGS__ can only appear in the expansion"
1517                        " of a C++11 variadic macro");
1518           else
1519             cpp_error (pfile, CPP_DL_PEDWARN,
1520                        "__VA_ARGS__ can only appear in the expansion"
1521                        " of a C99 variadic macro");
1522         }
1523
1524       /* __VA_OPT__ should only appear in the replacement list of a
1525          variadic macro.  */
1526       if (result == pfile->spec_nodes.n__VA_OPT__)
1527         maybe_va_opt_error (pfile);
1528
1529       /* For -Wc++-compat, warn about use of C++ named operators.  */
1530       if (result->flags & NODE_WARN_OPERATOR)
1531         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1532                      "identifier \"%s\" is a special operator name in C++",
1533                      NODE_NAME (result));
1534     }
1535
1536   return result;
1537 }
1538
1539 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1540 static void
1541 lex_number (cpp_reader *pfile, cpp_string *number,
1542             struct normalize_state *nst)
1543 {
1544   const uchar *cur;
1545   const uchar *base;
1546   uchar *dest;
1547
1548   base = pfile->buffer->cur - 1;
1549   do
1550     {
1551       const uchar *adj_digit_sep = NULL;
1552       cur = pfile->buffer->cur;
1553
1554       /* N.B. ISIDNUM does not include $.  */
1555       while (ISIDNUM (*cur)
1556              || (*cur == '.' && !DIGIT_SEP (cur[-1]))
1557              || DIGIT_SEP (*cur)
1558              || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
1559         {
1560           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1561           /* Adjacent digit separators do not form part of the pp-number syntax.
1562              However, they can safely be diagnosed here as an error, since '' is
1563              not a valid preprocessing token.  */
1564           if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
1565             adj_digit_sep = cur;
1566           cur++;
1567         }
1568       /* A number can't end with a digit separator.  */
1569       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
1570         --cur;
1571       if (adj_digit_sep && adj_digit_sep < cur)
1572         cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
1573
1574       pfile->buffer->cur = cur;
1575     }
1576   while (forms_identifier_p (pfile, false, nst));
1577
1578   number->len = cur - base;
1579   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1580   memcpy (dest, base, number->len);
1581   dest[number->len] = '\0';
1582   number->text = dest;
1583 }
1584
1585 /* Create a token of type TYPE with a literal spelling.  */
1586 static void
1587 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1588                 unsigned int len, enum cpp_ttype type)
1589 {
1590   token->type = type;
1591   token->val.str.len = len;
1592   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
1593 }
1594
1595 const uchar *
1596 cpp_alloc_token_string (cpp_reader *pfile,
1597                         const unsigned char *ptr, unsigned len)
1598 {
1599   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1600
1601   dest[len] = 0;
1602   memcpy (dest, ptr, len);
1603   return dest;
1604 }
1605
1606 /* A pair of raw buffer pointers.  The currently open one is [1], the
1607    first one is [0].  Used for string literal lexing.  */
1608 struct lit_accum {
1609   _cpp_buff *first;
1610   _cpp_buff *last;
1611   const uchar *rpos;
1612   size_t accum;
1613
1614   lit_accum ()
1615     : first (NULL), last (NULL), rpos (0), accum (0)
1616   {
1617   }
1618
1619   void append (cpp_reader *, const uchar *, size_t);
1620
1621   void read_begin (cpp_reader *);
1622   bool reading_p () const
1623   {
1624     return rpos != NULL;
1625   }
1626   char read_char ()
1627   {
1628     char c = *rpos++;
1629     if (rpos == BUFF_FRONT (last))
1630       rpos = NULL;
1631     return c;
1632   }
1633 };
1634
1635 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1636    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1637
1638 void
1639 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
1640 {
1641   if (!last)
1642     /* Starting.  */
1643     first = last = _cpp_get_buff (pfile, len);
1644   else if (len > BUFF_ROOM (last))
1645     {
1646       /* There is insufficient room in the buffer.  Copy what we can,
1647          and then either extend or create a new one.  */
1648       size_t room = BUFF_ROOM (last);
1649       memcpy (BUFF_FRONT (last), base, room);
1650       BUFF_FRONT (last) += room;
1651       base += room;
1652       len -= room;
1653       accum += room;
1654
1655       gcc_checking_assert (!rpos);
1656
1657       last = _cpp_append_extend_buff (pfile, last, len);
1658     }
1659
1660   memcpy (BUFF_FRONT (last), base, len);
1661   BUFF_FRONT (last) += len;
1662   accum += len;
1663 }
1664
1665 void
1666 lit_accum::read_begin (cpp_reader *pfile)
1667 {
1668   /* We never accumulate more than 4 chars to read.  */
1669   if (BUFF_ROOM (last) < 4)
1670
1671     last = _cpp_append_extend_buff (pfile, last, 4);
1672   rpos = BUFF_FRONT (last);
1673 }
1674
1675 /* Returns true if a macro has been defined.
1676    This might not work if compile with -save-temps,
1677    or preprocess separately from compilation.  */
1678
1679 static bool
1680 is_macro(cpp_reader *pfile, const uchar *base)
1681 {
1682   const uchar *cur = base;
1683   if (! ISIDST (*cur))
1684     return false;
1685   unsigned int hash = HT_HASHSTEP (0, *cur);
1686   ++cur;
1687   while (ISIDNUM (*cur))
1688     {
1689       hash = HT_HASHSTEP (hash, *cur);
1690       ++cur;
1691     }
1692   hash = HT_HASHFINISH (hash, cur - base);
1693
1694   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1695                                         base, cur - base, hash, HT_NO_INSERT));
1696
1697   return result && cpp_macro_p (result);
1698 }
1699
1700 /* Returns true if a literal suffix does not have the expected form
1701    and is defined as a macro.  */
1702
1703 static bool
1704 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
1705 {
1706   /* User-defined literals outside of namespace std must start with a single
1707      underscore, so assume anything of that form really is a UDL suffix.
1708      We don't need to worry about UDLs defined inside namespace std because
1709      their names are reserved, so cannot be used as macro names in valid
1710      programs.  */
1711   if (base[0] == '_' && base[1] != '_')
1712     return false;
1713   return is_macro (pfile, base);
1714 }
1715
1716 /* Lexes a raw string.  The stored string contains the spelling,
1717    including double quotes, delimiter string, '(' and ')', any leading
1718    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
1719    the type of the literal, or CPP_OTHER if it was not properly
1720    terminated.
1721
1722    BASE is the start of the token.  Updates pfile->buffer->cur to just
1723    after the lexed string.
1724
1725    The spelling is NUL-terminated, but it is not guaranteed that this
1726    is the first NUL since embedded NULs are preserved.  */
1727
1728 static void
1729 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1730 {
1731   const uchar *pos = base;
1732
1733   /* 'tis a pity this information isn't passed down from the lexer's
1734      initial categorization of the token.  */
1735   enum cpp_ttype type = CPP_STRING;
1736
1737   if (*pos == 'L')
1738     {
1739       type = CPP_WSTRING;
1740       pos++;
1741     }
1742   else if (*pos == 'U')
1743     {
1744       type = CPP_STRING32;
1745       pos++;
1746     }
1747   else if (*pos == 'u')
1748     {
1749       if (pos[1] == '8')
1750         {
1751           type = CPP_UTF8STRING;
1752           pos++;
1753         }
1754       else
1755         type = CPP_STRING16;
1756       pos++;
1757     }
1758
1759   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
1760   pos += 2;
1761
1762   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1763
1764   /* Skip notes before the ".  */
1765   while (note->pos < pos)
1766     ++note;
1767
1768   lit_accum accum;
1769
1770   uchar prefix[17];
1771   unsigned prefix_len = 0;
1772   enum Phase
1773   {
1774    PHASE_PREFIX = -2,
1775    PHASE_NONE = -1,
1776    PHASE_SUFFIX = 0
1777   } phase = PHASE_PREFIX;
1778
1779   for (;;)
1780     {
1781       gcc_checking_assert (note->pos >= pos);
1782
1783       /* Undo any escaped newlines and trigraphs.  */
1784       if (!accum.reading_p () && note->pos == pos)
1785         switch (note->type)
1786           {
1787           case '\\':
1788           case ' ':
1789             /* Restore backslash followed by newline.  */
1790             accum.append (pfile, base, pos - base);
1791             base = pos;
1792             accum.read_begin (pfile);
1793             accum.append (pfile, UC"\\", 1);
1794
1795           after_backslash:
1796             if (note->type == ' ')
1797               /* GNU backslash whitespace newline extension.  FIXME
1798                  could be any sequence of non-vertical space.  When we
1799                  can properly restore any such sequence, we should
1800                  mark this note as handled so _cpp_process_line_notes
1801                  doesn't warn.  */
1802               accum.append (pfile, UC" ", 1);
1803
1804             accum.append (pfile, UC"\n", 1);
1805             note++;
1806             break;
1807
1808           case '\n':
1809             /* This can happen for ??/<NEWLINE> when trigraphs are not
1810                being interpretted.  */
1811             gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
1812             note->type = 0;
1813             note++;
1814             break;
1815
1816           default:
1817             gcc_checking_assert (_cpp_trigraph_map[note->type]);
1818
1819             /* Don't warn about this trigraph in
1820                _cpp_process_line_notes, since trigraphs show up as
1821                trigraphs in raw strings.  */
1822             uchar type = note->type;
1823             note->type = 0;
1824
1825             if (CPP_OPTION (pfile, trigraphs))
1826               {
1827                 accum.append (pfile, base, pos - base);
1828                 base = pos;
1829                 accum.read_begin (pfile);
1830                 accum.append (pfile, UC"??", 2);
1831                 accum.append (pfile, &type, 1);
1832
1833                 /* ??/ followed by newline gets two line notes, one for
1834                    the trigraph and one for the backslash/newline.  */
1835                 if (type == '/' && note[1].pos == pos)
1836                   {
1837                     note++;
1838                     gcc_assert (note->type == '\\' || note->type == ' ');
1839                     goto after_backslash;
1840                   }
1841                 /* Skip the replacement character.  */
1842                 base = ++pos;
1843               }
1844
1845             note++;
1846             break;
1847           }
1848
1849       /* Now get a char to process.  Either from an expanded note, or
1850          from the line buffer.  */
1851       bool read_note = accum.reading_p ();
1852       char c = read_note ? accum.read_char () : *pos++;
1853
1854       if (phase == PHASE_PREFIX)
1855         {
1856           if (c == '(')
1857             {
1858               /* Done.  */
1859               phase = PHASE_NONE;
1860               prefix[prefix_len++] = '"';
1861             }
1862           else if (prefix_len < 16
1863                    /* Prefix chars are any of the basic character set,
1864                       [lex.charset] except for '
1865                       ()\\\t\v\f\n'. Optimized for a contiguous
1866                       alphabet.  */
1867                    /* Unlike a switch, this collapses down to one or
1868                       two shift and bitmask operations on an ASCII
1869                       system, with an outlier or two.   */
1870                    && (('Z' - 'A' == 25
1871                         ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
1872                         : ISIDST (c))
1873                        || (c >= '0' && c <= '9')
1874                        || c == '_' || c == '{' || c == '}'
1875                        || c == '[' || c == ']' || c == '#'
1876                        || c == '<' || c == '>' || c == '%'
1877                        || c == ':' || c == ';' || c == '.' || c == '?'
1878                        || c == '*' || c == '+' || c == '-' || c == '/'
1879                        || c == '^' || c == '&' || c == '|' || c == '~'
1880                        || c == '!' || c == '=' || c == ','
1881                        || c == '"' || c == '\''))
1882             prefix[prefix_len++] = c;
1883           else
1884             {
1885               /* Something is wrong.  */
1886               int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
1887               if (prefix_len == 16)
1888                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1889                                      col, "raw string delimiter longer "
1890                                      "than 16 characters");
1891               else if (c == '\n')
1892                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1893                                      col, "invalid new-line in raw "
1894                                      "string delimiter");
1895               else
1896                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1897                                      col, "invalid character '%c' in "
1898                                      "raw string delimiter", c);
1899               type = CPP_OTHER;
1900               phase = PHASE_NONE;
1901               /* Continue until we get a close quote, that's probably
1902                  the best failure mode.  */
1903               prefix_len = 0;
1904             }
1905           if (c != '\n')
1906             continue;
1907         }
1908
1909       if (phase != PHASE_NONE)
1910         {
1911           if (prefix[phase] != c)
1912             phase = PHASE_NONE;
1913           else if (unsigned (phase + 1) == prefix_len)
1914             break;
1915           else
1916             {
1917               phase = Phase (phase + 1);
1918               continue;
1919             }
1920         }
1921
1922       if (!prefix_len && c == '"')
1923         /* Failure mode lexing.  */
1924         goto out;
1925       else if (prefix_len && c == ')')
1926         phase = PHASE_SUFFIX;
1927       else if (!read_note && c == '\n')
1928         {
1929           pos--;
1930           pfile->buffer->cur = pos;
1931           if (pfile->state.in_directive
1932               || (pfile->state.parsing_args
1933                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
1934             {
1935               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1936                                    "unterminated raw string");
1937               type = CPP_OTHER;
1938               goto out;
1939             }
1940
1941           accum.append (pfile, base, pos - base + 1);
1942           _cpp_process_line_notes (pfile, false);
1943
1944           if (pfile->buffer->next_line < pfile->buffer->rlimit)
1945             CPP_INCREMENT_LINE (pfile, 0);
1946           pfile->buffer->need_line = true;
1947
1948           if (!_cpp_get_fresh_line (pfile))
1949             {
1950               /* We ran out of file and failed to get a line.  */
1951               location_t src_loc = token->src_loc;
1952               token->type = CPP_EOF;
1953               /* Tell the compiler the line number of the EOF token.  */
1954               token->src_loc = pfile->line_table->highest_line;
1955               token->flags = BOL;
1956               if (accum.first)
1957                 _cpp_release_buff (pfile, accum.first);
1958               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1959                                    "unterminated raw string");
1960               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
1961               _cpp_pop_buffer (pfile);
1962               return;
1963             }
1964
1965           pos = base = pfile->buffer->cur;
1966           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1967         }
1968     }
1969
1970   if (CPP_OPTION (pfile, user_literals))
1971     {
1972       /* If a string format macro, say from inttypes.h, is placed touching
1973          a string literal it could be parsed as a C++11 user-defined string
1974          literal thus breaking the program.  */
1975       if (is_macro_not_literal_suffix (pfile, pos))
1976         {
1977           /* Raise a warning, but do not consume subsequent tokens.  */
1978           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1979             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1980                                    token->src_loc, 0,
1981                                    "invalid suffix on literal; C++11 requires "
1982                                    "a space between literal and string macro");
1983         }
1984       /* Grab user defined literal suffix.  */
1985       else if (ISIDST (*pos))
1986         {
1987           type = cpp_userdef_string_add_type (type);
1988           ++pos;
1989
1990           while (ISIDNUM (*pos))
1991             ++pos;
1992         }
1993     }
1994
1995  out:
1996   pfile->buffer->cur = pos;
1997   if (!accum.accum)
1998     create_literal (pfile, token, base, pos - base, type);
1999   else
2000     {
2001       size_t extra_len = pos - base;
2002       uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2003
2004       token->type = type;
2005       token->val.str.len = accum.accum + extra_len;
2006       token->val.str.text = dest;
2007       for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2008         {
2009           size_t len = BUFF_FRONT (buf) - buf->base;
2010           memcpy (dest, buf->base, len);
2011           dest += len;
2012         }
2013       _cpp_release_buff (pfile, accum.first);
2014       memcpy (dest, base, extra_len);
2015       dest[extra_len] = '\0';
2016     }
2017 }
2018
2019 /* Lexes a string, character constant, or angle-bracketed header file
2020    name.  The stored string contains the spelling, including opening
2021    quote and any leading 'L', 'u', 'U' or 'u8' and optional
2022    'R' modifier.  It returns the type of the literal, or CPP_OTHER
2023    if it was not properly terminated, or CPP_LESS for an unterminated
2024    header name which must be relexed as normal tokens.
2025
2026    The spelling is NUL-terminated, but it is not guaranteed that this
2027    is the first NUL since embedded NULs are preserved.  */
2028 static void
2029 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2030 {
2031   bool saw_NUL = false;
2032   const uchar *cur;
2033   cppchar_t terminator;
2034   enum cpp_ttype type;
2035
2036   cur = base;
2037   terminator = *cur++;
2038   if (terminator == 'L' || terminator == 'U')
2039     terminator = *cur++;
2040   else if (terminator == 'u')
2041     {
2042       terminator = *cur++;
2043       if (terminator == '8')
2044         terminator = *cur++;
2045     }
2046   if (terminator == 'R')
2047     {
2048       lex_raw_string (pfile, token, base);
2049       return;
2050     }
2051   if (terminator == '"')
2052     type = (*base == 'L' ? CPP_WSTRING :
2053             *base == 'U' ? CPP_STRING32 :
2054             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2055                          : CPP_STRING);
2056   else if (terminator == '\'')
2057     type = (*base == 'L' ? CPP_WCHAR :
2058             *base == 'U' ? CPP_CHAR32 :
2059             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2060                          : CPP_CHAR);
2061   else
2062     terminator = '>', type = CPP_HEADER_NAME;
2063
2064   for (;;)
2065     {
2066       cppchar_t c = *cur++;
2067
2068       /* In #include-style directives, terminators are not escapable.  */
2069       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2070         cur++;
2071       else if (c == terminator)
2072         break;
2073       else if (c == '\n')
2074         {
2075           cur--;
2076           /* Unmatched quotes always yield undefined behavior, but
2077              greedy lexing means that what appears to be an unterminated
2078              header name may actually be a legitimate sequence of tokens.  */
2079           if (terminator == '>')
2080             {
2081               token->type = CPP_LESS;
2082               return;
2083             }
2084           type = CPP_OTHER;
2085           break;
2086         }
2087       else if (c == '\0')
2088         saw_NUL = true;
2089     }
2090
2091   if (saw_NUL && !pfile->state.skipping)
2092     cpp_error (pfile, CPP_DL_WARNING,
2093                "null character(s) preserved in literal");
2094
2095   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2096     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2097                (int) terminator);
2098
2099   if (CPP_OPTION (pfile, user_literals))
2100     {
2101       /* If a string format macro, say from inttypes.h, is placed touching
2102          a string literal it could be parsed as a C++11 user-defined string
2103          literal thus breaking the program.  */
2104       if (is_macro_not_literal_suffix (pfile, cur))
2105         {
2106           /* Raise a warning, but do not consume subsequent tokens.  */
2107           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2108             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2109                                    token->src_loc, 0,
2110                                    "invalid suffix on literal; C++11 requires "
2111                                    "a space between literal and string macro");
2112         }
2113       /* Grab user defined literal suffix.  */
2114       else if (ISIDST (*cur))
2115         {
2116           type = cpp_userdef_char_add_type (type);
2117           type = cpp_userdef_string_add_type (type);
2118           ++cur;
2119
2120           while (ISIDNUM (*cur))
2121             ++cur;
2122         }
2123     }
2124   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2125            && is_macro (pfile, cur)
2126            && !pfile->state.skipping)
2127     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2128                            token->src_loc, 0, "C++11 requires a space "
2129                            "between string literal and macro");
2130
2131   pfile->buffer->cur = cur;
2132   create_literal (pfile, token, base, cur - base, type);
2133 }
2134
2135 /* Return the comment table. The client may not make any assumption
2136    about the ordering of the table.  */
2137 cpp_comment_table *
2138 cpp_get_comments (cpp_reader *pfile)
2139 {
2140   return &pfile->comments;
2141 }
2142
2143 /* Append a comment to the end of the comment table. */
2144 static void
2145 store_comment (cpp_reader *pfile, cpp_token *token)
2146 {
2147   int len;
2148
2149   if (pfile->comments.allocated == 0)
2150     {
2151       pfile->comments.allocated = 256;
2152       pfile->comments.entries = (cpp_comment *) xmalloc
2153         (pfile->comments.allocated * sizeof (cpp_comment));
2154     }
2155
2156   if (pfile->comments.count == pfile->comments.allocated)
2157     {
2158       pfile->comments.allocated *= 2;
2159       pfile->comments.entries = (cpp_comment *) xrealloc
2160         (pfile->comments.entries,
2161          pfile->comments.allocated * sizeof (cpp_comment));
2162     }
2163
2164   len = token->val.str.len;
2165
2166   /* Copy comment. Note, token may not be NULL terminated. */
2167   pfile->comments.entries[pfile->comments.count].comment =
2168     (char *) xmalloc (sizeof (char) * (len + 1));
2169   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2170           token->val.str.text, len);
2171   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2172
2173   /* Set source location. */
2174   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2175
2176   /* Increment the count of entries in the comment table. */
2177   pfile->comments.count++;
2178 }
2179
2180 /* The stored comment includes the comment start and any terminator.  */
2181 static void
2182 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2183               cppchar_t type)
2184 {
2185   unsigned char *buffer;
2186   unsigned int len, clen, i;
2187
2188   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2189
2190   /* C++ comments probably (not definitely) have moved past a new
2191      line, which we don't want to save in the comment.  */
2192   if (is_vspace (pfile->buffer->cur[-1]))
2193     len--;
2194
2195   /* If we are currently in a directive or in argument parsing, then
2196      we need to store all C++ comments as C comments internally, and
2197      so we need to allocate a little extra space in that case.
2198
2199      Note that the only time we encounter a directive here is
2200      when we are saving comments in a "#define".  */
2201   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2202           && type == '/') ? len + 2 : len;
2203
2204   buffer = _cpp_unaligned_alloc (pfile, clen);
2205
2206   token->type = CPP_COMMENT;
2207   token->val.str.len = clen;
2208   token->val.str.text = buffer;
2209
2210   buffer[0] = '/';
2211   memcpy (buffer + 1, from, len - 1);
2212
2213   /* Finish conversion to a C comment, if necessary.  */
2214   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2215     {
2216       buffer[1] = '*';
2217       buffer[clen - 2] = '*';
2218       buffer[clen - 1] = '/';
2219       /* As there can be in a C++ comments illegal sequences for C comments
2220          we need to filter them out.  */
2221       for (i = 2; i < (clen - 2); i++)
2222         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2223           buffer[i] = '|';
2224     }
2225
2226   /* Finally store this comment for use by clients of libcpp. */
2227   store_comment (pfile, token);
2228 }
2229
2230 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2231    comment.  */
2232
2233 static bool
2234 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2235 {
2236   const unsigned char *from = comment_start + 1;
2237
2238   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2239     {
2240       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2241          don't recognize any comments.  The latter only checks attributes,
2242          the former doesn't warn.  */
2243     case 0:
2244     default:
2245       return false;
2246       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2247          content it has.  */
2248     case 1:
2249       return true;
2250     case 2:
2251       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2252          .*falls?[ \t-]*thr(u|ough).* regex.  */
2253       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2254            from++)
2255         {
2256           /* Is there anything like strpbrk with upper boundary, or
2257              memchr looking for 2 characters rather than just one?  */
2258           if (from[0] != 'f' && from[0] != 'F')
2259             continue;
2260           if (from[1] != 'a' && from[1] != 'A')
2261             continue;
2262           if (from[2] != 'l' && from[2] != 'L')
2263             continue;
2264           if (from[3] != 'l' && from[3] != 'L')
2265             continue;
2266           from += sizeof "fall" - 1;
2267           if (from[0] == 's' || from[0] == 'S')
2268             from++;
2269           while (*from == ' ' || *from == '\t' || *from == '-')
2270             from++;
2271           if (from[0] != 't' && from[0] != 'T')
2272             continue;
2273           if (from[1] != 'h' && from[1] != 'H')
2274             continue;
2275           if (from[2] != 'r' && from[2] != 'R')
2276             continue;
2277           if (from[3] == 'u' || from[3] == 'U')
2278             return true;
2279           if (from[3] != 'o' && from[3] != 'O')
2280             continue;
2281           if (from[4] != 'u' && from[4] != 'U')
2282             continue;
2283           if (from[5] != 'g' && from[5] != 'G')
2284             continue;
2285           if (from[6] != 'h' && from[6] != 'H')
2286             continue;
2287           return true;
2288         }
2289       return false;
2290     case 3:
2291     case 4:
2292       break;
2293     }
2294
2295   /* Whole comment contents:
2296      -fallthrough
2297      @fallthrough@
2298    */
2299   if (*from == '-' || *from == '@')
2300     {
2301       size_t len = sizeof "fallthrough" - 1;
2302       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2303         return false;
2304       if (memcmp (from + 1, "fallthrough", len))
2305         return false;
2306       if (*from == '@')
2307         {
2308           if (from[len + 1] != '@')
2309             return false;
2310           len++;
2311         }
2312       from += 1 + len;
2313     }
2314   /* Whole comment contents (regex):
2315      lint -fallthrough[ \t]*
2316    */
2317   else if (*from == 'l')
2318     {
2319       size_t len = sizeof "int -fallthrough" - 1;
2320       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2321         return false;
2322       if (memcmp (from + 1, "int -fallthrough", len))
2323         return false;
2324       from += 1 + len;
2325       while (*from == ' ' || *from == '\t')
2326         from++;
2327     }
2328   /* Whole comment contents (regex):
2329      [ \t]*FALLTHR(U|OUGH)[ \t]*
2330    */
2331   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2332     {
2333       while (*from == ' ' || *from == '\t')
2334         from++;
2335       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
2336         return false;
2337       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2338         return false;
2339       from += sizeof "FALLTHR" - 1;
2340       if (*from == 'U')
2341         from++;
2342       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
2343         return false;
2344       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2345         return false;
2346       else
2347         from += sizeof "OUGH" - 1;
2348       while (*from == ' ' || *from == '\t')
2349         from++;
2350     }
2351   /* Whole comment contents (regex):
2352      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2353      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2354      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2355    */
2356   else
2357     {
2358       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2359         from++;
2360       unsigned char f = *from;
2361       bool all_upper = false;
2362       if (f == 'E' || f == 'e')
2363         {
2364           if ((size_t) (pfile->buffer->cur - from)
2365               < sizeof "else fallthru" - 1)
2366             return false;
2367           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2368             all_upper = true;
2369           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2370             return false;
2371           from += sizeof "else" - 1;
2372           if (*from == ',')
2373             from++;
2374           if (*from != ' ')
2375             return false;
2376           from++;
2377           if (all_upper && *from == 'f')
2378             return false;
2379           if (f == 'e' && *from == 'F')
2380             return false;
2381           f = *from;
2382         }
2383       else if (f == 'I' || f == 'i')
2384         {
2385           if ((size_t) (pfile->buffer->cur - from)
2386               < sizeof "intentional fallthru" - 1)
2387             return false;
2388           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2389                                   sizeof "NTENTIONAL" - 1) == 0)
2390             all_upper = true;
2391           else if (memcmp (from + 1, "ntentional",
2392                            sizeof "ntentional" - 1))
2393             return false;
2394           from += sizeof "intentional" - 1;
2395           if (*from == ' ')
2396             {
2397               from++;
2398               if (all_upper && *from == 'f')
2399                 return false;
2400             }
2401           else if (all_upper)
2402             {
2403               if (memcmp (from, "LY F", sizeof "LY F" - 1))
2404                 return false;
2405               from += sizeof "LY " - 1;
2406             }
2407           else
2408             {
2409               if (memcmp (from, "ly ", sizeof "ly " - 1))
2410                 return false;
2411               from += sizeof "ly " - 1;
2412             }
2413           if (f == 'i' && *from == 'F')
2414             return false;
2415           f = *from;
2416         }
2417       if (f != 'F' && f != 'f')
2418         return false;
2419       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
2420         return false;
2421       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
2422         all_upper = true;
2423       else if (all_upper)
2424         return false;
2425       else if (memcmp (from + 1, "all", sizeof "all" - 1))
2426         return false;
2427       from += sizeof "fall" - 1;
2428       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
2429         from += 2;
2430       else if (*from == ' ' || *from == '-')
2431         from++;
2432       else if (*from != (all_upper ? 'T' : 't'))
2433         return false;
2434       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
2435         return false;
2436       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
2437         return false;
2438       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
2439         {
2440           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
2441             return false;
2442           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
2443                       sizeof "hrough" - 1))
2444             return false;
2445           from += sizeof "through" - 1;
2446         }
2447       else
2448         from += sizeof "thru" - 1;
2449       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2450         from++;
2451       if (*from == '-')
2452         {
2453           from++;
2454           if (*comment_start == '*')
2455             {
2456               do
2457                 {
2458                   while (*from && *from != '*'
2459                          && *from != '\n' && *from != '\r')
2460                     from++;
2461                   if (*from != '*' || from[1] == '/')
2462                     break;
2463                   from++;
2464                 }
2465               while (1);
2466             }
2467           else
2468             while (*from && *from != '\n' && *from != '\r')
2469               from++;
2470         }
2471     }
2472   /* C block comment.  */
2473   if (*comment_start == '*')
2474     {
2475       if (*from != '*' || from[1] != '/')
2476         return false;
2477     }
2478   /* C++ line comment.  */
2479   else if (*from != '\n')
2480     return false;
2481
2482   return true;
2483 }
2484
2485 /* Allocate COUNT tokens for RUN.  */
2486 void
2487 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
2488 {
2489   run->base = XNEWVEC (cpp_token, count);
2490   run->limit = run->base + count;
2491   run->next = NULL;
2492 }
2493
2494 /* Returns the next tokenrun, or creates one if there is none.  */
2495 static tokenrun *
2496 next_tokenrun (tokenrun *run)
2497 {
2498   if (run->next == NULL)
2499     {
2500       run->next = XNEW (tokenrun);
2501       run->next->prev = run;
2502       _cpp_init_tokenrun (run->next, 250);
2503     }
2504
2505   return run->next;
2506 }
2507
2508 /* Return the number of not yet processed token in a given
2509    context.  */
2510 int
2511 _cpp_remaining_tokens_num_in_context (cpp_context *context)
2512 {
2513   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2514     return (LAST (context).token - FIRST (context).token);
2515   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2516            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2517     return (LAST (context).ptoken - FIRST (context).ptoken);
2518   else
2519       abort ();
2520 }
2521
2522 /* Returns the token present at index INDEX in a given context.  If
2523    INDEX is zero, the next token to be processed is returned.  */
2524 static const cpp_token*
2525 _cpp_token_from_context_at (cpp_context *context, int index)
2526 {
2527   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2528     return &(FIRST (context).token[index]);
2529   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2530            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2531     return FIRST (context).ptoken[index];
2532  else
2533    abort ();
2534 }
2535
2536 /* Look ahead in the input stream.  */
2537 const cpp_token *
2538 cpp_peek_token (cpp_reader *pfile, int index)
2539 {
2540   cpp_context *context = pfile->context;
2541   const cpp_token *peektok;
2542   int count;
2543
2544   /* First, scan through any pending cpp_context objects.  */
2545   while (context->prev)
2546     {
2547       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2548
2549       if (index < (int) sz)
2550         return _cpp_token_from_context_at (context, index);
2551       index -= (int) sz;
2552       context = context->prev;
2553     }
2554
2555   /* We will have to read some new tokens after all (and do so
2556      without invalidating preceding tokens).  */
2557   count = index;
2558   pfile->keep_tokens++;
2559
2560   /* For peeked tokens temporarily disable line_change reporting,
2561      until the tokens are parsed for real.  */
2562   void (*line_change) (cpp_reader *, const cpp_token *, int)
2563     = pfile->cb.line_change;
2564   pfile->cb.line_change = NULL;
2565
2566   do
2567     {
2568       peektok = _cpp_lex_token (pfile);
2569       if (peektok->type == CPP_EOF)
2570         {
2571           index--;
2572           break;
2573         }
2574       else if (peektok->type == CPP_PRAGMA)
2575         {
2576           /* Don't peek past a pragma.  */
2577           if (peektok == &pfile->directive_result)
2578             /* Save the pragma in the buffer.  */
2579             *pfile->cur_token++ = *peektok;
2580           index--;
2581           break;
2582         }
2583     }
2584   while (index--);
2585
2586   _cpp_backup_tokens_direct (pfile, count - index);
2587   pfile->keep_tokens--;
2588   pfile->cb.line_change = line_change;
2589
2590   return peektok;
2591 }
2592
2593 /* Allocate a single token that is invalidated at the same time as the
2594    rest of the tokens on the line.  Has its line and col set to the
2595    same as the last lexed token, so that diagnostics appear in the
2596    right place.  */
2597 cpp_token *
2598 _cpp_temp_token (cpp_reader *pfile)
2599 {
2600   cpp_token *old, *result;
2601   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2602   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2603
2604   old = pfile->cur_token - 1;
2605   /* Any pre-existing lookaheads must not be clobbered.  */
2606   if (la)
2607     {
2608       if (sz <= la)
2609         {
2610           tokenrun *next = next_tokenrun (pfile->cur_run);
2611
2612           if (sz < la)
2613             memmove (next->base + 1, next->base,
2614                      (la - sz) * sizeof (cpp_token));
2615
2616           next->base[0] = pfile->cur_run->limit[-1];
2617         }
2618
2619       if (sz > 1)
2620         memmove (pfile->cur_token + 1, pfile->cur_token,
2621                  MIN (la, sz - 1) * sizeof (cpp_token));
2622     }
2623
2624   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2625     {
2626       pfile->cur_run = next_tokenrun (pfile->cur_run);
2627       pfile->cur_token = pfile->cur_run->base;
2628     }
2629
2630   result = pfile->cur_token++;
2631   result->src_loc = old->src_loc;
2632   return result;
2633 }
2634
2635 /* We're at the beginning of a logical line (so not in
2636   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
2637   if we should enter deferred_pragma mode to tokenize the rest of the
2638   line as a module control-line.  */
2639
2640 static void
2641 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
2642 {
2643   unsigned backup = 0; /* Tokens we peeked.  */
2644   cpp_hashnode *node = result->val.node.node;
2645   cpp_token *peek = result;
2646   cpp_token *keyword = peek;
2647   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
2648   int header_count = 0;
2649
2650   /* Make sure the incoming state is as we expect it.  This way we
2651      can restore it using constants.  */
2652   gcc_checking_assert (!pfile->state.in_deferred_pragma
2653                        && !pfile->state.skipping
2654                        && !pfile->state.parsing_args
2655                        && !pfile->state.angled_headers
2656                        && (pfile->state.save_comments
2657                            == !CPP_OPTION (pfile, discard_comments)));
2658
2659   /* Enter directives mode sufficiently for peeking.  We don't have
2660      to actually set in_directive.  */
2661   pfile->state.in_deferred_pragma = true;
2662
2663   /* These two fields are needed to process tokenization in deferred
2664      pragma mode.  They are not used outside deferred pragma mode or
2665      directives mode.  */
2666   pfile->state.pragma_allow_expansion = true;
2667   pfile->directive_line = result->src_loc;
2668
2669   /* Saving comments is incompatible with directives mode.   */
2670   pfile->state.save_comments = 0;
2671
2672   if (node == n_modules[spec_nodes::M_EXPORT][0])
2673     {
2674       peek = _cpp_lex_direct (pfile);
2675       keyword = peek;
2676       backup++;
2677       if (keyword->type != CPP_NAME)
2678         goto not_module;
2679       node = keyword->val.node.node;
2680       if (!(node->flags & NODE_MODULE))
2681         goto not_module;
2682     }
2683
2684   if (node == n_modules[spec_nodes::M__IMPORT][0])
2685     /* __import  */
2686     header_count = backup + 2 + 16;
2687   else if (node == n_modules[spec_nodes::M_IMPORT][0])
2688     /* import  */
2689     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
2690   else if (node == n_modules[spec_nodes::M_MODULE][0])
2691     ; /* module  */
2692   else
2693     goto not_module;
2694
2695   /* We've seen [export] {module|import|__import}.  Check the next token.  */
2696   if (header_count)
2697     /* After '{,__}import' a header name may appear.  */
2698     pfile->state.angled_headers = true;
2699   peek = _cpp_lex_direct (pfile);
2700   backup++;
2701
2702   /* ... import followed by identifier, ':', '<' or
2703      header-name preprocessing tokens, or module
2704      followed by cpp-identifier, ':' or ';' preprocessing
2705      tokens.  C++ keywords are not yet relevant.  */
2706   if (peek->type == CPP_NAME
2707       || peek->type == CPP_COLON
2708       ||  (header_count
2709            ? (peek->type == CPP_LESS
2710               || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
2711               || peek->type == CPP_HEADER_NAME)
2712            : peek->type == CPP_SEMICOLON))
2713     {
2714       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
2715       if (!pfile->state.pragma_allow_expansion)
2716         pfile->state.prevent_expansion++;
2717
2718       if (!header_count && linemap_included_from
2719           (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
2720         cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
2721                              "module control-line cannot be in included file");
2722
2723       /* The first one or two tokens cannot be macro names.  */
2724       for (int ix = backup; ix--;)
2725         {
2726           cpp_token *tok = ix ? keyword : result;
2727           cpp_hashnode *node = tok->val.node.node;
2728
2729           /* Don't attempt to expand the token.  */
2730           tok->flags |= NO_EXPAND;
2731           if (_cpp_defined_macro_p (node)
2732               && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
2733               && !cpp_fun_like_macro_p (node))
2734             cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
2735                                  "module control-line \"%s\" cannot be"
2736                                  " an object-like macro",
2737                                  NODE_NAME (node));
2738         }
2739
2740       /* Map to underbar variants.  */
2741       keyword->val.node.node = n_modules[header_count
2742                                          ? spec_nodes::M_IMPORT
2743                                          : spec_nodes::M_MODULE][1];
2744       if (backup != 1)
2745         result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
2746
2747       /* Maybe tell the tokenizer we expect a header-name down the
2748          road.  */
2749       pfile->state.directive_file_token = header_count;
2750     }
2751   else
2752     {
2753     not_module:
2754       /* Drop out of directive mode.  */
2755       /* We aaserted save_comments had this value upon entry.  */
2756       pfile->state.save_comments
2757         = !CPP_OPTION (pfile, discard_comments);
2758       pfile->state.in_deferred_pragma = false;
2759       /* Do not let this remain on.  */
2760       pfile->state.angled_headers = false;
2761     }
2762
2763   /* In either case we want to backup the peeked tokens.  */
2764   if (backup)
2765     {
2766       /* If we saw EOL, we should drop it, because this isn't a module
2767          control-line after all.  */
2768       bool eol = peek->type == CPP_PRAGMA_EOL;
2769       if (!eol || backup > 1)
2770         {
2771           /* Put put the peeked tokens back  */
2772           _cpp_backup_tokens_direct (pfile, backup);
2773           /* But if the last one was an EOL, forget it.  */
2774           if (eol)
2775             pfile->lookaheads--;
2776         }
2777     }
2778 }
2779
2780 /* Lex a token into RESULT (external interface).  Takes care of issues
2781    like directive handling, token lookahead, multiple include
2782    optimization and skipping.  */
2783 const cpp_token *
2784 _cpp_lex_token (cpp_reader *pfile)
2785 {
2786   cpp_token *result;
2787
2788   for (;;)
2789     {
2790       if (pfile->cur_token == pfile->cur_run->limit)
2791         {
2792           pfile->cur_run = next_tokenrun (pfile->cur_run);
2793           pfile->cur_token = pfile->cur_run->base;
2794         }
2795       /* We assume that the current token is somewhere in the current
2796          run.  */
2797       if (pfile->cur_token < pfile->cur_run->base
2798           || pfile->cur_token >= pfile->cur_run->limit)
2799         abort ();
2800
2801       if (pfile->lookaheads)
2802         {
2803           pfile->lookaheads--;
2804           result = pfile->cur_token++;
2805         }
2806       else
2807         result = _cpp_lex_direct (pfile);
2808
2809       if (result->flags & BOL)
2810         {
2811           /* Is this a directive.  If _cpp_handle_directive returns
2812              false, it is an assembler #.  */
2813           if (result->type == CPP_HASH
2814               /* 6.10.3 p 11: Directives in a list of macro arguments
2815                  gives undefined behavior.  This implementation
2816                  handles the directive as normal.  */
2817               && pfile->state.parsing_args != 1)
2818             {
2819               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2820                 {
2821                   if (pfile->directive_result.type == CPP_PADDING)
2822                     continue;
2823                   result = &pfile->directive_result;
2824                 }
2825             }
2826           else if (pfile->state.in_deferred_pragma)
2827             result = &pfile->directive_result;
2828           else if (result->type == CPP_NAME
2829                    && (result->val.node.node->flags & NODE_MODULE)
2830                    && !pfile->state.skipping
2831                    /* Unlike regular directives, we do not deal with
2832                       tokenizing module directives as macro arguments.
2833                       That's not permitted.  */
2834                    && !pfile->state.parsing_args)
2835             {
2836               /* P1857.  Before macro expansion, At start of logical
2837                  line ... */
2838               /* We don't have to consider lookaheads at this point.  */
2839               gcc_checking_assert (!pfile->lookaheads);
2840
2841               cpp_maybe_module_directive (pfile, result);
2842             }
2843
2844           if (pfile->cb.line_change && !pfile->state.skipping)
2845             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2846         }
2847
2848       /* We don't skip tokens in directives.  */
2849       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2850         break;
2851
2852       /* Outside a directive, invalidate controlling macros.  At file
2853          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2854          get here and MI optimization works.  */
2855       pfile->mi_valid = false;
2856
2857       if (!pfile->state.skipping || result->type == CPP_EOF)
2858         break;
2859     }
2860
2861   return result;
2862 }
2863
2864 /* Returns true if a fresh line has been loaded.  */
2865 bool
2866 _cpp_get_fresh_line (cpp_reader *pfile)
2867 {
2868   /* We can't get a new line until we leave the current directive.  */
2869   if (pfile->state.in_directive)
2870     return false;
2871
2872   for (;;)
2873     {
2874       cpp_buffer *buffer = pfile->buffer;
2875
2876       if (!buffer->need_line)
2877         return true;
2878
2879       if (buffer->next_line < buffer->rlimit)
2880         {
2881           _cpp_clean_line (pfile);
2882           return true;
2883         }
2884
2885       /* First, get out of parsing arguments state.  */
2886       if (pfile->state.parsing_args)
2887         return false;
2888
2889       /* End of buffer.  Non-empty files should end in a newline.  */
2890       if (buffer->buf != buffer->rlimit
2891           && buffer->next_line > buffer->rlimit
2892           && !buffer->from_stage3)
2893         {
2894           /* Clip to buffer size.  */
2895           buffer->next_line = buffer->rlimit;
2896         }
2897
2898       if (buffer->prev && !buffer->return_at_eof)
2899         _cpp_pop_buffer (pfile);
2900       else
2901         {
2902           /* End of translation.  Do not pop the buffer yet. Increment
2903              line number so that the EOF token is on a line of its own
2904              (_cpp_lex_direct doesn't increment in that case, because
2905              it's hard for it to distinguish this special case). */
2906           CPP_INCREMENT_LINE (pfile, 0);
2907           return false;
2908         }
2909     }
2910 }
2911
2912 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2913   do                                                    \
2914     {                                                   \
2915       result->type = ELSE_TYPE;                         \
2916       if (*buffer->cur == CHAR)                         \
2917         buffer->cur++, result->type = THEN_TYPE;        \
2918     }                                                   \
2919   while (0)
2920
2921 /* Lex a token into pfile->cur_token, which is also incremented, to
2922    get diagnostics pointing to the correct location.
2923
2924    Does not handle issues such as token lookahead, multiple-include
2925    optimization, directives, skipping etc.  This function is only
2926    suitable for use by _cpp_lex_token, and in special cases like
2927    lex_expansion_token which doesn't care for any of these issues.
2928
2929    When meeting a newline, returns CPP_EOF if parsing a directive,
2930    otherwise returns to the start of the token buffer if permissible.
2931    Returns the location of the lexed token.  */
2932 cpp_token *
2933 _cpp_lex_direct (cpp_reader *pfile)
2934 {
2935   cppchar_t c;
2936   cpp_buffer *buffer;
2937   const unsigned char *comment_start;
2938   bool fallthrough_comment = false;
2939   cpp_token *result = pfile->cur_token++;
2940
2941  fresh_line:
2942   result->flags = 0;
2943   buffer = pfile->buffer;
2944   if (buffer->need_line)
2945     {
2946       gcc_assert (!pfile->state.in_deferred_pragma);
2947       if (!_cpp_get_fresh_line (pfile))
2948         {
2949           result->type = CPP_EOF;
2950           /* Not a real EOF in a directive or arg parsing -- we refuse
2951              to advance to the next file now, and will once we're out
2952              of those modes.  */
2953           if (!pfile->state.in_directive && !pfile->state.parsing_args)
2954             {
2955               /* Tell the compiler the line number of the EOF token.  */
2956               result->src_loc = pfile->line_table->highest_line;
2957               result->flags = BOL;
2958               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
2959               _cpp_pop_buffer (pfile);
2960             }
2961           return result;
2962         }
2963       if (buffer != pfile->buffer)
2964         fallthrough_comment = false;
2965       if (!pfile->keep_tokens)
2966         {
2967           pfile->cur_run = &pfile->base_run;
2968           result = pfile->base_run.base;
2969           pfile->cur_token = result + 1;
2970         }
2971       result->flags = BOL;
2972       if (pfile->state.parsing_args == 2)
2973         result->flags |= PREV_WHITE;
2974     }
2975   buffer = pfile->buffer;
2976  update_tokens_line:
2977   result->src_loc = pfile->line_table->highest_line;
2978
2979  skipped_white:
2980   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2981       && !pfile->overlaid_buffer)
2982     {
2983       _cpp_process_line_notes (pfile, false);
2984       result->src_loc = pfile->line_table->highest_line;
2985     }
2986   c = *buffer->cur++;
2987
2988   if (pfile->forced_token_location)
2989     result->src_loc = pfile->forced_token_location;
2990   else
2991     result->src_loc = linemap_position_for_column (pfile->line_table,
2992                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2993
2994   switch (c)
2995     {
2996     case ' ': case '\t': case '\f': case '\v': case '\0':
2997       result->flags |= PREV_WHITE;
2998       skip_whitespace (pfile, c);
2999       goto skipped_white;
3000
3001     case '\n':
3002       /* Increment the line, unless this is the last line ...  */
3003       if (buffer->cur < buffer->rlimit
3004           /* ... or this is a #include, (where _cpp_stack_file needs to
3005              unwind by one line) ...  */
3006           || (pfile->state.in_directive > 1
3007               /* ... except traditional-cpp increments this elsewhere.  */
3008               && !CPP_OPTION (pfile, traditional)))
3009         CPP_INCREMENT_LINE (pfile, 0);
3010       buffer->need_line = true;
3011       if (pfile->state.in_deferred_pragma)
3012         {
3013           /* Produce the PRAGMA_EOL on this line.  File reading
3014              ensures there is always a \n at end of the buffer, thus
3015              in a deferred pragma we always see CPP_PRAGMA_EOL before
3016              any CPP_EOF.  */
3017           result->type = CPP_PRAGMA_EOL;
3018           result->flags &= ~PREV_WHITE;
3019           pfile->state.in_deferred_pragma = false;
3020           if (!pfile->state.pragma_allow_expansion)
3021             pfile->state.prevent_expansion--;
3022           return result;
3023         }
3024       goto fresh_line;
3025
3026     case '0': case '1': case '2': case '3': case '4':
3027     case '5': case '6': case '7': case '8': case '9':
3028       {
3029         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3030         result->type = CPP_NUMBER;
3031         lex_number (pfile, &result->val.str, &nst);
3032         warn_about_normalization (pfile, result, &nst);
3033         break;
3034       }
3035
3036     case 'L':
3037     case 'u':
3038     case 'U':
3039     case 'R':
3040       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3041          wide strings or raw strings.  */
3042       if (c == 'L' || CPP_OPTION (pfile, rliterals)
3043           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3044         {
3045           if ((*buffer->cur == '\'' && c != 'R')
3046               || *buffer->cur == '"'
3047               || (*buffer->cur == 'R'
3048                   && c != 'R'
3049                   && buffer->cur[1] == '"'
3050                   && CPP_OPTION (pfile, rliterals))
3051               || (*buffer->cur == '8'
3052                   && c == 'u'
3053                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3054                                 && CPP_OPTION (pfile, utf8_char_literals)))
3055                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3056                           && CPP_OPTION (pfile, rliterals)))))
3057             {
3058               lex_string (pfile, result, buffer->cur - 1);
3059               break;
3060             }
3061         }
3062       /* Fall through.  */
3063
3064     case '_':
3065     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3066     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3067     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3068     case 's': case 't':           case 'v': case 'w': case 'x':
3069     case 'y': case 'z':
3070     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3071     case 'G': case 'H': case 'I': case 'J': case 'K':
3072     case 'M': case 'N': case 'O': case 'P': case 'Q':
3073     case 'S': case 'T':           case 'V': case 'W': case 'X':
3074     case 'Y': case 'Z':
3075       result->type = CPP_NAME;
3076       {
3077         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3078         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3079                                                 &nst,
3080                                                 &result->val.node.spelling);
3081         warn_about_normalization (pfile, result, &nst);
3082       }
3083
3084       /* Convert named operators to their proper types.  */
3085       if (result->val.node.node->flags & NODE_OPERATOR)
3086         {
3087           result->flags |= NAMED_OP;
3088           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3089         }
3090
3091       /* Signal FALLTHROUGH comment followed by another token.  */
3092       if (fallthrough_comment)
3093         result->flags |= PREV_FALLTHROUGH;
3094       break;
3095
3096     case '\'':
3097     case '"':
3098       lex_string (pfile, result, buffer->cur - 1);
3099       break;
3100
3101     case '/':
3102       /* A potential block or line comment.  */
3103       comment_start = buffer->cur;
3104       c = *buffer->cur;
3105
3106       if (c == '*')
3107         {
3108           if (_cpp_skip_block_comment (pfile))
3109             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3110         }
3111       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3112         {
3113           /* Don't warn for system headers.  */
3114           if (_cpp_in_system_header (pfile))
3115             ;
3116           /* Warn about comments if pedantically GNUC89, and not
3117              in system headers.  */
3118           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3119                    && CPP_PEDANTIC (pfile)
3120                    && ! buffer->warned_cplusplus_comments)
3121             {
3122               if (cpp_error (pfile, CPP_DL_PEDWARN,
3123                              "C++ style comments are not allowed in ISO C90"))
3124                 cpp_error (pfile, CPP_DL_NOTE,
3125                            "(this will be reported only once per input file)");
3126               buffer->warned_cplusplus_comments = 1;
3127             }
3128           /* Or if specifically desired via -Wc90-c99-compat.  */
3129           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3130                    && ! CPP_OPTION (pfile, cplusplus)
3131                    && ! buffer->warned_cplusplus_comments)
3132             {
3133               if (cpp_error (pfile, CPP_DL_WARNING,
3134                              "C++ style comments are incompatible with C90"))
3135                 cpp_error (pfile, CPP_DL_NOTE,
3136                            "(this will be reported only once per input file)");
3137               buffer->warned_cplusplus_comments = 1;
3138             }
3139           /* In C89/C94, C++ style comments are forbidden.  */
3140           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3141                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
3142             {
3143               /* But don't be confused about valid code such as
3144                  - // immediately followed by *,
3145                  - // in a preprocessing directive,
3146                  - // in an #if 0 block.  */
3147               if (buffer->cur[1] == '*'
3148                   || pfile->state.in_directive
3149                   || pfile->state.skipping)
3150                 {
3151                   result->type = CPP_DIV;
3152                   break;
3153                 }
3154               else if (! buffer->warned_cplusplus_comments)
3155                 {
3156                   if (cpp_error (pfile, CPP_DL_ERROR,
3157                                  "C++ style comments are not allowed in "
3158                                  "ISO C90"))
3159                     cpp_error (pfile, CPP_DL_NOTE,
3160                                "(this will be reported only once per input "
3161                                "file)");
3162                   buffer->warned_cplusplus_comments = 1;
3163                 }
3164             }
3165           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3166             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3167         }
3168       else if (c == '=')
3169         {
3170           buffer->cur++;
3171           result->type = CPP_DIV_EQ;
3172           break;
3173         }
3174       else
3175         {
3176           result->type = CPP_DIV;
3177           break;
3178         }
3179
3180       if (fallthrough_comment_p (pfile, comment_start))
3181         fallthrough_comment = true;
3182
3183       if (pfile->cb.comment)
3184         {
3185           size_t len = pfile->buffer->cur - comment_start;
3186           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3187                              len + 1);
3188         }
3189
3190       if (!pfile->state.save_comments)
3191         {
3192           result->flags |= PREV_WHITE;
3193           goto update_tokens_line;
3194         }
3195
3196       if (fallthrough_comment)
3197         result->flags |= PREV_FALLTHROUGH;
3198
3199       /* Save the comment as a token in its own right.  */
3200       save_comment (pfile, result, comment_start, c);
3201       break;
3202
3203     case '<':
3204       if (pfile->state.angled_headers)
3205         {
3206           lex_string (pfile, result, buffer->cur - 1);
3207           if (result->type != CPP_LESS)
3208             break;
3209         }
3210
3211       result->type = CPP_LESS;
3212       if (*buffer->cur == '=')
3213         {
3214           buffer->cur++, result->type = CPP_LESS_EQ;
3215           if (*buffer->cur == '>'
3216               && CPP_OPTION (pfile, cplusplus)
3217               && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
3218             buffer->cur++, result->type = CPP_SPACESHIP;
3219         }
3220       else if (*buffer->cur == '<')
3221         {
3222           buffer->cur++;
3223           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
3224         }
3225       else if (CPP_OPTION (pfile, digraphs))
3226         {
3227           if (*buffer->cur == ':')
3228             {
3229               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
3230                  three characters are <:: and the subsequent character
3231                  is neither : nor >, the < is treated as a preprocessor
3232                  token by itself".  */
3233               if (CPP_OPTION (pfile, cplusplus)
3234                   && CPP_OPTION (pfile, lang) != CLK_CXX98
3235                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
3236                   && buffer->cur[1] == ':'
3237                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3238                 break;
3239
3240               buffer->cur++;
3241               result->flags |= DIGRAPH;
3242               result->type = CPP_OPEN_SQUARE;
3243             }
3244           else if (*buffer->cur == '%')
3245             {
3246               buffer->cur++;
3247               result->flags |= DIGRAPH;
3248               result->type = CPP_OPEN_BRACE;
3249             }
3250         }
3251       break;
3252
3253     case '>':
3254       result->type = CPP_GREATER;
3255       if (*buffer->cur == '=')
3256         buffer->cur++, result->type = CPP_GREATER_EQ;
3257       else if (*buffer->cur == '>')
3258         {
3259           buffer->cur++;
3260           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3261         }
3262       break;
3263
3264     case '%':
3265       result->type = CPP_MOD;
3266       if (*buffer->cur == '=')
3267         buffer->cur++, result->type = CPP_MOD_EQ;
3268       else if (CPP_OPTION (pfile, digraphs))
3269         {
3270           if (*buffer->cur == ':')
3271             {
3272               buffer->cur++;
3273               result->flags |= DIGRAPH;
3274               result->type = CPP_HASH;
3275               if (*buffer->cur == '%' && buffer->cur[1] == ':')
3276                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3277             }
3278           else if (*buffer->cur == '>')
3279             {
3280               buffer->cur++;
3281               result->flags |= DIGRAPH;
3282               result->type = CPP_CLOSE_BRACE;
3283             }
3284         }
3285       break;
3286
3287     case '.':
3288       result->type = CPP_DOT;
3289       if (ISDIGIT (*buffer->cur))
3290         {
3291           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3292           result->type = CPP_NUMBER;
3293           lex_number (pfile, &result->val.str, &nst);
3294           warn_about_normalization (pfile, result, &nst);
3295         }
3296       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3297         buffer->cur += 2, result->type = CPP_ELLIPSIS;
3298       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3299         buffer->cur++, result->type = CPP_DOT_STAR;
3300       break;
3301
3302     case '+':
3303       result->type = CPP_PLUS;
3304       if (*buffer->cur == '+')
3305         buffer->cur++, result->type = CPP_PLUS_PLUS;
3306       else if (*buffer->cur == '=')
3307         buffer->cur++, result->type = CPP_PLUS_EQ;
3308       break;
3309
3310     case '-':
3311       result->type = CPP_MINUS;
3312       if (*buffer->cur == '>')
3313         {
3314           buffer->cur++;
3315           result->type = CPP_DEREF;
3316           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3317             buffer->cur++, result->type = CPP_DEREF_STAR;
3318         }
3319       else if (*buffer->cur == '-')
3320         buffer->cur++, result->type = CPP_MINUS_MINUS;
3321       else if (*buffer->cur == '=')
3322         buffer->cur++, result->type = CPP_MINUS_EQ;
3323       break;
3324
3325     case '&':
3326       result->type = CPP_AND;
3327       if (*buffer->cur == '&')
3328         buffer->cur++, result->type = CPP_AND_AND;
3329       else if (*buffer->cur == '=')
3330         buffer->cur++, result->type = CPP_AND_EQ;
3331       break;
3332
3333     case '|':
3334       result->type = CPP_OR;
3335       if (*buffer->cur == '|')
3336         buffer->cur++, result->type = CPP_OR_OR;
3337       else if (*buffer->cur == '=')
3338         buffer->cur++, result->type = CPP_OR_EQ;
3339       break;
3340
3341     case ':':
3342       result->type = CPP_COLON;
3343       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
3344         buffer->cur++, result->type = CPP_SCOPE;
3345       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3346         {
3347           buffer->cur++;
3348           result->flags |= DIGRAPH;
3349           result->type = CPP_CLOSE_SQUARE;
3350         }
3351       break;
3352
3353     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3354     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3355     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3356     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3357     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3358
3359     case '?': result->type = CPP_QUERY; break;
3360     case '~': result->type = CPP_COMPL; break;
3361     case ',': result->type = CPP_COMMA; break;
3362     case '(': result->type = CPP_OPEN_PAREN; break;
3363     case ')': result->type = CPP_CLOSE_PAREN; break;
3364     case '[': result->type = CPP_OPEN_SQUARE; break;
3365     case ']': result->type = CPP_CLOSE_SQUARE; break;
3366     case '{': result->type = CPP_OPEN_BRACE; break;
3367     case '}': result->type = CPP_CLOSE_BRACE; break;
3368     case ';': result->type = CPP_SEMICOLON; break;
3369
3370       /* @ is a punctuator in Objective-C.  */
3371     case '@': result->type = CPP_ATSIGN; break;
3372
3373     default:
3374       {
3375         const uchar *base = --buffer->cur;
3376
3377         /* Check for an extended identifier ($ or UCN or UTF-8).  */
3378         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3379         if (forms_identifier_p (pfile, true, &nst))
3380           {
3381             result->type = CPP_NAME;
3382             result->val.node.node = lex_identifier (pfile, base, true, &nst,
3383                                                     &result->val.node.spelling);
3384             warn_about_normalization (pfile, result, &nst);
3385             break;
3386           }
3387
3388         /* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
3389            single token.  */
3390         buffer->cur++;
3391         if (c >= utf8_signifier)
3392           {
3393             const uchar *pstr = base;
3394             cppchar_t s;
3395             if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
3396               buffer->cur = pstr;
3397           }
3398         create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
3399         break;
3400       }
3401
3402     }
3403
3404   /* Potentially convert the location of the token to a range.  */
3405   if (result->src_loc >= RESERVED_LOCATION_COUNT
3406       && result->type != CPP_EOF)
3407     {
3408       /* Ensure that any line notes are processed, so that we have the
3409          correct physical line/column for the end-point of the token even
3410          when a logical line is split via one or more backslashes.  */
3411       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3412           && !pfile->overlaid_buffer)
3413         _cpp_process_line_notes (pfile, false);
3414
3415       source_range tok_range;
3416       tok_range.m_start = result->src_loc;
3417       tok_range.m_finish
3418         = linemap_position_for_column (pfile->line_table,
3419                                        CPP_BUF_COLUMN (buffer, buffer->cur));
3420
3421       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
3422                                                result->src_loc,
3423                                                tok_range, NULL);
3424     }
3425
3426   return result;
3427 }
3428
3429 /* An upper bound on the number of bytes needed to spell TOKEN.
3430    Does not include preceding whitespace.  */
3431 unsigned int
3432 cpp_token_len (const cpp_token *token)
3433 {
3434   unsigned int len;
3435
3436   switch (TOKEN_SPELL (token))
3437     {
3438     default:            len = 6;                                break;
3439     case SPELL_LITERAL: len = token->val.str.len;               break;
3440     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
3441     }
3442
3443   return len;
3444 }
3445
3446 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
3447    Return the number of bytes read out of NAME.  (There are always
3448    10 bytes written to BUFFER.)  */
3449
3450 static size_t
3451 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
3452 {
3453   int j;
3454   int ucn_len = 0;
3455   int ucn_len_c;
3456   unsigned t;
3457   unsigned long utf32;
3458
3459   /* Compute the length of the UTF-8 sequence.  */
3460   for (t = *name; t & 0x80; t <<= 1)
3461     ucn_len++;
3462
3463   utf32 = *name & (0x7F >> ucn_len);
3464   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
3465     {
3466       utf32 = (utf32 << 6) | (*++name & 0x3F);
3467
3468       /* Ill-formed UTF-8.  */
3469       if ((*name & ~0x3F) != 0x80)
3470         abort ();
3471     }
3472
3473   *buffer++ = '\\';
3474   *buffer++ = 'U';
3475   for (j = 7; j >= 0; j--)
3476     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
3477   return ucn_len;
3478 }
3479
3480 /* Given a token TYPE corresponding to a digraph, return a pointer to
3481    the spelling of the digraph.  */
3482 static const unsigned char *
3483 cpp_digraph2name (enum cpp_ttype type)
3484 {
3485   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
3486 }
3487
3488 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
3489    The buffer must already contain the enough space to hold the
3490    token's spelling.  Returns a pointer to the character after the
3491    last character written.  */
3492 unsigned char *
3493 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
3494 {
3495   size_t i;
3496   const unsigned char *name = NODE_NAME (ident);
3497
3498   for (i = 0; i < NODE_LEN (ident); i++)
3499     if (name[i] & ~0x7F)
3500       {
3501         i += utf8_to_ucn (buffer, name + i) - 1;
3502         buffer += 10;
3503       }
3504     else
3505       *buffer++ = name[i];
3506
3507   return buffer;
3508 }
3509
3510 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
3511    already contain the enough space to hold the token's spelling.
3512    Returns a pointer to the character after the last character written.
3513    FORSTRING is true if this is to be the spelling after translation
3514    phase 1 (with the original spelling of extended identifiers), false
3515    if extended identifiers should always be written using UCNs (there is
3516    no option for always writing them in the internal UTF-8 form).
3517    FIXME: Would be nice if we didn't need the PFILE argument.  */
3518 unsigned char *
3519 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
3520                  unsigned char *buffer, bool forstring)
3521 {
3522   switch (TOKEN_SPELL (token))
3523     {
3524     case SPELL_OPERATOR:
3525       {
3526         const unsigned char *spelling;
3527         unsigned char c;
3528
3529         if (token->flags & DIGRAPH)
3530           spelling = cpp_digraph2name (token->type);
3531         else if (token->flags & NAMED_OP)
3532           goto spell_ident;
3533         else
3534           spelling = TOKEN_NAME (token);
3535
3536         while ((c = *spelling++) != '\0')
3537           *buffer++ = c;
3538       }
3539       break;
3540
3541     spell_ident:
3542     case SPELL_IDENT:
3543       if (forstring)
3544         {
3545           memcpy (buffer, NODE_NAME (token->val.node.spelling),
3546                   NODE_LEN (token->val.node.spelling));
3547           buffer += NODE_LEN (token->val.node.spelling);
3548         }
3549       else
3550         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
3551       break;
3552
3553     case SPELL_LITERAL:
3554       memcpy (buffer, token->val.str.text, token->val.str.len);
3555       buffer += token->val.str.len;
3556       break;
3557
3558     case SPELL_NONE:
3559       cpp_error (pfile, CPP_DL_ICE,
3560                  "unspellable token %s", TOKEN_NAME (token));
3561       break;
3562     }
3563
3564   return buffer;
3565 }
3566
3567 /* Returns TOKEN spelt as a null-terminated string.  The string is
3568    freed when the reader is destroyed.  Useful for diagnostics.  */
3569 unsigned char *
3570 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
3571 {
3572   unsigned int len = cpp_token_len (token) + 1;
3573   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
3574
3575   end = cpp_spell_token (pfile, token, start, false);
3576   end[0] = '\0';
3577
3578   return start;
3579 }
3580
3581 /* Returns a pointer to a string which spells the token defined by
3582    TYPE and FLAGS.  Used by C front ends, which really should move to
3583    using cpp_token_as_text.  */
3584 const char *
3585 cpp_type2name (enum cpp_ttype type, unsigned char flags)
3586 {
3587   if (flags & DIGRAPH)
3588     return (const char *) cpp_digraph2name (type);
3589   else if (flags & NAMED_OP)
3590     return cpp_named_operator2name (type);
3591
3592   return (const char *) token_spellings[type].name;
3593 }
3594
3595 /* Writes the spelling of token to FP, without any preceding space.
3596    Separated from cpp_spell_token for efficiency - to avoid stdio
3597    double-buffering.  */
3598 void
3599 cpp_output_token (const cpp_token *token, FILE *fp)
3600 {
3601   switch (TOKEN_SPELL (token))
3602     {
3603     case SPELL_OPERATOR:
3604       {
3605         const unsigned char *spelling;
3606         int c;
3607
3608         if (token->flags & DIGRAPH)
3609           spelling = cpp_digraph2name (token->type);
3610         else if (token->flags & NAMED_OP)
3611           goto spell_ident;
3612         else
3613           spelling = TOKEN_NAME (token);
3614
3615         c = *spelling;
3616         do
3617           putc (c, fp);
3618         while ((c = *++spelling) != '\0');
3619       }
3620       break;
3621
3622     spell_ident:
3623     case SPELL_IDENT:
3624       {
3625         size_t i;
3626         const unsigned char * name = NODE_NAME (token->val.node.node);
3627
3628         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
3629           if (name[i] & ~0x7F)
3630             {
3631               unsigned char buffer[10];
3632               i += utf8_to_ucn (buffer, name + i) - 1;
3633               fwrite (buffer, 1, 10, fp);
3634             }
3635           else
3636             fputc (NODE_NAME (token->val.node.node)[i], fp);
3637       }
3638       break;
3639
3640     case SPELL_LITERAL:
3641       if (token->type == CPP_HEADER_NAME)
3642         fputc ('"', fp);
3643       fwrite (token->val.str.text, 1, token->val.str.len, fp);
3644       if (token->type == CPP_HEADER_NAME)
3645         fputc ('"', fp);
3646       break;
3647
3648     case SPELL_NONE:
3649       /* An error, most probably.  */
3650       break;
3651     }
3652 }
3653
3654 /* Compare two tokens.  */
3655 int
3656 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
3657 {
3658   if (a->type == b->type && a->flags == b->flags)
3659     switch (TOKEN_SPELL (a))
3660       {
3661       default:                  /* Keep compiler happy.  */
3662       case SPELL_OPERATOR:
3663         /* token_no is used to track where multiple consecutive ##
3664            tokens were originally located.  */
3665         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
3666       case SPELL_NONE:
3667         return (a->type != CPP_MACRO_ARG
3668                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
3669                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
3670       case SPELL_IDENT:
3671         return (a->val.node.node == b->val.node.node
3672                 && a->val.node.spelling == b->val.node.spelling);
3673       case SPELL_LITERAL:
3674         return (a->val.str.len == b->val.str.len
3675                 && !memcmp (a->val.str.text, b->val.str.text,
3676                             a->val.str.len));
3677       }
3678
3679   return 0;
3680 }
3681
3682 /* Returns nonzero if a space should be inserted to avoid an
3683    accidental token paste for output.  For simplicity, it is
3684    conservative, and occasionally advises a space where one is not
3685    needed, e.g. "." and ".2".  */
3686 int
3687 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
3688                  const cpp_token *token2)
3689 {
3690   enum cpp_ttype a = token1->type, b = token2->type;
3691   cppchar_t c;
3692
3693   if (token1->flags & NAMED_OP)
3694     a = CPP_NAME;
3695   if (token2->flags & NAMED_OP)
3696     b = CPP_NAME;
3697
3698   c = EOF;
3699   if (token2->flags & DIGRAPH)
3700     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
3701   else if (token_spellings[b].category == SPELL_OPERATOR)
3702     c = token_spellings[b].name[0];
3703
3704   /* Quickly get everything that can paste with an '='.  */
3705   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
3706     return 1;
3707
3708   switch (a)
3709     {
3710     case CPP_GREATER:   return c == '>';
3711     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
3712     case CPP_PLUS:      return c == '+';
3713     case CPP_MINUS:     return c == '-' || c == '>';
3714     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
3715     case CPP_MOD:       return c == ':' || c == '>';
3716     case CPP_AND:       return c == '&';
3717     case CPP_OR:        return c == '|';
3718     case CPP_COLON:     return c == ':' || c == '>';
3719     case CPP_DEREF:     return c == '*';
3720     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
3721     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
3722     case CPP_PRAGMA:
3723     case CPP_NAME:      return ((b == CPP_NUMBER
3724                                  && name_p (pfile, &token2->val.str))
3725                                 || b == CPP_NAME
3726                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
3727     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
3728                                 || b == CPP_CHAR
3729                                 || c == '.' || c == '+' || c == '-');
3730                                       /* UCNs */
3731     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
3732                                  && b == CPP_NAME)
3733                                 || (CPP_OPTION (pfile, objc)
3734                                     && token1->val.str.text[0] == '@'
3735                                     && (b == CPP_NAME || b == CPP_STRING)));
3736     case CPP_LESS_EQ:   return c == '>';
3737     case CPP_STRING:
3738     case CPP_WSTRING:
3739     case CPP_UTF8STRING:
3740     case CPP_STRING16:
3741     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
3742                                 && (b == CPP_NAME
3743                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
3744                                         && ISIDST (token2->val.str.text[0]))));
3745
3746     default:            break;
3747     }
3748
3749   return 0;
3750 }
3751
3752 /* Output all the remaining tokens on the current line, and a newline
3753    character, to FP.  Leading whitespace is removed.  If there are
3754    macros, special token padding is not performed.  */
3755 void
3756 cpp_output_line (cpp_reader *pfile, FILE *fp)
3757 {
3758   const cpp_token *token;
3759
3760   token = cpp_get_token (pfile);
3761   while (token->type != CPP_EOF)
3762     {
3763       cpp_output_token (token, fp);
3764       token = cpp_get_token (pfile);
3765       if (token->flags & PREV_WHITE)
3766         putc (' ', fp);
3767     }
3768
3769   putc ('\n', fp);
3770 }
3771
3772 /* Return a string representation of all the remaining tokens on the
3773    current line.  The result is allocated using xmalloc and must be
3774    freed by the caller.  */
3775 unsigned char *
3776 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
3777 {
3778   const cpp_token *token;
3779   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
3780   unsigned int alloced = 120 + out;
3781   unsigned char *result = (unsigned char *) xmalloc (alloced);
3782
3783   /* If DIR_NAME is empty, there are no initial contents.  */
3784   if (dir_name)
3785     {
3786       sprintf ((char *) result, "#%s ", dir_name);
3787       out += 2;
3788     }
3789
3790   token = cpp_get_token (pfile);
3791   while (token->type != CPP_EOF)
3792     {
3793       unsigned char *last;
3794       /* Include room for a possible space and the terminating nul.  */
3795       unsigned int len = cpp_token_len (token) + 2;
3796
3797       if (out + len > alloced)
3798         {
3799           alloced *= 2;
3800           if (out + len > alloced)
3801             alloced = out + len;
3802           result = (unsigned char *) xrealloc (result, alloced);
3803         }
3804
3805       last = cpp_spell_token (pfile, token, &result[out], 0);
3806       out = last - result;
3807
3808       token = cpp_get_token (pfile);
3809       if (token->flags & PREV_WHITE)
3810         result[out++] = ' ';
3811     }
3812
3813   result[out] = '\0';
3814   return result;
3815 }
3816
3817 /* Memory buffers.  Changing these three constants can have a dramatic
3818    effect on performance.  The values here are reasonable defaults,
3819    but might be tuned.  If you adjust them, be sure to test across a
3820    range of uses of cpplib, including heavy nested function-like macro
3821    expansion.  Also check the change in peak memory usage (NJAMD is a
3822    good tool for this).  */
3823 #define MIN_BUFF_SIZE 8000
3824 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3825 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
3826         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
3827
3828 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
3829   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
3830 #endif
3831
3832 /* Create a new allocation buffer.  Place the control block at the end
3833    of the buffer, so that buffer overflows will cause immediate chaos.  */
3834 static _cpp_buff *
3835 new_buff (size_t len)
3836 {
3837   _cpp_buff *result;
3838   unsigned char *base;
3839
3840   if (len < MIN_BUFF_SIZE)
3841     len = MIN_BUFF_SIZE;
3842   len = CPP_ALIGN (len);
3843
3844 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3845   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
3846      struct first.  */
3847   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
3848   base = XNEWVEC (unsigned char, len + slen);
3849   result = (_cpp_buff *) base;
3850   base += slen;
3851 #else
3852   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3853   result = (_cpp_buff *) (base + len);
3854 #endif
3855   result->base = base;
3856   result->cur = base;
3857   result->limit = base + len;
3858   result->next = NULL;
3859   return result;
3860 }
3861
3862 /* Place a chain of unwanted allocation buffers on the free list.  */
3863 void
3864 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3865 {
3866   _cpp_buff *end = buff;
3867
3868   while (end->next)
3869     end = end->next;
3870   end->next = pfile->free_buffs;
3871   pfile->free_buffs = buff;
3872 }
3873
3874 /* Return a free buffer of size at least MIN_SIZE.  */
3875 _cpp_buff *
3876 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3877 {
3878   _cpp_buff *result, **p;
3879
3880   for (p = &pfile->free_buffs;; p = &(*p)->next)
3881     {
3882       size_t size;
3883
3884       if (*p == NULL)
3885         return new_buff (min_size);
3886       result = *p;
3887       size = result->limit - result->base;
3888       /* Return a buffer that's big enough, but don't waste one that's
3889          way too big.  */
3890       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3891         break;
3892     }
3893
3894   *p = result->next;
3895   result->next = NULL;
3896   result->cur = result->base;
3897   return result;
3898 }
3899
3900 /* Creates a new buffer with enough space to hold the uncommitted
3901    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
3902    the excess bytes to the new buffer.  Chains the new buffer after
3903    BUFF, and returns the new buffer.  */
3904 _cpp_buff *
3905 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3906 {
3907   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3908   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3909
3910   buff->next = new_buff;
3911   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3912   return new_buff;
3913 }
3914
3915 /* Creates a new buffer with enough space to hold the uncommitted
3916    remaining bytes of the buffer pointed to by BUFF, and at least
3917    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3918    Chains the new buffer before the buffer pointed to by BUFF, and
3919    updates the pointer to point to the new buffer.  */
3920 void
3921 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3922 {
3923   _cpp_buff *new_buff, *old_buff = *pbuff;
3924   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3925
3926   new_buff = _cpp_get_buff (pfile, size);
3927   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3928   new_buff->next = old_buff;
3929   *pbuff = new_buff;
3930 }
3931
3932 /* Free a chain of buffers starting at BUFF.  */
3933 void
3934 _cpp_free_buff (_cpp_buff *buff)
3935 {
3936   _cpp_buff *next;
3937
3938   for (; buff; buff = next)
3939     {
3940       next = buff->next;
3941 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3942       free (buff);
3943 #else
3944       free (buff->base);
3945 #endif
3946     }
3947 }
3948
3949 /* Allocate permanent, unaligned storage of length LEN.  */
3950 unsigned char *
3951 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3952 {
3953   _cpp_buff *buff = pfile->u_buff;
3954   unsigned char *result = buff->cur;
3955
3956   if (len > (size_t) (buff->limit - result))
3957     {
3958       buff = _cpp_get_buff (pfile, len);
3959       buff->next = pfile->u_buff;
3960       pfile->u_buff = buff;
3961       result = buff->cur;
3962     }
3963
3964   buff->cur = result + len;
3965   return result;
3966 }
3967
3968 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3969    That buffer is used for growing allocations when saving macro
3970    replacement lists in a #define, and when parsing an answer to an
3971    assertion in #assert, #unassert or #if (and therefore possibly
3972    whilst expanding macros).  It therefore must not be used by any
3973    code that they might call: specifically the lexer and the guts of
3974    the macro expander.
3975
3976    All existing other uses clearly fit this restriction: storing
3977    registered pragmas during initialization.  */
3978 unsigned char *
3979 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3980 {
3981   _cpp_buff *buff = pfile->a_buff;
3982   unsigned char *result = buff->cur;
3983
3984   if (len > (size_t) (buff->limit - result))
3985     {
3986       buff = _cpp_get_buff (pfile, len);
3987       buff->next = pfile->a_buff;
3988       pfile->a_buff = buff;
3989       result = buff->cur;
3990     }
3991
3992   buff->cur = result + len;
3993   return result;
3994 }
3995
3996 /* Commit or allocate storage from a buffer.  */
3997
3998 void *
3999 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4000 {
4001   void *ptr = BUFF_FRONT (pfile->a_buff);
4002
4003   if (pfile->hash_table->alloc_subobject)
4004     {
4005       void *copy = pfile->hash_table->alloc_subobject (size);
4006       memcpy (copy, ptr, size);
4007       ptr = copy;
4008     }
4009   else
4010     BUFF_FRONT (pfile->a_buff) += size;
4011
4012   return ptr;
4013 }
4014
4015 /* Say which field of TOK is in use.  */
4016
4017 enum cpp_token_fld_kind
4018 cpp_token_val_index (const cpp_token *tok)
4019 {
4020   switch (TOKEN_SPELL (tok))
4021     {
4022     case SPELL_IDENT:
4023       return CPP_TOKEN_FLD_NODE;
4024     case SPELL_LITERAL:
4025       return CPP_TOKEN_FLD_STR;
4026     case SPELL_OPERATOR:
4027       /* Operands which were originally spelled as ident keep around
4028          the node for the exact spelling.  */
4029       if (tok->flags & NAMED_OP)
4030         return CPP_TOKEN_FLD_NODE;
4031       else if (tok->type == CPP_PASTE)
4032         return CPP_TOKEN_FLD_TOKEN_NO;
4033       else
4034         return CPP_TOKEN_FLD_NONE;
4035     case SPELL_NONE:
4036       if (tok->type == CPP_MACRO_ARG)
4037         return CPP_TOKEN_FLD_ARG_NO;
4038       else if (tok->type == CPP_PADDING)
4039         return CPP_TOKEN_FLD_SOURCE;
4040       else if (tok->type == CPP_PRAGMA)
4041         return CPP_TOKEN_FLD_PRAGMA;
4042       /* fall through */
4043     default:
4044       return CPP_TOKEN_FLD_NONE;
4045     }
4046 }
4047
4048 /* All tokens lexed in R after calling this function will be forced to
4049    have their location_t to be P, until
4050    cpp_stop_forcing_token_locations is called for R.  */
4051
4052 void
4053 cpp_force_token_locations (cpp_reader *r, location_t loc)
4054 {
4055   r->forced_token_location = loc;
4056 }
4057
4058 /* Go back to assigning locations naturally for lexed tokens.  */
4059
4060 void
4061 cpp_stop_forcing_token_locations (cpp_reader *r)
4062 {
4063   r->forced_token_location = 0;
4064 }
4065
4066 /* We're looking at \, if it's escaping EOL, look past it.  If at
4067    LIMIT, don't advance.  */
4068
4069 static const unsigned char *
4070 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4071 {
4072   const unsigned char *probe = peek;
4073
4074   if (__builtin_expect (peek[1] == '\n', true))
4075     {
4076     eol:
4077       probe += 2;
4078       if (__builtin_expect (probe < limit, true))
4079         {
4080           peek = probe;
4081           if (*peek == '\\')
4082             /* The user might be perverse.  */
4083             return do_peek_backslash (peek, limit);
4084         }
4085     }
4086   else if (__builtin_expect (peek[1] == '\r', false))
4087     {
4088       if (probe[2] == '\n')
4089         probe++;
4090       goto eol;
4091     }
4092
4093   return peek;
4094 }
4095
4096 static const unsigned char *
4097 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4098 {
4099   if (__builtin_expect (*peek == '\\', false))
4100     peek = do_peek_backslash (peek, limit);
4101   return peek;
4102 }
4103
4104 static const unsigned char *
4105 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4106 {
4107   if (peek == bound)
4108     return NULL;
4109
4110   unsigned char c = *--peek;
4111   if (__builtin_expect (c == '\n', false)
4112       || __builtin_expect (c == 'r', false))
4113     {
4114       if (peek == bound)
4115         return peek;
4116       int ix = -1;
4117       if (c == '\n' && peek[ix] == '\r')
4118         {
4119           if (peek + ix == bound)
4120             return peek;
4121           ix--;
4122         }
4123
4124       if (peek[ix] == '\\')
4125         return do_peek_prev (peek + ix, bound);
4126
4127       return peek;
4128     }
4129   else
4130     return peek;
4131 }
4132
4133 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4134    space.  Otherwise return NULL.  */
4135
4136 static const unsigned char *
4137 do_peek_ident (const char *match, const unsigned char *peek,
4138                const unsigned char *limit)
4139 {
4140   for (; *++match; peek++)
4141     if (*peek != *match)
4142       {
4143         peek = do_peek_next (peek, limit);
4144         if (*peek != *match)
4145           return NULL;
4146       }
4147
4148   /* Must now not be looking at an identifier char.  */
4149   peek = do_peek_next (peek, limit);
4150   if (ISIDNUM (*peek))
4151     return NULL;
4152
4153   /* Skip control-line whitespace.  */
4154  ws:
4155   while (*peek == ' ' || *peek == '\t')
4156     peek++;
4157   if (__builtin_expect (*peek == '\\', false))
4158     {
4159       peek = do_peek_backslash (peek, limit);
4160       if (*peek != '\\')
4161         goto ws;
4162     }
4163
4164   return peek;
4165 }
4166
4167 /* Are we looking at a module control line starting as PEEK - 1?  */
4168
4169 static bool
4170 do_peek_module (cpp_reader *pfile, unsigned char c,
4171                 const unsigned char *peek, const unsigned char *limit)
4172 {
4173   bool import = false;
4174
4175   if (__builtin_expect (c == 'e', false))
4176     {
4177       if (!((peek[0] == 'x' || peek[0] == '\\')
4178             && (peek = do_peek_ident ("export", peek, limit))))
4179         return false;
4180
4181       /* export, peek for import or module.  No need to peek __import
4182          here.  */
4183       if (peek[0] == 'i')
4184         {
4185           if (!((peek[1] == 'm' || peek[1] == '\\')
4186                 && (peek = do_peek_ident ("import", peek + 1, limit))))
4187             return false;
4188           import = true;
4189         }
4190       else if (peek[0] == 'm')
4191         {
4192           if (!((peek[1] == 'o' || peek[1] == '\\')
4193                 && (peek = do_peek_ident ("module", peek + 1, limit))))
4194             return false;
4195         }
4196       else
4197         return false;
4198     }
4199   else if (__builtin_expect (c == 'i', false))
4200     {
4201       if (!((peek[0] == 'm' || peek[0] == '\\')
4202             && (peek = do_peek_ident ("import", peek, limit))))
4203         return false;
4204       import = true;
4205     }
4206   else if (__builtin_expect (c == '_', false))
4207     {
4208       /* Needed for translated includes.   */
4209       if (!((peek[0] == '_' || peek[0] == '\\')
4210             && (peek = do_peek_ident ("__import", peek, limit))))
4211         return false;
4212       import = true;
4213     }
4214   else if (__builtin_expect (c == 'm', false))
4215     {
4216       if (!((peek[0] == 'o' || peek[0] == '\\')
4217             && (peek = do_peek_ident ("module", peek, limit))))
4218         return false;
4219     }
4220   else
4221     return false;
4222
4223   /* Peek the next character to see if it's good enough.  We'll be at
4224      the first non-whitespace char, including skipping an escaped
4225      newline.  */
4226   /* ... import followed by identifier, ':', '<' or header-name
4227      preprocessing tokens, or module followed by identifier, ':' or
4228      ';' preprocessing tokens.  */
4229   unsigned char p = *peek++;
4230
4231   /* A character literal is ... single quotes, ... optionally preceded
4232      by u8, u, U, or L */
4233   /* A string-literal is a ... double quotes, optionally prefixed by
4234      R, u8, u8R, u, uR, U, UR, L, or LR */
4235   if (p == 'u')
4236     {
4237       peek = do_peek_next (peek, limit);
4238       if (*peek == '8')
4239         {
4240           peek++;
4241           goto peek_u8;
4242         }
4243       goto peek_u;
4244     }
4245   else if (p == 'U' || p == 'L')
4246     {
4247     peek_u8:
4248       peek = do_peek_next (peek, limit);
4249     peek_u:
4250       if (*peek == '\"' || *peek == '\'')
4251         return false;
4252
4253       if (*peek == 'R')
4254         goto peek_R;
4255       /* Identifier. Ok.  */
4256     }
4257   else if (p == 'R')
4258     {
4259     peek_R:
4260       if (CPP_OPTION (pfile, rliterals))
4261         {
4262           peek = do_peek_next (peek, limit);
4263           if (*peek == '\"')
4264             return false;
4265         }
4266       /* Identifier. Ok.  */
4267     }
4268   else if ('Z' - 'A' == 25
4269            ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
4270            : ISIDST (p))
4271     {
4272       /* Identifier.  Ok. */
4273     }
4274   else if (p == '<')
4275     {
4276       /* Maybe angle header, ok for import.  Reject
4277          '<=', '<<' digraph:'<:'.  */
4278       if (!import)
4279         return false;
4280       peek = do_peek_next (peek, limit);
4281       if (*peek == '=' || *peek == '<'
4282           || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
4283         return false;
4284     }
4285   else if (p == ';')
4286     {
4287       /* SEMICOLON, ok for module.  */
4288       if (import)
4289         return false;
4290     }
4291   else if (p == '"')
4292     {
4293       /* STRING, ok for import.  */
4294       if (!import)
4295         return false;
4296     }
4297   else if (p == ':')
4298     {
4299       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
4300       peek = do_peek_next (peek, limit);
4301       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
4302         return false;
4303     }
4304   else
4305     /* FIXME: Detect a unicode character, excluding those not
4306        permitted as the initial character. [lex.name]/1.  I presume
4307        we need to check the \[uU] spellings, and directly using
4308        Unicode in say UTF8 form?  Or perhaps we do the phase-1
4309        conversion of UTF8 to universal-character-names?  */
4310     return false;
4311
4312   return true;
4313 }
4314
4315 /* Directives-only scanning.  Somewhat more relaxed than correct
4316    parsing -- some ill-formed programs will not be rejected.  */
4317
4318 void
4319 cpp_directive_only_process (cpp_reader *pfile,
4320                             void *data,
4321                             void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
4322 {
4323   bool module_p = CPP_OPTION (pfile, module_directives);
4324
4325   do
4326     {
4327     restart:
4328       /* Buffer initialization, but no line cleaning. */
4329       cpp_buffer *buffer = pfile->buffer;
4330       buffer->cur_note = buffer->notes_used = 0;
4331       buffer->cur = buffer->line_base = buffer->next_line;
4332       buffer->need_line = false;
4333       /* Files always end in a newline or carriage return.  We rely on this for
4334          character peeking safety.  */
4335       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
4336
4337       const unsigned char *base = buffer->cur;
4338       unsigned line_count = 0;
4339       const unsigned char *line_start = base;
4340
4341       bool bol = true;
4342       bool raw = false;
4343
4344       const unsigned char *lwm = base;
4345       for (const unsigned char *pos = base, *limit = buffer->rlimit;
4346            pos < limit;)
4347         {
4348           unsigned char c = *pos++;
4349           /* This matches the switch in _cpp_lex_direct.  */
4350           switch (c)
4351             {
4352             case ' ': case '\t': case '\f': case '\v':
4353               /* Whitespace, do nothing.  */
4354               break;
4355
4356             case '\r': /* MAC line ending, or Windows \r\n  */
4357               if (*pos == '\n')
4358                 pos++;
4359               /* FALLTHROUGH */
4360
4361             case '\n':
4362               bol = true;
4363
4364             next_line:
4365               CPP_INCREMENT_LINE (pfile, 0);
4366               line_count++;
4367               line_start = pos;
4368               break;
4369
4370             case '\\':
4371               /* <backslash><newline> is removed, and doesn't undo any
4372                  preceeding escape or whatnot.  */
4373               if (*pos == '\n')
4374                 {
4375                   pos++;
4376                   goto next_line;
4377                 }
4378               else if (*pos == '\r')
4379                 {
4380                   if (pos[1] == '\n')
4381                     pos++;
4382                   pos++;
4383                   goto next_line;
4384                 }
4385               goto dflt;
4386
4387             case '#':
4388               if (bol)
4389                 {
4390                   /* Line directive.  */
4391                   if (pos - 1 > base && !pfile->state.skipping)
4392                     cb (pfile, CPP_DO_print, data,
4393                         line_count, base, pos - 1 - base);
4394
4395                   /* Prep things for directive handling. */
4396                   buffer->next_line = pos;
4397                   buffer->need_line = true;
4398                   bool ok = _cpp_get_fresh_line (pfile);
4399                   gcc_checking_assert (ok);
4400
4401                   /* Ensure proper column numbering for generated
4402                      error messages. */
4403                   buffer->line_base -= pos - line_start;
4404
4405                   _cpp_handle_directive (pfile, line_start + 1 != pos);
4406
4407                   /* Sanitize the line settings.  Duplicate #include's can
4408                      mess things up. */
4409                   // FIXME: Necessary?
4410                   pfile->line_table->highest_location
4411                     = pfile->line_table->highest_line;
4412
4413                   if (!pfile->state.skipping
4414                       && pfile->buffer->next_line < pfile->buffer->rlimit)
4415                     cb (pfile, CPP_DO_location, data,
4416                         pfile->line_table->highest_line);
4417
4418                   goto restart;
4419                 }
4420               goto dflt;
4421
4422             case '/':
4423               {
4424                 const unsigned char *peek = do_peek_next (pos, limit);
4425                 if (!(*peek == '/' || *peek == '*'))
4426                   goto dflt;
4427
4428                 /* Line or block comment  */
4429                 bool is_block = *peek == '*';
4430                 bool star = false;
4431                 bool esc = false;
4432                 location_t sloc
4433                   = linemap_position_for_column (pfile->line_table,
4434                                                  pos - line_start);
4435
4436                 while (pos < limit)
4437                   {
4438                     char c = *pos++;
4439                     switch (c)
4440                       {
4441                       case '\\':
4442                         esc = true;
4443                         break;
4444
4445                       case '\r':
4446                         if (*pos == '\n')
4447                           pos++;
4448                         /* FALLTHROUGH  */
4449
4450                       case '\n':
4451                         {
4452                           CPP_INCREMENT_LINE (pfile, 0);
4453                           line_count++;
4454                           line_start = pos;
4455                           if (!esc && !is_block)
4456                             {
4457                               bol = true;
4458                               goto done_comment;
4459                             }
4460                         }
4461                         if (!esc)
4462                           star = false;
4463                         esc = false;
4464                         break;
4465
4466                       case '*':
4467                         if (pos > peek && !esc)
4468                           star = is_block;
4469                         esc = false;
4470                         break;
4471
4472                       case '/':
4473                         if (star)
4474                           goto done_comment;
4475                         /* FALLTHROUGH  */
4476
4477                       default:
4478                         star = false;
4479                         esc = false;
4480                         break;
4481                       }
4482                   }
4483                 if (pos < limit || is_block)
4484                   cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4485                                        "unterminated comment");
4486               done_comment:
4487                 lwm = pos;
4488                 break;
4489               }
4490
4491             case '\'':
4492               if (!CPP_OPTION (pfile, digit_separators))
4493                 goto delimited_string;
4494
4495               /* Possibly a number punctuator.  */
4496               if (!ISIDNUM (*do_peek_next (pos, limit)))
4497                 goto delimited_string;
4498
4499               goto quote_peek;
4500
4501             case '\"':
4502               if (!CPP_OPTION (pfile, rliterals))
4503                 goto delimited_string;
4504
4505             quote_peek:
4506               {
4507                 /* For ' see if it's a number punctuator
4508                    \.?<digit>(<digit>|<identifier-nondigit>
4509                    |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
4510                 /* For " see if it's a raw string
4511                    {U,L,u,u8}R.  This includes CPP_NUMBER detection,
4512                    because that could be 0e+R.  */
4513                 const unsigned char *peek = pos - 1;
4514                 bool quote_first = c == '"';
4515                 bool quote_eight = false;
4516                 bool maybe_number_start = false;
4517                 bool want_number = false;
4518
4519                 while ((peek = do_peek_prev (peek, lwm)))
4520                   {
4521                     unsigned char p = *peek;
4522                     if (quote_first)
4523                       {
4524                         if (!raw)
4525                           {
4526                             if (p != 'R')
4527                               break;
4528                             raw = true;
4529                             continue;
4530                           }
4531
4532                         quote_first = false;
4533                         if (p == 'L' || p == 'U' || p == 'u')
4534                           ;
4535                         else if (p == '8')
4536                           quote_eight = true;
4537                         else
4538                           goto second_raw;
4539                       }
4540                     else if (quote_eight)
4541                       {
4542                         if (p != 'u')
4543                           {
4544                             raw = false;
4545                             break;
4546                           }
4547                         quote_eight = false;
4548                       }
4549                     else if (c == '"')
4550                       {
4551                       second_raw:;
4552                         if (!want_number && ISIDNUM (p))
4553                           {
4554                             raw = false;
4555                             break;
4556                           }
4557                       }
4558
4559                     if (ISDIGIT (p))
4560                       maybe_number_start = true;
4561                     else if (p == '.')
4562                       want_number = true;
4563                     else if (ISIDNUM (p))
4564                       maybe_number_start = false;
4565                     else if (p == '+' || p == '-')
4566                       {
4567                         if (const unsigned char *peek_prev
4568                             = do_peek_prev (peek, lwm))
4569                           {
4570                             p = *peek_prev;
4571                             if (p == 'e' || p == 'E'
4572                                 || p == 'p' || p == 'P')
4573                               {
4574                                 want_number = true;
4575                                 maybe_number_start = false;
4576                               }
4577                             else
4578                               break;
4579                           }
4580                         else
4581                           break;
4582                       }
4583                     else if (p == '\'' || p == '\"')
4584                       {
4585                         /* If this is lwm, this must be the end of a
4586                            previous string.  So this is a trailing
4587                            literal type, (a) if those are allowed,
4588                              and (b) maybe_start is false.  Otherwise
4589                              this must be a CPP_NUMBER because we've
4590                              met another ', and we'd have checked that
4591                              in its own right.  */
4592                         if (peek == lwm && CPP_OPTION (pfile, uliterals))
4593                           {
4594                             if  (!maybe_number_start && !want_number)
4595                               /* Must be a literal type.  */
4596                               raw = false;
4597                           }
4598                         else if (p == '\''
4599                                  && CPP_OPTION (pfile, digit_separators))
4600                           maybe_number_start = true;
4601                         break;
4602                       }
4603                     else if (c == '\'')
4604                       break;
4605                     else if (!quote_first && !quote_eight)
4606                       break;
4607                   }
4608
4609                 if (maybe_number_start)
4610                   {
4611                     if (c == '\'')
4612                       /* A CPP NUMBER.  */
4613                       goto dflt;
4614                     raw = false;
4615                   }
4616
4617                 goto delimited_string;
4618               }
4619
4620             delimited_string:
4621               {
4622                 /* (Possibly raw) string or char literal.  */
4623                 unsigned char end = c;
4624                 int delim_len = -1;
4625                 const unsigned char *delim = NULL;
4626                 location_t sloc = linemap_position_for_column (pfile->line_table,
4627                                                                pos - line_start);
4628                 int esc = 0;
4629
4630                 if (raw)
4631                   {
4632                     /* There can be no line breaks in the delimiter.  */
4633                     delim = pos;
4634                     for (delim_len = 0; (c = *pos++) != '('; delim_len++)
4635                       {
4636                         if (delim_len == 16)
4637                           {
4638                             cpp_error_with_line (pfile, CPP_DL_ERROR,
4639                                                  sloc, 0,
4640                                                  "raw string delimiter"
4641                                                  " longer than %d"
4642                                                  " characters",
4643                                                  delim_len);
4644                             raw = false;
4645                             pos = delim;
4646                             break;
4647                           }
4648                         if (strchr (") \\\t\v\f\n", c))
4649                           {
4650                             cpp_error_with_line (pfile, CPP_DL_ERROR,
4651                                                  sloc, 0,
4652                                                  "invalid character '%c'"
4653                                                  " in raw string"
4654                                                  " delimiter", c);
4655                             raw = false;
4656                             pos = delim;
4657                             break;
4658                           }
4659                         if (pos >= limit)
4660                           goto bad_string;
4661                       }
4662                   }
4663
4664                 while (pos < limit)
4665                   {
4666                     char c = *pos++;
4667                     switch (c)
4668                       {
4669                       case '\\':
4670                         if (!raw)
4671                           esc++;
4672                         break;
4673
4674                       case '\r':
4675                         if (*pos == '\n')
4676                           pos++;
4677                         /* FALLTHROUGH  */
4678
4679                       case '\n':
4680                         {
4681                           CPP_INCREMENT_LINE (pfile, 0);
4682                           line_count++;
4683                           line_start = pos;
4684                         }
4685                         if (esc)
4686                           esc--;
4687                         break;
4688
4689                       case ')':
4690                         if (raw
4691                             && pos + delim_len + 1 < limit
4692                             && pos[delim_len] == end
4693                             && !memcmp (delim, pos, delim_len))
4694                           {
4695                             pos += delim_len + 1;
4696                             raw = false;
4697                             goto done_string;
4698                           }
4699                         break;
4700
4701                       default:
4702                         if (!raw && !(esc & 1) && c == end)
4703                           goto done_string;
4704                         esc = 0;
4705                         break;
4706                       }
4707                   }
4708               bad_string:
4709                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4710                                      "unterminated literal");
4711
4712               done_string:
4713                 raw = false;
4714                 lwm = pos - 1;
4715               }
4716               goto dflt;
4717
4718             case '_':
4719             case 'e':
4720             case 'i':
4721             case 'm':
4722               if (bol && module_p && !pfile->state.skipping
4723                   && do_peek_module (pfile, c, pos, limit))
4724                 {
4725                   /* We've seen the start of a module control line.
4726                      Start up the tokenizer.  */
4727                   pos--; /* Backup over the first character.  */
4728
4729                   /* Backup over whitespace to start of line.  */
4730                   while (pos > line_start
4731                          && (pos[-1] == ' ' || pos[-1] == '\t'))
4732                     pos--;
4733
4734                   if (pos > base)
4735                     cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
4736
4737                   /* Prep things for directive handling. */
4738                   buffer->next_line = pos;
4739                   buffer->need_line = true;
4740
4741                   /* Now get tokens until the PRAGMA_EOL.  */
4742                   do
4743                     {
4744                       location_t spelling;
4745                       const cpp_token *tok
4746                         = cpp_get_token_with_location (pfile, &spelling);
4747
4748                       gcc_assert (pfile->state.in_deferred_pragma
4749                                   || tok->type == CPP_PRAGMA_EOL);
4750                       cb (pfile, CPP_DO_token, data, tok, spelling);
4751                     }
4752                   while (pfile->state.in_deferred_pragma);
4753
4754                   if (pfile->buffer->next_line < pfile->buffer->rlimit)
4755                     cb (pfile, CPP_DO_location, data,
4756                         pfile->line_table->highest_line);
4757
4758                   pfile->mi_valid = false;
4759                   goto restart;
4760                 }
4761               goto dflt;
4762
4763             default:
4764             dflt:
4765               bol = false;
4766               pfile->mi_valid = false;
4767               break;
4768             }
4769         }
4770
4771       if (buffer->rlimit > base && !pfile->state.skipping)
4772         {
4773           const unsigned char *limit = buffer->rlimit;
4774           /* If the file was not newline terminated, add rlimit, which is
4775              guaranteed to point to a newline, to the end of our range.  */
4776           if (limit[-1] != '\n')
4777             {
4778               limit++;
4779               CPP_INCREMENT_LINE (pfile, 0);
4780               line_count++;
4781             }
4782           cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
4783         }
4784
4785       _cpp_pop_buffer (pfile);
4786     }
4787   while (pfile->buffer);
4788 }