libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2021 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problem can be properly
 267    autoconfed:
 268
 269    The Solaris 10+ assembler tags objects with the instruction set
 270    extensions used, so SSE4.2 executables cannot run on machines that
 271    don't support that extension.  */
 272
 273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 274
 275 /* Replicated character data to be shared between implementations.
 276    Recall that outside of a context with vector support we can't
 277    define compatible vector types, therefore these are all defined
 278    in terms of raw characters.  */
 279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 280   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 281     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 282   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 283     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 284   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 285     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 286   { '?', '?', '?', '?', '?', '?', '?', '?',
 287     '?', '?', '?', '?', '?', '?', '?', '?' },
 288 };
 289
 290 /* A version of the fast scanner using MMX vectorized byte compare insns.
 291
 292    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 293    which was packaged into SSE1; it is also present in the AMD MMX
 294    extension.  Mark the function as using "sse" so that we emit a real
 295    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 296
 297 static const uchar *
 298 #ifndef __SSE__
 299 __attribute__((__target__("sse")))
 300 #endif
 301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 302 {
 303   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 304   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 305
 306   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 307   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 308   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 309   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 310
 311   unsigned int misalign, found, mask;
 312   const v8qi *p;
 313   v8qi data, t, c;
 314
 315   /* Align the source pointer.  While MMX doesn't generate unaligned data
 316      faults, this allows us to safely scan to the end of the buffer without
 317      reading beyond the end of the last page.  */
 318   misalign = (uintptr_t)s & 7;
 319   p = (const v8qi *)((uintptr_t)s & -8);
 320   data = *p;
 321
 322   /* Create a mask for the bytes that are valid within the first
 323      16-byte block.  The Idea here is that the AND with the mask
 324      within the loop is "free", since we need some AND or TEST
 325      insn in order to set the flags for the branch anyway.  */
 326   mask = -1u << misalign;
 327
 328   /* Main loop processing 8 bytes at a time.  */
 329   goto start;
 330   do
 331     {
 332       data = *++p;
 333       mask = -1;
 334
 335     start:
 336       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 337       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 338       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 339       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       found = __builtin_ia32_pmovmskb (t);
 344       found &= mask;
 345     }
 346   while (!found);
 347
 348   __builtin_ia32_emms ();
 349
 350   /* FOUND contains 1 in bits for which we matched a relevant
 351      character.  Conversion to the byte index is trivial.  */
 352   found = __builtin_ctz(found);
 353   return (const uchar *)p + found;
 354 }
 355
 356 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 357
 358 static const uchar *
 359 #ifndef __SSE2__
 360 __attribute__((__target__("sse2")))
 361 #endif
 362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 363 {
 364   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 365
 366   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 367   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 368   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 369   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 370
 371   unsigned int misalign, found, mask;
 372   const v16qi *p;
 373   v16qi data, t;
 374
 375   /* Align the source pointer.  */
 376   misalign = (uintptr_t)s & 15;
 377   p = (const v16qi *)((uintptr_t)s & -16);
 378   data = *p;
 379
 380   /* Create a mask for the bytes that are valid within the first
 381      16-byte block.  The Idea here is that the AND with the mask
 382      within the loop is "free", since we need some AND or TEST
 383      insn in order to set the flags for the branch anyway.  */
 384   mask = -1u << misalign;
 385
 386   /* Main loop processing 16 bytes at a time.  */
 387   goto start;
 388   do
 389     {
 390       data = *++p;
 391       mask = -1;
 392
 393     start:
 394       t  = data == repl_nl;
 395       t |= data == repl_cr;
 396       t |= data == repl_bs;
 397       t |= data == repl_qm;
 398       found = __builtin_ia32_pmovmskb128 (t);
 399       found &= mask;
 400     }
 401   while (!found);
 402
 403   /* FOUND contains 1 in bits for which we matched a relevant
 404      character.  Conversion to the byte index is trivial.  */
 405   found = __builtin_ctz(found);
 406   return (const uchar *)p + found;
 407 }
 408
 409 #ifdef HAVE_SSE4
 410 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 411
 412 static const uchar *
 413 #ifndef __SSE4_2__
 414 __attribute__((__target__("sse4.2")))
 415 #endif
 416 search_line_sse42 (const uchar *s, const uchar *end)
 417 {
 418   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 419   static const v16qi search = { '\n', '\r', '?', '\\' };
 420
 421   uintptr_t si = (uintptr_t)s;
 422   uintptr_t index;
 423
 424   /* Check for unaligned input.  */
 425   if (si & 15)
 426     {
 427       v16qi sv;
 428
 429       if (__builtin_expect (end - s < 16, 0)
 430           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 431         {
 432           /* There are less than 16 bytes left in the buffer, and less
 433              than 16 bytes left on the page.  Reading 16 bytes at this
 434              point might generate a spurious page fault.  Defer to the
 435              SSE2 implementation, which already handles alignment.  */
 436           return search_line_sse2 (s, end);
 437         }
 438
 439       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 440          memory need not be aligned.  */
 441       sv = __builtin_ia32_loaddqu ((const char *) s);
 442       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 443
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 15) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  */
 454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 455   while (1)
 456     {
 457       char f;
 458
 459       /* By using inline assembly instead of the builtin,
 460          we can use the result, as well as the flags set.  */
 461       __asm ("%vpcmpestri\t$0, %2, %3"
 462              : "=c"(index), "=@ccc"(f)
 463              : "m"(*s), "x"(search), "a"(4), "d"(16));
 464       if (f)
 465         break;
 466
 467       s += 16;
 468     }
 469 #else
 470   s -= 16;
 471   /* By doing the whole loop in inline assembly,
 472      we can make proper use of the flags set.  */
 473   __asm (      ".balign 16\n"
 474         "0:     add $16, %1\n"
 475         "       %vpcmpestri\t$0, (%1), %2\n"
 476         "       jnc 0b"
 477         : "=&c"(index), "+r"(s)
 478         : "x"(search), "a"(4), "d"(16));
 479 #endif
 480
 481  found:
 482   return s + index;
 483 }
 484
 485 #else
 486 /* Work around out-dated assemblers without sse4 support.  */
 487 #define search_line_sse42 search_line_sse2
 488 #endif
 489
 490 /* Check the CPU capabilities.  */
 491
 492 #include "../gcc/config/i386/cpuid.h"
 493
 494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 495 static search_line_fast_type search_line_fast;
 496
 497 #define HAVE_init_vectorized_lexer 1
 498 static inline void
 499 init_vectorized_lexer (void)
 500 {
 501   unsigned dummy, ecx = 0, edx = 0;
 502   search_line_fast_type impl = search_line_acc_char;
 503   int minimum = 0;
 504
 505 #if defined(__SSE4_2__)
 506   minimum = 3;
 507 #elif defined(__SSE2__)
 508   minimum = 2;
 509 #elif defined(__SSE__)
 510   minimum = 1;
 511 #endif
 512
 513   if (minimum == 3)
 514     impl = search_line_sse42;
 515   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 516     {
 517       if (minimum == 3 || (ecx & bit_SSE4_2))
 518         impl = search_line_sse42;
 519       else if (minimum == 2 || (edx & bit_SSE2))
 520         impl = search_line_sse2;
 521       else if (minimum == 1 || (edx & bit_SSE))
 522         impl = search_line_mmx;
 523     }
 524   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 525     {
 526       if (minimum == 1
 527           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 528         impl = search_line_mmx;
 529     }
 530
 531   search_line_fast = impl;
 532 }
 533
 534 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 535
 536 /* A vection of the fast scanner using AltiVec vectorized byte compares
 537    and VSX unaligned loads (when VSX is available).  This is otherwise
 538    the same as the AltiVec version.  */
 539
 540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 541 static const uchar *
 542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 543 {
 544   typedef __attribute__((altivec(vector))) unsigned char vc;
 545
 546   const vc repl_nl = {
 547     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 548     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 549   };
 550   const vc repl_cr = {
 551     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 552     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 553   };
 554   const vc repl_bs = {
 555     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 556     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 557   };
 558   const vc repl_qm = {
 559     '?', '?', '?', '?', '?', '?', '?', '?',
 560     '?', '?', '?', '?', '?', '?', '?', '?',
 561   };
 562   const vc zero = { 0 };
 563
 564   vc data, t;
 565
 566   /* Main loop processing 16 bytes at a time.  */
 567   do
 568     {
 569       vc m_nl, m_cr, m_bs, m_qm;
 570
 571       data = __builtin_vec_vsx_ld (0, s);
 572       s += 16;
 573
 574       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 575       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 576       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 577       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 578       t = (m_nl | m_cr) | (m_bs | m_qm);
 579
 580       /* T now contains 0xff in bytes for which we matched one of the relevant
 581          characters.  We want to exit the loop if any byte in T is non-zero.
 582          Below is the expansion of vec_any_ne(t, zero).  */
 583     }
 584   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 585
 586   /* Restore s to to point to the 16 bytes we just processed.  */
 587   s -= 16;
 588
 589   {
 590 #define N  (sizeof(vc) / sizeof(long))
 591
 592     union {
 593       vc v;
 594       /* Statically assert that N is 2 or 4.  */
 595       unsigned long l[(N == 2 || N == 4) ? N : -1];
 596     } u;
 597     unsigned long l, i = 0;
 598
 599     u.v = t;
 600
 601     /* Find the first word of T that is non-zero.  */
 602     switch (N)
 603       {
 604       case 4:
 605         l = u.l[i++];
 606         if (l != 0)
 607           break;
 608         s += sizeof(unsigned long);
 609         l = u.l[i++];
 610         if (l != 0)
 611           break;
 612         s += sizeof(unsigned long);
 613         /* FALLTHRU */
 614       case 2:
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619         l = u.l[i];
 620       }
 621
 622     /* L now contains 0xff in bytes for which we matched one of the
 623        relevant characters.  We can find the byte index by finding
 624        its bit index and dividing by 8.  */
 625 #ifdef __BIG_ENDIAN__
 626     l = __builtin_clzl(l) >> 3;
 627 #else
 628     l = __builtin_ctzl(l) >> 3;
 629 #endif
 630     return s + l;
 631
 632 #undef N
 633   }
 634 }
 635
 636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 637
 638 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 639    This cannot be used for little endian because vec_lvsl/lvsr are
 640    deprecated for little endian and the code won't work properly.  */
 641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 642    so we can't compile this function without -maltivec on the command line
 643    (or implied by some other switch).  */
 644
 645 static const uchar *
 646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 647 {
 648   typedef __attribute__((altivec(vector))) unsigned char vc;
 649
 650   const vc repl_nl = {
 651     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 652     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 653   };
 654   const vc repl_cr = {
 655     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 656     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 657   };
 658   const vc repl_bs = {
 659     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 660     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 661   };
 662   const vc repl_qm = {
 663     '?', '?', '?', '?', '?', '?', '?', '?',
 664     '?', '?', '?', '?', '?', '?', '?', '?',
 665   };
 666   const vc ones = {
 667     -1, -1, -1, -1, -1, -1, -1, -1,
 668     -1, -1, -1, -1, -1, -1, -1, -1,
 669   };
 670   const vc zero = { 0 };
 671
 672   vc data, mask, t;
 673
 674   /* Altivec loads automatically mask addresses with -16.  This lets us
 675      issue the first load as early as possible.  */
 676   data = __builtin_vec_ld(0, (const vc *)s);
 677
 678   /* Discard bytes before the beginning of the buffer.  Do this by
 679      beginning with all ones and shifting in zeros according to the
 680      mis-alignment.  The LVSR instruction pulls the exact shift we
 681      want from the address.  */
 682   mask = __builtin_vec_lvsr(0, s);
 683   mask = __builtin_vec_perm(zero, ones, mask);
 684   data &= mask;
 685
 686   /* While altivec loads mask addresses, we still need to align S so
 687      that the offset we compute at the end is correct.  */
 688   s = (const uchar *)((uintptr_t)s & -16);
 689
 690   /* Main loop processing 16 bytes at a time.  */
 691   goto start;
 692   do
 693     {
 694       vc m_nl, m_cr, m_bs, m_qm;
 695
 696       s += 16;
 697       data = __builtin_vec_ld(0, (const vc *)s);
 698
 699     start:
 700       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 701       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 702       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 703       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 704       t = (m_nl | m_cr) | (m_bs | m_qm);
 705
 706       /* T now contains 0xff in bytes for which we matched one of the relevant
 707          characters.  We want to exit the loop if any byte in T is non-zero.
 708          Below is the expansion of vec_any_ne(t, zero).  */
 709     }
 710   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 711
 712   {
 713 #define N  (sizeof(vc) / sizeof(long))
 714
 715     union {
 716       vc v;
 717       /* Statically assert that N is 2 or 4.  */
 718       unsigned long l[(N == 2 || N == 4) ? N : -1];
 719     } u;
 720     unsigned long l, i = 0;
 721
 722     u.v = t;
 723
 724     /* Find the first word of T that is non-zero.  */
 725     switch (N)
 726       {
 727       case 4:
 728         l = u.l[i++];
 729         if (l != 0)
 730           break;
 731         s += sizeof(unsigned long);
 732         l = u.l[i++];
 733         if (l != 0)
 734           break;
 735         s += sizeof(unsigned long);
 736         /* FALLTHROUGH */
 737       case 2:
 738         l = u.l[i++];
 739         if (l != 0)
 740           break;
 741         s += sizeof(unsigned long);
 742         l = u.l[i];
 743       }
 744
 745     /* L now contains 0xff in bytes for which we matched one of the
 746        relevant characters.  We can find the byte index by finding
 747        its bit index and dividing by 8.  */
 748     l = __builtin_clzl(l) >> 3;
 749     return s + l;
 750
 751 #undef N
 752   }
 753 }
 754
 755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 756 #include "arm_neon.h"
 757
 758 /* This doesn't have to be the exact page size, but no system may use
 759    a size smaller than this.  ARMv8 requires a minimum page size of
 760    4k.  The impact of being conservative here is a small number of
 761    cases will take the slightly slower entry path into the main
 762    loop.  */
 763
 764 #define AARCH64_MIN_PAGE_SIZE 4096
 765
 766 static const uchar *
 767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 768 {
 769   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 770   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 771   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 772   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 773   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 774
 775 #ifdef __ARM_BIG_ENDIAN
 776   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 777 #else
 778   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 779 #endif
 780
 781   unsigned int found;
 782   const uint8_t *p;
 783   uint8x16_t data;
 784   uint8x16_t t;
 785   uint16x8_t m;
 786   uint8x16_t u, v, w;
 787
 788   /* Align the source pointer.  */
 789   p = (const uint8_t *)((uintptr_t)s & -16);
 790
 791   /* Assuming random string start positions, with a 4k page size we'll take
 792      the slow path about 0.37% of the time.  */
 793   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 794                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 795                         < 16, 0))
 796     {
 797       /* Slow path: the string starts near a possible page boundary.  */
 798       uint32_t misalign, mask;
 799
 800       misalign = (uintptr_t)s & 15;
 801       mask = (-1u << misalign) & 0xffff;
 802       data = vld1q_u8 (p);
 803       t = vceqq_u8 (data, repl_nl);
 804       u = vceqq_u8 (data, repl_cr);
 805       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 806       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 807       t = vorrq_u8 (v, w);
 808       t = vandq_u8 (t, xmask);
 809       m = vpaddlq_u8 (t);
 810       m = vshlq_u16 (m, shift);
 811       found = vaddvq_u16 (m);
 812       found &= mask;
 813       if (found)
 814         return (const uchar*)p + __builtin_ctz (found);
 815     }
 816   else
 817     {
 818       data = vld1q_u8 ((const uint8_t *) s);
 819       t = vceqq_u8 (data, repl_nl);
 820       u = vceqq_u8 (data, repl_cr);
 821       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 822       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 823       t = vorrq_u8 (v, w);
 824       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 825         goto done;
 826     }
 827
 828   do
 829     {
 830       p += 16;
 831       data = vld1q_u8 (p);
 832       t = vceqq_u8 (data, repl_nl);
 833       u = vceqq_u8 (data, repl_cr);
 834       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 835       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 836       t = vorrq_u8 (v, w);
 837     } while (!vpaddd_u64 ((uint64x2_t)t));
 838
 839 done:
 840   /* Now that we've found the terminating substring, work out precisely where
 841      we need to stop.  */
 842   t = vandq_u8 (t, xmask);
 843   m = vpaddlq_u8 (t);
 844   m = vshlq_u16 (m, shift);
 845   found = vaddvq_u16 (m);
 846   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 847           + __builtin_ctz (found));
 848 }
 849
 850 #elif defined (__ARM_NEON)
 851 #include "arm_neon.h"
 852
 853 static const uchar *
 854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 855 {
 856   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 857   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 858   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 859   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 860   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 861
 862   unsigned int misalign, found, mask;
 863   const uint8_t *p;
 864   uint8x16_t data;
 865
 866   /* Align the source pointer.  */
 867   misalign = (uintptr_t)s & 15;
 868   p = (const uint8_t *)((uintptr_t)s & -16);
 869   data = vld1q_u8 (p);
 870
 871   /* Create a mask for the bytes that are valid within the first
 872      16-byte block.  The Idea here is that the AND with the mask
 873      within the loop is "free", since we need some AND or TEST
 874      insn in order to set the flags for the branch anyway.  */
 875   mask = (-1u << misalign) & 0xffff;
 876
 877   /* Main loop, processing 16 bytes at a time.  */
 878   goto start;
 879
 880   do
 881     {
 882       uint8x8_t l;
 883       uint16x4_t m;
 884       uint32x2_t n;
 885       uint8x16_t t, u, v, w;
 886
 887       p += 16;
 888       data = vld1q_u8 (p);
 889       mask = 0xffff;
 890
 891     start:
 892       t = vceqq_u8 (data, repl_nl);
 893       u = vceqq_u8 (data, repl_cr);
 894       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 895       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 896       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 897       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 898       m = vpaddl_u8 (l);
 899       n = vpaddl_u16 (m);
 900
 901       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 902               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 903       found &= mask;
 904     }
 905   while (!found);
 906
 907   /* FOUND contains 1 in bits for which we matched a relevant
 908      character.  Conversion to the byte index is trivial.  */
 909   found = __builtin_ctz (found);
 910   return (const uchar *)p + found;
 911 }
 912
 913 #else
 914
 915 /* We only have one accelerated alternative.  Use a direct call so that
 916    we encourage inlining.  */
 917
 918 #define search_line_fast  search_line_acc_char
 919
 920 #endif
 921
 922 /* Initialize the lexer if needed.  */
 923
 924 void
 925 _cpp_init_lexer (void)
 926 {
 927 #ifdef HAVE_init_vectorized_lexer
 928   init_vectorized_lexer ();
 929 #endif
 930 }
 931
 932 /* Returns with a logical line that contains no escaped newlines or
 933    trigraphs.  This is a time-critical inner loop.  */
 934 void
 935 _cpp_clean_line (cpp_reader *pfile)
 936 {
 937   cpp_buffer *buffer;
 938   const uchar *s;
 939   uchar c, *d, *p;
 940
 941   buffer = pfile->buffer;
 942   buffer->cur_note = buffer->notes_used = 0;
 943   buffer->cur = buffer->line_base = buffer->next_line;
 944   buffer->need_line = false;
 945   s = buffer->next_line;
 946
 947   if (!buffer->from_stage3)
 948     {
 949       const uchar *pbackslash = NULL;
 950
 951       /* Fast path.  This is the common case of an un-escaped line with
 952          no trigraphs.  The primary win here is by not writing any
 953          data back to memory until we have to.  */
 954       while (1)
 955         {
 956           /* Perform an optimized search for \n, \r, \\, ?.  */
 957           s = search_line_fast (s, buffer->rlimit);
 958
 959           c = *s;
 960           if (c == '\\')
 961             {
 962               /* Record the location of the backslash and continue.  */
 963               pbackslash = s++;
 964             }
 965           else if (__builtin_expect (c == '?', 0))
 966             {
 967               if (__builtin_expect (s[1] == '?', false)
 968                    && _cpp_trigraph_map[s[2]])
 969                 {
 970                   /* Have a trigraph.  We may or may not have to convert
 971                      it.  Add a line note regardless, for -Wtrigraphs.  */
 972                   add_line_note (buffer, s, s[2]);
 973                   if (CPP_OPTION (pfile, trigraphs))
 974                     {
 975                       /* We do, and that means we have to switch to the
 976                          slow path.  */
 977                       d = (uchar *) s;
 978                       *d = _cpp_trigraph_map[s[2]];
 979                       s += 2;
 980                       goto slow_path;
 981                     }
 982                 }
 983               /* Not a trigraph.  Continue on fast-path.  */
 984               s++;
 985             }
 986           else
 987             break;
 988         }
 989
 990       /* This must be \r or \n.  We're either done, or we'll be forced
 991          to write back to the buffer and continue on the slow path.  */
 992       d = (uchar *) s;
 993
 994       if (__builtin_expect (s == buffer->rlimit, false))
 995         goto done;
 996
 997       /* DOS line ending? */
 998       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 999         {
1000           s++;
1001           if (s == buffer->rlimit)
1002             goto done;
1003         }
1004
1005       if (__builtin_expect (pbackslash == NULL, true))
1006         goto done;
1007
1008       /* Check for escaped newline.  */
1009       p = d;
1010       while (is_nvspace (p[-1]))
1011         p--;
1012       if (p - 1 != pbackslash)
1013         goto done;
1014
1015       /* Have an escaped newline; process it and proceed to
1016          the slow path.  */
1017       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018       d = p - 2;
1019       buffer->next_line = p - 1;
1020
1021     slow_path:
1022       while (1)
1023         {
1024           c = *++s;
1025           *++d = c;
1026
1027           if (c == '\n' || c == '\r')
1028             {
1029               /* Handle DOS line endings.  */
1030               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031                 s++;
1032               if (s == buffer->rlimit)
1033                 break;
1034
1035               /* Escaped?  */
1036               p = d;
1037               while (p != buffer->next_line && is_nvspace (p[-1]))
1038                 p--;
1039               if (p == buffer->next_line || p[-1] != '\\')
1040                 break;
1041
1042               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043               d = p - 2;
1044               buffer->next_line = p - 1;
1045             }
1046           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1047             {
1048               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1049               add_line_note (buffer, d, s[2]);
1050               if (CPP_OPTION (pfile, trigraphs))
1051                 {
1052                   *d = _cpp_trigraph_map[s[2]];
1053                   s += 2;
1054                 }
1055             }
1056         }
1057     }
1058   else
1059     {
1060       while (*s != '\n' && *s != '\r')
1061         s++;
1062       d = (uchar *) s;
1063
1064       /* Handle DOS line endings.  */
1065       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1066         s++;
1067     }
1068
1069  done:
1070   *d = '\n';
1071   /* A sentinel note that should never be processed.  */
1072   add_line_note (buffer, d + 1, '\n');
1073   buffer->next_line = s + 1;
1074 }
1075
1076 /* Return true if the trigraph indicated by NOTE should be warned
1077    about in a comment.  */
1078 static bool
1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1080 {
1081   const uchar *p;
1082
1083   /* Within comments we don't warn about trigraphs, unless the
1084      trigraph forms an escaped newline, as that may change
1085      behavior.  */
1086   if (note->type != '/')
1087     return false;
1088
1089   /* If -trigraphs, then this was an escaped newline iff the next note
1090      is coincident.  */
1091   if (CPP_OPTION (pfile, trigraphs))
1092     return note[1].pos == note->pos;
1093
1094   /* Otherwise, see if this forms an escaped newline.  */
1095   p = note->pos + 3;
1096   while (is_nvspace (*p))
1097     p++;
1098
1099   /* There might have been escaped newlines between the trigraph and the
1100      newline we found.  Hence the position test.  */
1101   return (*p == '\n' && p < note[1].pos);
1102 }
1103
1104 /* Process the notes created by add_line_note as far as the current
1105    location.  */
1106 void
1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1108 {
1109   cpp_buffer *buffer = pfile->buffer;
1110
1111   for (;;)
1112     {
1113       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114       unsigned int col;
1115
1116       if (note->pos > buffer->cur)
1117         break;
1118
1119       buffer->cur_note++;
1120       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1121
1122       if (note->type == '\\' || note->type == ' ')
1123         {
1124           if (note->type == ' ' && !in_comment)
1125             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126                                  "backslash and newline separated by space");
1127
1128           if (buffer->next_line > buffer->rlimit)
1129             {
1130               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131                                    "backslash-newline at end of file");
1132               /* Prevent "no newline at end of file" warning.  */
1133               buffer->next_line = buffer->rlimit;
1134             }
1135
1136           buffer->line_base = note->pos;
1137           CPP_INCREMENT_LINE (pfile, 0);
1138         }
1139       else if (_cpp_trigraph_map[note->type])
1140         {
1141           if (CPP_OPTION (pfile, warn_trigraphs)
1142               && (!in_comment || warn_in_comment (pfile, note)))
1143             {
1144               if (CPP_OPTION (pfile, trigraphs))
1145                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146                                        pfile->line_table->highest_line, col,
1147                                        "trigraph ??%c converted to %c",
1148                                        note->type,
1149                                        (int) _cpp_trigraph_map[note->type]);
1150               else
1151                 {
1152                   cpp_warning_with_line
1153                     (pfile, CPP_W_TRIGRAPHS,
1154                      pfile->line_table->highest_line, col,
1155                      "trigraph ??%c ignored, use -trigraphs to enable",
1156                      note->type);
1157                 }
1158             }
1159         }
1160       else if (note->type == 0)
1161         /* Already processed in lex_raw_string.  */;
1162       else
1163         abort ();
1164     }
1165 }
1166
1167 /* Skip a C-style block comment.  We find the end of the comment by
1168    seeing if an asterisk is before every '/' we encounter.  Returns
1169    nonzero if comment terminated by EOF, zero otherwise.
1170
1171    Buffer->cur points to the initial asterisk of the comment.  */
1172 bool
1173 _cpp_skip_block_comment (cpp_reader *pfile)
1174 {
1175   cpp_buffer *buffer = pfile->buffer;
1176   const uchar *cur = buffer->cur;
1177   uchar c;
1178
1179   cur++;
1180   if (*cur == '/')
1181     cur++;
1182
1183   for (;;)
1184     {
1185       /* People like decorating comments with '*', so check for '/'
1186          instead for efficiency.  */
1187       c = *cur++;
1188
1189       if (c == '/')
1190         {
1191           if (cur[-2] == '*')
1192             break;
1193
1194           /* Warn about potential nested comments, but not if the '/'
1195              comes immediately before the true comment delimiter.
1196              Don't bother to get it right across escaped newlines.  */
1197           if (CPP_OPTION (pfile, warn_comments)
1198               && cur[0] == '*' && cur[1] != '/')
1199             {
1200               buffer->cur = cur;
1201               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1202                                      pfile->line_table->highest_line,
1203                                      CPP_BUF_COL (buffer),
1204                                      "\"/*\" within comment");
1205             }
1206         }
1207       else if (c == '\n')
1208         {
1209           unsigned int cols;
1210           buffer->cur = cur - 1;
1211           _cpp_process_line_notes (pfile, true);
1212           if (buffer->next_line >= buffer->rlimit)
1213             return true;
1214           _cpp_clean_line (pfile);
1215
1216           cols = buffer->next_line - buffer->line_base;
1217           CPP_INCREMENT_LINE (pfile, cols);
1218
1219           cur = buffer->cur;
1220         }
1221     }
1222
1223   buffer->cur = cur;
1224   _cpp_process_line_notes (pfile, true);
1225   return false;
1226 }
1227
1228 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1229    terminating newline.  Handles escaped newlines.  Returns nonzero
1230    if a multiline comment.  */
1231 static int
1232 skip_line_comment (cpp_reader *pfile)
1233 {
1234   cpp_buffer *buffer = pfile->buffer;
1235   location_t orig_line = pfile->line_table->highest_line;
1236
1237   while (*buffer->cur != '\n')
1238     buffer->cur++;
1239
1240   _cpp_process_line_notes (pfile, true);
1241   return orig_line != pfile->line_table->highest_line;
1242 }
1243
1244 /* Skips whitespace, saving the next non-whitespace character.  */
1245 static void
1246 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1247 {
1248   cpp_buffer *buffer = pfile->buffer;
1249   bool saw_NUL = false;
1250
1251   do
1252     {
1253       /* Horizontal space always OK.  */
1254       if (c == ' ' || c == '\t')
1255         ;
1256       /* Just \f \v or \0 left.  */
1257       else if (c == '\0')
1258         saw_NUL = true;
1259       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1260         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1261                              CPP_BUF_COL (buffer),
1262                              "%s in preprocessing directive",
1263                              c == '\f' ? "form feed" : "vertical tab");
1264
1265       c = *buffer->cur++;
1266     }
1267   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1268   while (is_nvspace (c));
1269
1270   if (saw_NUL)
1271     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1272
1273   buffer->cur--;
1274 }
1275
1276 /* See if the characters of a number token are valid in a name (no
1277    '.', '+' or '-').  */
1278 static int
1279 name_p (cpp_reader *pfile, const cpp_string *string)
1280 {
1281   unsigned int i;
1282
1283   for (i = 0; i < string->len; i++)
1284     if (!is_idchar (string->text[i]))
1285       return 0;
1286
1287   return 1;
1288 }
1289
1290 /* After parsing an identifier or other sequence, produce a warning about
1291    sequences not in NFC/NFKC.  */
1292 static void
1293 warn_about_normalization (cpp_reader *pfile,
1294                           const cpp_token *token,
1295                           const struct normalize_state *s)
1296 {
1297   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1298       && !pfile->state.skipping)
1299     {
1300       /* Make sure that the token is printed using UCNs, even
1301          if we'd otherwise happily print UTF-8.  */
1302       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1303       size_t sz;
1304
1305       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1306       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1307         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1308                                "`%.*s' is not in NFKC", (int) sz, buf);
1309       else
1310         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1311                                "`%.*s' is not in NFC", (int) sz, buf);
1312       free (buf);
1313     }
1314 }
1315
1316 static const cppchar_t utf8_signifier = 0xC0;
1317
1318 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1319    an identifier.  FIRST is TRUE if this starts an identifier.  */
1320 static bool
1321 forms_identifier_p (cpp_reader *pfile, int first,
1322                     struct normalize_state *state)
1323 {
1324   cpp_buffer *buffer = pfile->buffer;
1325
1326   if (*buffer->cur == '$')
1327     {
1328       if (!CPP_OPTION (pfile, dollars_in_ident))
1329         return false;
1330
1331       buffer->cur++;
1332       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1333         {
1334           CPP_OPTION (pfile, warn_dollars) = 0;
1335           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1336         }
1337
1338       return true;
1339     }
1340
1341   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
1342   if (CPP_OPTION (pfile, extended_identifiers))
1343     {
1344       cppchar_t s;
1345       if (*buffer->cur >= utf8_signifier)
1346         {
1347           if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1348                                state, &s))
1349             return true;
1350         }
1351       else if (*buffer->cur == '\\'
1352                && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1353         {
1354           buffer->cur += 2;
1355           if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1356                               state, &s, NULL, NULL))
1357             return true;
1358           buffer->cur -= 2;
1359         }
1360     }
1361
1362   return false;
1363 }
1364
1365 /* Helper function to issue error about improper __VA_OPT__ use.  */
1366 static void
1367 maybe_va_opt_error (cpp_reader *pfile)
1368 {
1369   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1370     {
1371       /* __VA_OPT__ should not be accepted at all, but allow it in
1372          system headers.  */
1373       if (!_cpp_in_system_header (pfile))
1374         cpp_error (pfile, CPP_DL_PEDWARN,
1375                    "__VA_OPT__ is not available until C++20");
1376     }
1377   else if (!pfile->state.va_args_ok)
1378     {
1379       /* __VA_OPT__ should only appear in the replacement list of a
1380          variadic macro.  */
1381       cpp_error (pfile, CPP_DL_PEDWARN,
1382                  "__VA_OPT__ can only appear in the expansion"
1383                  " of a C++20 variadic macro");
1384     }
1385 }
1386
1387 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1388 static cpp_hashnode *
1389 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1390 {
1391   cpp_hashnode *result;
1392   const uchar *cur;
1393   unsigned int len;
1394   unsigned int hash = HT_HASHSTEP (0, *base);
1395
1396   cur = base + 1;
1397   while (ISIDNUM (*cur))
1398     {
1399       hash = HT_HASHSTEP (hash, *cur);
1400       cur++;
1401     }
1402   len = cur - base;
1403   hash = HT_HASHFINISH (hash, len);
1404   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1405                                               base, len, hash, HT_ALLOC));
1406
1407   /* Rarely, identifiers require diagnostics when lexed.  */
1408   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1409                         && !pfile->state.skipping, 0))
1410     {
1411       /* It is allowed to poison the same identifier twice.  */
1412       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1413         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1414                    NODE_NAME (result));
1415
1416       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1417          replacement list of a variadic macro.  */
1418       if (result == pfile->spec_nodes.n__VA_ARGS__
1419           && !pfile->state.va_args_ok)
1420         {
1421           if (CPP_OPTION (pfile, cplusplus))
1422             cpp_error (pfile, CPP_DL_PEDWARN,
1423                        "__VA_ARGS__ can only appear in the expansion"
1424                        " of a C++11 variadic macro");
1425           else
1426             cpp_error (pfile, CPP_DL_PEDWARN,
1427                        "__VA_ARGS__ can only appear in the expansion"
1428                        " of a C99 variadic macro");
1429         }
1430
1431       if (result == pfile->spec_nodes.n__VA_OPT__)
1432         maybe_va_opt_error (pfile);
1433
1434       /* For -Wc++-compat, warn about use of C++ named operators.  */
1435       if (result->flags & NODE_WARN_OPERATOR)
1436         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1437                      "identifier \"%s\" is a special operator name in C++",
1438                      NODE_NAME (result));
1439     }
1440
1441   return result;
1442 }
1443
1444 /* Get the cpp_hashnode of an identifier specified by NAME in
1445    the current cpp_reader object.  If none is found, NULL is returned.  */
1446 cpp_hashnode *
1447 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1448 {
1449   cpp_hashnode *result;
1450   result = lex_identifier_intern (pfile, (uchar *) name);
1451   return result;
1452 }
1453
1454 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1455 static cpp_hashnode *
1456 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1457                 struct normalize_state *nst, cpp_hashnode **spelling)
1458 {
1459   cpp_hashnode *result;
1460   const uchar *cur;
1461   unsigned int len;
1462   unsigned int hash = HT_HASHSTEP (0, *base);
1463
1464   cur = pfile->buffer->cur;
1465   if (! starts_ucn)
1466     {
1467       while (ISIDNUM (*cur))
1468         {
1469           hash = HT_HASHSTEP (hash, *cur);
1470           cur++;
1471         }
1472       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1473     }
1474   pfile->buffer->cur = cur;
1475   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1476     {
1477       /* Slower version for identifiers containing UCNs
1478          or extended chars (including $).  */
1479       do {
1480         while (ISIDNUM (*pfile->buffer->cur))
1481           {
1482             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1483             pfile->buffer->cur++;
1484           }
1485       } while (forms_identifier_p (pfile, false, nst));
1486       result = _cpp_interpret_identifier (pfile, base,
1487                                           pfile->buffer->cur - base);
1488       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1489     }
1490   else
1491     {
1492       len = cur - base;
1493       hash = HT_HASHFINISH (hash, len);
1494
1495       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1496                                                   base, len, hash, HT_ALLOC));
1497       *spelling = result;
1498     }
1499
1500   /* Rarely, identifiers require diagnostics when lexed.  */
1501   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1502                         && !pfile->state.skipping, 0))
1503     {
1504       /* It is allowed to poison the same identifier twice.  */
1505       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1506         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1507                    NODE_NAME (result));
1508
1509       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1510          replacement list of a variadic macro.  */
1511       if (result == pfile->spec_nodes.n__VA_ARGS__
1512           && !pfile->state.va_args_ok)
1513         {
1514           if (CPP_OPTION (pfile, cplusplus))
1515             cpp_error (pfile, CPP_DL_PEDWARN,
1516                        "__VA_ARGS__ can only appear in the expansion"
1517                        " of a C++11 variadic macro");
1518           else
1519             cpp_error (pfile, CPP_DL_PEDWARN,
1520                        "__VA_ARGS__ can only appear in the expansion"
1521                        " of a C99 variadic macro");
1522         }
1523
1524       /* __VA_OPT__ should only appear in the replacement list of a
1525          variadic macro.  */
1526       if (result == pfile->spec_nodes.n__VA_OPT__)
1527         maybe_va_opt_error (pfile);
1528
1529       /* For -Wc++-compat, warn about use of C++ named operators.  */
1530       if (result->flags & NODE_WARN_OPERATOR)
1531         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1532                      "identifier \"%s\" is a special operator name in C++",
1533                      NODE_NAME (result));
1534     }
1535
1536   return result;
1537 }
1538
1539 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1540 static void
1541 lex_number (cpp_reader *pfile, cpp_string *number,
1542             struct normalize_state *nst)
1543 {
1544   const uchar *cur;
1545   const uchar *base;
1546   uchar *dest;
1547
1548   base = pfile->buffer->cur - 1;
1549   do
1550     {
1551       cur = pfile->buffer->cur;
1552
1553       /* N.B. ISIDNUM does not include $.  */
1554       while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1555              || VALID_SIGN (*cur, cur[-1]))
1556         {
1557           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1558           cur++;
1559         }
1560       /* A number can't end with a digit separator.  */
1561       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
1562         --cur;
1563
1564       pfile->buffer->cur = cur;
1565     }
1566   while (forms_identifier_p (pfile, false, nst));
1567
1568   number->len = cur - base;
1569   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1570   memcpy (dest, base, number->len);
1571   dest[number->len] = '\0';
1572   number->text = dest;
1573 }
1574
1575 /* Create a token of type TYPE with a literal spelling.  */
1576 static void
1577 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1578                 unsigned int len, enum cpp_ttype type)
1579 {
1580   token->type = type;
1581   token->val.str.len = len;
1582   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
1583 }
1584
1585 const uchar *
1586 cpp_alloc_token_string (cpp_reader *pfile,
1587                         const unsigned char *ptr, unsigned len)
1588 {
1589   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1590
1591   dest[len] = 0;
1592   memcpy (dest, ptr, len);
1593   return dest;
1594 }
1595
1596 /* A pair of raw buffer pointers.  The currently open one is [1], the
1597    first one is [0].  Used for string literal lexing.  */
1598 struct lit_accum {
1599   _cpp_buff *first;
1600   _cpp_buff *last;
1601   const uchar *rpos;
1602   size_t accum;
1603
1604   lit_accum ()
1605     : first (NULL), last (NULL), rpos (0), accum (0)
1606   {
1607   }
1608
1609   void append (cpp_reader *, const uchar *, size_t);
1610
1611   void read_begin (cpp_reader *);
1612   bool reading_p () const
1613   {
1614     return rpos != NULL;
1615   }
1616   char read_char ()
1617   {
1618     char c = *rpos++;
1619     if (rpos == BUFF_FRONT (last))
1620       rpos = NULL;
1621     return c;
1622   }
1623 };
1624
1625 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1626    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1627
1628 void
1629 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
1630 {
1631   if (!last)
1632     /* Starting.  */
1633     first = last = _cpp_get_buff (pfile, len);
1634   else if (len > BUFF_ROOM (last))
1635     {
1636       /* There is insufficient room in the buffer.  Copy what we can,
1637          and then either extend or create a new one.  */
1638       size_t room = BUFF_ROOM (last);
1639       memcpy (BUFF_FRONT (last), base, room);
1640       BUFF_FRONT (last) += room;
1641       base += room;
1642       len -= room;
1643       accum += room;
1644
1645       gcc_checking_assert (!rpos);
1646
1647       last = _cpp_append_extend_buff (pfile, last, len);
1648     }
1649
1650   memcpy (BUFF_FRONT (last), base, len);
1651   BUFF_FRONT (last) += len;
1652   accum += len;
1653 }
1654
1655 void
1656 lit_accum::read_begin (cpp_reader *pfile)
1657 {
1658   /* We never accumulate more than 4 chars to read.  */
1659   if (BUFF_ROOM (last) < 4)
1660
1661     last = _cpp_append_extend_buff (pfile, last, 4);
1662   rpos = BUFF_FRONT (last);
1663 }
1664
1665 /* Returns true if a macro has been defined.
1666    This might not work if compile with -save-temps,
1667    or preprocess separately from compilation.  */
1668
1669 static bool
1670 is_macro(cpp_reader *pfile, const uchar *base)
1671 {
1672   const uchar *cur = base;
1673   if (! ISIDST (*cur))
1674     return false;
1675   unsigned int hash = HT_HASHSTEP (0, *cur);
1676   ++cur;
1677   while (ISIDNUM (*cur))
1678     {
1679       hash = HT_HASHSTEP (hash, *cur);
1680       ++cur;
1681     }
1682   hash = HT_HASHFINISH (hash, cur - base);
1683
1684   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1685                                         base, cur - base, hash, HT_NO_INSERT));
1686
1687   return result && cpp_macro_p (result);
1688 }
1689
1690 /* Returns true if a literal suffix does not have the expected form
1691    and is defined as a macro.  */
1692
1693 static bool
1694 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
1695 {
1696   /* User-defined literals outside of namespace std must start with a single
1697      underscore, so assume anything of that form really is a UDL suffix.
1698      We don't need to worry about UDLs defined inside namespace std because
1699      their names are reserved, so cannot be used as macro names in valid
1700      programs.  */
1701   if (base[0] == '_' && base[1] != '_')
1702     return false;
1703   return is_macro (pfile, base);
1704 }
1705
1706 /* Lexes a raw string.  The stored string contains the spelling,
1707    including double quotes, delimiter string, '(' and ')', any leading
1708    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
1709    the type of the literal, or CPP_OTHER if it was not properly
1710    terminated.
1711
1712    BASE is the start of the token.  Updates pfile->buffer->cur to just
1713    after the lexed string.
1714
1715    The spelling is NUL-terminated, but it is not guaranteed that this
1716    is the first NUL since embedded NULs are preserved.  */
1717
1718 static void
1719 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1720 {
1721   const uchar *pos = base;
1722
1723   /* 'tis a pity this information isn't passed down from the lexer's
1724      initial categorization of the token.  */
1725   enum cpp_ttype type = CPP_STRING;
1726
1727   if (*pos == 'L')
1728     {
1729       type = CPP_WSTRING;
1730       pos++;
1731     }
1732   else if (*pos == 'U')
1733     {
1734       type = CPP_STRING32;
1735       pos++;
1736     }
1737   else if (*pos == 'u')
1738     {
1739       if (pos[1] == '8')
1740         {
1741           type = CPP_UTF8STRING;
1742           pos++;
1743         }
1744       else
1745         type = CPP_STRING16;
1746       pos++;
1747     }
1748
1749   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
1750   pos += 2;
1751
1752   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1753
1754   /* Skip notes before the ".  */
1755   while (note->pos < pos)
1756     ++note;
1757
1758   lit_accum accum;
1759
1760   uchar prefix[17];
1761   unsigned prefix_len = 0;
1762   enum Phase
1763   {
1764    PHASE_PREFIX = -2,
1765    PHASE_NONE = -1,
1766    PHASE_SUFFIX = 0
1767   } phase = PHASE_PREFIX;
1768
1769   for (;;)
1770     {
1771       gcc_checking_assert (note->pos >= pos);
1772
1773       /* Undo any escaped newlines and trigraphs.  */
1774       if (!accum.reading_p () && note->pos == pos)
1775         switch (note->type)
1776           {
1777           case '\\':
1778           case ' ':
1779             /* Restore backslash followed by newline.  */
1780             accum.append (pfile, base, pos - base);
1781             base = pos;
1782             accum.read_begin (pfile);
1783             accum.append (pfile, UC"\\", 1);
1784
1785           after_backslash:
1786             if (note->type == ' ')
1787               /* GNU backslash whitespace newline extension.  FIXME
1788                  could be any sequence of non-vertical space.  When we
1789                  can properly restore any such sequence, we should
1790                  mark this note as handled so _cpp_process_line_notes
1791                  doesn't warn.  */
1792               accum.append (pfile, UC" ", 1);
1793
1794             accum.append (pfile, UC"\n", 1);
1795             note++;
1796             break;
1797
1798           case '\n':
1799             /* This can happen for ??/<NEWLINE> when trigraphs are not
1800                being interpretted.  */
1801             gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
1802             note->type = 0;
1803             note++;
1804             break;
1805
1806           default:
1807             gcc_checking_assert (_cpp_trigraph_map[note->type]);
1808
1809             /* Don't warn about this trigraph in
1810                _cpp_process_line_notes, since trigraphs show up as
1811                trigraphs in raw strings.  */
1812             uchar type = note->type;
1813             note->type = 0;
1814
1815             if (CPP_OPTION (pfile, trigraphs))
1816               {
1817                 accum.append (pfile, base, pos - base);
1818                 base = pos;
1819                 accum.read_begin (pfile);
1820                 accum.append (pfile, UC"??", 2);
1821                 accum.append (pfile, &type, 1);
1822
1823                 /* ??/ followed by newline gets two line notes, one for
1824                    the trigraph and one for the backslash/newline.  */
1825                 if (type == '/' && note[1].pos == pos)
1826                   {
1827                     note++;
1828                     gcc_assert (note->type == '\\' || note->type == ' ');
1829                     goto after_backslash;
1830                   }
1831                 /* Skip the replacement character.  */
1832                 base = ++pos;
1833               }
1834
1835             note++;
1836             break;
1837           }
1838
1839       /* Now get a char to process.  Either from an expanded note, or
1840          from the line buffer.  */
1841       bool read_note = accum.reading_p ();
1842       char c = read_note ? accum.read_char () : *pos++;
1843
1844       if (phase == PHASE_PREFIX)
1845         {
1846           if (c == '(')
1847             {
1848               /* Done.  */
1849               phase = PHASE_NONE;
1850               prefix[prefix_len++] = '"';
1851             }
1852           else if (prefix_len < 16
1853                    /* Prefix chars are any of the basic character set,
1854                       [lex.charset] except for '
1855                       ()\\\t\v\f\n'. Optimized for a contiguous
1856                       alphabet.  */
1857                    /* Unlike a switch, this collapses down to one or
1858                       two shift and bitmask operations on an ASCII
1859                       system, with an outlier or two.   */
1860                    && (('Z' - 'A' == 25
1861                         ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
1862                         : ISIDST (c))
1863                        || (c >= '0' && c <= '9')
1864                        || c == '_' || c == '{' || c == '}'
1865                        || c == '[' || c == ']' || c == '#'
1866                        || c == '<' || c == '>' || c == '%'
1867                        || c == ':' || c == ';' || c == '.' || c == '?'
1868                        || c == '*' || c == '+' || c == '-' || c == '/'
1869                        || c == '^' || c == '&' || c == '|' || c == '~'
1870                        || c == '!' || c == '=' || c == ','
1871                        || c == '"' || c == '\''))
1872             prefix[prefix_len++] = c;
1873           else
1874             {
1875               /* Something is wrong.  */
1876               int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
1877               if (prefix_len == 16)
1878                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1879                                      col, "raw string delimiter longer "
1880                                      "than 16 characters");
1881               else if (c == '\n')
1882                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1883                                      col, "invalid new-line in raw "
1884                                      "string delimiter");
1885               else
1886                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1887                                      col, "invalid character '%c' in "
1888                                      "raw string delimiter", c);
1889               type = CPP_OTHER;
1890               phase = PHASE_NONE;
1891               /* Continue until we get a close quote, that's probably
1892                  the best failure mode.  */
1893               prefix_len = 0;
1894             }
1895           if (c != '\n')
1896             continue;
1897         }
1898
1899       if (phase != PHASE_NONE)
1900         {
1901           if (prefix[phase] != c)
1902             phase = PHASE_NONE;
1903           else if (unsigned (phase + 1) == prefix_len)
1904             break;
1905           else
1906             {
1907               phase = Phase (phase + 1);
1908               continue;
1909             }
1910         }
1911
1912       if (!prefix_len && c == '"')
1913         /* Failure mode lexing.  */
1914         goto out;
1915       else if (prefix_len && c == ')')
1916         phase = PHASE_SUFFIX;
1917       else if (!read_note && c == '\n')
1918         {
1919           pos--;
1920           pfile->buffer->cur = pos;
1921           if (pfile->state.in_directive
1922               || (pfile->state.parsing_args
1923                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
1924             {
1925               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1926                                    "unterminated raw string");
1927               type = CPP_OTHER;
1928               goto out;
1929             }
1930
1931           accum.append (pfile, base, pos - base + 1);
1932           _cpp_process_line_notes (pfile, false);
1933
1934           if (pfile->buffer->next_line < pfile->buffer->rlimit)
1935             CPP_INCREMENT_LINE (pfile, 0);
1936           pfile->buffer->need_line = true;
1937
1938           if (!_cpp_get_fresh_line (pfile))
1939             {
1940               /* We ran out of file and failed to get a line.  */
1941               location_t src_loc = token->src_loc;
1942               token->type = CPP_EOF;
1943               /* Tell the compiler the line number of the EOF token.  */
1944               token->src_loc = pfile->line_table->highest_line;
1945               token->flags = BOL;
1946               if (accum.first)
1947                 _cpp_release_buff (pfile, accum.first);
1948               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1949                                    "unterminated raw string");
1950               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
1951               _cpp_pop_buffer (pfile);
1952               return;
1953             }
1954
1955           pos = base = pfile->buffer->cur;
1956           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1957         }
1958     }
1959
1960   if (CPP_OPTION (pfile, user_literals))
1961     {
1962       /* If a string format macro, say from inttypes.h, is placed touching
1963          a string literal it could be parsed as a C++11 user-defined string
1964          literal thus breaking the program.  */
1965       if (is_macro_not_literal_suffix (pfile, pos))
1966         {
1967           /* Raise a warning, but do not consume subsequent tokens.  */
1968           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1969             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1970                                    token->src_loc, 0,
1971                                    "invalid suffix on literal; C++11 requires "
1972                                    "a space between literal and string macro");
1973         }
1974       /* Grab user defined literal suffix.  */
1975       else if (ISIDST (*pos))
1976         {
1977           type = cpp_userdef_string_add_type (type);
1978           ++pos;
1979
1980           while (ISIDNUM (*pos))
1981             ++pos;
1982         }
1983     }
1984
1985  out:
1986   pfile->buffer->cur = pos;
1987   if (!accum.accum)
1988     create_literal (pfile, token, base, pos - base, type);
1989   else
1990     {
1991       size_t extra_len = pos - base;
1992       uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
1993
1994       token->type = type;
1995       token->val.str.len = accum.accum + extra_len;
1996       token->val.str.text = dest;
1997       for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
1998         {
1999           size_t len = BUFF_FRONT (buf) - buf->base;
2000           memcpy (dest, buf->base, len);
2001           dest += len;
2002         }
2003       _cpp_release_buff (pfile, accum.first);
2004       memcpy (dest, base, extra_len);
2005       dest[extra_len] = '\0';
2006     }
2007 }
2008
2009 /* Lexes a string, character constant, or angle-bracketed header file
2010    name.  The stored string contains the spelling, including opening
2011    quote and any leading 'L', 'u', 'U' or 'u8' and optional
2012    'R' modifier.  It returns the type of the literal, or CPP_OTHER
2013    if it was not properly terminated, or CPP_LESS for an unterminated
2014    header name which must be relexed as normal tokens.
2015
2016    The spelling is NUL-terminated, but it is not guaranteed that this
2017    is the first NUL since embedded NULs are preserved.  */
2018 static void
2019 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2020 {
2021   bool saw_NUL = false;
2022   const uchar *cur;
2023   cppchar_t terminator;
2024   enum cpp_ttype type;
2025
2026   cur = base;
2027   terminator = *cur++;
2028   if (terminator == 'L' || terminator == 'U')
2029     terminator = *cur++;
2030   else if (terminator == 'u')
2031     {
2032       terminator = *cur++;
2033       if (terminator == '8')
2034         terminator = *cur++;
2035     }
2036   if (terminator == 'R')
2037     {
2038       lex_raw_string (pfile, token, base);
2039       return;
2040     }
2041   if (terminator == '"')
2042     type = (*base == 'L' ? CPP_WSTRING :
2043             *base == 'U' ? CPP_STRING32 :
2044             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2045                          : CPP_STRING);
2046   else if (terminator == '\'')
2047     type = (*base == 'L' ? CPP_WCHAR :
2048             *base == 'U' ? CPP_CHAR32 :
2049             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2050                          : CPP_CHAR);
2051   else
2052     terminator = '>', type = CPP_HEADER_NAME;
2053
2054   for (;;)
2055     {
2056       cppchar_t c = *cur++;
2057
2058       /* In #include-style directives, terminators are not escapable.  */
2059       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2060         cur++;
2061       else if (c == terminator)
2062         break;
2063       else if (c == '\n')
2064         {
2065           cur--;
2066           /* Unmatched quotes always yield undefined behavior, but
2067              greedy lexing means that what appears to be an unterminated
2068              header name may actually be a legitimate sequence of tokens.  */
2069           if (terminator == '>')
2070             {
2071               token->type = CPP_LESS;
2072               return;
2073             }
2074           type = CPP_OTHER;
2075           break;
2076         }
2077       else if (c == '\0')
2078         saw_NUL = true;
2079     }
2080
2081   if (saw_NUL && !pfile->state.skipping)
2082     cpp_error (pfile, CPP_DL_WARNING,
2083                "null character(s) preserved in literal");
2084
2085   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2086     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2087                (int) terminator);
2088
2089   if (CPP_OPTION (pfile, user_literals))
2090     {
2091       /* If a string format macro, say from inttypes.h, is placed touching
2092          a string literal it could be parsed as a C++11 user-defined string
2093          literal thus breaking the program.  */
2094       if (is_macro_not_literal_suffix (pfile, cur))
2095         {
2096           /* Raise a warning, but do not consume subsequent tokens.  */
2097           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2098             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2099                                    token->src_loc, 0,
2100                                    "invalid suffix on literal; C++11 requires "
2101                                    "a space between literal and string macro");
2102         }
2103       /* Grab user defined literal suffix.  */
2104       else if (ISIDST (*cur))
2105         {
2106           type = cpp_userdef_char_add_type (type);
2107           type = cpp_userdef_string_add_type (type);
2108           ++cur;
2109
2110           while (ISIDNUM (*cur))
2111             ++cur;
2112         }
2113     }
2114   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2115            && is_macro (pfile, cur)
2116            && !pfile->state.skipping)
2117     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2118                            token->src_loc, 0, "C++11 requires a space "
2119                            "between string literal and macro");
2120
2121   pfile->buffer->cur = cur;
2122   create_literal (pfile, token, base, cur - base, type);
2123 }
2124
2125 /* Return the comment table. The client may not make any assumption
2126    about the ordering of the table.  */
2127 cpp_comment_table *
2128 cpp_get_comments (cpp_reader *pfile)
2129 {
2130   return &pfile->comments;
2131 }
2132
2133 /* Append a comment to the end of the comment table. */
2134 static void
2135 store_comment (cpp_reader *pfile, cpp_token *token)
2136 {
2137   int len;
2138
2139   if (pfile->comments.allocated == 0)
2140     {
2141       pfile->comments.allocated = 256;
2142       pfile->comments.entries = (cpp_comment *) xmalloc
2143         (pfile->comments.allocated * sizeof (cpp_comment));
2144     }
2145
2146   if (pfile->comments.count == pfile->comments.allocated)
2147     {
2148       pfile->comments.allocated *= 2;
2149       pfile->comments.entries = (cpp_comment *) xrealloc
2150         (pfile->comments.entries,
2151          pfile->comments.allocated * sizeof (cpp_comment));
2152     }
2153
2154   len = token->val.str.len;
2155
2156   /* Copy comment. Note, token may not be NULL terminated. */
2157   pfile->comments.entries[pfile->comments.count].comment =
2158     (char *) xmalloc (sizeof (char) * (len + 1));
2159   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2160           token->val.str.text, len);
2161   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2162
2163   /* Set source location. */
2164   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2165
2166   /* Increment the count of entries in the comment table. */
2167   pfile->comments.count++;
2168 }
2169
2170 /* The stored comment includes the comment start and any terminator.  */
2171 static void
2172 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2173               cppchar_t type)
2174 {
2175   unsigned char *buffer;
2176   unsigned int len, clen, i;
2177
2178   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2179
2180   /* C++ comments probably (not definitely) have moved past a new
2181      line, which we don't want to save in the comment.  */
2182   if (is_vspace (pfile->buffer->cur[-1]))
2183     len--;
2184
2185   /* If we are currently in a directive or in argument parsing, then
2186      we need to store all C++ comments as C comments internally, and
2187      so we need to allocate a little extra space in that case.
2188
2189      Note that the only time we encounter a directive here is
2190      when we are saving comments in a "#define".  */
2191   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2192           && type == '/') ? len + 2 : len;
2193
2194   buffer = _cpp_unaligned_alloc (pfile, clen);
2195
2196   token->type = CPP_COMMENT;
2197   token->val.str.len = clen;
2198   token->val.str.text = buffer;
2199
2200   buffer[0] = '/';
2201   memcpy (buffer + 1, from, len - 1);
2202
2203   /* Finish conversion to a C comment, if necessary.  */
2204   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2205     {
2206       buffer[1] = '*';
2207       buffer[clen - 2] = '*';
2208       buffer[clen - 1] = '/';
2209       /* As there can be in a C++ comments illegal sequences for C comments
2210          we need to filter them out.  */
2211       for (i = 2; i < (clen - 2); i++)
2212         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2213           buffer[i] = '|';
2214     }
2215
2216   /* Finally store this comment for use by clients of libcpp. */
2217   store_comment (pfile, token);
2218 }
2219
2220 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2221    comment.  */
2222
2223 static bool
2224 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2225 {
2226   const unsigned char *from = comment_start + 1;
2227
2228   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2229     {
2230       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2231          don't recognize any comments.  The latter only checks attributes,
2232          the former doesn't warn.  */
2233     case 0:
2234     default:
2235       return false;
2236       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2237          content it has.  */
2238     case 1:
2239       return true;
2240     case 2:
2241       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2242          .*falls?[ \t-]*thr(u|ough).* regex.  */
2243       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2244            from++)
2245         {
2246           /* Is there anything like strpbrk with upper boundary, or
2247              memchr looking for 2 characters rather than just one?  */
2248           if (from[0] != 'f' && from[0] != 'F')
2249             continue;
2250           if (from[1] != 'a' && from[1] != 'A')
2251             continue;
2252           if (from[2] != 'l' && from[2] != 'L')
2253             continue;
2254           if (from[3] != 'l' && from[3] != 'L')
2255             continue;
2256           from += sizeof "fall" - 1;
2257           if (from[0] == 's' || from[0] == 'S')
2258             from++;
2259           while (*from == ' ' || *from == '\t' || *from == '-')
2260             from++;
2261           if (from[0] != 't' && from[0] != 'T')
2262             continue;
2263           if (from[1] != 'h' && from[1] != 'H')
2264             continue;
2265           if (from[2] != 'r' && from[2] != 'R')
2266             continue;
2267           if (from[3] == 'u' || from[3] == 'U')
2268             return true;
2269           if (from[3] != 'o' && from[3] != 'O')
2270             continue;
2271           if (from[4] != 'u' && from[4] != 'U')
2272             continue;
2273           if (from[5] != 'g' && from[5] != 'G')
2274             continue;
2275           if (from[6] != 'h' && from[6] != 'H')
2276             continue;
2277           return true;
2278         }
2279       return false;
2280     case 3:
2281     case 4:
2282       break;
2283     }
2284
2285   /* Whole comment contents:
2286      -fallthrough
2287      @fallthrough@
2288    */
2289   if (*from == '-' || *from == '@')
2290     {
2291       size_t len = sizeof "fallthrough" - 1;
2292       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2293         return false;
2294       if (memcmp (from + 1, "fallthrough", len))
2295         return false;
2296       if (*from == '@')
2297         {
2298           if (from[len + 1] != '@')
2299             return false;
2300           len++;
2301         }
2302       from += 1 + len;
2303     }
2304   /* Whole comment contents (regex):
2305      lint -fallthrough[ \t]*
2306    */
2307   else if (*from == 'l')
2308     {
2309       size_t len = sizeof "int -fallthrough" - 1;
2310       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2311         return false;
2312       if (memcmp (from + 1, "int -fallthrough", len))
2313         return false;
2314       from += 1 + len;
2315       while (*from == ' ' || *from == '\t')
2316         from++;
2317     }
2318   /* Whole comment contents (regex):
2319      [ \t]*FALLTHR(U|OUGH)[ \t]*
2320    */
2321   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2322     {
2323       while (*from == ' ' || *from == '\t')
2324         from++;
2325       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
2326         return false;
2327       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2328         return false;
2329       from += sizeof "FALLTHR" - 1;
2330       if (*from == 'U')
2331         from++;
2332       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
2333         return false;
2334       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2335         return false;
2336       else
2337         from += sizeof "OUGH" - 1;
2338       while (*from == ' ' || *from == '\t')
2339         from++;
2340     }
2341   /* Whole comment contents (regex):
2342      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2343      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2344      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2345    */
2346   else
2347     {
2348       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2349         from++;
2350       unsigned char f = *from;
2351       bool all_upper = false;
2352       if (f == 'E' || f == 'e')
2353         {
2354           if ((size_t) (pfile->buffer->cur - from)
2355               < sizeof "else fallthru" - 1)
2356             return false;
2357           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2358             all_upper = true;
2359           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2360             return false;
2361           from += sizeof "else" - 1;
2362           if (*from == ',')
2363             from++;
2364           if (*from != ' ')
2365             return false;
2366           from++;
2367           if (all_upper && *from == 'f')
2368             return false;
2369           if (f == 'e' && *from == 'F')
2370             return false;
2371           f = *from;
2372         }
2373       else if (f == 'I' || f == 'i')
2374         {
2375           if ((size_t) (pfile->buffer->cur - from)
2376               < sizeof "intentional fallthru" - 1)
2377             return false;
2378           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2379                                   sizeof "NTENTIONAL" - 1) == 0)
2380             all_upper = true;
2381           else if (memcmp (from + 1, "ntentional",
2382                            sizeof "ntentional" - 1))
2383             return false;
2384           from += sizeof "intentional" - 1;
2385           if (*from == ' ')
2386             {
2387               from++;
2388               if (all_upper && *from == 'f')
2389                 return false;
2390             }
2391           else if (all_upper)
2392             {
2393               if (memcmp (from, "LY F", sizeof "LY F" - 1))
2394                 return false;
2395               from += sizeof "LY " - 1;
2396             }
2397           else
2398             {
2399               if (memcmp (from, "ly ", sizeof "ly " - 1))
2400                 return false;
2401               from += sizeof "ly " - 1;
2402             }
2403           if (f == 'i' && *from == 'F')
2404             return false;
2405           f = *from;
2406         }
2407       if (f != 'F' && f != 'f')
2408         return false;
2409       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
2410         return false;
2411       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
2412         all_upper = true;
2413       else if (all_upper)
2414         return false;
2415       else if (memcmp (from + 1, "all", sizeof "all" - 1))
2416         return false;
2417       from += sizeof "fall" - 1;
2418       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
2419         from += 2;
2420       else if (*from == ' ' || *from == '-')
2421         from++;
2422       else if (*from != (all_upper ? 'T' : 't'))
2423         return false;
2424       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
2425         return false;
2426       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
2427         return false;
2428       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
2429         {
2430           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
2431             return false;
2432           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
2433                       sizeof "hrough" - 1))
2434             return false;
2435           from += sizeof "through" - 1;
2436         }
2437       else
2438         from += sizeof "thru" - 1;
2439       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2440         from++;
2441       if (*from == '-')
2442         {
2443           from++;
2444           if (*comment_start == '*')
2445             {
2446               do
2447                 {
2448                   while (*from && *from != '*'
2449                          && *from != '\n' && *from != '\r')
2450                     from++;
2451                   if (*from != '*' || from[1] == '/')
2452                     break;
2453                   from++;
2454                 }
2455               while (1);
2456             }
2457           else
2458             while (*from && *from != '\n' && *from != '\r')
2459               from++;
2460         }
2461     }
2462   /* C block comment.  */
2463   if (*comment_start == '*')
2464     {
2465       if (*from != '*' || from[1] != '/')
2466         return false;
2467     }
2468   /* C++ line comment.  */
2469   else if (*from != '\n')
2470     return false;
2471
2472   return true;
2473 }
2474
2475 /* Allocate COUNT tokens for RUN.  */
2476 void
2477 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
2478 {
2479   run->base = XNEWVEC (cpp_token, count);
2480   run->limit = run->base + count;
2481   run->next = NULL;
2482 }
2483
2484 /* Returns the next tokenrun, or creates one if there is none.  */
2485 static tokenrun *
2486 next_tokenrun (tokenrun *run)
2487 {
2488   if (run->next == NULL)
2489     {
2490       run->next = XNEW (tokenrun);
2491       run->next->prev = run;
2492       _cpp_init_tokenrun (run->next, 250);
2493     }
2494
2495   return run->next;
2496 }
2497
2498 /* Return the number of not yet processed token in a given
2499    context.  */
2500 int
2501 _cpp_remaining_tokens_num_in_context (cpp_context *context)
2502 {
2503   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2504     return (LAST (context).token - FIRST (context).token);
2505   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2506            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2507     return (LAST (context).ptoken - FIRST (context).ptoken);
2508   else
2509       abort ();
2510 }
2511
2512 /* Returns the token present at index INDEX in a given context.  If
2513    INDEX is zero, the next token to be processed is returned.  */
2514 static const cpp_token*
2515 _cpp_token_from_context_at (cpp_context *context, int index)
2516 {
2517   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2518     return &(FIRST (context).token[index]);
2519   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2520            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2521     return FIRST (context).ptoken[index];
2522  else
2523    abort ();
2524 }
2525
2526 /* Look ahead in the input stream.  */
2527 const cpp_token *
2528 cpp_peek_token (cpp_reader *pfile, int index)
2529 {
2530   cpp_context *context = pfile->context;
2531   const cpp_token *peektok;
2532   int count;
2533
2534   /* First, scan through any pending cpp_context objects.  */
2535   while (context->prev)
2536     {
2537       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2538
2539       if (index < (int) sz)
2540         return _cpp_token_from_context_at (context, index);
2541       index -= (int) sz;
2542       context = context->prev;
2543     }
2544
2545   /* We will have to read some new tokens after all (and do so
2546      without invalidating preceding tokens).  */
2547   count = index;
2548   pfile->keep_tokens++;
2549
2550   /* For peeked tokens temporarily disable line_change reporting,
2551      until the tokens are parsed for real.  */
2552   void (*line_change) (cpp_reader *, const cpp_token *, int)
2553     = pfile->cb.line_change;
2554   pfile->cb.line_change = NULL;
2555
2556   do
2557     {
2558       peektok = _cpp_lex_token (pfile);
2559       if (peektok->type == CPP_EOF)
2560         {
2561           index--;
2562           break;
2563         }
2564       else if (peektok->type == CPP_PRAGMA)
2565         {
2566           /* Don't peek past a pragma.  */
2567           if (peektok == &pfile->directive_result)
2568             /* Save the pragma in the buffer.  */
2569             *pfile->cur_token++ = *peektok;
2570           index--;
2571           break;
2572         }
2573     }
2574   while (index--);
2575
2576   _cpp_backup_tokens_direct (pfile, count - index);
2577   pfile->keep_tokens--;
2578   pfile->cb.line_change = line_change;
2579
2580   return peektok;
2581 }
2582
2583 /* Allocate a single token that is invalidated at the same time as the
2584    rest of the tokens on the line.  Has its line and col set to the
2585    same as the last lexed token, so that diagnostics appear in the
2586    right place.  */
2587 cpp_token *
2588 _cpp_temp_token (cpp_reader *pfile)
2589 {
2590   cpp_token *old, *result;
2591   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2592   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2593
2594   old = pfile->cur_token - 1;
2595   /* Any pre-existing lookaheads must not be clobbered.  */
2596   if (la)
2597     {
2598       if (sz <= la)
2599         {
2600           tokenrun *next = next_tokenrun (pfile->cur_run);
2601
2602           if (sz < la)
2603             memmove (next->base + 1, next->base,
2604                      (la - sz) * sizeof (cpp_token));
2605
2606           next->base[0] = pfile->cur_run->limit[-1];
2607         }
2608
2609       if (sz > 1)
2610         memmove (pfile->cur_token + 1, pfile->cur_token,
2611                  MIN (la, sz - 1) * sizeof (cpp_token));
2612     }
2613
2614   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2615     {
2616       pfile->cur_run = next_tokenrun (pfile->cur_run);
2617       pfile->cur_token = pfile->cur_run->base;
2618     }
2619
2620   result = pfile->cur_token++;
2621   result->src_loc = old->src_loc;
2622   return result;
2623 }
2624
2625 /* We're at the beginning of a logical line (so not in
2626   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
2627   if we should enter deferred_pragma mode to tokenize the rest of the
2628   line as a module control-line.  */
2629
2630 static void
2631 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
2632 {
2633   unsigned backup = 0; /* Tokens we peeked.  */
2634   cpp_hashnode *node = result->val.node.node;
2635   cpp_token *peek = result;
2636   cpp_token *keyword = peek;
2637   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
2638   int header_count = 0;
2639
2640   /* Make sure the incoming state is as we expect it.  This way we
2641      can restore it using constants.  */
2642   gcc_checking_assert (!pfile->state.in_deferred_pragma
2643                        && !pfile->state.skipping
2644                        && !pfile->state.parsing_args
2645                        && !pfile->state.angled_headers
2646                        && (pfile->state.save_comments
2647                            == !CPP_OPTION (pfile, discard_comments)));
2648
2649   /* Enter directives mode sufficiently for peeking.  We don't have
2650      to actually set in_directive.  */
2651   pfile->state.in_deferred_pragma = true;
2652
2653   /* These two fields are needed to process tokenization in deferred
2654      pragma mode.  They are not used outside deferred pragma mode or
2655      directives mode.  */
2656   pfile->state.pragma_allow_expansion = true;
2657   pfile->directive_line = result->src_loc;
2658
2659   /* Saving comments is incompatible with directives mode.   */
2660   pfile->state.save_comments = 0;
2661
2662   if (node == n_modules[spec_nodes::M_EXPORT][0])
2663     {
2664       peek = _cpp_lex_direct (pfile);
2665       keyword = peek;
2666       backup++;
2667       if (keyword->type != CPP_NAME)
2668         goto not_module;
2669       node = keyword->val.node.node;
2670       if (!(node->flags & NODE_MODULE))
2671         goto not_module;
2672     }
2673
2674   if (node == n_modules[spec_nodes::M__IMPORT][0])
2675     /* __import  */
2676     header_count = backup + 2 + 16;
2677   else if (node == n_modules[spec_nodes::M_IMPORT][0])
2678     /* import  */
2679     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
2680   else if (node == n_modules[spec_nodes::M_MODULE][0])
2681     ; /* module  */
2682   else
2683     goto not_module;
2684
2685   /* We've seen [export] {module|import|__import}.  Check the next token.  */
2686   if (header_count)
2687     /* After '{,__}import' a header name may appear.  */
2688     pfile->state.angled_headers = true;
2689   peek = _cpp_lex_direct (pfile);
2690   backup++;
2691
2692   /* ... import followed by identifier, ':', '<' or
2693      header-name preprocessing tokens, or module
2694      followed by cpp-identifier, ':' or ';' preprocessing
2695      tokens.  C++ keywords are not yet relevant.  */
2696   if (peek->type == CPP_NAME
2697       || peek->type == CPP_COLON
2698       ||  (header_count
2699            ? (peek->type == CPP_LESS
2700               || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
2701               || peek->type == CPP_HEADER_NAME)
2702            : peek->type == CPP_SEMICOLON))
2703     {
2704       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
2705       if (!pfile->state.pragma_allow_expansion)
2706         pfile->state.prevent_expansion++;
2707
2708       if (!header_count && linemap_included_from
2709           (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
2710         cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
2711                              "module control-line cannot be in included file");
2712
2713       /* The first one or two tokens cannot be macro names.  */
2714       for (int ix = backup; ix--;)
2715         {
2716           cpp_token *tok = ix ? keyword : result;
2717           cpp_hashnode *node = tok->val.node.node;
2718
2719           /* Don't attempt to expand the token.  */
2720           tok->flags |= NO_EXPAND;
2721           if (_cpp_defined_macro_p (node)
2722               && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
2723               && !cpp_fun_like_macro_p (node))
2724             cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
2725                                  "module control-line \"%s\" cannot be"
2726                                  " an object-like macro",
2727                                  NODE_NAME (node));
2728         }
2729
2730       /* Map to underbar variants.  */
2731       keyword->val.node.node = n_modules[header_count
2732                                          ? spec_nodes::M_IMPORT
2733                                          : spec_nodes::M_MODULE][1];
2734       if (backup != 1)
2735         result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
2736
2737       /* Maybe tell the tokenizer we expect a header-name down the
2738          road.  */
2739       pfile->state.directive_file_token = header_count;
2740     }
2741   else
2742     {
2743     not_module:
2744       /* Drop out of directive mode.  */
2745       /* We aaserted save_comments had this value upon entry.  */
2746       pfile->state.save_comments
2747         = !CPP_OPTION (pfile, discard_comments);
2748       pfile->state.in_deferred_pragma = false;
2749       /* Do not let this remain on.  */
2750       pfile->state.angled_headers = false;
2751     }
2752
2753   /* In either case we want to backup the peeked tokens.  */
2754   if (backup)
2755     {
2756       /* If we saw EOL, we should drop it, because this isn't a module
2757          control-line after all.  */
2758       bool eol = peek->type == CPP_PRAGMA_EOL;
2759       if (!eol || backup > 1)
2760         {
2761           /* Put put the peeked tokens back  */
2762           _cpp_backup_tokens_direct (pfile, backup);
2763           /* But if the last one was an EOL, forget it.  */
2764           if (eol)
2765             pfile->lookaheads--;
2766         }
2767     }
2768 }
2769
2770 /* Lex a token into RESULT (external interface).  Takes care of issues
2771    like directive handling, token lookahead, multiple include
2772    optimization and skipping.  */
2773 const cpp_token *
2774 _cpp_lex_token (cpp_reader *pfile)
2775 {
2776   cpp_token *result;
2777
2778   for (;;)
2779     {
2780       if (pfile->cur_token == pfile->cur_run->limit)
2781         {
2782           pfile->cur_run = next_tokenrun (pfile->cur_run);
2783           pfile->cur_token = pfile->cur_run->base;
2784         }
2785       /* We assume that the current token is somewhere in the current
2786          run.  */
2787       if (pfile->cur_token < pfile->cur_run->base
2788           || pfile->cur_token >= pfile->cur_run->limit)
2789         abort ();
2790
2791       if (pfile->lookaheads)
2792         {
2793           pfile->lookaheads--;
2794           result = pfile->cur_token++;
2795         }
2796       else
2797         result = _cpp_lex_direct (pfile);
2798
2799       if (result->flags & BOL)
2800         {
2801           /* Is this a directive.  If _cpp_handle_directive returns
2802              false, it is an assembler #.  */
2803           if (result->type == CPP_HASH
2804               /* 6.10.3 p 11: Directives in a list of macro arguments
2805                  gives undefined behavior.  This implementation
2806                  handles the directive as normal.  */
2807               && pfile->state.parsing_args != 1)
2808             {
2809               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2810                 {
2811                   if (pfile->directive_result.type == CPP_PADDING)
2812                     continue;
2813                   result = &pfile->directive_result;
2814                 }
2815             }
2816           else if (pfile->state.in_deferred_pragma)
2817             result = &pfile->directive_result;
2818           else if (result->type == CPP_NAME
2819                    && (result->val.node.node->flags & NODE_MODULE)
2820                    && !pfile->state.skipping
2821                    /* Unlike regular directives, we do not deal with
2822                       tokenizing module directives as macro arguments.
2823                       That's not permitted.  */
2824                    && !pfile->state.parsing_args)
2825             {
2826               /* P1857.  Before macro expansion, At start of logical
2827                  line ... */
2828               /* We don't have to consider lookaheads at this point.  */
2829               gcc_checking_assert (!pfile->lookaheads);
2830
2831               cpp_maybe_module_directive (pfile, result);
2832             }
2833
2834           if (pfile->cb.line_change && !pfile->state.skipping)
2835             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2836         }
2837
2838       /* We don't skip tokens in directives.  */
2839       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2840         break;
2841
2842       /* Outside a directive, invalidate controlling macros.  At file
2843          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2844          get here and MI optimization works.  */
2845       pfile->mi_valid = false;
2846
2847       if (!pfile->state.skipping || result->type == CPP_EOF)
2848         break;
2849     }
2850
2851   return result;
2852 }
2853
2854 /* Returns true if a fresh line has been loaded.  */
2855 bool
2856 _cpp_get_fresh_line (cpp_reader *pfile)
2857 {
2858   /* We can't get a new line until we leave the current directive.  */
2859   if (pfile->state.in_directive)
2860     return false;
2861
2862   for (;;)
2863     {
2864       cpp_buffer *buffer = pfile->buffer;
2865
2866       if (!buffer->need_line)
2867         return true;
2868
2869       if (buffer->next_line < buffer->rlimit)
2870         {
2871           _cpp_clean_line (pfile);
2872           return true;
2873         }
2874
2875       /* First, get out of parsing arguments state.  */
2876       if (pfile->state.parsing_args)
2877         return false;
2878
2879       /* End of buffer.  Non-empty files should end in a newline.  */
2880       if (buffer->buf != buffer->rlimit
2881           && buffer->next_line > buffer->rlimit
2882           && !buffer->from_stage3)
2883         {
2884           /* Clip to buffer size.  */
2885           buffer->next_line = buffer->rlimit;
2886         }
2887
2888       if (buffer->prev && !buffer->return_at_eof)
2889         _cpp_pop_buffer (pfile);
2890       else
2891         {
2892           /* End of translation.  Do not pop the buffer yet. Increment
2893              line number so that the EOF token is on a line of its own
2894              (_cpp_lex_direct doesn't increment in that case, because
2895              it's hard for it to distinguish this special case). */
2896           CPP_INCREMENT_LINE (pfile, 0);
2897           return false;
2898         }
2899     }
2900 }
2901
2902 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2903   do                                                    \
2904     {                                                   \
2905       result->type = ELSE_TYPE;                         \
2906       if (*buffer->cur == CHAR)                         \
2907         buffer->cur++, result->type = THEN_TYPE;        \
2908     }                                                   \
2909   while (0)
2910
2911 /* Lex a token into pfile->cur_token, which is also incremented, to
2912    get diagnostics pointing to the correct location.
2913
2914    Does not handle issues such as token lookahead, multiple-include
2915    optimization, directives, skipping etc.  This function is only
2916    suitable for use by _cpp_lex_token, and in special cases like
2917    lex_expansion_token which doesn't care for any of these issues.
2918
2919    When meeting a newline, returns CPP_EOF if parsing a directive,
2920    otherwise returns to the start of the token buffer if permissible.
2921    Returns the location of the lexed token.  */
2922 cpp_token *
2923 _cpp_lex_direct (cpp_reader *pfile)
2924 {
2925   cppchar_t c;
2926   cpp_buffer *buffer;
2927   const unsigned char *comment_start;
2928   bool fallthrough_comment = false;
2929   cpp_token *result = pfile->cur_token++;
2930
2931  fresh_line:
2932   result->flags = 0;
2933   buffer = pfile->buffer;
2934   if (buffer->need_line)
2935     {
2936       gcc_assert (!pfile->state.in_deferred_pragma);
2937       if (!_cpp_get_fresh_line (pfile))
2938         {
2939           result->type = CPP_EOF;
2940           /* Not a real EOF in a directive or arg parsing -- we refuse
2941              to advance to the next file now, and will once we're out
2942              of those modes.  */
2943           if (!pfile->state.in_directive && !pfile->state.parsing_args)
2944             {
2945               /* Tell the compiler the line number of the EOF token.  */
2946               result->src_loc = pfile->line_table->highest_line;
2947               result->flags = BOL;
2948               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
2949               _cpp_pop_buffer (pfile);
2950             }
2951           return result;
2952         }
2953       if (buffer != pfile->buffer)
2954         fallthrough_comment = false;
2955       if (!pfile->keep_tokens)
2956         {
2957           pfile->cur_run = &pfile->base_run;
2958           result = pfile->base_run.base;
2959           pfile->cur_token = result + 1;
2960         }
2961       result->flags = BOL;
2962       if (pfile->state.parsing_args == 2)
2963         result->flags |= PREV_WHITE;
2964     }
2965   buffer = pfile->buffer;
2966  update_tokens_line:
2967   result->src_loc = pfile->line_table->highest_line;
2968
2969  skipped_white:
2970   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2971       && !pfile->overlaid_buffer)
2972     {
2973       _cpp_process_line_notes (pfile, false);
2974       result->src_loc = pfile->line_table->highest_line;
2975     }
2976   c = *buffer->cur++;
2977
2978   if (pfile->forced_token_location)
2979     result->src_loc = pfile->forced_token_location;
2980   else
2981     result->src_loc = linemap_position_for_column (pfile->line_table,
2982                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2983
2984   switch (c)
2985     {
2986     case ' ': case '\t': case '\f': case '\v': case '\0':
2987       result->flags |= PREV_WHITE;
2988       skip_whitespace (pfile, c);
2989       goto skipped_white;
2990
2991     case '\n':
2992       /* Increment the line, unless this is the last line ...  */
2993       if (buffer->cur < buffer->rlimit
2994           /* ... or this is a #include, (where _cpp_stack_file needs to
2995              unwind by one line) ...  */
2996           || (pfile->state.in_directive > 1
2997               /* ... except traditional-cpp increments this elsewhere.  */
2998               && !CPP_OPTION (pfile, traditional)))
2999         CPP_INCREMENT_LINE (pfile, 0);
3000       buffer->need_line = true;
3001       if (pfile->state.in_deferred_pragma)
3002         {
3003           /* Produce the PRAGMA_EOL on this line.  File reading
3004              ensures there is always a \n at end of the buffer, thus
3005              in a deferred pragma we always see CPP_PRAGMA_EOL before
3006              any CPP_EOF.  */
3007           result->type = CPP_PRAGMA_EOL;
3008           result->flags &= ~PREV_WHITE;
3009           pfile->state.in_deferred_pragma = false;
3010           if (!pfile->state.pragma_allow_expansion)
3011             pfile->state.prevent_expansion--;
3012           return result;
3013         }
3014       goto fresh_line;
3015
3016     case '0': case '1': case '2': case '3': case '4':
3017     case '5': case '6': case '7': case '8': case '9':
3018       {
3019         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3020         result->type = CPP_NUMBER;
3021         lex_number (pfile, &result->val.str, &nst);
3022         warn_about_normalization (pfile, result, &nst);
3023         break;
3024       }
3025
3026     case 'L':
3027     case 'u':
3028     case 'U':
3029     case 'R':
3030       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3031          wide strings or raw strings.  */
3032       if (c == 'L' || CPP_OPTION (pfile, rliterals)
3033           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3034         {
3035           if ((*buffer->cur == '\'' && c != 'R')
3036               || *buffer->cur == '"'
3037               || (*buffer->cur == 'R'
3038                   && c != 'R'
3039                   && buffer->cur[1] == '"'
3040                   && CPP_OPTION (pfile, rliterals))
3041               || (*buffer->cur == '8'
3042                   && c == 'u'
3043                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3044                                 && CPP_OPTION (pfile, utf8_char_literals)))
3045                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3046                           && CPP_OPTION (pfile, rliterals)))))
3047             {
3048               lex_string (pfile, result, buffer->cur - 1);
3049               break;
3050             }
3051         }
3052       /* Fall through.  */
3053
3054     case '_':
3055     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3056     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3057     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3058     case 's': case 't':           case 'v': case 'w': case 'x':
3059     case 'y': case 'z':
3060     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3061     case 'G': case 'H': case 'I': case 'J': case 'K':
3062     case 'M': case 'N': case 'O': case 'P': case 'Q':
3063     case 'S': case 'T':           case 'V': case 'W': case 'X':
3064     case 'Y': case 'Z':
3065       result->type = CPP_NAME;
3066       {
3067         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3068         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3069                                                 &nst,
3070                                                 &result->val.node.spelling);
3071         warn_about_normalization (pfile, result, &nst);
3072       }
3073
3074       /* Convert named operators to their proper types.  */
3075       if (result->val.node.node->flags & NODE_OPERATOR)
3076         {
3077           result->flags |= NAMED_OP;
3078           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3079         }
3080
3081       /* Signal FALLTHROUGH comment followed by another token.  */
3082       if (fallthrough_comment)
3083         result->flags |= PREV_FALLTHROUGH;
3084       break;
3085
3086     case '\'':
3087     case '"':
3088       lex_string (pfile, result, buffer->cur - 1);
3089       break;
3090
3091     case '/':
3092       /* A potential block or line comment.  */
3093       comment_start = buffer->cur;
3094       c = *buffer->cur;
3095
3096       if (c == '*')
3097         {
3098           if (_cpp_skip_block_comment (pfile))
3099             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3100         }
3101       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3102         {
3103           /* Don't warn for system headers.  */
3104           if (_cpp_in_system_header (pfile))
3105             ;
3106           /* Warn about comments if pedantically GNUC89, and not
3107              in system headers.  */
3108           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3109                    && CPP_PEDANTIC (pfile)
3110                    && ! buffer->warned_cplusplus_comments)
3111             {
3112               if (cpp_error (pfile, CPP_DL_PEDWARN,
3113                              "C++ style comments are not allowed in ISO C90"))
3114                 cpp_error (pfile, CPP_DL_NOTE,
3115                            "(this will be reported only once per input file)");
3116               buffer->warned_cplusplus_comments = 1;
3117             }
3118           /* Or if specifically desired via -Wc90-c99-compat.  */
3119           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3120                    && ! CPP_OPTION (pfile, cplusplus)
3121                    && ! buffer->warned_cplusplus_comments)
3122             {
3123               if (cpp_error (pfile, CPP_DL_WARNING,
3124                              "C++ style comments are incompatible with C90"))
3125                 cpp_error (pfile, CPP_DL_NOTE,
3126                            "(this will be reported only once per input file)");
3127               buffer->warned_cplusplus_comments = 1;
3128             }
3129           /* In C89/C94, C++ style comments are forbidden.  */
3130           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3131                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
3132             {
3133               /* But don't be confused about valid code such as
3134                  - // immediately followed by *,
3135                  - // in a preprocessing directive,
3136                  - // in an #if 0 block.  */
3137               if (buffer->cur[1] == '*'
3138                   || pfile->state.in_directive
3139                   || pfile->state.skipping)
3140                 {
3141                   result->type = CPP_DIV;
3142                   break;
3143                 }
3144               else if (! buffer->warned_cplusplus_comments)
3145                 {
3146                   if (cpp_error (pfile, CPP_DL_ERROR,
3147                                  "C++ style comments are not allowed in "
3148                                  "ISO C90"))
3149                     cpp_error (pfile, CPP_DL_NOTE,
3150                                "(this will be reported only once per input "
3151                                "file)");
3152                   buffer->warned_cplusplus_comments = 1;
3153                 }
3154             }
3155           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3156             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3157         }
3158       else if (c == '=')
3159         {
3160           buffer->cur++;
3161           result->type = CPP_DIV_EQ;
3162           break;
3163         }
3164       else
3165         {
3166           result->type = CPP_DIV;
3167           break;
3168         }
3169
3170       if (fallthrough_comment_p (pfile, comment_start))
3171         fallthrough_comment = true;
3172
3173       if (pfile->cb.comment)
3174         {
3175           size_t len = pfile->buffer->cur - comment_start;
3176           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3177                              len + 1);
3178         }
3179
3180       if (!pfile->state.save_comments)
3181         {
3182           result->flags |= PREV_WHITE;
3183           goto update_tokens_line;
3184         }
3185
3186       if (fallthrough_comment)
3187         result->flags |= PREV_FALLTHROUGH;
3188
3189       /* Save the comment as a token in its own right.  */
3190       save_comment (pfile, result, comment_start, c);
3191       break;
3192
3193     case '<':
3194       if (pfile->state.angled_headers)
3195         {
3196           lex_string (pfile, result, buffer->cur - 1);
3197           if (result->type != CPP_LESS)
3198             break;
3199         }
3200
3201       result->type = CPP_LESS;
3202       if (*buffer->cur == '=')
3203         {
3204           buffer->cur++, result->type = CPP_LESS_EQ;
3205           if (*buffer->cur == '>'
3206               && CPP_OPTION (pfile, cplusplus)
3207               && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
3208             buffer->cur++, result->type = CPP_SPACESHIP;
3209         }
3210       else if (*buffer->cur == '<')
3211         {
3212           buffer->cur++;
3213           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
3214         }
3215       else if (CPP_OPTION (pfile, digraphs))
3216         {
3217           if (*buffer->cur == ':')
3218             {
3219               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
3220                  three characters are <:: and the subsequent character
3221                  is neither : nor >, the < is treated as a preprocessor
3222                  token by itself".  */
3223               if (CPP_OPTION (pfile, cplusplus)
3224                   && CPP_OPTION (pfile, lang) != CLK_CXX98
3225                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
3226                   && buffer->cur[1] == ':'
3227                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3228                 break;
3229
3230               buffer->cur++;
3231               result->flags |= DIGRAPH;
3232               result->type = CPP_OPEN_SQUARE;
3233             }
3234           else if (*buffer->cur == '%')
3235             {
3236               buffer->cur++;
3237               result->flags |= DIGRAPH;
3238               result->type = CPP_OPEN_BRACE;
3239             }
3240         }
3241       break;
3242
3243     case '>':
3244       result->type = CPP_GREATER;
3245       if (*buffer->cur == '=')
3246         buffer->cur++, result->type = CPP_GREATER_EQ;
3247       else if (*buffer->cur == '>')
3248         {
3249           buffer->cur++;
3250           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3251         }
3252       break;
3253
3254     case '%':
3255       result->type = CPP_MOD;
3256       if (*buffer->cur == '=')
3257         buffer->cur++, result->type = CPP_MOD_EQ;
3258       else if (CPP_OPTION (pfile, digraphs))
3259         {
3260           if (*buffer->cur == ':')
3261             {
3262               buffer->cur++;
3263               result->flags |= DIGRAPH;
3264               result->type = CPP_HASH;
3265               if (*buffer->cur == '%' && buffer->cur[1] == ':')
3266                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3267             }
3268           else if (*buffer->cur == '>')
3269             {
3270               buffer->cur++;
3271               result->flags |= DIGRAPH;
3272               result->type = CPP_CLOSE_BRACE;
3273             }
3274         }
3275       break;
3276
3277     case '.':
3278       result->type = CPP_DOT;
3279       if (ISDIGIT (*buffer->cur))
3280         {
3281           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3282           result->type = CPP_NUMBER;
3283           lex_number (pfile, &result->val.str, &nst);
3284           warn_about_normalization (pfile, result, &nst);
3285         }
3286       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3287         buffer->cur += 2, result->type = CPP_ELLIPSIS;
3288       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3289         buffer->cur++, result->type = CPP_DOT_STAR;
3290       break;
3291
3292     case '+':
3293       result->type = CPP_PLUS;
3294       if (*buffer->cur == '+')
3295         buffer->cur++, result->type = CPP_PLUS_PLUS;
3296       else if (*buffer->cur == '=')
3297         buffer->cur++, result->type = CPP_PLUS_EQ;
3298       break;
3299
3300     case '-':
3301       result->type = CPP_MINUS;
3302       if (*buffer->cur == '>')
3303         {
3304           buffer->cur++;
3305           result->type = CPP_DEREF;
3306           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3307             buffer->cur++, result->type = CPP_DEREF_STAR;
3308         }
3309       else if (*buffer->cur == '-')
3310         buffer->cur++, result->type = CPP_MINUS_MINUS;
3311       else if (*buffer->cur == '=')
3312         buffer->cur++, result->type = CPP_MINUS_EQ;
3313       break;
3314
3315     case '&':
3316       result->type = CPP_AND;
3317       if (*buffer->cur == '&')
3318         buffer->cur++, result->type = CPP_AND_AND;
3319       else if (*buffer->cur == '=')
3320         buffer->cur++, result->type = CPP_AND_EQ;
3321       break;
3322
3323     case '|':
3324       result->type = CPP_OR;
3325       if (*buffer->cur == '|')
3326         buffer->cur++, result->type = CPP_OR_OR;
3327       else if (*buffer->cur == '=')
3328         buffer->cur++, result->type = CPP_OR_EQ;
3329       break;
3330
3331     case ':':
3332       result->type = CPP_COLON;
3333       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
3334         buffer->cur++, result->type = CPP_SCOPE;
3335       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3336         {
3337           buffer->cur++;
3338           result->flags |= DIGRAPH;
3339           result->type = CPP_CLOSE_SQUARE;
3340         }
3341       break;
3342
3343     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3344     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3345     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3346     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3347     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3348
3349     case '?': result->type = CPP_QUERY; break;
3350     case '~': result->type = CPP_COMPL; break;
3351     case ',': result->type = CPP_COMMA; break;
3352     case '(': result->type = CPP_OPEN_PAREN; break;
3353     case ')': result->type = CPP_CLOSE_PAREN; break;
3354     case '[': result->type = CPP_OPEN_SQUARE; break;
3355     case ']': result->type = CPP_CLOSE_SQUARE; break;
3356     case '{': result->type = CPP_OPEN_BRACE; break;
3357     case '}': result->type = CPP_CLOSE_BRACE; break;
3358     case ';': result->type = CPP_SEMICOLON; break;
3359
3360       /* @ is a punctuator in Objective-C.  */
3361     case '@': result->type = CPP_ATSIGN; break;
3362
3363     default:
3364       {
3365         const uchar *base = --buffer->cur;
3366
3367         /* Check for an extended identifier ($ or UCN or UTF-8).  */
3368         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3369         if (forms_identifier_p (pfile, true, &nst))
3370           {
3371             result->type = CPP_NAME;
3372             result->val.node.node = lex_identifier (pfile, base, true, &nst,
3373                                                     &result->val.node.spelling);
3374             warn_about_normalization (pfile, result, &nst);
3375             break;
3376           }
3377
3378         /* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
3379            single token.  */
3380         buffer->cur++;
3381         if (c >= utf8_signifier)
3382           {
3383             const uchar *pstr = base;
3384             cppchar_t s;
3385             if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
3386               buffer->cur = pstr;
3387           }
3388         create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
3389         break;
3390       }
3391
3392     }
3393
3394   /* Potentially convert the location of the token to a range.  */
3395   if (result->src_loc >= RESERVED_LOCATION_COUNT
3396       && result->type != CPP_EOF)
3397     {
3398       /* Ensure that any line notes are processed, so that we have the
3399          correct physical line/column for the end-point of the token even
3400          when a logical line is split via one or more backslashes.  */
3401       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3402           && !pfile->overlaid_buffer)
3403         _cpp_process_line_notes (pfile, false);
3404
3405       source_range tok_range;
3406       tok_range.m_start = result->src_loc;
3407       tok_range.m_finish
3408         = linemap_position_for_column (pfile->line_table,
3409                                        CPP_BUF_COLUMN (buffer, buffer->cur));
3410
3411       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
3412                                                result->src_loc,
3413                                                tok_range, NULL);
3414     }
3415
3416   return result;
3417 }
3418
3419 /* An upper bound on the number of bytes needed to spell TOKEN.
3420    Does not include preceding whitespace.  */
3421 unsigned int
3422 cpp_token_len (const cpp_token *token)
3423 {
3424   unsigned int len;
3425
3426   switch (TOKEN_SPELL (token))
3427     {
3428     default:            len = 6;                                break;
3429     case SPELL_LITERAL: len = token->val.str.len;               break;
3430     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
3431     }
3432
3433   return len;
3434 }
3435
3436 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
3437    Return the number of bytes read out of NAME.  (There are always
3438    10 bytes written to BUFFER.)  */
3439
3440 static size_t
3441 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
3442 {
3443   int j;
3444   int ucn_len = 0;
3445   int ucn_len_c;
3446   unsigned t;
3447   unsigned long utf32;
3448
3449   /* Compute the length of the UTF-8 sequence.  */
3450   for (t = *name; t & 0x80; t <<= 1)
3451     ucn_len++;
3452
3453   utf32 = *name & (0x7F >> ucn_len);
3454   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
3455     {
3456       utf32 = (utf32 << 6) | (*++name & 0x3F);
3457
3458       /* Ill-formed UTF-8.  */
3459       if ((*name & ~0x3F) != 0x80)
3460         abort ();
3461     }
3462
3463   *buffer++ = '\\';
3464   *buffer++ = 'U';
3465   for (j = 7; j >= 0; j--)
3466     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
3467   return ucn_len;
3468 }
3469
3470 /* Given a token TYPE corresponding to a digraph, return a pointer to
3471    the spelling of the digraph.  */
3472 static const unsigned char *
3473 cpp_digraph2name (enum cpp_ttype type)
3474 {
3475   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
3476 }
3477
3478 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
3479    The buffer must already contain the enough space to hold the
3480    token's spelling.  Returns a pointer to the character after the
3481    last character written.  */
3482 unsigned char *
3483 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
3484 {
3485   size_t i;
3486   const unsigned char *name = NODE_NAME (ident);
3487
3488   for (i = 0; i < NODE_LEN (ident); i++)
3489     if (name[i] & ~0x7F)
3490       {
3491         i += utf8_to_ucn (buffer, name + i) - 1;
3492         buffer += 10;
3493       }
3494     else
3495       *buffer++ = name[i];
3496
3497   return buffer;
3498 }
3499
3500 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
3501    already contain the enough space to hold the token's spelling.
3502    Returns a pointer to the character after the last character written.
3503    FORSTRING is true if this is to be the spelling after translation
3504    phase 1 (with the original spelling of extended identifiers), false
3505    if extended identifiers should always be written using UCNs (there is
3506    no option for always writing them in the internal UTF-8 form).
3507    FIXME: Would be nice if we didn't need the PFILE argument.  */
3508 unsigned char *
3509 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
3510                  unsigned char *buffer, bool forstring)
3511 {
3512   switch (TOKEN_SPELL (token))
3513     {
3514     case SPELL_OPERATOR:
3515       {
3516         const unsigned char *spelling;
3517         unsigned char c;
3518
3519         if (token->flags & DIGRAPH)
3520           spelling = cpp_digraph2name (token->type);
3521         else if (token->flags & NAMED_OP)
3522           goto spell_ident;
3523         else
3524           spelling = TOKEN_NAME (token);
3525
3526         while ((c = *spelling++) != '\0')
3527           *buffer++ = c;
3528       }
3529       break;
3530
3531     spell_ident:
3532     case SPELL_IDENT:
3533       if (forstring)
3534         {
3535           memcpy (buffer, NODE_NAME (token->val.node.spelling),
3536                   NODE_LEN (token->val.node.spelling));
3537           buffer += NODE_LEN (token->val.node.spelling);
3538         }
3539       else
3540         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
3541       break;
3542
3543     case SPELL_LITERAL:
3544       memcpy (buffer, token->val.str.text, token->val.str.len);
3545       buffer += token->val.str.len;
3546       break;
3547
3548     case SPELL_NONE:
3549       cpp_error (pfile, CPP_DL_ICE,
3550                  "unspellable token %s", TOKEN_NAME (token));
3551       break;
3552     }
3553
3554   return buffer;
3555 }
3556
3557 /* Returns TOKEN spelt as a null-terminated string.  The string is
3558    freed when the reader is destroyed.  Useful for diagnostics.  */
3559 unsigned char *
3560 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
3561 {
3562   unsigned int len = cpp_token_len (token) + 1;
3563   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
3564
3565   end = cpp_spell_token (pfile, token, start, false);
3566   end[0] = '\0';
3567
3568   return start;
3569 }
3570
3571 /* Returns a pointer to a string which spells the token defined by
3572    TYPE and FLAGS.  Used by C front ends, which really should move to
3573    using cpp_token_as_text.  */
3574 const char *
3575 cpp_type2name (enum cpp_ttype type, unsigned char flags)
3576 {
3577   if (flags & DIGRAPH)
3578     return (const char *) cpp_digraph2name (type);
3579   else if (flags & NAMED_OP)
3580     return cpp_named_operator2name (type);
3581
3582   return (const char *) token_spellings[type].name;
3583 }
3584
3585 /* Writes the spelling of token to FP, without any preceding space.
3586    Separated from cpp_spell_token for efficiency - to avoid stdio
3587    double-buffering.  */
3588 void
3589 cpp_output_token (const cpp_token *token, FILE *fp)
3590 {
3591   switch (TOKEN_SPELL (token))
3592     {
3593     case SPELL_OPERATOR:
3594       {
3595         const unsigned char *spelling;
3596         int c;
3597
3598         if (token->flags & DIGRAPH)
3599           spelling = cpp_digraph2name (token->type);
3600         else if (token->flags & NAMED_OP)
3601           goto spell_ident;
3602         else
3603           spelling = TOKEN_NAME (token);
3604
3605         c = *spelling;
3606         do
3607           putc (c, fp);
3608         while ((c = *++spelling) != '\0');
3609       }
3610       break;
3611
3612     spell_ident:
3613     case SPELL_IDENT:
3614       {
3615         size_t i;
3616         const unsigned char * name = NODE_NAME (token->val.node.node);
3617
3618         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
3619           if (name[i] & ~0x7F)
3620             {
3621               unsigned char buffer[10];
3622               i += utf8_to_ucn (buffer, name + i) - 1;
3623               fwrite (buffer, 1, 10, fp);
3624             }
3625           else
3626             fputc (NODE_NAME (token->val.node.node)[i], fp);
3627       }
3628       break;
3629
3630     case SPELL_LITERAL:
3631       if (token->type == CPP_HEADER_NAME)
3632         fputc ('"', fp);
3633       fwrite (token->val.str.text, 1, token->val.str.len, fp);
3634       if (token->type == CPP_HEADER_NAME)
3635         fputc ('"', fp);
3636       break;
3637
3638     case SPELL_NONE:
3639       /* An error, most probably.  */
3640       break;
3641     }
3642 }
3643
3644 /* Compare two tokens.  */
3645 int
3646 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
3647 {
3648   if (a->type == b->type && a->flags == b->flags)
3649     switch (TOKEN_SPELL (a))
3650       {
3651       default:                  /* Keep compiler happy.  */
3652       case SPELL_OPERATOR:
3653         /* token_no is used to track where multiple consecutive ##
3654            tokens were originally located.  */
3655         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
3656       case SPELL_NONE:
3657         return (a->type != CPP_MACRO_ARG
3658                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
3659                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
3660       case SPELL_IDENT:
3661         return (a->val.node.node == b->val.node.node
3662                 && a->val.node.spelling == b->val.node.spelling);
3663       case SPELL_LITERAL:
3664         return (a->val.str.len == b->val.str.len
3665                 && !memcmp (a->val.str.text, b->val.str.text,
3666                             a->val.str.len));
3667       }
3668
3669   return 0;
3670 }
3671
3672 /* Returns nonzero if a space should be inserted to avoid an
3673    accidental token paste for output.  For simplicity, it is
3674    conservative, and occasionally advises a space where one is not
3675    needed, e.g. "." and ".2".  */
3676 int
3677 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
3678                  const cpp_token *token2)
3679 {
3680   enum cpp_ttype a = token1->type, b = token2->type;
3681   cppchar_t c;
3682
3683   if (token1->flags & NAMED_OP)
3684     a = CPP_NAME;
3685   if (token2->flags & NAMED_OP)
3686     b = CPP_NAME;
3687
3688   c = EOF;
3689   if (token2->flags & DIGRAPH)
3690     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
3691   else if (token_spellings[b].category == SPELL_OPERATOR)
3692     c = token_spellings[b].name[0];
3693
3694   /* Quickly get everything that can paste with an '='.  */
3695   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
3696     return 1;
3697
3698   switch (a)
3699     {
3700     case CPP_GREATER:   return c == '>';
3701     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
3702     case CPP_PLUS:      return c == '+';
3703     case CPP_MINUS:     return c == '-' || c == '>';
3704     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
3705     case CPP_MOD:       return c == ':' || c == '>';
3706     case CPP_AND:       return c == '&';
3707     case CPP_OR:        return c == '|';
3708     case CPP_COLON:     return c == ':' || c == '>';
3709     case CPP_DEREF:     return c == '*';
3710     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
3711     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
3712     case CPP_NAME:      return ((b == CPP_NUMBER
3713                                  && name_p (pfile, &token2->val.str))
3714                                 || b == CPP_NAME
3715                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
3716     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
3717                                 || c == '.' || c == '+' || c == '-');
3718                                       /* UCNs */
3719     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
3720                                  && b == CPP_NAME)
3721                                 || (CPP_OPTION (pfile, objc)
3722                                     && token1->val.str.text[0] == '@'
3723                                     && (b == CPP_NAME || b == CPP_STRING)));
3724     case CPP_LESS_EQ:   return c == '>';
3725     case CPP_STRING:
3726     case CPP_WSTRING:
3727     case CPP_UTF8STRING:
3728     case CPP_STRING16:
3729     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
3730                                 && (b == CPP_NAME
3731                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
3732                                         && ISIDST (token2->val.str.text[0]))));
3733
3734     default:            break;
3735     }
3736
3737   return 0;
3738 }
3739
3740 /* Output all the remaining tokens on the current line, and a newline
3741    character, to FP.  Leading whitespace is removed.  If there are
3742    macros, special token padding is not performed.  */
3743 void
3744 cpp_output_line (cpp_reader *pfile, FILE *fp)
3745 {
3746   const cpp_token *token;
3747
3748   token = cpp_get_token (pfile);
3749   while (token->type != CPP_EOF)
3750     {
3751       cpp_output_token (token, fp);
3752       token = cpp_get_token (pfile);
3753       if (token->flags & PREV_WHITE)
3754         putc (' ', fp);
3755     }
3756
3757   putc ('\n', fp);
3758 }
3759
3760 /* Return a string representation of all the remaining tokens on the
3761    current line.  The result is allocated using xmalloc and must be
3762    freed by the caller.  */
3763 unsigned char *
3764 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
3765 {
3766   const cpp_token *token;
3767   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
3768   unsigned int alloced = 120 + out;
3769   unsigned char *result = (unsigned char *) xmalloc (alloced);
3770
3771   /* If DIR_NAME is empty, there are no initial contents.  */
3772   if (dir_name)
3773     {
3774       sprintf ((char *) result, "#%s ", dir_name);
3775       out += 2;
3776     }
3777
3778   token = cpp_get_token (pfile);
3779   while (token->type != CPP_EOF)
3780     {
3781       unsigned char *last;
3782       /* Include room for a possible space and the terminating nul.  */
3783       unsigned int len = cpp_token_len (token) + 2;
3784
3785       if (out + len > alloced)
3786         {
3787           alloced *= 2;
3788           if (out + len > alloced)
3789             alloced = out + len;
3790           result = (unsigned char *) xrealloc (result, alloced);
3791         }
3792
3793       last = cpp_spell_token (pfile, token, &result[out], 0);
3794       out = last - result;
3795
3796       token = cpp_get_token (pfile);
3797       if (token->flags & PREV_WHITE)
3798         result[out++] = ' ';
3799     }
3800
3801   result[out] = '\0';
3802   return result;
3803 }
3804
3805 /* Memory buffers.  Changing these three constants can have a dramatic
3806    effect on performance.  The values here are reasonable defaults,
3807    but might be tuned.  If you adjust them, be sure to test across a
3808    range of uses of cpplib, including heavy nested function-like macro
3809    expansion.  Also check the change in peak memory usage (NJAMD is a
3810    good tool for this).  */
3811 #define MIN_BUFF_SIZE 8000
3812 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3813 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
3814         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
3815
3816 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
3817   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
3818 #endif
3819
3820 /* Create a new allocation buffer.  Place the control block at the end
3821    of the buffer, so that buffer overflows will cause immediate chaos.  */
3822 static _cpp_buff *
3823 new_buff (size_t len)
3824 {
3825   _cpp_buff *result;
3826   unsigned char *base;
3827
3828   if (len < MIN_BUFF_SIZE)
3829     len = MIN_BUFF_SIZE;
3830   len = CPP_ALIGN (len);
3831
3832 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3833   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
3834      struct first.  */
3835   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
3836   base = XNEWVEC (unsigned char, len + slen);
3837   result = (_cpp_buff *) base;
3838   base += slen;
3839 #else
3840   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3841   result = (_cpp_buff *) (base + len);
3842 #endif
3843   result->base = base;
3844   result->cur = base;
3845   result->limit = base + len;
3846   result->next = NULL;
3847   return result;
3848 }
3849
3850 /* Place a chain of unwanted allocation buffers on the free list.  */
3851 void
3852 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3853 {
3854   _cpp_buff *end = buff;
3855
3856   while (end->next)
3857     end = end->next;
3858   end->next = pfile->free_buffs;
3859   pfile->free_buffs = buff;
3860 }
3861
3862 /* Return a free buffer of size at least MIN_SIZE.  */
3863 _cpp_buff *
3864 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3865 {
3866   _cpp_buff *result, **p;
3867
3868   for (p = &pfile->free_buffs;; p = &(*p)->next)
3869     {
3870       size_t size;
3871
3872       if (*p == NULL)
3873         return new_buff (min_size);
3874       result = *p;
3875       size = result->limit - result->base;
3876       /* Return a buffer that's big enough, but don't waste one that's
3877          way too big.  */
3878       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3879         break;
3880     }
3881
3882   *p = result->next;
3883   result->next = NULL;
3884   result->cur = result->base;
3885   return result;
3886 }
3887
3888 /* Creates a new buffer with enough space to hold the uncommitted
3889    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
3890    the excess bytes to the new buffer.  Chains the new buffer after
3891    BUFF, and returns the new buffer.  */
3892 _cpp_buff *
3893 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3894 {
3895   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3896   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3897
3898   buff->next = new_buff;
3899   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3900   return new_buff;
3901 }
3902
3903 /* Creates a new buffer with enough space to hold the uncommitted
3904    remaining bytes of the buffer pointed to by BUFF, and at least
3905    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3906    Chains the new buffer before the buffer pointed to by BUFF, and
3907    updates the pointer to point to the new buffer.  */
3908 void
3909 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3910 {
3911   _cpp_buff *new_buff, *old_buff = *pbuff;
3912   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3913
3914   new_buff = _cpp_get_buff (pfile, size);
3915   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3916   new_buff->next = old_buff;
3917   *pbuff = new_buff;
3918 }
3919
3920 /* Free a chain of buffers starting at BUFF.  */
3921 void
3922 _cpp_free_buff (_cpp_buff *buff)
3923 {
3924   _cpp_buff *next;
3925
3926   for (; buff; buff = next)
3927     {
3928       next = buff->next;
3929 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3930       free (buff);
3931 #else
3932       free (buff->base);
3933 #endif
3934     }
3935 }
3936
3937 /* Allocate permanent, unaligned storage of length LEN.  */
3938 unsigned char *
3939 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3940 {
3941   _cpp_buff *buff = pfile->u_buff;
3942   unsigned char *result = buff->cur;
3943
3944   if (len > (size_t) (buff->limit - result))
3945     {
3946       buff = _cpp_get_buff (pfile, len);
3947       buff->next = pfile->u_buff;
3948       pfile->u_buff = buff;
3949       result = buff->cur;
3950     }
3951
3952   buff->cur = result + len;
3953   return result;
3954 }
3955
3956 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3957    That buffer is used for growing allocations when saving macro
3958    replacement lists in a #define, and when parsing an answer to an
3959    assertion in #assert, #unassert or #if (and therefore possibly
3960    whilst expanding macros).  It therefore must not be used by any
3961    code that they might call: specifically the lexer and the guts of
3962    the macro expander.
3963
3964    All existing other uses clearly fit this restriction: storing
3965    registered pragmas during initialization.  */
3966 unsigned char *
3967 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3968 {
3969   _cpp_buff *buff = pfile->a_buff;
3970   unsigned char *result = buff->cur;
3971
3972   if (len > (size_t) (buff->limit - result))
3973     {
3974       buff = _cpp_get_buff (pfile, len);
3975       buff->next = pfile->a_buff;
3976       pfile->a_buff = buff;
3977       result = buff->cur;
3978     }
3979
3980   buff->cur = result + len;
3981   return result;
3982 }
3983
3984 /* Commit or allocate storage from a buffer.  */
3985
3986 void *
3987 _cpp_commit_buff (cpp_reader *pfile, size_t size)
3988 {
3989   void *ptr = BUFF_FRONT (pfile->a_buff);
3990
3991   if (pfile->hash_table->alloc_subobject)
3992     {
3993       void *copy = pfile->hash_table->alloc_subobject (size);
3994       memcpy (copy, ptr, size);
3995       ptr = copy;
3996     }
3997   else
3998     BUFF_FRONT (pfile->a_buff) += size;
3999
4000   return ptr;
4001 }
4002
4003 /* Say which field of TOK is in use.  */
4004
4005 enum cpp_token_fld_kind
4006 cpp_token_val_index (const cpp_token *tok)
4007 {
4008   switch (TOKEN_SPELL (tok))
4009     {
4010     case SPELL_IDENT:
4011       return CPP_TOKEN_FLD_NODE;
4012     case SPELL_LITERAL:
4013       return CPP_TOKEN_FLD_STR;
4014     case SPELL_OPERATOR:
4015       /* Operands which were originally spelled as ident keep around
4016          the node for the exact spelling.  */
4017       if (tok->flags & NAMED_OP)
4018         return CPP_TOKEN_FLD_NODE;
4019       else if (tok->type == CPP_PASTE)
4020         return CPP_TOKEN_FLD_TOKEN_NO;
4021       else
4022         return CPP_TOKEN_FLD_NONE;
4023     case SPELL_NONE:
4024       if (tok->type == CPP_MACRO_ARG)
4025         return CPP_TOKEN_FLD_ARG_NO;
4026       else if (tok->type == CPP_PADDING)
4027         return CPP_TOKEN_FLD_SOURCE;
4028       else if (tok->type == CPP_PRAGMA)
4029         return CPP_TOKEN_FLD_PRAGMA;
4030       /* fall through */
4031     default:
4032       return CPP_TOKEN_FLD_NONE;
4033     }
4034 }
4035
4036 /* All tokens lexed in R after calling this function will be forced to
4037    have their location_t to be P, until
4038    cpp_stop_forcing_token_locations is called for R.  */
4039
4040 void
4041 cpp_force_token_locations (cpp_reader *r, location_t loc)
4042 {
4043   r->forced_token_location = loc;
4044 }
4045
4046 /* Go back to assigning locations naturally for lexed tokens.  */
4047
4048 void
4049 cpp_stop_forcing_token_locations (cpp_reader *r)
4050 {
4051   r->forced_token_location = 0;
4052 }
4053
4054 /* We're looking at \, if it's escaping EOL, look past it.  If at
4055    LIMIT, don't advance.  */
4056
4057 static const unsigned char *
4058 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4059 {
4060   const unsigned char *probe = peek;
4061
4062   if (__builtin_expect (peek[1] == '\n', true))
4063     {
4064     eol:
4065       probe += 2;
4066       if (__builtin_expect (probe < limit, true))
4067         {
4068           peek = probe;
4069           if (*peek == '\\')
4070             /* The user might be perverse.  */
4071             return do_peek_backslash (peek, limit);
4072         }
4073     }
4074   else if (__builtin_expect (peek[1] == '\r', false))
4075     {
4076       if (probe[2] == '\n')
4077         probe++;
4078       goto eol;
4079     }
4080
4081   return peek;
4082 }
4083
4084 static const unsigned char *
4085 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4086 {
4087   if (__builtin_expect (*peek == '\\', false))
4088     peek = do_peek_backslash (peek, limit);
4089   return peek;
4090 }
4091
4092 static const unsigned char *
4093 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4094 {
4095   if (peek == bound)
4096     return NULL;
4097
4098   unsigned char c = *--peek;
4099   if (__builtin_expect (c == '\n', false)
4100       || __builtin_expect (c == 'r', false))
4101     {
4102       if (peek == bound)
4103         return peek;
4104       int ix = -1;
4105       if (c == '\n' && peek[ix] == '\r')
4106         {
4107           if (peek + ix == bound)
4108             return peek;
4109           ix--;
4110         }
4111
4112       if (peek[ix] == '\\')
4113         return do_peek_prev (peek + ix, bound);
4114
4115       return peek;
4116     }
4117   else
4118     return peek;
4119 }
4120
4121 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4122    space.  Otherwise return NULL.  */
4123
4124 static const unsigned char *
4125 do_peek_ident (const char *match, const unsigned char *peek,
4126                const unsigned char *limit)
4127 {
4128   for (; *++match; peek++)
4129     if (*peek != *match)
4130       {
4131         peek = do_peek_next (peek, limit);
4132         if (*peek != *match)
4133           return NULL;
4134       }
4135
4136   /* Must now not be looking at an identifier char.  */
4137   peek = do_peek_next (peek, limit);
4138   if (ISIDNUM (*peek))
4139     return NULL;
4140
4141   /* Skip control-line whitespace.  */
4142  ws:
4143   while (*peek == ' ' || *peek == '\t')
4144     peek++;
4145   if (__builtin_expect (*peek == '\\', false))
4146     {
4147       peek = do_peek_backslash (peek, limit);
4148       if (*peek != '\\')
4149         goto ws;
4150     }
4151
4152   return peek;
4153 }
4154
4155 /* Are we looking at a module control line starting as PEEK - 1?  */
4156
4157 static bool
4158 do_peek_module (cpp_reader *pfile, unsigned char c,
4159                 const unsigned char *peek, const unsigned char *limit)
4160 {
4161   bool import = false;
4162
4163   if (__builtin_expect (c == 'e', false))
4164     {
4165       if (!((peek[0] == 'x' || peek[0] == '\\')
4166             && (peek = do_peek_ident ("export", peek, limit))))
4167         return false;
4168
4169       /* export, peek for import or module.  No need to peek __import
4170          here.  */
4171       if (peek[0] == 'i')
4172         {
4173           if (!((peek[1] == 'm' || peek[1] == '\\')
4174                 && (peek = do_peek_ident ("import", peek + 1, limit))))
4175             return false;
4176           import = true;
4177         }
4178       else if (peek[0] == 'm')
4179         {
4180           if (!((peek[1] == 'o' || peek[1] == '\\')
4181                 && (peek = do_peek_ident ("module", peek + 1, limit))))
4182             return false;
4183         }
4184       else
4185         return false;
4186     }
4187   else if (__builtin_expect (c == 'i', false))
4188     {
4189       if (!((peek[0] == 'm' || peek[0] == '\\')
4190             && (peek = do_peek_ident ("import", peek, limit))))
4191         return false;
4192       import = true;
4193     }
4194   else if (__builtin_expect (c == '_', false))
4195     {
4196       /* Needed for translated includes.   */
4197       if (!((peek[0] == '_' || peek[0] == '\\')
4198             && (peek = do_peek_ident ("__import", peek, limit))))
4199         return false;
4200       import = true;
4201     }
4202   else if (__builtin_expect (c == 'm', false))
4203     {
4204       if (!((peek[0] == 'o' || peek[0] == '\\')
4205             && (peek = do_peek_ident ("module", peek, limit))))
4206         return false;
4207     }
4208   else
4209     return false;
4210
4211   /* Peek the next character to see if it's good enough.  We'll be at
4212      the first non-whitespace char, including skipping an escaped
4213      newline.  */
4214   /* ... import followed by identifier, ':', '<' or header-name
4215      preprocessing tokens, or module followed by identifier, ':' or
4216      ';' preprocessing tokens.  */
4217   unsigned char p = *peek++;
4218
4219   /* A character literal is ... single quotes, ... optionally preceded
4220      by u8, u, U, or L */
4221   /* A string-literal is a ... double quotes, optionally prefixed by
4222      R, u8, u8R, u, uR, U, UR, L, or LR */
4223   if (p == 'u')
4224     {
4225       peek = do_peek_next (peek, limit);
4226       if (*peek == '8')
4227         {
4228           peek++;
4229           goto peek_u8;
4230         }
4231       goto peek_u;
4232     }
4233   else if (p == 'U' || p == 'L')
4234     {
4235     peek_u8:
4236       peek = do_peek_next (peek, limit);
4237     peek_u:
4238       if (*peek == '\"' || *peek == '\'')
4239         return false;
4240
4241       if (*peek == 'R')
4242         goto peek_R;
4243       /* Identifier. Ok.  */
4244     }
4245   else if (p == 'R')
4246     {
4247     peek_R:
4248       if (CPP_OPTION (pfile, rliterals))
4249         {
4250           peek = do_peek_next (peek, limit);
4251           if (*peek == '\"')
4252             return false;
4253         }
4254       /* Identifier. Ok.  */
4255     }
4256   else if ('Z' - 'A' == 25
4257            ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
4258            : ISIDST (p))
4259     {
4260       /* Identifier.  Ok. */
4261     }
4262   else if (p == '<')
4263     {
4264       /* Maybe angle header, ok for import.  Reject
4265          '<=', '<<' digraph:'<:'.  */
4266       if (!import)
4267         return false;
4268       peek = do_peek_next (peek, limit);
4269       if (*peek == '=' || *peek == '<'
4270           || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
4271         return false;
4272     }
4273   else if (p == ';')
4274     {
4275       /* SEMICOLON, ok for module.  */
4276       if (import)
4277         return false;
4278     }
4279   else if (p == '"')
4280     {
4281       /* STRING, ok for import.  */
4282       if (!import)
4283         return false;
4284     }
4285   else if (p == ':')
4286     {
4287       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
4288       peek = do_peek_next (peek, limit);
4289       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
4290         return false;
4291     }
4292   else
4293     /* FIXME: Detect a unicode character, excluding those not
4294        permitted as the initial character. [lex.name]/1.  I presume
4295        we need to check the \[uU] spellings, and directly using
4296        Unicode in say UTF8 form?  Or perhaps we do the phase-1
4297        conversion of UTF8 to universal-character-names?  */
4298     return false;
4299
4300   return true;
4301 }
4302
4303 /* Directives-only scanning.  Somewhat more relaxed than correct
4304    parsing -- some ill-formed programs will not be rejected.  */
4305
4306 void
4307 cpp_directive_only_process (cpp_reader *pfile,
4308                             void *data,
4309                             void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
4310 {
4311   bool module_p = CPP_OPTION (pfile, module_directives);
4312
4313   do
4314     {
4315     restart:
4316       /* Buffer initialization, but no line cleaning. */
4317       cpp_buffer *buffer = pfile->buffer;
4318       buffer->cur_note = buffer->notes_used = 0;
4319       buffer->cur = buffer->line_base = buffer->next_line;
4320       buffer->need_line = false;
4321       /* Files always end in a newline or carriage return.  We rely on this for
4322          character peeking safety.  */
4323       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
4324
4325       const unsigned char *base = buffer->cur;
4326       unsigned line_count = 0;
4327       const unsigned char *line_start = base;
4328
4329       bool bol = true;
4330       bool raw = false;
4331
4332       const unsigned char *lwm = base;
4333       for (const unsigned char *pos = base, *limit = buffer->rlimit;
4334            pos < limit;)
4335         {
4336           unsigned char c = *pos++;
4337           /* This matches the switch in _cpp_lex_direct.  */
4338           switch (c)
4339             {
4340             case ' ': case '\t': case '\f': case '\v':
4341               /* Whitespace, do nothing.  */
4342               break;
4343
4344             case '\r': /* MAC line ending, or Windows \r\n  */
4345               if (*pos == '\n')
4346                 pos++;
4347               /* FALLTHROUGH */
4348
4349             case '\n':
4350               bol = true;
4351
4352             next_line:
4353               CPP_INCREMENT_LINE (pfile, 0);
4354               line_count++;
4355               line_start = pos;
4356               break;
4357
4358             case '\\':
4359               /* <backslash><newline> is removed, and doesn't undo any
4360                  preceeding escape or whatnot.  */
4361               if (*pos == '\n')
4362                 {
4363                   pos++;
4364                   goto next_line;
4365                 }
4366               else if (*pos == '\r')
4367                 {
4368                   if (pos[1] == '\n')
4369                     pos++;
4370                   pos++;
4371                   goto next_line;
4372                 }
4373               goto dflt;
4374
4375             case '#':
4376               if (bol)
4377                 {
4378                   /* Line directive.  */
4379                   if (pos - 1 > base && !pfile->state.skipping)
4380                     cb (pfile, CPP_DO_print, data,
4381                         line_count, base, pos - 1 - base);
4382
4383                   /* Prep things for directive handling. */
4384                   buffer->next_line = pos;
4385                   buffer->need_line = true;
4386                   bool ok = _cpp_get_fresh_line (pfile);
4387                   gcc_checking_assert (ok);
4388
4389                   /* Ensure proper column numbering for generated
4390                      error messages. */
4391                   buffer->line_base -= pos - line_start;
4392
4393                   _cpp_handle_directive (pfile, line_start + 1 != pos);
4394
4395                   /* Sanitize the line settings.  Duplicate #include's can
4396                      mess things up. */
4397                   // FIXME: Necessary?
4398                   pfile->line_table->highest_location
4399                     = pfile->line_table->highest_line;
4400
4401                   if (!pfile->state.skipping
4402                       && pfile->buffer->next_line < pfile->buffer->rlimit)
4403                     cb (pfile, CPP_DO_location, data,
4404                         pfile->line_table->highest_line);
4405
4406                   goto restart;
4407                 }
4408               goto dflt;
4409
4410             case '/':
4411               {
4412                 const unsigned char *peek = do_peek_next (pos, limit);
4413                 if (!(*peek == '/' || *peek == '*'))
4414                   goto dflt;
4415
4416                 /* Line or block comment  */
4417                 bool is_block = *peek == '*';
4418                 bool star = false;
4419                 bool esc = false;
4420                 location_t sloc
4421                   = linemap_position_for_column (pfile->line_table,
4422                                                  pos - line_start);
4423
4424                 while (pos < limit)
4425                   {
4426                     char c = *pos++;
4427                     switch (c)
4428                       {
4429                       case '\\':
4430                         esc = true;
4431                         break;
4432
4433                       case '\r':
4434                         if (*pos == '\n')
4435                           pos++;
4436                         /* FALLTHROUGH  */
4437
4438                       case '\n':
4439                         {
4440                           CPP_INCREMENT_LINE (pfile, 0);
4441                           line_count++;
4442                           line_start = pos;
4443                           if (!esc && !is_block)
4444                             {
4445                               bol = true;
4446                               goto done_comment;
4447                             }
4448                         }
4449                         if (!esc)
4450                           star = false;
4451                         esc = false;
4452                         break;
4453
4454                       case '*':
4455                         if (pos > peek && !esc)
4456                           star = is_block;
4457                         esc = false;
4458                         break;
4459
4460                       case '/':
4461                         if (star)
4462                           goto done_comment;
4463                         /* FALLTHROUGH  */
4464
4465                       default:
4466                         star = false;
4467                         esc = false;
4468                         break;
4469                       }
4470                   }
4471                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4472                                      "unterminated comment");
4473               done_comment:
4474                 lwm = pos;
4475                 break;
4476               }
4477
4478             case '\'':
4479               if (!CPP_OPTION (pfile, digit_separators))
4480                 goto delimited_string;
4481
4482               /* Possibly a number punctuator.  */
4483               if (!ISIDNUM (*do_peek_next (pos, limit)))
4484                 goto delimited_string;
4485
4486               goto quote_peek;
4487
4488             case '\"':
4489               if (!CPP_OPTION (pfile, rliterals))
4490                 goto delimited_string;
4491
4492             quote_peek:
4493               {
4494                 /* For ' see if it's a number punctuator
4495                    \.?<digit>(<digit>|<identifier-nondigit>
4496                    |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
4497                 /* For " see if it's a raw string
4498                    {U,L,u,u8}R.  This includes CPP_NUMBER detection,
4499                    because that could be 0e+R.  */
4500                 const unsigned char *peek = pos - 1;
4501                 bool quote_first = c == '"';
4502                 bool quote_eight = false;
4503                 bool maybe_number_start = false;
4504                 bool want_number = false;
4505
4506                 while ((peek = do_peek_prev (peek, lwm)))
4507                   {
4508                     unsigned char p = *peek;
4509                     if (quote_first)
4510                       {
4511                         if (!raw)
4512                           {
4513                             if (p != 'R')
4514                               break;
4515                             raw = true;
4516                             continue;
4517                           }
4518
4519                         quote_first = false;
4520                         if (p == 'L' || p == 'U' || p == 'u')
4521                           ;
4522                         else if (p == '8')
4523                           quote_eight = true;
4524                         else
4525                           goto second_raw;
4526                       }
4527                     else if (quote_eight)
4528                       {
4529                         if (p != 'u')
4530                           {
4531                             raw = false;
4532                             break;
4533                           }
4534                         quote_eight = false;
4535                       }
4536                     else if (c == '"')
4537                       {
4538                       second_raw:;
4539                         if (!want_number && ISIDNUM (p))
4540                           {
4541                             raw = false;
4542                             break;
4543                           }
4544                       }
4545
4546                     if (ISDIGIT (p))
4547                       maybe_number_start = true;
4548                     else if (p == '.')
4549                       want_number = true;
4550                     else if (ISIDNUM (p))
4551                       maybe_number_start = false;
4552                     else if (p == '+' || p == '-')
4553                       {
4554                         if (const unsigned char *peek_prev
4555                             = do_peek_prev (peek, lwm))
4556                           {
4557                             p = *peek_prev;
4558                             if (p == 'e' || p == 'E'
4559                                 || p == 'p' || p == 'P')
4560                               {
4561                                 want_number = true;
4562                                 maybe_number_start = false;
4563                               }
4564                             else
4565                               break;
4566                           }
4567                         else
4568                           break;
4569                       }
4570                     else if (p == '\'' || p == '\"')
4571                       {
4572                         /* If this is lwm, this must be the end of a
4573                            previous string.  So this is a trailing
4574                            literal type, (a) if those are allowed,
4575                              and (b) maybe_start is false.  Otherwise
4576                              this must be a CPP_NUMBER because we've
4577                              met another ', and we'd have checked that
4578                              in its own right.  */
4579                         if (peek == lwm && CPP_OPTION (pfile, uliterals))
4580                           {
4581                             if  (!maybe_number_start && !want_number)
4582                               /* Must be a literal type.  */
4583                               raw = false;
4584                           }
4585                         else if (p == '\''
4586                                  && CPP_OPTION (pfile, digit_separators))
4587                           maybe_number_start = true;
4588                         break;
4589                       }
4590                     else if (c == '\'')
4591                       break;
4592                     else if (!quote_first && !quote_eight)
4593                       break;
4594                   }
4595
4596                 if (maybe_number_start)
4597                   {
4598                     if (c == '\'')
4599                       /* A CPP NUMBER.  */
4600                       goto dflt;
4601                     raw = false;
4602                   }
4603
4604                 goto delimited_string;
4605               }
4606
4607             delimited_string:
4608               {
4609                 /* (Possibly raw) string or char literal.  */
4610                 unsigned char end = c;
4611                 int delim_len = -1;
4612                 const unsigned char *delim = NULL;
4613                 location_t sloc = linemap_position_for_column (pfile->line_table,
4614                                                                pos - line_start);
4615                 int esc = 0;
4616
4617                 if (raw)
4618                   {
4619                     /* There can be no line breaks in the delimiter.  */
4620                     delim = pos;
4621                     for (delim_len = 0; (c = *pos++) != '('; delim_len++)
4622                       {
4623                         if (delim_len == 16)
4624                           {
4625                             cpp_error_with_line (pfile, CPP_DL_ERROR,
4626                                                  sloc, 0,
4627                                                  "raw string delimiter"
4628                                                  " longer than %d"
4629                                                  " characters",
4630                                                  delim_len);
4631                             raw = false;
4632                             pos = delim;
4633                             break;
4634                           }
4635                         if (strchr (") \\\t\v\f\n", c))
4636                           {
4637                             cpp_error_with_line (pfile, CPP_DL_ERROR,
4638                                                  sloc, 0,
4639                                                  "invalid character '%c'"
4640                                                  " in raw string"
4641                                                  " delimiter", c);
4642                             raw = false;
4643                             pos = delim;
4644                             break;
4645                           }
4646                         if (pos >= limit)
4647                           goto bad_string;
4648                       }
4649                   }
4650
4651                 while (pos < limit)
4652                   {
4653                     char c = *pos++;
4654                     switch (c)
4655                       {
4656                       case '\\':
4657                         if (!raw)
4658                           esc++;
4659                         break;
4660
4661                       case '\r':
4662                         if (*pos == '\n')
4663                           pos++;
4664                         /* FALLTHROUGH  */
4665
4666                       case '\n':
4667                         {
4668                           CPP_INCREMENT_LINE (pfile, 0);
4669                           line_count++;
4670                           line_start = pos;
4671                         }
4672                         if (esc)
4673                           esc--;
4674                         break;
4675
4676                       case ')':
4677                         if (raw
4678                             && pos + delim_len + 1 < limit
4679                             && pos[delim_len] == end
4680                             && !memcmp (delim, pos, delim_len))
4681                           {
4682                             pos += delim_len + 1;
4683                             raw = false;
4684                             goto done_string;
4685                           }
4686                         break;
4687
4688                       default:
4689                         if (!raw && !(esc & 1) && c == end)
4690                           goto done_string;
4691                         esc = 0;
4692                         break;
4693                       }
4694                   }
4695               bad_string:
4696                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4697                                      "unterminated literal");
4698
4699               done_string:
4700                 raw = false;
4701                 lwm = pos - 1;
4702               }
4703               goto dflt;
4704
4705             case '_':
4706             case 'e':
4707             case 'i':
4708             case 'm':
4709               if (bol && module_p && !pfile->state.skipping
4710                   && do_peek_module (pfile, c, pos, limit))
4711                 {
4712                   /* We've seen the start of a module control line.
4713                      Start up the tokenizer.  */
4714                   pos--; /* Backup over the first character.  */
4715
4716                   /* Backup over whitespace to start of line.  */
4717                   while (pos > line_start
4718                          && (pos[-1] == ' ' || pos[-1] == '\t'))
4719                     pos--;
4720
4721                   if (pos > base)
4722                     cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
4723
4724                   /* Prep things for directive handling. */
4725                   buffer->next_line = pos;
4726                   buffer->need_line = true;
4727
4728                   /* Now get tokens until the PRAGMA_EOL.  */
4729                   do
4730                     {
4731                       location_t spelling;
4732                       const cpp_token *tok
4733                         = cpp_get_token_with_location (pfile, &spelling);
4734
4735                       gcc_assert (pfile->state.in_deferred_pragma
4736                                   || tok->type == CPP_PRAGMA_EOL);
4737                       cb (pfile, CPP_DO_token, data, tok, spelling);
4738                     }
4739                   while (pfile->state.in_deferred_pragma);
4740
4741                   if (pfile->buffer->next_line < pfile->buffer->rlimit)
4742                     cb (pfile, CPP_DO_location, data,
4743                         pfile->line_table->highest_line);
4744
4745                   pfile->mi_valid = false;
4746                   goto restart;
4747                 }
4748               goto dflt;
4749
4750             default:
4751             dflt:
4752               bol = false;
4753               pfile->mi_valid = false;
4754               break;
4755             }
4756         }
4757
4758       if (buffer->rlimit > base && !pfile->state.skipping)
4759         cb (pfile, CPP_DO_print, data, line_count, base, buffer->rlimit - base);
4760
4761       _cpp_pop_buffer (pfile);
4762     }
4763   while (pfile->buffer);
4764 }