libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2020 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problem can be properly
 267    autoconfed:
 268
 269    The Solaris 10+ assembler tags objects with the instruction set
 270    extensions used, so SSE4.2 executables cannot run on machines that
 271    don't support that extension.  */
 272
 273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 274
 275 /* Replicated character data to be shared between implementations.
 276    Recall that outside of a context with vector support we can't
 277    define compatible vector types, therefore these are all defined
 278    in terms of raw characters.  */
 279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 280   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 281     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 282   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 283     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 284   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 285     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 286   { '?', '?', '?', '?', '?', '?', '?', '?',
 287     '?', '?', '?', '?', '?', '?', '?', '?' },
 288 };
 289
 290 /* A version of the fast scanner using MMX vectorized byte compare insns.
 291
 292    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 293    which was packaged into SSE1; it is also present in the AMD MMX
 294    extension.  Mark the function as using "sse" so that we emit a real
 295    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 296
 297 static const uchar *
 298 #ifndef __SSE__
 299 __attribute__((__target__("sse")))
 300 #endif
 301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 302 {
 303   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 304   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 305
 306   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 307   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 308   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 309   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 310
 311   unsigned int misalign, found, mask;
 312   const v8qi *p;
 313   v8qi data, t, c;
 314
 315   /* Align the source pointer.  While MMX doesn't generate unaligned data
 316      faults, this allows us to safely scan to the end of the buffer without
 317      reading beyond the end of the last page.  */
 318   misalign = (uintptr_t)s & 7;
 319   p = (const v8qi *)((uintptr_t)s & -8);
 320   data = *p;
 321
 322   /* Create a mask for the bytes that are valid within the first
 323      16-byte block.  The Idea here is that the AND with the mask
 324      within the loop is "free", since we need some AND or TEST
 325      insn in order to set the flags for the branch anyway.  */
 326   mask = -1u << misalign;
 327
 328   /* Main loop processing 8 bytes at a time.  */
 329   goto start;
 330   do
 331     {
 332       data = *++p;
 333       mask = -1;
 334
 335     start:
 336       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 337       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 338       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 339       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       found = __builtin_ia32_pmovmskb (t);
 344       found &= mask;
 345     }
 346   while (!found);
 347
 348   __builtin_ia32_emms ();
 349
 350   /* FOUND contains 1 in bits for which we matched a relevant
 351      character.  Conversion to the byte index is trivial.  */
 352   found = __builtin_ctz(found);
 353   return (const uchar *)p + found;
 354 }
 355
 356 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 357
 358 static const uchar *
 359 #ifndef __SSE2__
 360 __attribute__((__target__("sse2")))
 361 #endif
 362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 363 {
 364   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 365
 366   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 367   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 368   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 369   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 370
 371   unsigned int misalign, found, mask;
 372   const v16qi *p;
 373   v16qi data, t;
 374
 375   /* Align the source pointer.  */
 376   misalign = (uintptr_t)s & 15;
 377   p = (const v16qi *)((uintptr_t)s & -16);
 378   data = *p;
 379
 380   /* Create a mask for the bytes that are valid within the first
 381      16-byte block.  The Idea here is that the AND with the mask
 382      within the loop is "free", since we need some AND or TEST
 383      insn in order to set the flags for the branch anyway.  */
 384   mask = -1u << misalign;
 385
 386   /* Main loop processing 16 bytes at a time.  */
 387   goto start;
 388   do
 389     {
 390       data = *++p;
 391       mask = -1;
 392
 393     start:
 394       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 395       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 396       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 397       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 398       found = __builtin_ia32_pmovmskb128 (t);
 399       found &= mask;
 400     }
 401   while (!found);
 402
 403   /* FOUND contains 1 in bits for which we matched a relevant
 404      character.  Conversion to the byte index is trivial.  */
 405   found = __builtin_ctz(found);
 406   return (const uchar *)p + found;
 407 }
 408
 409 #ifdef HAVE_SSE4
 410 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 411
 412 static const uchar *
 413 #ifndef __SSE4_2__
 414 __attribute__((__target__("sse4.2")))
 415 #endif
 416 search_line_sse42 (const uchar *s, const uchar *end)
 417 {
 418   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 419   static const v16qi search = { '\n', '\r', '?', '\\' };
 420
 421   uintptr_t si = (uintptr_t)s;
 422   uintptr_t index;
 423
 424   /* Check for unaligned input.  */
 425   if (si & 15)
 426     {
 427       v16qi sv;
 428
 429       if (__builtin_expect (end - s < 16, 0)
 430           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 431         {
 432           /* There are less than 16 bytes left in the buffer, and less
 433              than 16 bytes left on the page.  Reading 16 bytes at this
 434              point might generate a spurious page fault.  Defer to the
 435              SSE2 implementation, which already handles alignment.  */
 436           return search_line_sse2 (s, end);
 437         }
 438
 439       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 440          memory need not be aligned.  */
 441       sv = __builtin_ia32_loaddqu ((const char *) s);
 442       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 443
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 15) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  */
 454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 455   while (1)
 456     {
 457       char f;
 458
 459       /* By using inline assembly instead of the builtin,
 460          we can use the result, as well as the flags set.  */
 461       __asm ("%vpcmpestri\t$0, %2, %3"
 462              : "=c"(index), "=@ccc"(f)
 463              : "m"(*s), "x"(search), "a"(4), "d"(16));
 464       if (f)
 465         break;
 466
 467       s += 16;
 468     }
 469 #else
 470   s -= 16;
 471   /* By doing the whole loop in inline assembly,
 472      we can make proper use of the flags set.  */
 473   __asm (      ".balign 16\n"
 474         "0:     add $16, %1\n"
 475         "       %vpcmpestri\t$0, (%1), %2\n"
 476         "       jnc 0b"
 477         : "=&c"(index), "+r"(s)
 478         : "x"(search), "a"(4), "d"(16));
 479 #endif
 480
 481  found:
 482   return s + index;
 483 }
 484
 485 #else
 486 /* Work around out-dated assemblers without sse4 support.  */
 487 #define search_line_sse42 search_line_sse2
 488 #endif
 489
 490 /* Check the CPU capabilities.  */
 491
 492 #include "../gcc/config/i386/cpuid.h"
 493
 494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 495 static search_line_fast_type search_line_fast;
 496
 497 #define HAVE_init_vectorized_lexer 1
 498 static inline void
 499 init_vectorized_lexer (void)
 500 {
 501   unsigned dummy, ecx = 0, edx = 0;
 502   search_line_fast_type impl = search_line_acc_char;
 503   int minimum = 0;
 504
 505 #if defined(__SSE4_2__)
 506   minimum = 3;
 507 #elif defined(__SSE2__)
 508   minimum = 2;
 509 #elif defined(__SSE__)
 510   minimum = 1;
 511 #endif
 512
 513   if (minimum == 3)
 514     impl = search_line_sse42;
 515   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 516     {
 517       if (minimum == 3 || (ecx & bit_SSE4_2))
 518         impl = search_line_sse42;
 519       else if (minimum == 2 || (edx & bit_SSE2))
 520         impl = search_line_sse2;
 521       else if (minimum == 1 || (edx & bit_SSE))
 522         impl = search_line_mmx;
 523     }
 524   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 525     {
 526       if (minimum == 1
 527           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 528         impl = search_line_mmx;
 529     }
 530
 531   search_line_fast = impl;
 532 }
 533
 534 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 535
 536 /* A vection of the fast scanner using AltiVec vectorized byte compares
 537    and VSX unaligned loads (when VSX is available).  This is otherwise
 538    the same as the AltiVec version.  */
 539
 540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 541 static const uchar *
 542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 543 {
 544   typedef __attribute__((altivec(vector))) unsigned char vc;
 545
 546   const vc repl_nl = {
 547     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 548     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 549   };
 550   const vc repl_cr = {
 551     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 552     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 553   };
 554   const vc repl_bs = {
 555     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 556     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 557   };
 558   const vc repl_qm = {
 559     '?', '?', '?', '?', '?', '?', '?', '?',
 560     '?', '?', '?', '?', '?', '?', '?', '?',
 561   };
 562   const vc zero = { 0 };
 563
 564   vc data, t;
 565
 566   /* Main loop processing 16 bytes at a time.  */
 567   do
 568     {
 569       vc m_nl, m_cr, m_bs, m_qm;
 570
 571       data = __builtin_vec_vsx_ld (0, s);
 572       s += 16;
 573
 574       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 575       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 576       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 577       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 578       t = (m_nl | m_cr) | (m_bs | m_qm);
 579
 580       /* T now contains 0xff in bytes for which we matched one of the relevant
 581          characters.  We want to exit the loop if any byte in T is non-zero.
 582          Below is the expansion of vec_any_ne(t, zero).  */
 583     }
 584   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 585
 586   /* Restore s to to point to the 16 bytes we just processed.  */
 587   s -= 16;
 588
 589   {
 590 #define N  (sizeof(vc) / sizeof(long))
 591
 592     union {
 593       vc v;
 594       /* Statically assert that N is 2 or 4.  */
 595       unsigned long l[(N == 2 || N == 4) ? N : -1];
 596     } u;
 597     unsigned long l, i = 0;
 598
 599     u.v = t;
 600
 601     /* Find the first word of T that is non-zero.  */
 602     switch (N)
 603       {
 604       case 4:
 605         l = u.l[i++];
 606         if (l != 0)
 607           break;
 608         s += sizeof(unsigned long);
 609         l = u.l[i++];
 610         if (l != 0)
 611           break;
 612         s += sizeof(unsigned long);
 613         /* FALLTHRU */
 614       case 2:
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619         l = u.l[i];
 620       }
 621
 622     /* L now contains 0xff in bytes for which we matched one of the
 623        relevant characters.  We can find the byte index by finding
 624        its bit index and dividing by 8.  */
 625 #ifdef __BIG_ENDIAN__
 626     l = __builtin_clzl(l) >> 3;
 627 #else
 628     l = __builtin_ctzl(l) >> 3;
 629 #endif
 630     return s + l;
 631
 632 #undef N
 633   }
 634 }
 635
 636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 637
 638 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 639    This cannot be used for little endian because vec_lvsl/lvsr are
 640    deprecated for little endian and the code won't work properly.  */
 641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 642    so we can't compile this function without -maltivec on the command line
 643    (or implied by some other switch).  */
 644
 645 static const uchar *
 646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 647 {
 648   typedef __attribute__((altivec(vector))) unsigned char vc;
 649
 650   const vc repl_nl = {
 651     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 652     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 653   };
 654   const vc repl_cr = {
 655     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 656     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 657   };
 658   const vc repl_bs = {
 659     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 660     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 661   };
 662   const vc repl_qm = {
 663     '?', '?', '?', '?', '?', '?', '?', '?',
 664     '?', '?', '?', '?', '?', '?', '?', '?',
 665   };
 666   const vc ones = {
 667     -1, -1, -1, -1, -1, -1, -1, -1,
 668     -1, -1, -1, -1, -1, -1, -1, -1,
 669   };
 670   const vc zero = { 0 };
 671
 672   vc data, mask, t;
 673
 674   /* Altivec loads automatically mask addresses with -16.  This lets us
 675      issue the first load as early as possible.  */
 676   data = __builtin_vec_ld(0, (const vc *)s);
 677
 678   /* Discard bytes before the beginning of the buffer.  Do this by
 679      beginning with all ones and shifting in zeros according to the
 680      mis-alignment.  The LVSR instruction pulls the exact shift we
 681      want from the address.  */
 682   mask = __builtin_vec_lvsr(0, s);
 683   mask = __builtin_vec_perm(zero, ones, mask);
 684   data &= mask;
 685
 686   /* While altivec loads mask addresses, we still need to align S so
 687      that the offset we compute at the end is correct.  */
 688   s = (const uchar *)((uintptr_t)s & -16);
 689
 690   /* Main loop processing 16 bytes at a time.  */
 691   goto start;
 692   do
 693     {
 694       vc m_nl, m_cr, m_bs, m_qm;
 695
 696       s += 16;
 697       data = __builtin_vec_ld(0, (const vc *)s);
 698
 699     start:
 700       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 701       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 702       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 703       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 704       t = (m_nl | m_cr) | (m_bs | m_qm);
 705
 706       /* T now contains 0xff in bytes for which we matched one of the relevant
 707          characters.  We want to exit the loop if any byte in T is non-zero.
 708          Below is the expansion of vec_any_ne(t, zero).  */
 709     }
 710   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 711
 712   {
 713 #define N  (sizeof(vc) / sizeof(long))
 714
 715     union {
 716       vc v;
 717       /* Statically assert that N is 2 or 4.  */
 718       unsigned long l[(N == 2 || N == 4) ? N : -1];
 719     } u;
 720     unsigned long l, i = 0;
 721
 722     u.v = t;
 723
 724     /* Find the first word of T that is non-zero.  */
 725     switch (N)
 726       {
 727       case 4:
 728         l = u.l[i++];
 729         if (l != 0)
 730           break;
 731         s += sizeof(unsigned long);
 732         l = u.l[i++];
 733         if (l != 0)
 734           break;
 735         s += sizeof(unsigned long);
 736         /* FALLTHROUGH */
 737       case 2:
 738         l = u.l[i++];
 739         if (l != 0)
 740           break;
 741         s += sizeof(unsigned long);
 742         l = u.l[i];
 743       }
 744
 745     /* L now contains 0xff in bytes for which we matched one of the
 746        relevant characters.  We can find the byte index by finding
 747        its bit index and dividing by 8.  */
 748     l = __builtin_clzl(l) >> 3;
 749     return s + l;
 750
 751 #undef N
 752   }
 753 }
 754
 755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 756 #include "arm_neon.h"
 757
 758 /* This doesn't have to be the exact page size, but no system may use
 759    a size smaller than this.  ARMv8 requires a minimum page size of
 760    4k.  The impact of being conservative here is a small number of
 761    cases will take the slightly slower entry path into the main
 762    loop.  */
 763
 764 #define AARCH64_MIN_PAGE_SIZE 4096
 765
 766 static const uchar *
 767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 768 {
 769   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 770   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 771   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 772   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 773   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 774
 775 #ifdef __ARM_BIG_ENDIAN
 776   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 777 #else
 778   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 779 #endif
 780
 781   unsigned int found;
 782   const uint8_t *p;
 783   uint8x16_t data;
 784   uint8x16_t t;
 785   uint16x8_t m;
 786   uint8x16_t u, v, w;
 787
 788   /* Align the source pointer.  */
 789   p = (const uint8_t *)((uintptr_t)s & -16);
 790
 791   /* Assuming random string start positions, with a 4k page size we'll take
 792      the slow path about 0.37% of the time.  */
 793   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 794                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 795                         < 16, 0))
 796     {
 797       /* Slow path: the string starts near a possible page boundary.  */
 798       uint32_t misalign, mask;
 799
 800       misalign = (uintptr_t)s & 15;
 801       mask = (-1u << misalign) & 0xffff;
 802       data = vld1q_u8 (p);
 803       t = vceqq_u8 (data, repl_nl);
 804       u = vceqq_u8 (data, repl_cr);
 805       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 806       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 807       t = vorrq_u8 (v, w);
 808       t = vandq_u8 (t, xmask);
 809       m = vpaddlq_u8 (t);
 810       m = vshlq_u16 (m, shift);
 811       found = vaddvq_u16 (m);
 812       found &= mask;
 813       if (found)
 814         return (const uchar*)p + __builtin_ctz (found);
 815     }
 816   else
 817     {
 818       data = vld1q_u8 ((const uint8_t *) s);
 819       t = vceqq_u8 (data, repl_nl);
 820       u = vceqq_u8 (data, repl_cr);
 821       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 822       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 823       t = vorrq_u8 (v, w);
 824       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 825         goto done;
 826     }
 827
 828   do
 829     {
 830       p += 16;
 831       data = vld1q_u8 (p);
 832       t = vceqq_u8 (data, repl_nl);
 833       u = vceqq_u8 (data, repl_cr);
 834       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 835       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 836       t = vorrq_u8 (v, w);
 837     } while (!vpaddd_u64 ((uint64x2_t)t));
 838
 839 done:
 840   /* Now that we've found the terminating substring, work out precisely where
 841      we need to stop.  */
 842   t = vandq_u8 (t, xmask);
 843   m = vpaddlq_u8 (t);
 844   m = vshlq_u16 (m, shift);
 845   found = vaddvq_u16 (m);
 846   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 847           + __builtin_ctz (found));
 848 }
 849
 850 #elif defined (__ARM_NEON)
 851 #include "arm_neon.h"
 852
 853 static const uchar *
 854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 855 {
 856   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 857   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 858   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 859   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 860   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 861
 862   unsigned int misalign, found, mask;
 863   const uint8_t *p;
 864   uint8x16_t data;
 865
 866   /* Align the source pointer.  */
 867   misalign = (uintptr_t)s & 15;
 868   p = (const uint8_t *)((uintptr_t)s & -16);
 869   data = vld1q_u8 (p);
 870
 871   /* Create a mask for the bytes that are valid within the first
 872      16-byte block.  The Idea here is that the AND with the mask
 873      within the loop is "free", since we need some AND or TEST
 874      insn in order to set the flags for the branch anyway.  */
 875   mask = (-1u << misalign) & 0xffff;
 876
 877   /* Main loop, processing 16 bytes at a time.  */
 878   goto start;
 879
 880   do
 881     {
 882       uint8x8_t l;
 883       uint16x4_t m;
 884       uint32x2_t n;
 885       uint8x16_t t, u, v, w;
 886
 887       p += 16;
 888       data = vld1q_u8 (p);
 889       mask = 0xffff;
 890
 891     start:
 892       t = vceqq_u8 (data, repl_nl);
 893       u = vceqq_u8 (data, repl_cr);
 894       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 895       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 896       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 897       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 898       m = vpaddl_u8 (l);
 899       n = vpaddl_u16 (m);
 900
 901       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 902               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 903       found &= mask;
 904     }
 905   while (!found);
 906
 907   /* FOUND contains 1 in bits for which we matched a relevant
 908      character.  Conversion to the byte index is trivial.  */
 909   found = __builtin_ctz (found);
 910   return (const uchar *)p + found;
 911 }
 912
 913 #else
 914
 915 /* We only have one accelerated alternative.  Use a direct call so that
 916    we encourage inlining.  */
 917
 918 #define search_line_fast  search_line_acc_char
 919
 920 #endif
 921
 922 /* Initialize the lexer if needed.  */
 923
 924 void
 925 _cpp_init_lexer (void)
 926 {
 927 #ifdef HAVE_init_vectorized_lexer
 928   init_vectorized_lexer ();
 929 #endif
 930 }
 931
 932 /* Returns with a logical line that contains no escaped newlines or
 933    trigraphs.  This is a time-critical inner loop.  */
 934 void
 935 _cpp_clean_line (cpp_reader *pfile)
 936 {
 937   cpp_buffer *buffer;
 938   const uchar *s;
 939   uchar c, *d, *p;
 940
 941   buffer = pfile->buffer;
 942   buffer->cur_note = buffer->notes_used = 0;
 943   buffer->cur = buffer->line_base = buffer->next_line;
 944   buffer->need_line = false;
 945   s = buffer->next_line;
 946
 947   if (!buffer->from_stage3)
 948     {
 949       const uchar *pbackslash = NULL;
 950
 951       /* Fast path.  This is the common case of an un-escaped line with
 952          no trigraphs.  The primary win here is by not writing any
 953          data back to memory until we have to.  */
 954       while (1)
 955         {
 956           /* Perform an optimized search for \n, \r, \\, ?.  */
 957           s = search_line_fast (s, buffer->rlimit);
 958
 959           c = *s;
 960           if (c == '\\')
 961             {
 962               /* Record the location of the backslash and continue.  */
 963               pbackslash = s++;
 964             }
 965           else if (__builtin_expect (c == '?', 0))
 966             {
 967               if (__builtin_expect (s[1] == '?', false)
 968                    && _cpp_trigraph_map[s[2]])
 969                 {
 970                   /* Have a trigraph.  We may or may not have to convert
 971                      it.  Add a line note regardless, for -Wtrigraphs.  */
 972                   add_line_note (buffer, s, s[2]);
 973                   if (CPP_OPTION (pfile, trigraphs))
 974                     {
 975                       /* We do, and that means we have to switch to the
 976                          slow path.  */
 977                       d = (uchar *) s;
 978                       *d = _cpp_trigraph_map[s[2]];
 979                       s += 2;
 980                       goto slow_path;
 981                     }
 982                 }
 983               /* Not a trigraph.  Continue on fast-path.  */
 984               s++;
 985             }
 986           else
 987             break;
 988         }
 989
 990       /* This must be \r or \n.  We're either done, or we'll be forced
 991          to write back to the buffer and continue on the slow path.  */
 992       d = (uchar *) s;
 993
 994       if (__builtin_expect (s == buffer->rlimit, false))
 995         goto done;
 996
 997       /* DOS line ending? */
 998       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 999         {
1000           s++;
1001           if (s == buffer->rlimit)
1002             goto done;
1003         }
1004
1005       if (__builtin_expect (pbackslash == NULL, true))
1006         goto done;
1007
1008       /* Check for escaped newline.  */
1009       p = d;
1010       while (is_nvspace (p[-1]))
1011         p--;
1012       if (p - 1 != pbackslash)
1013         goto done;
1014
1015       /* Have an escaped newline; process it and proceed to
1016          the slow path.  */
1017       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018       d = p - 2;
1019       buffer->next_line = p - 1;
1020
1021     slow_path:
1022       while (1)
1023         {
1024           c = *++s;
1025           *++d = c;
1026
1027           if (c == '\n' || c == '\r')
1028             {
1029               /* Handle DOS line endings.  */
1030               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031                 s++;
1032               if (s == buffer->rlimit)
1033                 break;
1034
1035               /* Escaped?  */
1036               p = d;
1037               while (p != buffer->next_line && is_nvspace (p[-1]))
1038                 p--;
1039               if (p == buffer->next_line || p[-1] != '\\')
1040                 break;
1041
1042               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043               d = p - 2;
1044               buffer->next_line = p - 1;
1045             }
1046           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1047             {
1048               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1049               add_line_note (buffer, d, s[2]);
1050               if (CPP_OPTION (pfile, trigraphs))
1051                 {
1052                   *d = _cpp_trigraph_map[s[2]];
1053                   s += 2;
1054                 }
1055             }
1056         }
1057     }
1058   else
1059     {
1060       while (*s != '\n' && *s != '\r')
1061         s++;
1062       d = (uchar *) s;
1063
1064       /* Handle DOS line endings.  */
1065       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
1066         s++;
1067     }
1068
1069  done:
1070   *d = '\n';
1071   /* A sentinel note that should never be processed.  */
1072   add_line_note (buffer, d + 1, '\n');
1073   buffer->next_line = s + 1;
1074 }
1075
1076 /* Return true if the trigraph indicated by NOTE should be warned
1077    about in a comment.  */
1078 static bool
1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1080 {
1081   const uchar *p;
1082
1083   /* Within comments we don't warn about trigraphs, unless the
1084      trigraph forms an escaped newline, as that may change
1085      behavior.  */
1086   if (note->type != '/')
1087     return false;
1088
1089   /* If -trigraphs, then this was an escaped newline iff the next note
1090      is coincident.  */
1091   if (CPP_OPTION (pfile, trigraphs))
1092     return note[1].pos == note->pos;
1093
1094   /* Otherwise, see if this forms an escaped newline.  */
1095   p = note->pos + 3;
1096   while (is_nvspace (*p))
1097     p++;
1098
1099   /* There might have been escaped newlines between the trigraph and the
1100      newline we found.  Hence the position test.  */
1101   return (*p == '\n' && p < note[1].pos);
1102 }
1103
1104 /* Process the notes created by add_line_note as far as the current
1105    location.  */
1106 void
1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1108 {
1109   cpp_buffer *buffer = pfile->buffer;
1110
1111   for (;;)
1112     {
1113       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114       unsigned int col;
1115
1116       if (note->pos > buffer->cur)
1117         break;
1118
1119       buffer->cur_note++;
1120       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1121
1122       if (note->type == '\\' || note->type == ' ')
1123         {
1124           if (note->type == ' ' && !in_comment)
1125             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126                                  "backslash and newline separated by space");
1127
1128           if (buffer->next_line > buffer->rlimit)
1129             {
1130               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131                                    "backslash-newline at end of file");
1132               /* Prevent "no newline at end of file" warning.  */
1133               buffer->next_line = buffer->rlimit;
1134             }
1135
1136           buffer->line_base = note->pos;
1137           CPP_INCREMENT_LINE (pfile, 0);
1138         }
1139       else if (_cpp_trigraph_map[note->type])
1140         {
1141           if (CPP_OPTION (pfile, warn_trigraphs)
1142               && (!in_comment || warn_in_comment (pfile, note)))
1143             {
1144               if (CPP_OPTION (pfile, trigraphs))
1145                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146                                        pfile->line_table->highest_line, col,
1147                                        "trigraph ??%c converted to %c",
1148                                        note->type,
1149                                        (int) _cpp_trigraph_map[note->type]);
1150               else
1151                 {
1152                   cpp_warning_with_line
1153                     (pfile, CPP_W_TRIGRAPHS,
1154                      pfile->line_table->highest_line, col,
1155                      "trigraph ??%c ignored, use -trigraphs to enable",
1156                      note->type);
1157                 }
1158             }
1159         }
1160       else if (note->type == 0)
1161         /* Already processed in lex_raw_string.  */;
1162       else
1163         abort ();
1164     }
1165 }
1166
1167 /* Skip a C-style block comment.  We find the end of the comment by
1168    seeing if an asterisk is before every '/' we encounter.  Returns
1169    nonzero if comment terminated by EOF, zero otherwise.
1170
1171    Buffer->cur points to the initial asterisk of the comment.  */
1172 bool
1173 _cpp_skip_block_comment (cpp_reader *pfile)
1174 {
1175   cpp_buffer *buffer = pfile->buffer;
1176   const uchar *cur = buffer->cur;
1177   uchar c;
1178
1179   cur++;
1180   if (*cur == '/')
1181     cur++;
1182
1183   for (;;)
1184     {
1185       /* People like decorating comments with '*', so check for '/'
1186          instead for efficiency.  */
1187       c = *cur++;
1188
1189       if (c == '/')
1190         {
1191           if (cur[-2] == '*')
1192             break;
1193
1194           /* Warn about potential nested comments, but not if the '/'
1195              comes immediately before the true comment delimiter.
1196              Don't bother to get it right across escaped newlines.  */
1197           if (CPP_OPTION (pfile, warn_comments)
1198               && cur[0] == '*' && cur[1] != '/')
1199             {
1200               buffer->cur = cur;
1201               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1202                                      pfile->line_table->highest_line,
1203                                      CPP_BUF_COL (buffer),
1204                                      "\"/*\" within comment");
1205             }
1206         }
1207       else if (c == '\n')
1208         {
1209           unsigned int cols;
1210           buffer->cur = cur - 1;
1211           _cpp_process_line_notes (pfile, true);
1212           if (buffer->next_line >= buffer->rlimit)
1213             return true;
1214           _cpp_clean_line (pfile);
1215
1216           cols = buffer->next_line - buffer->line_base;
1217           CPP_INCREMENT_LINE (pfile, cols);
1218
1219           cur = buffer->cur;
1220         }
1221     }
1222
1223   buffer->cur = cur;
1224   _cpp_process_line_notes (pfile, true);
1225   return false;
1226 }
1227
1228 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1229    terminating newline.  Handles escaped newlines.  Returns nonzero
1230    if a multiline comment.  */
1231 static int
1232 skip_line_comment (cpp_reader *pfile)
1233 {
1234   cpp_buffer *buffer = pfile->buffer;
1235   location_t orig_line = pfile->line_table->highest_line;
1236
1237   while (*buffer->cur != '\n')
1238     buffer->cur++;
1239
1240   _cpp_process_line_notes (pfile, true);
1241   return orig_line != pfile->line_table->highest_line;
1242 }
1243
1244 /* Skips whitespace, saving the next non-whitespace character.  */
1245 static void
1246 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1247 {
1248   cpp_buffer *buffer = pfile->buffer;
1249   bool saw_NUL = false;
1250
1251   do
1252     {
1253       /* Horizontal space always OK.  */
1254       if (c == ' ' || c == '\t')
1255         ;
1256       /* Just \f \v or \0 left.  */
1257       else if (c == '\0')
1258         saw_NUL = true;
1259       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1260         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1261                              CPP_BUF_COL (buffer),
1262                              "%s in preprocessing directive",
1263                              c == '\f' ? "form feed" : "vertical tab");
1264
1265       c = *buffer->cur++;
1266     }
1267   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1268   while (is_nvspace (c));
1269
1270   if (saw_NUL)
1271     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1272
1273   buffer->cur--;
1274 }
1275
1276 /* See if the characters of a number token are valid in a name (no
1277    '.', '+' or '-').  */
1278 static int
1279 name_p (cpp_reader *pfile, const cpp_string *string)
1280 {
1281   unsigned int i;
1282
1283   for (i = 0; i < string->len; i++)
1284     if (!is_idchar (string->text[i]))
1285       return 0;
1286
1287   return 1;
1288 }
1289
1290 /* After parsing an identifier or other sequence, produce a warning about
1291    sequences not in NFC/NFKC.  */
1292 static void
1293 warn_about_normalization (cpp_reader *pfile,
1294                           const cpp_token *token,
1295                           const struct normalize_state *s)
1296 {
1297   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1298       && !pfile->state.skipping)
1299     {
1300       /* Make sure that the token is printed using UCNs, even
1301          if we'd otherwise happily print UTF-8.  */
1302       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1303       size_t sz;
1304
1305       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1306       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1307         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1308                                "`%.*s' is not in NFKC", (int) sz, buf);
1309       else
1310         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1311                                "`%.*s' is not in NFC", (int) sz, buf);
1312       free (buf);
1313     }
1314 }
1315
1316 static const cppchar_t utf8_signifier = 0xC0;
1317
1318 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1319    an identifier.  FIRST is TRUE if this starts an identifier.  */
1320 static bool
1321 forms_identifier_p (cpp_reader *pfile, int first,
1322                     struct normalize_state *state)
1323 {
1324   cpp_buffer *buffer = pfile->buffer;
1325
1326   if (*buffer->cur == '$')
1327     {
1328       if (!CPP_OPTION (pfile, dollars_in_ident))
1329         return false;
1330
1331       buffer->cur++;
1332       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1333         {
1334           CPP_OPTION (pfile, warn_dollars) = 0;
1335           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1336         }
1337
1338       return true;
1339     }
1340
1341   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
1342   if (CPP_OPTION (pfile, extended_identifiers))
1343     {
1344       cppchar_t s;
1345       if (*buffer->cur >= utf8_signifier)
1346         {
1347           if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1348                                state, &s))
1349             return true;
1350         }
1351       else if (*buffer->cur == '\\'
1352                && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1353         {
1354           buffer->cur += 2;
1355           if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1356                               state, &s, NULL, NULL))
1357             return true;
1358           buffer->cur -= 2;
1359         }
1360     }
1361
1362   return false;
1363 }
1364
1365 /* Helper function to issue error about improper __VA_OPT__ use.  */
1366 static void
1367 maybe_va_opt_error (cpp_reader *pfile)
1368 {
1369   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1370     {
1371       /* __VA_OPT__ should not be accepted at all, but allow it in
1372          system headers.  */
1373       if (!cpp_in_system_header (pfile))
1374         cpp_error (pfile, CPP_DL_PEDWARN,
1375                    "__VA_OPT__ is not available until C++20");
1376     }
1377   else if (!pfile->state.va_args_ok)
1378     {
1379       /* __VA_OPT__ should only appear in the replacement list of a
1380          variadic macro.  */
1381       cpp_error (pfile, CPP_DL_PEDWARN,
1382                  "__VA_OPT__ can only appear in the expansion"
1383                  " of a C++20 variadic macro");
1384     }
1385 }
1386
1387 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1388 static cpp_hashnode *
1389 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1390 {
1391   cpp_hashnode *result;
1392   const uchar *cur;
1393   unsigned int len;
1394   unsigned int hash = HT_HASHSTEP (0, *base);
1395
1396   cur = base + 1;
1397   while (ISIDNUM (*cur))
1398     {
1399       hash = HT_HASHSTEP (hash, *cur);
1400       cur++;
1401     }
1402   len = cur - base;
1403   hash = HT_HASHFINISH (hash, len);
1404   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1405                                               base, len, hash, HT_ALLOC));
1406
1407   /* Rarely, identifiers require diagnostics when lexed.  */
1408   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1409                         && !pfile->state.skipping, 0))
1410     {
1411       /* It is allowed to poison the same identifier twice.  */
1412       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1413         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1414                    NODE_NAME (result));
1415
1416       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1417          replacement list of a variadic macro.  */
1418       if (result == pfile->spec_nodes.n__VA_ARGS__
1419           && !pfile->state.va_args_ok)
1420         {
1421           if (CPP_OPTION (pfile, cplusplus))
1422             cpp_error (pfile, CPP_DL_PEDWARN,
1423                        "__VA_ARGS__ can only appear in the expansion"
1424                        " of a C++11 variadic macro");
1425           else
1426             cpp_error (pfile, CPP_DL_PEDWARN,
1427                        "__VA_ARGS__ can only appear in the expansion"
1428                        " of a C99 variadic macro");
1429         }
1430
1431       if (result == pfile->spec_nodes.n__VA_OPT__)
1432         maybe_va_opt_error (pfile);
1433
1434       /* For -Wc++-compat, warn about use of C++ named operators.  */
1435       if (result->flags & NODE_WARN_OPERATOR)
1436         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1437                      "identifier \"%s\" is a special operator name in C++",
1438                      NODE_NAME (result));
1439     }
1440
1441   return result;
1442 }
1443
1444 /* Get the cpp_hashnode of an identifier specified by NAME in
1445    the current cpp_reader object.  If none is found, NULL is returned.  */
1446 cpp_hashnode *
1447 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1448 {
1449   cpp_hashnode *result;
1450   result = lex_identifier_intern (pfile, (uchar *) name);
1451   return result;
1452 }
1453
1454 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1455 static cpp_hashnode *
1456 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1457                 struct normalize_state *nst, cpp_hashnode **spelling)
1458 {
1459   cpp_hashnode *result;
1460   const uchar *cur;
1461   unsigned int len;
1462   unsigned int hash = HT_HASHSTEP (0, *base);
1463
1464   cur = pfile->buffer->cur;
1465   if (! starts_ucn)
1466     {
1467       while (ISIDNUM (*cur))
1468         {
1469           hash = HT_HASHSTEP (hash, *cur);
1470           cur++;
1471         }
1472       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1473     }
1474   pfile->buffer->cur = cur;
1475   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1476     {
1477       /* Slower version for identifiers containing UCNs
1478          or extended chars (including $).  */
1479       do {
1480         while (ISIDNUM (*pfile->buffer->cur))
1481           {
1482             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1483             pfile->buffer->cur++;
1484           }
1485       } while (forms_identifier_p (pfile, false, nst));
1486       result = _cpp_interpret_identifier (pfile, base,
1487                                           pfile->buffer->cur - base);
1488       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1489     }
1490   else
1491     {
1492       len = cur - base;
1493       hash = HT_HASHFINISH (hash, len);
1494
1495       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1496                                                   base, len, hash, HT_ALLOC));
1497       *spelling = result;
1498     }
1499
1500   /* Rarely, identifiers require diagnostics when lexed.  */
1501   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1502                         && !pfile->state.skipping, 0))
1503     {
1504       /* It is allowed to poison the same identifier twice.  */
1505       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1506         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1507                    NODE_NAME (result));
1508
1509       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1510          replacement list of a variadic macro.  */
1511       if (result == pfile->spec_nodes.n__VA_ARGS__
1512           && !pfile->state.va_args_ok)
1513         {
1514           if (CPP_OPTION (pfile, cplusplus))
1515             cpp_error (pfile, CPP_DL_PEDWARN,
1516                        "__VA_ARGS__ can only appear in the expansion"
1517                        " of a C++11 variadic macro");
1518           else
1519             cpp_error (pfile, CPP_DL_PEDWARN,
1520                        "__VA_ARGS__ can only appear in the expansion"
1521                        " of a C99 variadic macro");
1522         }
1523
1524       /* __VA_OPT__ should only appear in the replacement list of a
1525          variadic macro.  */
1526       if (result == pfile->spec_nodes.n__VA_OPT__)
1527         maybe_va_opt_error (pfile);
1528
1529       /* For -Wc++-compat, warn about use of C++ named operators.  */
1530       if (result->flags & NODE_WARN_OPERATOR)
1531         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1532                      "identifier \"%s\" is a special operator name in C++",
1533                      NODE_NAME (result));
1534     }
1535
1536   return result;
1537 }
1538
1539 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1540 static void
1541 lex_number (cpp_reader *pfile, cpp_string *number,
1542             struct normalize_state *nst)
1543 {
1544   const uchar *cur;
1545   const uchar *base;
1546   uchar *dest;
1547
1548   base = pfile->buffer->cur - 1;
1549   do
1550     {
1551       cur = pfile->buffer->cur;
1552
1553       /* N.B. ISIDNUM does not include $.  */
1554       while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1555              || VALID_SIGN (*cur, cur[-1]))
1556         {
1557           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1558           cur++;
1559         }
1560       /* A number can't end with a digit separator.  */
1561       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
1562         --cur;
1563
1564       pfile->buffer->cur = cur;
1565     }
1566   while (forms_identifier_p (pfile, false, nst));
1567
1568   number->len = cur - base;
1569   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1570   memcpy (dest, base, number->len);
1571   dest[number->len] = '\0';
1572   number->text = dest;
1573 }
1574
1575 /* Create a token of type TYPE with a literal spelling.  */
1576 static void
1577 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1578                 unsigned int len, enum cpp_ttype type)
1579 {
1580   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1581
1582   memcpy (dest, base, len);
1583   dest[len] = '\0';
1584   token->type = type;
1585   token->val.str.len = len;
1586   token->val.str.text = dest;
1587 }
1588
1589 /* A pair of raw buffer pointers.  The currently open one is [1], the
1590    first one is [0].  Used for string literal lexing.  */
1591 struct lit_accum {
1592   _cpp_buff *first;
1593   _cpp_buff *last;
1594   const uchar *rpos;
1595   size_t accum;
1596
1597   lit_accum ()
1598     : first (NULL), last (NULL), rpos (0), accum (0)
1599   {
1600   }
1601
1602   void append (cpp_reader *, const uchar *, size_t);
1603
1604   void read_begin (cpp_reader *);
1605   bool reading_p () const
1606   {
1607     return rpos != NULL;
1608   }
1609   char read_char ()
1610   {
1611     char c = *rpos++;
1612     if (rpos == BUFF_FRONT (last))
1613       rpos = NULL;
1614     return c;
1615   }
1616 };
1617
1618 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1619    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1620
1621 void
1622 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
1623 {
1624   if (!last)
1625     /* Starting.  */
1626     first = last = _cpp_get_buff (pfile, len);
1627   else if (len > BUFF_ROOM (last))
1628     {
1629       /* There is insufficient room in the buffer.  Copy what we can,
1630          and then either extend or create a new one.  */
1631       size_t room = BUFF_ROOM (last);
1632       memcpy (BUFF_FRONT (last), base, room);
1633       BUFF_FRONT (last) += room;
1634       base += room;
1635       len -= room;
1636       accum += room;
1637
1638       gcc_checking_assert (!rpos);
1639
1640       last = _cpp_append_extend_buff (pfile, last, len);
1641     }
1642
1643   memcpy (BUFF_FRONT (last), base, len);
1644   BUFF_FRONT (last) += len;
1645   accum += len;
1646 }
1647
1648 void
1649 lit_accum::read_begin (cpp_reader *pfile)
1650 {
1651   /* We never accumulate more than 4 chars to read.  */
1652   if (BUFF_ROOM (last) < 4)
1653
1654     last = _cpp_append_extend_buff (pfile, last, 4);
1655   rpos = BUFF_FRONT (last);
1656 }
1657
1658 /* Returns true if a macro has been defined.
1659    This might not work if compile with -save-temps,
1660    or preprocess separately from compilation.  */
1661
1662 static bool
1663 is_macro(cpp_reader *pfile, const uchar *base)
1664 {
1665   const uchar *cur = base;
1666   if (! ISIDST (*cur))
1667     return false;
1668   unsigned int hash = HT_HASHSTEP (0, *cur);
1669   ++cur;
1670   while (ISIDNUM (*cur))
1671     {
1672       hash = HT_HASHSTEP (hash, *cur);
1673       ++cur;
1674     }
1675   hash = HT_HASHFINISH (hash, cur - base);
1676
1677   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1678                                         base, cur - base, hash, HT_NO_INSERT));
1679
1680   return result && cpp_macro_p (result);
1681 }
1682
1683 /* Returns true if a literal suffix does not have the expected form
1684    and is defined as a macro.  */
1685
1686 static bool
1687 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
1688 {
1689   /* User-defined literals outside of namespace std must start with a single
1690      underscore, so assume anything of that form really is a UDL suffix.
1691      We don't need to worry about UDLs defined inside namespace std because
1692      their names are reserved, so cannot be used as macro names in valid
1693      programs.  */
1694   if (base[0] == '_' && base[1] != '_')
1695     return false;
1696   return is_macro (pfile, base);
1697 }
1698
1699 /* Lexes a raw string.  The stored string contains the spelling,
1700    including double quotes, delimiter string, '(' and ')', any leading
1701    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
1702    the type of the literal, or CPP_OTHER if it was not properly
1703    terminated.
1704
1705    BASE is the start of the token.  Updates pfile->buffer->cur to just
1706    after the lexed string.
1707
1708    The spelling is NUL-terminated, but it is not guaranteed that this
1709    is the first NUL since embedded NULs are preserved.  */
1710
1711 static void
1712 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1713 {
1714   const uchar *pos = base;
1715
1716   /* 'tis a pity this information isn't passed down from the lexer's
1717      initial categorization of the token.  */
1718   enum cpp_ttype type = CPP_STRING;
1719
1720   if (*pos == 'L')
1721     {
1722       type = CPP_WSTRING;
1723       pos++;
1724     }
1725   else if (*pos == 'U')
1726     {
1727       type = CPP_STRING32;
1728       pos++;
1729     }
1730   else if (*pos == 'u')
1731     {
1732       if (pos[1] == '8')
1733         {
1734           type = CPP_UTF8STRING;
1735           pos++;
1736         }
1737       else
1738         type = CPP_STRING16;
1739       pos++;
1740     }
1741
1742   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
1743   pos += 2;
1744
1745   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1746
1747   /* Skip notes before the ".  */
1748   while (note->pos < pos)
1749     ++note;
1750
1751   lit_accum accum;
1752
1753   uchar prefix[17];
1754   unsigned prefix_len = 0;
1755   enum Phase
1756   {
1757    PHASE_PREFIX = -2,
1758    PHASE_NONE = -1,
1759    PHASE_SUFFIX = 0
1760   } phase = PHASE_PREFIX;
1761
1762   for (;;)
1763     {
1764       gcc_checking_assert (note->pos >= pos);
1765
1766       /* Undo any escaped newlines and trigraphs.  */
1767       if (!accum.reading_p () && note->pos == pos)
1768         switch (note->type)
1769           {
1770           case '\\':
1771           case ' ':
1772             /* Restore backslash followed by newline.  */
1773             accum.append (pfile, base, pos - base);
1774             base = pos;
1775             accum.read_begin (pfile);
1776             accum.append (pfile, UC"\\", 1);
1777
1778           after_backslash:
1779             if (note->type == ' ')
1780               /* GNU backslash whitespace newline extension.  FIXME
1781                  could be any sequence of non-vertical space.  When we
1782                  can properly restore any such sequence, we should
1783                  mark this note as handled so _cpp_process_line_notes
1784                  doesn't warn.  */
1785               accum.append (pfile, UC" ", 1);
1786
1787             accum.append (pfile, UC"\n", 1);
1788             note++;
1789             break;
1790
1791           case '\n':
1792             /* This can happen for ??/<NEWLINE> when trigraphs are not
1793                being interpretted.  */
1794             gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
1795             note->type = 0;
1796             note++;
1797             break;
1798
1799           default:
1800             gcc_checking_assert (_cpp_trigraph_map[note->type]);
1801
1802             /* Don't warn about this trigraph in
1803                _cpp_process_line_notes, since trigraphs show up as
1804                trigraphs in raw strings.  */
1805             uchar type = note->type;
1806             note->type = 0;
1807
1808             if (CPP_OPTION (pfile, trigraphs))
1809               {
1810                 accum.append (pfile, base, pos - base);
1811                 base = pos;
1812                 accum.read_begin (pfile);
1813                 accum.append (pfile, UC"??", 2);
1814                 accum.append (pfile, &type, 1);
1815
1816                 /* ??/ followed by newline gets two line notes, one for
1817                    the trigraph and one for the backslash/newline.  */
1818                 if (type == '/' && note[1].pos == pos)
1819                   {
1820                     note++;
1821                     gcc_assert (note->type == '\\' || note->type == ' ');
1822                     goto after_backslash;
1823                   }
1824                 /* Skip the replacement character.  */
1825                 base = ++pos;
1826               }
1827
1828             note++;
1829             break;
1830           }
1831
1832       /* Now get a char to process.  Either from an expanded note, or
1833          from the line buffer.  */
1834       bool read_note = accum.reading_p ();
1835       char c = read_note ? accum.read_char () : *pos++;
1836
1837       if (phase == PHASE_PREFIX)
1838         {
1839           if (c == '(')
1840             {
1841               /* Done.  */
1842               phase = PHASE_NONE;
1843               prefix[prefix_len++] = '"';
1844             }
1845           else if (prefix_len < 16
1846                    /* Prefix chars are any of the basic character set,
1847                       [lex.charset] except for '
1848                       ()\\\t\v\f\n'. Optimized for a contiguous
1849                       alphabet.  */
1850                    /* Unlike a switch, this collapses down to one or
1851                       two shift and bitmask operations on an ASCII
1852                       system, with an outlier or two.   */
1853                    && (('Z' - 'A' == 25
1854                         ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
1855                         : ISIDST (c))
1856                        || (c >= '0' && c <= '9')
1857                        || c == '_' || c == '{' || c == '}'
1858                        || c == '[' || c == ']' || c == '#'
1859                        || c == '<' || c == '>' || c == '%'
1860                        || c == ':' || c == ';' || c == '.' || c == '?'
1861                        || c == '*' || c == '+' || c == '-' || c == '/'
1862                        || c == '^' || c == '&' || c == '|' || c == '~'
1863                        || c == '!' || c == '=' || c == ','
1864                        || c == '"' || c == '\''))
1865             prefix[prefix_len++] = c;
1866           else
1867             {
1868               /* Something is wrong.  */
1869               int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
1870               if (prefix_len == 16)
1871                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1872                                      col, "raw string delimiter longer "
1873                                      "than 16 characters");
1874               else if (c == '\n')
1875                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1876                                      col, "invalid new-line in raw "
1877                                      "string delimiter");
1878               else
1879                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1880                                      col, "invalid character '%c' in "
1881                                      "raw string delimiter", c);
1882               type = CPP_OTHER;
1883               phase = PHASE_NONE;
1884               /* Continue until we get a close quote, that's probably
1885                  the best failure mode.  */
1886               prefix_len = 0;
1887             }
1888           if (c != '\n')
1889             continue;
1890         }
1891
1892       if (phase != PHASE_NONE)
1893         {
1894           if (prefix[phase] != c)
1895             phase = PHASE_NONE;
1896           else if (unsigned (phase + 1) == prefix_len)
1897             break;
1898           else
1899             {
1900               phase = Phase (phase + 1);
1901               continue;
1902             }
1903         }
1904
1905       if (!prefix_len && c == '"')
1906         /* Failure mode lexing.  */
1907         goto out;
1908       else if (prefix_len && c == ')')
1909         phase = PHASE_SUFFIX;
1910       else if (!read_note && c == '\n')
1911         {
1912           pos--;
1913           pfile->buffer->cur = pos;
1914           if (pfile->state.in_directive
1915               || (pfile->state.parsing_args
1916                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
1917             {
1918               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1919                                    "unterminated raw string");
1920               type = CPP_OTHER;
1921               goto out;
1922             }
1923
1924           accum.append (pfile, base, pos - base + 1);
1925           _cpp_process_line_notes (pfile, false);
1926
1927           if (pfile->buffer->next_line < pfile->buffer->rlimit)
1928             CPP_INCREMENT_LINE (pfile, 0);
1929           pfile->buffer->need_line = true;
1930
1931           if (!_cpp_get_fresh_line (pfile))
1932             {
1933               /* We ran out of file and failed to get a line.  */
1934               location_t src_loc = token->src_loc;
1935               token->type = CPP_EOF;
1936               /* Tell the compiler the line number of the EOF token.  */
1937               token->src_loc = pfile->line_table->highest_line;
1938               token->flags = BOL;
1939               if (accum.first)
1940                 _cpp_release_buff (pfile, accum.first);
1941               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1942                                    "unterminated raw string");
1943               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
1944               _cpp_pop_buffer (pfile);
1945               return;
1946             }
1947
1948           pos = base = pfile->buffer->cur;
1949           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1950         }
1951     }
1952
1953   if (CPP_OPTION (pfile, user_literals))
1954     {
1955       /* If a string format macro, say from inttypes.h, is placed touching
1956          a string literal it could be parsed as a C++11 user-defined string
1957          literal thus breaking the program.  */
1958       if (is_macro_not_literal_suffix (pfile, pos))
1959         {
1960           /* Raise a warning, but do not consume subsequent tokens.  */
1961           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1962             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1963                                    token->src_loc, 0,
1964                                    "invalid suffix on literal; C++11 requires "
1965                                    "a space between literal and string macro");
1966         }
1967       /* Grab user defined literal suffix.  */
1968       else if (ISIDST (*pos))
1969         {
1970           type = cpp_userdef_string_add_type (type);
1971           ++pos;
1972
1973           while (ISIDNUM (*pos))
1974             ++pos;
1975         }
1976     }
1977
1978  out:
1979   pfile->buffer->cur = pos;
1980   if (!accum.accum)
1981     create_literal (pfile, token, base, pos - base, type);
1982   else
1983     {
1984       size_t extra_len = pos - base;
1985       uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
1986
1987       token->type = type;
1988       token->val.str.len = accum.accum + extra_len;
1989       token->val.str.text = dest;
1990       for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
1991         {
1992           size_t len = BUFF_FRONT (buf) - buf->base;
1993           memcpy (dest, buf->base, len);
1994           dest += len;
1995         }
1996       _cpp_release_buff (pfile, accum.first);
1997       memcpy (dest, base, extra_len);
1998       dest[extra_len] = '\0';
1999     }
2000 }
2001
2002 /* Lexes a string, character constant, or angle-bracketed header file
2003    name.  The stored string contains the spelling, including opening
2004    quote and any leading 'L', 'u', 'U' or 'u8' and optional
2005    'R' modifier.  It returns the type of the literal, or CPP_OTHER
2006    if it was not properly terminated, or CPP_LESS for an unterminated
2007    header name which must be relexed as normal tokens.
2008
2009    The spelling is NUL-terminated, but it is not guaranteed that this
2010    is the first NUL since embedded NULs are preserved.  */
2011 static void
2012 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2013 {
2014   bool saw_NUL = false;
2015   const uchar *cur;
2016   cppchar_t terminator;
2017   enum cpp_ttype type;
2018
2019   cur = base;
2020   terminator = *cur++;
2021   if (terminator == 'L' || terminator == 'U')
2022     terminator = *cur++;
2023   else if (terminator == 'u')
2024     {
2025       terminator = *cur++;
2026       if (terminator == '8')
2027         terminator = *cur++;
2028     }
2029   if (terminator == 'R')
2030     {
2031       lex_raw_string (pfile, token, base);
2032       return;
2033     }
2034   if (terminator == '"')
2035     type = (*base == 'L' ? CPP_WSTRING :
2036             *base == 'U' ? CPP_STRING32 :
2037             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2038                          : CPP_STRING);
2039   else if (terminator == '\'')
2040     type = (*base == 'L' ? CPP_WCHAR :
2041             *base == 'U' ? CPP_CHAR32 :
2042             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2043                          : CPP_CHAR);
2044   else
2045     terminator = '>', type = CPP_HEADER_NAME;
2046
2047   for (;;)
2048     {
2049       cppchar_t c = *cur++;
2050
2051       /* In #include-style directives, terminators are not escapable.  */
2052       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2053         cur++;
2054       else if (c == terminator)
2055         break;
2056       else if (c == '\n')
2057         {
2058           cur--;
2059           /* Unmatched quotes always yield undefined behavior, but
2060              greedy lexing means that what appears to be an unterminated
2061              header name may actually be a legitimate sequence of tokens.  */
2062           if (terminator == '>')
2063             {
2064               token->type = CPP_LESS;
2065               return;
2066             }
2067           type = CPP_OTHER;
2068           break;
2069         }
2070       else if (c == '\0')
2071         saw_NUL = true;
2072     }
2073
2074   if (saw_NUL && !pfile->state.skipping)
2075     cpp_error (pfile, CPP_DL_WARNING,
2076                "null character(s) preserved in literal");
2077
2078   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2079     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2080                (int) terminator);
2081
2082   if (CPP_OPTION (pfile, user_literals))
2083     {
2084       /* If a string format macro, say from inttypes.h, is placed touching
2085          a string literal it could be parsed as a C++11 user-defined string
2086          literal thus breaking the program.  */
2087       if (is_macro_not_literal_suffix (pfile, cur))
2088         {
2089           /* Raise a warning, but do not consume subsequent tokens.  */
2090           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2091             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2092                                    token->src_loc, 0,
2093                                    "invalid suffix on literal; C++11 requires "
2094                                    "a space between literal and string macro");
2095         }
2096       /* Grab user defined literal suffix.  */
2097       else if (ISIDST (*cur))
2098         {
2099           type = cpp_userdef_char_add_type (type);
2100           type = cpp_userdef_string_add_type (type);
2101           ++cur;
2102
2103           while (ISIDNUM (*cur))
2104             ++cur;
2105         }
2106     }
2107   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2108            && is_macro (pfile, cur)
2109            && !pfile->state.skipping)
2110     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2111                            token->src_loc, 0, "C++11 requires a space "
2112                            "between string literal and macro");
2113
2114   pfile->buffer->cur = cur;
2115   create_literal (pfile, token, base, cur - base, type);
2116 }
2117
2118 /* Return the comment table. The client may not make any assumption
2119    about the ordering of the table.  */
2120 cpp_comment_table *
2121 cpp_get_comments (cpp_reader *pfile)
2122 {
2123   return &pfile->comments;
2124 }
2125
2126 /* Append a comment to the end of the comment table. */
2127 static void
2128 store_comment (cpp_reader *pfile, cpp_token *token)
2129 {
2130   int len;
2131
2132   if (pfile->comments.allocated == 0)
2133     {
2134       pfile->comments.allocated = 256;
2135       pfile->comments.entries = (cpp_comment *) xmalloc
2136         (pfile->comments.allocated * sizeof (cpp_comment));
2137     }
2138
2139   if (pfile->comments.count == pfile->comments.allocated)
2140     {
2141       pfile->comments.allocated *= 2;
2142       pfile->comments.entries = (cpp_comment *) xrealloc
2143         (pfile->comments.entries,
2144          pfile->comments.allocated * sizeof (cpp_comment));
2145     }
2146
2147   len = token->val.str.len;
2148
2149   /* Copy comment. Note, token may not be NULL terminated. */
2150   pfile->comments.entries[pfile->comments.count].comment =
2151     (char *) xmalloc (sizeof (char) * (len + 1));
2152   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2153           token->val.str.text, len);
2154   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2155
2156   /* Set source location. */
2157   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2158
2159   /* Increment the count of entries in the comment table. */
2160   pfile->comments.count++;
2161 }
2162
2163 /* The stored comment includes the comment start and any terminator.  */
2164 static void
2165 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2166               cppchar_t type)
2167 {
2168   unsigned char *buffer;
2169   unsigned int len, clen, i;
2170
2171   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2172
2173   /* C++ comments probably (not definitely) have moved past a new
2174      line, which we don't want to save in the comment.  */
2175   if (is_vspace (pfile->buffer->cur[-1]))
2176     len--;
2177
2178   /* If we are currently in a directive or in argument parsing, then
2179      we need to store all C++ comments as C comments internally, and
2180      so we need to allocate a little extra space in that case.
2181
2182      Note that the only time we encounter a directive here is
2183      when we are saving comments in a "#define".  */
2184   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2185           && type == '/') ? len + 2 : len;
2186
2187   buffer = _cpp_unaligned_alloc (pfile, clen);
2188
2189   token->type = CPP_COMMENT;
2190   token->val.str.len = clen;
2191   token->val.str.text = buffer;
2192
2193   buffer[0] = '/';
2194   memcpy (buffer + 1, from, len - 1);
2195
2196   /* Finish conversion to a C comment, if necessary.  */
2197   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2198     {
2199       buffer[1] = '*';
2200       buffer[clen - 2] = '*';
2201       buffer[clen - 1] = '/';
2202       /* As there can be in a C++ comments illegal sequences for C comments
2203          we need to filter them out.  */
2204       for (i = 2; i < (clen - 2); i++)
2205         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2206           buffer[i] = '|';
2207     }
2208
2209   /* Finally store this comment for use by clients of libcpp. */
2210   store_comment (pfile, token);
2211 }
2212
2213 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2214    comment.  */
2215
2216 static bool
2217 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2218 {
2219   const unsigned char *from = comment_start + 1;
2220
2221   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2222     {
2223       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2224          don't recognize any comments.  The latter only checks attributes,
2225          the former doesn't warn.  */
2226     case 0:
2227     default:
2228       return false;
2229       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2230          content it has.  */
2231     case 1:
2232       return true;
2233     case 2:
2234       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2235          .*falls?[ \t-]*thr(u|ough).* regex.  */
2236       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2237            from++)
2238         {
2239           /* Is there anything like strpbrk with upper boundary, or
2240              memchr looking for 2 characters rather than just one?  */
2241           if (from[0] != 'f' && from[0] != 'F')
2242             continue;
2243           if (from[1] != 'a' && from[1] != 'A')
2244             continue;
2245           if (from[2] != 'l' && from[2] != 'L')
2246             continue;
2247           if (from[3] != 'l' && from[3] != 'L')
2248             continue;
2249           from += sizeof "fall" - 1;
2250           if (from[0] == 's' || from[0] == 'S')
2251             from++;
2252           while (*from == ' ' || *from == '\t' || *from == '-')
2253             from++;
2254           if (from[0] != 't' && from[0] != 'T')
2255             continue;
2256           if (from[1] != 'h' && from[1] != 'H')
2257             continue;
2258           if (from[2] != 'r' && from[2] != 'R')
2259             continue;
2260           if (from[3] == 'u' || from[3] == 'U')
2261             return true;
2262           if (from[3] != 'o' && from[3] != 'O')
2263             continue;
2264           if (from[4] != 'u' && from[4] != 'U')
2265             continue;
2266           if (from[5] != 'g' && from[5] != 'G')
2267             continue;
2268           if (from[6] != 'h' && from[6] != 'H')
2269             continue;
2270           return true;
2271         }
2272       return false;
2273     case 3:
2274     case 4:
2275       break;
2276     }
2277
2278   /* Whole comment contents:
2279      -fallthrough
2280      @fallthrough@
2281    */
2282   if (*from == '-' || *from == '@')
2283     {
2284       size_t len = sizeof "fallthrough" - 1;
2285       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2286         return false;
2287       if (memcmp (from + 1, "fallthrough", len))
2288         return false;
2289       if (*from == '@')
2290         {
2291           if (from[len + 1] != '@')
2292             return false;
2293           len++;
2294         }
2295       from += 1 + len;
2296     }
2297   /* Whole comment contents (regex):
2298      lint -fallthrough[ \t]*
2299    */
2300   else if (*from == 'l')
2301     {
2302       size_t len = sizeof "int -fallthrough" - 1;
2303       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2304         return false;
2305       if (memcmp (from + 1, "int -fallthrough", len))
2306         return false;
2307       from += 1 + len;
2308       while (*from == ' ' || *from == '\t')
2309         from++;
2310     }
2311   /* Whole comment contents (regex):
2312      [ \t]*FALLTHR(U|OUGH)[ \t]*
2313    */
2314   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2315     {
2316       while (*from == ' ' || *from == '\t')
2317         from++;
2318       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
2319         return false;
2320       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2321         return false;
2322       from += sizeof "FALLTHR" - 1;
2323       if (*from == 'U')
2324         from++;
2325       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
2326         return false;
2327       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2328         return false;
2329       else
2330         from += sizeof "OUGH" - 1;
2331       while (*from == ' ' || *from == '\t')
2332         from++;
2333     }
2334   /* Whole comment contents (regex):
2335      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2336      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2337      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2338    */
2339   else
2340     {
2341       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2342         from++;
2343       unsigned char f = *from;
2344       bool all_upper = false;
2345       if (f == 'E' || f == 'e')
2346         {
2347           if ((size_t) (pfile->buffer->cur - from)
2348               < sizeof "else fallthru" - 1)
2349             return false;
2350           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2351             all_upper = true;
2352           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2353             return false;
2354           from += sizeof "else" - 1;
2355           if (*from == ',')
2356             from++;
2357           if (*from != ' ')
2358             return false;
2359           from++;
2360           if (all_upper && *from == 'f')
2361             return false;
2362           if (f == 'e' && *from == 'F')
2363             return false;
2364           f = *from;
2365         }
2366       else if (f == 'I' || f == 'i')
2367         {
2368           if ((size_t) (pfile->buffer->cur - from)
2369               < sizeof "intentional fallthru" - 1)
2370             return false;
2371           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2372                                   sizeof "NTENTIONAL" - 1) == 0)
2373             all_upper = true;
2374           else if (memcmp (from + 1, "ntentional",
2375                            sizeof "ntentional" - 1))
2376             return false;
2377           from += sizeof "intentional" - 1;
2378           if (*from == ' ')
2379             {
2380               from++;
2381               if (all_upper && *from == 'f')
2382                 return false;
2383             }
2384           else if (all_upper)
2385             {
2386               if (memcmp (from, "LY F", sizeof "LY F" - 1))
2387                 return false;
2388               from += sizeof "LY " - 1;
2389             }
2390           else
2391             {
2392               if (memcmp (from, "ly ", sizeof "ly " - 1))
2393                 return false;
2394               from += sizeof "ly " - 1;
2395             }
2396           if (f == 'i' && *from == 'F')
2397             return false;
2398           f = *from;
2399         }
2400       if (f != 'F' && f != 'f')
2401         return false;
2402       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
2403         return false;
2404       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
2405         all_upper = true;
2406       else if (all_upper)
2407         return false;
2408       else if (memcmp (from + 1, "all", sizeof "all" - 1))
2409         return false;
2410       from += sizeof "fall" - 1;
2411       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
2412         from += 2;
2413       else if (*from == ' ' || *from == '-')
2414         from++;
2415       else if (*from != (all_upper ? 'T' : 't'))
2416         return false;
2417       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
2418         return false;
2419       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
2420         return false;
2421       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
2422         {
2423           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
2424             return false;
2425           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
2426                       sizeof "hrough" - 1))
2427             return false;
2428           from += sizeof "through" - 1;
2429         }
2430       else
2431         from += sizeof "thru" - 1;
2432       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2433         from++;
2434       if (*from == '-')
2435         {
2436           from++;
2437           if (*comment_start == '*')
2438             {
2439               do
2440                 {
2441                   while (*from && *from != '*'
2442                          && *from != '\n' && *from != '\r')
2443                     from++;
2444                   if (*from != '*' || from[1] == '/')
2445                     break;
2446                   from++;
2447                 }
2448               while (1);
2449             }
2450           else
2451             while (*from && *from != '\n' && *from != '\r')
2452               from++;
2453         }
2454     }
2455   /* C block comment.  */
2456   if (*comment_start == '*')
2457     {
2458       if (*from != '*' || from[1] != '/')
2459         return false;
2460     }
2461   /* C++ line comment.  */
2462   else if (*from != '\n')
2463     return false;
2464
2465   return true;
2466 }
2467
2468 /* Allocate COUNT tokens for RUN.  */
2469 void
2470 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
2471 {
2472   run->base = XNEWVEC (cpp_token, count);
2473   run->limit = run->base + count;
2474   run->next = NULL;
2475 }
2476
2477 /* Returns the next tokenrun, or creates one if there is none.  */
2478 static tokenrun *
2479 next_tokenrun (tokenrun *run)
2480 {
2481   if (run->next == NULL)
2482     {
2483       run->next = XNEW (tokenrun);
2484       run->next->prev = run;
2485       _cpp_init_tokenrun (run->next, 250);
2486     }
2487
2488   return run->next;
2489 }
2490
2491 /* Return the number of not yet processed token in a given
2492    context.  */
2493 int
2494 _cpp_remaining_tokens_num_in_context (cpp_context *context)
2495 {
2496   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2497     return (LAST (context).token - FIRST (context).token);
2498   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2499            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2500     return (LAST (context).ptoken - FIRST (context).ptoken);
2501   else
2502       abort ();
2503 }
2504
2505 /* Returns the token present at index INDEX in a given context.  If
2506    INDEX is zero, the next token to be processed is returned.  */
2507 static const cpp_token*
2508 _cpp_token_from_context_at (cpp_context *context, int index)
2509 {
2510   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2511     return &(FIRST (context).token[index]);
2512   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2513            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2514     return FIRST (context).ptoken[index];
2515  else
2516    abort ();
2517 }
2518
2519 /* Look ahead in the input stream.  */
2520 const cpp_token *
2521 cpp_peek_token (cpp_reader *pfile, int index)
2522 {
2523   cpp_context *context = pfile->context;
2524   const cpp_token *peektok;
2525   int count;
2526
2527   /* First, scan through any pending cpp_context objects.  */
2528   while (context->prev)
2529     {
2530       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2531
2532       if (index < (int) sz)
2533         return _cpp_token_from_context_at (context, index);
2534       index -= (int) sz;
2535       context = context->prev;
2536     }
2537
2538   /* We will have to read some new tokens after all (and do so
2539      without invalidating preceding tokens).  */
2540   count = index;
2541   pfile->keep_tokens++;
2542
2543   /* For peeked tokens temporarily disable line_change reporting,
2544      until the tokens are parsed for real.  */
2545   void (*line_change) (cpp_reader *, const cpp_token *, int)
2546     = pfile->cb.line_change;
2547   pfile->cb.line_change = NULL;
2548
2549   do
2550     {
2551       peektok = _cpp_lex_token (pfile);
2552       if (peektok->type == CPP_EOF)
2553         {
2554           index--;
2555           break;
2556         }
2557     }
2558   while (index--);
2559
2560   _cpp_backup_tokens_direct (pfile, count - index);
2561   pfile->keep_tokens--;
2562   pfile->cb.line_change = line_change;
2563
2564   return peektok;
2565 }
2566
2567 /* Allocate a single token that is invalidated at the same time as the
2568    rest of the tokens on the line.  Has its line and col set to the
2569    same as the last lexed token, so that diagnostics appear in the
2570    right place.  */
2571 cpp_token *
2572 _cpp_temp_token (cpp_reader *pfile)
2573 {
2574   cpp_token *old, *result;
2575   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2576   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2577
2578   old = pfile->cur_token - 1;
2579   /* Any pre-existing lookaheads must not be clobbered.  */
2580   if (la)
2581     {
2582       if (sz <= la)
2583         {
2584           tokenrun *next = next_tokenrun (pfile->cur_run);
2585
2586           if (sz < la)
2587             memmove (next->base + 1, next->base,
2588                      (la - sz) * sizeof (cpp_token));
2589
2590           next->base[0] = pfile->cur_run->limit[-1];
2591         }
2592
2593       if (sz > 1)
2594         memmove (pfile->cur_token + 1, pfile->cur_token,
2595                  MIN (la, sz - 1) * sizeof (cpp_token));
2596     }
2597
2598   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2599     {
2600       pfile->cur_run = next_tokenrun (pfile->cur_run);
2601       pfile->cur_token = pfile->cur_run->base;
2602     }
2603
2604   result = pfile->cur_token++;
2605   result->src_loc = old->src_loc;
2606   return result;
2607 }
2608
2609 /* Lex a token into RESULT (external interface).  Takes care of issues
2610    like directive handling, token lookahead, multiple include
2611    optimization and skipping.  */
2612 const cpp_token *
2613 _cpp_lex_token (cpp_reader *pfile)
2614 {
2615   cpp_token *result;
2616
2617   for (;;)
2618     {
2619       if (pfile->cur_token == pfile->cur_run->limit)
2620         {
2621           pfile->cur_run = next_tokenrun (pfile->cur_run);
2622           pfile->cur_token = pfile->cur_run->base;
2623         }
2624       /* We assume that the current token is somewhere in the current
2625          run.  */
2626       if (pfile->cur_token < pfile->cur_run->base
2627           || pfile->cur_token >= pfile->cur_run->limit)
2628         abort ();
2629
2630       if (pfile->lookaheads)
2631         {
2632           pfile->lookaheads--;
2633           result = pfile->cur_token++;
2634         }
2635       else
2636         result = _cpp_lex_direct (pfile);
2637
2638       if (result->flags & BOL)
2639         {
2640           /* Is this a directive.  If _cpp_handle_directive returns
2641              false, it is an assembler #.  */
2642           if (result->type == CPP_HASH
2643               /* 6.10.3 p 11: Directives in a list of macro arguments
2644                  gives undefined behavior.  This implementation
2645                  handles the directive as normal.  */
2646               && pfile->state.parsing_args != 1)
2647             {
2648               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2649                 {
2650                   if (pfile->directive_result.type == CPP_PADDING)
2651                     continue;
2652                   result = &pfile->directive_result;
2653                 }
2654             }
2655           else if (pfile->state.in_deferred_pragma)
2656             result = &pfile->directive_result;
2657
2658           if (pfile->cb.line_change && !pfile->state.skipping)
2659             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2660         }
2661
2662       /* We don't skip tokens in directives.  */
2663       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2664         break;
2665
2666       /* Outside a directive, invalidate controlling macros.  At file
2667          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2668          get here and MI optimization works.  */
2669       pfile->mi_valid = false;
2670
2671       if (!pfile->state.skipping || result->type == CPP_EOF)
2672         break;
2673     }
2674
2675   return result;
2676 }
2677
2678 /* Returns true if a fresh line has been loaded.  */
2679 bool
2680 _cpp_get_fresh_line (cpp_reader *pfile)
2681 {
2682   /* We can't get a new line until we leave the current directive.  */
2683   if (pfile->state.in_directive)
2684     return false;
2685
2686   for (;;)
2687     {
2688       cpp_buffer *buffer = pfile->buffer;
2689
2690       if (!buffer->need_line)
2691         return true;
2692
2693       if (buffer->next_line < buffer->rlimit)
2694         {
2695           _cpp_clean_line (pfile);
2696           return true;
2697         }
2698
2699       /* First, get out of parsing arguments state.  */
2700       if (pfile->state.parsing_args)
2701         return false;
2702
2703       /* End of buffer.  Non-empty files should end in a newline.  */
2704       if (buffer->buf != buffer->rlimit
2705           && buffer->next_line > buffer->rlimit
2706           && !buffer->from_stage3)
2707         {
2708           /* Clip to buffer size.  */
2709           buffer->next_line = buffer->rlimit;
2710         }
2711
2712       if (buffer->prev && !buffer->return_at_eof)
2713         _cpp_pop_buffer (pfile);
2714       else
2715         {
2716           /* End of translation.  Do not pop the buffer yet. Increment
2717              line number so that the EOF token is on a line of its own
2718              (_cpp_lex_direct doesn't increment in that case, because
2719              it's hard for it to distinguish this special case). */
2720           CPP_INCREMENT_LINE (pfile, 0);
2721           return false;
2722         }
2723     }
2724 }
2725
2726 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2727   do                                                    \
2728     {                                                   \
2729       result->type = ELSE_TYPE;                         \
2730       if (*buffer->cur == CHAR)                         \
2731         buffer->cur++, result->type = THEN_TYPE;        \
2732     }                                                   \
2733   while (0)
2734
2735 /* Lex a token into pfile->cur_token, which is also incremented, to
2736    get diagnostics pointing to the correct location.
2737
2738    Does not handle issues such as token lookahead, multiple-include
2739    optimization, directives, skipping etc.  This function is only
2740    suitable for use by _cpp_lex_token, and in special cases like
2741    lex_expansion_token which doesn't care for any of these issues.
2742
2743    When meeting a newline, returns CPP_EOF if parsing a directive,
2744    otherwise returns to the start of the token buffer if permissible.
2745    Returns the location of the lexed token.  */
2746 cpp_token *
2747 _cpp_lex_direct (cpp_reader *pfile)
2748 {
2749   cppchar_t c;
2750   cpp_buffer *buffer;
2751   const unsigned char *comment_start;
2752   bool fallthrough_comment = false;
2753   cpp_token *result = pfile->cur_token++;
2754
2755  fresh_line:
2756   result->flags = 0;
2757   buffer = pfile->buffer;
2758   if (buffer->need_line)
2759     {
2760       if (pfile->state.in_deferred_pragma)
2761         {
2762           result->type = CPP_PRAGMA_EOL;
2763           pfile->state.in_deferred_pragma = false;
2764           if (!pfile->state.pragma_allow_expansion)
2765             pfile->state.prevent_expansion--;
2766           return result;
2767         }
2768       if (!_cpp_get_fresh_line (pfile))
2769         {
2770           result->type = CPP_EOF;
2771           if (!pfile->state.in_directive)
2772             {
2773               /* Tell the compiler the line number of the EOF token.  */
2774               result->src_loc = pfile->line_table->highest_line;
2775               result->flags = BOL;
2776               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
2777               _cpp_pop_buffer (pfile);
2778             }
2779           return result;
2780         }
2781       if (buffer != pfile->buffer)
2782         fallthrough_comment = false;
2783       if (!pfile->keep_tokens)
2784         {
2785           pfile->cur_run = &pfile->base_run;
2786           result = pfile->base_run.base;
2787           pfile->cur_token = result + 1;
2788         }
2789       result->flags = BOL;
2790       if (pfile->state.parsing_args == 2)
2791         result->flags |= PREV_WHITE;
2792     }
2793   buffer = pfile->buffer;
2794  update_tokens_line:
2795   result->src_loc = pfile->line_table->highest_line;
2796
2797  skipped_white:
2798   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2799       && !pfile->overlaid_buffer)
2800     {
2801       _cpp_process_line_notes (pfile, false);
2802       result->src_loc = pfile->line_table->highest_line;
2803     }
2804   c = *buffer->cur++;
2805
2806   if (pfile->forced_token_location)
2807     result->src_loc = pfile->forced_token_location;
2808   else
2809     result->src_loc = linemap_position_for_column (pfile->line_table,
2810                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2811
2812   switch (c)
2813     {
2814     case ' ': case '\t': case '\f': case '\v': case '\0':
2815       result->flags |= PREV_WHITE;
2816       skip_whitespace (pfile, c);
2817       goto skipped_white;
2818
2819     case '\n':
2820       /* Increment the line, unless this is the last line ...  */
2821       if (buffer->cur < buffer->rlimit
2822           /* ... or this is a #include, (where _cpp_stack_file needs to
2823              unwind by one line) ...  */
2824           || (pfile->state.in_directive > 1
2825               /* ... except traditional-cpp increments this elsewhere.  */
2826               && !CPP_OPTION (pfile, traditional)))
2827         CPP_INCREMENT_LINE (pfile, 0);
2828       buffer->need_line = true;
2829       goto fresh_line;
2830
2831     case '0': case '1': case '2': case '3': case '4':
2832     case '5': case '6': case '7': case '8': case '9':
2833       {
2834         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2835         result->type = CPP_NUMBER;
2836         lex_number (pfile, &result->val.str, &nst);
2837         warn_about_normalization (pfile, result, &nst);
2838         break;
2839       }
2840
2841     case 'L':
2842     case 'u':
2843     case 'U':
2844     case 'R':
2845       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2846          wide strings or raw strings.  */
2847       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2848           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2849         {
2850           if ((*buffer->cur == '\'' && c != 'R')
2851               || *buffer->cur == '"'
2852               || (*buffer->cur == 'R'
2853                   && c != 'R'
2854                   && buffer->cur[1] == '"'
2855                   && CPP_OPTION (pfile, rliterals))
2856               || (*buffer->cur == '8'
2857                   && c == 'u'
2858                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
2859                                 && CPP_OPTION (pfile, utf8_char_literals)))
2860                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2861                           && CPP_OPTION (pfile, rliterals)))))
2862             {
2863               lex_string (pfile, result, buffer->cur - 1);
2864               break;
2865             }
2866         }
2867       /* Fall through.  */
2868
2869     case '_':
2870     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2871     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2872     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2873     case 's': case 't':           case 'v': case 'w': case 'x':
2874     case 'y': case 'z':
2875     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2876     case 'G': case 'H': case 'I': case 'J': case 'K':
2877     case 'M': case 'N': case 'O': case 'P': case 'Q':
2878     case 'S': case 'T':           case 'V': case 'W': case 'X':
2879     case 'Y': case 'Z':
2880       result->type = CPP_NAME;
2881       {
2882         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2883         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2884                                                 &nst,
2885                                                 &result->val.node.spelling);
2886         warn_about_normalization (pfile, result, &nst);
2887       }
2888
2889       /* Convert named operators to their proper types.  */
2890       if (result->val.node.node->flags & NODE_OPERATOR)
2891         {
2892           result->flags |= NAMED_OP;
2893           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2894         }
2895
2896       /* Signal FALLTHROUGH comment followed by another token.  */
2897       if (fallthrough_comment)
2898         result->flags |= PREV_FALLTHROUGH;
2899       break;
2900
2901     case '\'':
2902     case '"':
2903       lex_string (pfile, result, buffer->cur - 1);
2904       break;
2905
2906     case '/':
2907       /* A potential block or line comment.  */
2908       comment_start = buffer->cur;
2909       c = *buffer->cur;
2910
2911       if (c == '*')
2912         {
2913           if (_cpp_skip_block_comment (pfile))
2914             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2915         }
2916       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
2917         {
2918           /* Don't warn for system headers.  */
2919           if (cpp_in_system_header (pfile))
2920             ;
2921           /* Warn about comments if pedantically GNUC89, and not
2922              in system headers.  */
2923           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
2924                    && CPP_PEDANTIC (pfile)
2925                    && ! buffer->warned_cplusplus_comments)
2926             {
2927               if (cpp_error (pfile, CPP_DL_PEDWARN,
2928                              "C++ style comments are not allowed in ISO C90"))
2929                 cpp_error (pfile, CPP_DL_NOTE,
2930                            "(this will be reported only once per input file)");
2931               buffer->warned_cplusplus_comments = 1;
2932             }
2933           /* Or if specifically desired via -Wc90-c99-compat.  */
2934           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
2935                    && ! CPP_OPTION (pfile, cplusplus)
2936                    && ! buffer->warned_cplusplus_comments)
2937             {
2938               if (cpp_error (pfile, CPP_DL_WARNING,
2939                              "C++ style comments are incompatible with C90"))
2940                 cpp_error (pfile, CPP_DL_NOTE,
2941                            "(this will be reported only once per input file)");
2942               buffer->warned_cplusplus_comments = 1;
2943             }
2944           /* In C89/C94, C++ style comments are forbidden.  */
2945           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
2946                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
2947             {
2948               /* But don't be confused about valid code such as
2949                  - // immediately followed by *,
2950                  - // in a preprocessing directive,
2951                  - // in an #if 0 block.  */
2952               if (buffer->cur[1] == '*'
2953                   || pfile->state.in_directive
2954                   || pfile->state.skipping)
2955                 {
2956                   result->type = CPP_DIV;
2957                   break;
2958                 }
2959               else if (! buffer->warned_cplusplus_comments)
2960                 {
2961                   if (cpp_error (pfile, CPP_DL_ERROR,
2962                                  "C++ style comments are not allowed in "
2963                                  "ISO C90"))
2964                     cpp_error (pfile, CPP_DL_NOTE,
2965                                "(this will be reported only once per input "
2966                                "file)");
2967                   buffer->warned_cplusplus_comments = 1;
2968                 }
2969             }
2970           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2971             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2972         }
2973       else if (c == '=')
2974         {
2975           buffer->cur++;
2976           result->type = CPP_DIV_EQ;
2977           break;
2978         }
2979       else
2980         {
2981           result->type = CPP_DIV;
2982           break;
2983         }
2984
2985       if (fallthrough_comment_p (pfile, comment_start))
2986         fallthrough_comment = true;
2987
2988       if (pfile->cb.comment)
2989         {
2990           size_t len = pfile->buffer->cur - comment_start;
2991           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
2992                              len + 1);
2993         }
2994
2995       if (!pfile->state.save_comments)
2996         {
2997           result->flags |= PREV_WHITE;
2998           goto update_tokens_line;
2999         }
3000
3001       if (fallthrough_comment)
3002         result->flags |= PREV_FALLTHROUGH;
3003
3004       /* Save the comment as a token in its own right.  */
3005       save_comment (pfile, result, comment_start, c);
3006       break;
3007
3008     case '<':
3009       if (pfile->state.angled_headers)
3010         {
3011           lex_string (pfile, result, buffer->cur - 1);
3012           if (result->type != CPP_LESS)
3013             break;
3014         }
3015
3016       result->type = CPP_LESS;
3017       if (*buffer->cur == '=')
3018         {
3019           buffer->cur++, result->type = CPP_LESS_EQ;
3020           if (*buffer->cur == '>'
3021               && CPP_OPTION (pfile, cplusplus)
3022               && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
3023             buffer->cur++, result->type = CPP_SPACESHIP;
3024         }
3025       else if (*buffer->cur == '<')
3026         {
3027           buffer->cur++;
3028           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
3029         }
3030       else if (CPP_OPTION (pfile, digraphs))
3031         {
3032           if (*buffer->cur == ':')
3033             {
3034               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
3035                  three characters are <:: and the subsequent character
3036                  is neither : nor >, the < is treated as a preprocessor
3037                  token by itself".  */
3038               if (CPP_OPTION (pfile, cplusplus)
3039                   && CPP_OPTION (pfile, lang) != CLK_CXX98
3040                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
3041                   && buffer->cur[1] == ':'
3042                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3043                 break;
3044
3045               buffer->cur++;
3046               result->flags |= DIGRAPH;
3047               result->type = CPP_OPEN_SQUARE;
3048             }
3049           else if (*buffer->cur == '%')
3050             {
3051               buffer->cur++;
3052               result->flags |= DIGRAPH;
3053               result->type = CPP_OPEN_BRACE;
3054             }
3055         }
3056       break;
3057
3058     case '>':
3059       result->type = CPP_GREATER;
3060       if (*buffer->cur == '=')
3061         buffer->cur++, result->type = CPP_GREATER_EQ;
3062       else if (*buffer->cur == '>')
3063         {
3064           buffer->cur++;
3065           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3066         }
3067       break;
3068
3069     case '%':
3070       result->type = CPP_MOD;
3071       if (*buffer->cur == '=')
3072         buffer->cur++, result->type = CPP_MOD_EQ;
3073       else if (CPP_OPTION (pfile, digraphs))
3074         {
3075           if (*buffer->cur == ':')
3076             {
3077               buffer->cur++;
3078               result->flags |= DIGRAPH;
3079               result->type = CPP_HASH;
3080               if (*buffer->cur == '%' && buffer->cur[1] == ':')
3081                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3082             }
3083           else if (*buffer->cur == '>')
3084             {
3085               buffer->cur++;
3086               result->flags |= DIGRAPH;
3087               result->type = CPP_CLOSE_BRACE;
3088             }
3089         }
3090       break;
3091
3092     case '.':
3093       result->type = CPP_DOT;
3094       if (ISDIGIT (*buffer->cur))
3095         {
3096           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3097           result->type = CPP_NUMBER;
3098           lex_number (pfile, &result->val.str, &nst);
3099           warn_about_normalization (pfile, result, &nst);
3100         }
3101       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3102         buffer->cur += 2, result->type = CPP_ELLIPSIS;
3103       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3104         buffer->cur++, result->type = CPP_DOT_STAR;
3105       break;
3106
3107     case '+':
3108       result->type = CPP_PLUS;
3109       if (*buffer->cur == '+')
3110         buffer->cur++, result->type = CPP_PLUS_PLUS;
3111       else if (*buffer->cur == '=')
3112         buffer->cur++, result->type = CPP_PLUS_EQ;
3113       break;
3114
3115     case '-':
3116       result->type = CPP_MINUS;
3117       if (*buffer->cur == '>')
3118         {
3119           buffer->cur++;
3120           result->type = CPP_DEREF;
3121           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3122             buffer->cur++, result->type = CPP_DEREF_STAR;
3123         }
3124       else if (*buffer->cur == '-')
3125         buffer->cur++, result->type = CPP_MINUS_MINUS;
3126       else if (*buffer->cur == '=')
3127         buffer->cur++, result->type = CPP_MINUS_EQ;
3128       break;
3129
3130     case '&':
3131       result->type = CPP_AND;
3132       if (*buffer->cur == '&')
3133         buffer->cur++, result->type = CPP_AND_AND;
3134       else if (*buffer->cur == '=')
3135         buffer->cur++, result->type = CPP_AND_EQ;
3136       break;
3137
3138     case '|':
3139       result->type = CPP_OR;
3140       if (*buffer->cur == '|')
3141         buffer->cur++, result->type = CPP_OR_OR;
3142       else if (*buffer->cur == '=')
3143         buffer->cur++, result->type = CPP_OR_EQ;
3144       break;
3145
3146     case ':':
3147       result->type = CPP_COLON;
3148       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
3149         buffer->cur++, result->type = CPP_SCOPE;
3150       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3151         {
3152           buffer->cur++;
3153           result->flags |= DIGRAPH;
3154           result->type = CPP_CLOSE_SQUARE;
3155         }
3156       break;
3157
3158     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3159     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3160     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3161     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3162     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3163
3164     case '?': result->type = CPP_QUERY; break;
3165     case '~': result->type = CPP_COMPL; break;
3166     case ',': result->type = CPP_COMMA; break;
3167     case '(': result->type = CPP_OPEN_PAREN; break;
3168     case ')': result->type = CPP_CLOSE_PAREN; break;
3169     case '[': result->type = CPP_OPEN_SQUARE; break;
3170     case ']': result->type = CPP_CLOSE_SQUARE; break;
3171     case '{': result->type = CPP_OPEN_BRACE; break;
3172     case '}': result->type = CPP_CLOSE_BRACE; break;
3173     case ';': result->type = CPP_SEMICOLON; break;
3174
3175       /* @ is a punctuator in Objective-C.  */
3176     case '@': result->type = CPP_ATSIGN; break;
3177
3178     default:
3179       {
3180         const uchar *base = --buffer->cur;
3181
3182         /* Check for an extended identifier ($ or UCN or UTF-8).  */
3183         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3184         if (forms_identifier_p (pfile, true, &nst))
3185           {
3186             result->type = CPP_NAME;
3187             result->val.node.node = lex_identifier (pfile, base, true, &nst,
3188                                                     &result->val.node.spelling);
3189             warn_about_normalization (pfile, result, &nst);
3190             break;
3191           }
3192
3193         /* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
3194            single token.  */
3195         buffer->cur++;
3196         if (c >= utf8_signifier)
3197           {
3198             const uchar *pstr = base;
3199             cppchar_t s;
3200             if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
3201               buffer->cur = pstr;
3202           }
3203         create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
3204         break;
3205       }
3206
3207     }
3208
3209   /* Potentially convert the location of the token to a range.  */
3210   if (result->src_loc >= RESERVED_LOCATION_COUNT
3211       && result->type != CPP_EOF)
3212     {
3213       /* Ensure that any line notes are processed, so that we have the
3214          correct physical line/column for the end-point of the token even
3215          when a logical line is split via one or more backslashes.  */
3216       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3217           && !pfile->overlaid_buffer)
3218         _cpp_process_line_notes (pfile, false);
3219
3220       source_range tok_range;
3221       tok_range.m_start = result->src_loc;
3222       tok_range.m_finish
3223         = linemap_position_for_column (pfile->line_table,
3224                                        CPP_BUF_COLUMN (buffer, buffer->cur));
3225
3226       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
3227                                                result->src_loc,
3228                                                tok_range, NULL);
3229     }
3230
3231   return result;
3232 }
3233
3234 /* An upper bound on the number of bytes needed to spell TOKEN.
3235    Does not include preceding whitespace.  */
3236 unsigned int
3237 cpp_token_len (const cpp_token *token)
3238 {
3239   unsigned int len;
3240
3241   switch (TOKEN_SPELL (token))
3242     {
3243     default:            len = 6;                                break;
3244     case SPELL_LITERAL: len = token->val.str.len;               break;
3245     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
3246     }
3247
3248   return len;
3249 }
3250
3251 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
3252    Return the number of bytes read out of NAME.  (There are always
3253    10 bytes written to BUFFER.)  */
3254
3255 static size_t
3256 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
3257 {
3258   int j;
3259   int ucn_len = 0;
3260   int ucn_len_c;
3261   unsigned t;
3262   unsigned long utf32;
3263
3264   /* Compute the length of the UTF-8 sequence.  */
3265   for (t = *name; t & 0x80; t <<= 1)
3266     ucn_len++;
3267
3268   utf32 = *name & (0x7F >> ucn_len);
3269   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
3270     {
3271       utf32 = (utf32 << 6) | (*++name & 0x3F);
3272
3273       /* Ill-formed UTF-8.  */
3274       if ((*name & ~0x3F) != 0x80)
3275         abort ();
3276     }
3277
3278   *buffer++ = '\\';
3279   *buffer++ = 'U';
3280   for (j = 7; j >= 0; j--)
3281     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
3282   return ucn_len;
3283 }
3284
3285 /* Given a token TYPE corresponding to a digraph, return a pointer to
3286    the spelling of the digraph.  */
3287 static const unsigned char *
3288 cpp_digraph2name (enum cpp_ttype type)
3289 {
3290   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
3291 }
3292
3293 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
3294    The buffer must already contain the enough space to hold the
3295    token's spelling.  Returns a pointer to the character after the
3296    last character written.  */
3297 unsigned char *
3298 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
3299 {
3300   size_t i;
3301   const unsigned char *name = NODE_NAME (ident);
3302
3303   for (i = 0; i < NODE_LEN (ident); i++)
3304     if (name[i] & ~0x7F)
3305       {
3306         i += utf8_to_ucn (buffer, name + i) - 1;
3307         buffer += 10;
3308       }
3309     else
3310       *buffer++ = name[i];
3311
3312   return buffer;
3313 }
3314
3315 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
3316    already contain the enough space to hold the token's spelling.
3317    Returns a pointer to the character after the last character written.
3318    FORSTRING is true if this is to be the spelling after translation
3319    phase 1 (with the original spelling of extended identifiers), false
3320    if extended identifiers should always be written using UCNs (there is
3321    no option for always writing them in the internal UTF-8 form).
3322    FIXME: Would be nice if we didn't need the PFILE argument.  */
3323 unsigned char *
3324 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
3325                  unsigned char *buffer, bool forstring)
3326 {
3327   switch (TOKEN_SPELL (token))
3328     {
3329     case SPELL_OPERATOR:
3330       {
3331         const unsigned char *spelling;
3332         unsigned char c;
3333
3334         if (token->flags & DIGRAPH)
3335           spelling = cpp_digraph2name (token->type);
3336         else if (token->flags & NAMED_OP)
3337           goto spell_ident;
3338         else
3339           spelling = TOKEN_NAME (token);
3340
3341         while ((c = *spelling++) != '\0')
3342           *buffer++ = c;
3343       }
3344       break;
3345
3346     spell_ident:
3347     case SPELL_IDENT:
3348       if (forstring)
3349         {
3350           memcpy (buffer, NODE_NAME (token->val.node.spelling),
3351                   NODE_LEN (token->val.node.spelling));
3352           buffer += NODE_LEN (token->val.node.spelling);
3353         }
3354       else
3355         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
3356       break;
3357
3358     case SPELL_LITERAL:
3359       memcpy (buffer, token->val.str.text, token->val.str.len);
3360       buffer += token->val.str.len;
3361       break;
3362
3363     case SPELL_NONE:
3364       cpp_error (pfile, CPP_DL_ICE,
3365                  "unspellable token %s", TOKEN_NAME (token));
3366       break;
3367     }
3368
3369   return buffer;
3370 }
3371
3372 /* Returns TOKEN spelt as a null-terminated string.  The string is
3373    freed when the reader is destroyed.  Useful for diagnostics.  */
3374 unsigned char *
3375 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
3376 {
3377   unsigned int len = cpp_token_len (token) + 1;
3378   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
3379
3380   end = cpp_spell_token (pfile, token, start, false);
3381   end[0] = '\0';
3382
3383   return start;
3384 }
3385
3386 /* Returns a pointer to a string which spells the token defined by
3387    TYPE and FLAGS.  Used by C front ends, which really should move to
3388    using cpp_token_as_text.  */
3389 const char *
3390 cpp_type2name (enum cpp_ttype type, unsigned char flags)
3391 {
3392   if (flags & DIGRAPH)
3393     return (const char *) cpp_digraph2name (type);
3394   else if (flags & NAMED_OP)
3395     return cpp_named_operator2name (type);
3396
3397   return (const char *) token_spellings[type].name;
3398 }
3399
3400 /* Writes the spelling of token to FP, without any preceding space.
3401    Separated from cpp_spell_token for efficiency - to avoid stdio
3402    double-buffering.  */
3403 void
3404 cpp_output_token (const cpp_token *token, FILE *fp)
3405 {
3406   switch (TOKEN_SPELL (token))
3407     {
3408     case SPELL_OPERATOR:
3409       {
3410         const unsigned char *spelling;
3411         int c;
3412
3413         if (token->flags & DIGRAPH)
3414           spelling = cpp_digraph2name (token->type);
3415         else if (token->flags & NAMED_OP)
3416           goto spell_ident;
3417         else
3418           spelling = TOKEN_NAME (token);
3419
3420         c = *spelling;
3421         do
3422           putc (c, fp);
3423         while ((c = *++spelling) != '\0');
3424       }
3425       break;
3426
3427     spell_ident:
3428     case SPELL_IDENT:
3429       {
3430         size_t i;
3431         const unsigned char * name = NODE_NAME (token->val.node.node);
3432
3433         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
3434           if (name[i] & ~0x7F)
3435             {
3436               unsigned char buffer[10];
3437               i += utf8_to_ucn (buffer, name + i) - 1;
3438               fwrite (buffer, 1, 10, fp);
3439             }
3440           else
3441             fputc (NODE_NAME (token->val.node.node)[i], fp);
3442       }
3443       break;
3444
3445     case SPELL_LITERAL:
3446       fwrite (token->val.str.text, 1, token->val.str.len, fp);
3447       break;
3448
3449     case SPELL_NONE:
3450       /* An error, most probably.  */
3451       break;
3452     }
3453 }
3454
3455 /* Compare two tokens.  */
3456 int
3457 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
3458 {
3459   if (a->type == b->type && a->flags == b->flags)
3460     switch (TOKEN_SPELL (a))
3461       {
3462       default:                  /* Keep compiler happy.  */
3463       case SPELL_OPERATOR:
3464         /* token_no is used to track where multiple consecutive ##
3465            tokens were originally located.  */
3466         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
3467       case SPELL_NONE:
3468         return (a->type != CPP_MACRO_ARG
3469                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
3470                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
3471       case SPELL_IDENT:
3472         return (a->val.node.node == b->val.node.node
3473                 && a->val.node.spelling == b->val.node.spelling);
3474       case SPELL_LITERAL:
3475         return (a->val.str.len == b->val.str.len
3476                 && !memcmp (a->val.str.text, b->val.str.text,
3477                             a->val.str.len));
3478       }
3479
3480   return 0;
3481 }
3482
3483 /* Returns nonzero if a space should be inserted to avoid an
3484    accidental token paste for output.  For simplicity, it is
3485    conservative, and occasionally advises a space where one is not
3486    needed, e.g. "." and ".2".  */
3487 int
3488 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
3489                  const cpp_token *token2)
3490 {
3491   enum cpp_ttype a = token1->type, b = token2->type;
3492   cppchar_t c;
3493
3494   if (token1->flags & NAMED_OP)
3495     a = CPP_NAME;
3496   if (token2->flags & NAMED_OP)
3497     b = CPP_NAME;
3498
3499   c = EOF;
3500   if (token2->flags & DIGRAPH)
3501     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
3502   else if (token_spellings[b].category == SPELL_OPERATOR)
3503     c = token_spellings[b].name[0];
3504
3505   /* Quickly get everything that can paste with an '='.  */
3506   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
3507     return 1;
3508
3509   switch (a)
3510     {
3511     case CPP_GREATER:   return c == '>';
3512     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
3513     case CPP_PLUS:      return c == '+';
3514     case CPP_MINUS:     return c == '-' || c == '>';
3515     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
3516     case CPP_MOD:       return c == ':' || c == '>';
3517     case CPP_AND:       return c == '&';
3518     case CPP_OR:        return c == '|';
3519     case CPP_COLON:     return c == ':' || c == '>';
3520     case CPP_DEREF:     return c == '*';
3521     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
3522     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
3523     case CPP_NAME:      return ((b == CPP_NUMBER
3524                                  && name_p (pfile, &token2->val.str))
3525                                 || b == CPP_NAME
3526                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
3527     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
3528                                 || c == '.' || c == '+' || c == '-');
3529                                       /* UCNs */
3530     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
3531                                  && b == CPP_NAME)
3532                                 || (CPP_OPTION (pfile, objc)
3533                                     && token1->val.str.text[0] == '@'
3534                                     && (b == CPP_NAME || b == CPP_STRING)));
3535     case CPP_LESS_EQ:   return c == '>';
3536     case CPP_STRING:
3537     case CPP_WSTRING:
3538     case CPP_UTF8STRING:
3539     case CPP_STRING16:
3540     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
3541                                 && (b == CPP_NAME
3542                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
3543                                         && ISIDST (token2->val.str.text[0]))));
3544
3545     default:            break;
3546     }
3547
3548   return 0;
3549 }
3550
3551 /* Output all the remaining tokens on the current line, and a newline
3552    character, to FP.  Leading whitespace is removed.  If there are
3553    macros, special token padding is not performed.  */
3554 void
3555 cpp_output_line (cpp_reader *pfile, FILE *fp)
3556 {
3557   const cpp_token *token;
3558
3559   token = cpp_get_token (pfile);
3560   while (token->type != CPP_EOF)
3561     {
3562       cpp_output_token (token, fp);
3563       token = cpp_get_token (pfile);
3564       if (token->flags & PREV_WHITE)
3565         putc (' ', fp);
3566     }
3567
3568   putc ('\n', fp);
3569 }
3570
3571 /* Return a string representation of all the remaining tokens on the
3572    current line.  The result is allocated using xmalloc and must be
3573    freed by the caller.  */
3574 unsigned char *
3575 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
3576 {
3577   const cpp_token *token;
3578   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
3579   unsigned int alloced = 120 + out;
3580   unsigned char *result = (unsigned char *) xmalloc (alloced);
3581
3582   /* If DIR_NAME is empty, there are no initial contents.  */
3583   if (dir_name)
3584     {
3585       sprintf ((char *) result, "#%s ", dir_name);
3586       out += 2;
3587     }
3588
3589   token = cpp_get_token (pfile);
3590   while (token->type != CPP_EOF)
3591     {
3592       unsigned char *last;
3593       /* Include room for a possible space and the terminating nul.  */
3594       unsigned int len = cpp_token_len (token) + 2;
3595
3596       if (out + len > alloced)
3597         {
3598           alloced *= 2;
3599           if (out + len > alloced)
3600             alloced = out + len;
3601           result = (unsigned char *) xrealloc (result, alloced);
3602         }
3603
3604       last = cpp_spell_token (pfile, token, &result[out], 0);
3605       out = last - result;
3606
3607       token = cpp_get_token (pfile);
3608       if (token->flags & PREV_WHITE)
3609         result[out++] = ' ';
3610     }
3611
3612   result[out] = '\0';
3613   return result;
3614 }
3615
3616 /* Memory buffers.  Changing these three constants can have a dramatic
3617    effect on performance.  The values here are reasonable defaults,
3618    but might be tuned.  If you adjust them, be sure to test across a
3619    range of uses of cpplib, including heavy nested function-like macro
3620    expansion.  Also check the change in peak memory usage (NJAMD is a
3621    good tool for this).  */
3622 #define MIN_BUFF_SIZE 8000
3623 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3624 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
3625         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
3626
3627 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
3628   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
3629 #endif
3630
3631 /* Create a new allocation buffer.  Place the control block at the end
3632    of the buffer, so that buffer overflows will cause immediate chaos.  */
3633 static _cpp_buff *
3634 new_buff (size_t len)
3635 {
3636   _cpp_buff *result;
3637   unsigned char *base;
3638
3639   if (len < MIN_BUFF_SIZE)
3640     len = MIN_BUFF_SIZE;
3641   len = CPP_ALIGN (len);
3642
3643 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3644   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
3645      struct first.  */
3646   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
3647   base = XNEWVEC (unsigned char, len + slen);
3648   result = (_cpp_buff *) base;
3649   base += slen;
3650 #else
3651   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3652   result = (_cpp_buff *) (base + len);
3653 #endif
3654   result->base = base;
3655   result->cur = base;
3656   result->limit = base + len;
3657   result->next = NULL;
3658   return result;
3659 }
3660
3661 /* Place a chain of unwanted allocation buffers on the free list.  */
3662 void
3663 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3664 {
3665   _cpp_buff *end = buff;
3666
3667   while (end->next)
3668     end = end->next;
3669   end->next = pfile->free_buffs;
3670   pfile->free_buffs = buff;
3671 }
3672
3673 /* Return a free buffer of size at least MIN_SIZE.  */
3674 _cpp_buff *
3675 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3676 {
3677   _cpp_buff *result, **p;
3678
3679   for (p = &pfile->free_buffs;; p = &(*p)->next)
3680     {
3681       size_t size;
3682
3683       if (*p == NULL)
3684         return new_buff (min_size);
3685       result = *p;
3686       size = result->limit - result->base;
3687       /* Return a buffer that's big enough, but don't waste one that's
3688          way too big.  */
3689       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3690         break;
3691     }
3692
3693   *p = result->next;
3694   result->next = NULL;
3695   result->cur = result->base;
3696   return result;
3697 }
3698
3699 /* Creates a new buffer with enough space to hold the uncommitted
3700    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
3701    the excess bytes to the new buffer.  Chains the new buffer after
3702    BUFF, and returns the new buffer.  */
3703 _cpp_buff *
3704 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3705 {
3706   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3707   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3708
3709   buff->next = new_buff;
3710   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3711   return new_buff;
3712 }
3713
3714 /* Creates a new buffer with enough space to hold the uncommitted
3715    remaining bytes of the buffer pointed to by BUFF, and at least
3716    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3717    Chains the new buffer before the buffer pointed to by BUFF, and
3718    updates the pointer to point to the new buffer.  */
3719 void
3720 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3721 {
3722   _cpp_buff *new_buff, *old_buff = *pbuff;
3723   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3724
3725   new_buff = _cpp_get_buff (pfile, size);
3726   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3727   new_buff->next = old_buff;
3728   *pbuff = new_buff;
3729 }
3730
3731 /* Free a chain of buffers starting at BUFF.  */
3732 void
3733 _cpp_free_buff (_cpp_buff *buff)
3734 {
3735   _cpp_buff *next;
3736
3737   for (; buff; buff = next)
3738     {
3739       next = buff->next;
3740 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3741       free (buff);
3742 #else
3743       free (buff->base);
3744 #endif
3745     }
3746 }
3747
3748 /* Allocate permanent, unaligned storage of length LEN.  */
3749 unsigned char *
3750 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3751 {
3752   _cpp_buff *buff = pfile->u_buff;
3753   unsigned char *result = buff->cur;
3754
3755   if (len > (size_t) (buff->limit - result))
3756     {
3757       buff = _cpp_get_buff (pfile, len);
3758       buff->next = pfile->u_buff;
3759       pfile->u_buff = buff;
3760       result = buff->cur;
3761     }
3762
3763   buff->cur = result + len;
3764   return result;
3765 }
3766
3767 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3768    That buffer is used for growing allocations when saving macro
3769    replacement lists in a #define, and when parsing an answer to an
3770    assertion in #assert, #unassert or #if (and therefore possibly
3771    whilst expanding macros).  It therefore must not be used by any
3772    code that they might call: specifically the lexer and the guts of
3773    the macro expander.
3774
3775    All existing other uses clearly fit this restriction: storing
3776    registered pragmas during initialization.  */
3777 unsigned char *
3778 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3779 {
3780   _cpp_buff *buff = pfile->a_buff;
3781   unsigned char *result = buff->cur;
3782
3783   if (len > (size_t) (buff->limit - result))
3784     {
3785       buff = _cpp_get_buff (pfile, len);
3786       buff->next = pfile->a_buff;
3787       pfile->a_buff = buff;
3788       result = buff->cur;
3789     }
3790
3791   buff->cur = result + len;
3792   return result;
3793 }
3794
3795 /* Commit or allocate storage from a buffer.  */
3796
3797 void *
3798 _cpp_commit_buff (cpp_reader *pfile, size_t size)
3799 {
3800   void *ptr = BUFF_FRONT (pfile->a_buff);
3801
3802   if (pfile->hash_table->alloc_subobject)
3803     {
3804       void *copy = pfile->hash_table->alloc_subobject (size);
3805       memcpy (copy, ptr, size);
3806       ptr = copy;
3807     }
3808   else
3809     BUFF_FRONT (pfile->a_buff) += size;
3810
3811   return ptr;
3812 }
3813
3814 /* Say which field of TOK is in use.  */
3815
3816 enum cpp_token_fld_kind
3817 cpp_token_val_index (const cpp_token *tok)
3818 {
3819   switch (TOKEN_SPELL (tok))
3820     {
3821     case SPELL_IDENT:
3822       return CPP_TOKEN_FLD_NODE;
3823     case SPELL_LITERAL:
3824       return CPP_TOKEN_FLD_STR;
3825     case SPELL_OPERATOR:
3826       /* Operands which were originally spelled as ident keep around
3827          the node for the exact spelling.  */
3828       if (tok->flags & NAMED_OP)
3829         return CPP_TOKEN_FLD_NODE;
3830       else if (tok->type == CPP_PASTE)
3831         return CPP_TOKEN_FLD_TOKEN_NO;
3832       else
3833         return CPP_TOKEN_FLD_NONE;
3834     case SPELL_NONE:
3835       if (tok->type == CPP_MACRO_ARG)
3836         return CPP_TOKEN_FLD_ARG_NO;
3837       else if (tok->type == CPP_PADDING)
3838         return CPP_TOKEN_FLD_SOURCE;
3839       else if (tok->type == CPP_PRAGMA)
3840         return CPP_TOKEN_FLD_PRAGMA;
3841       /* fall through */
3842     default:
3843       return CPP_TOKEN_FLD_NONE;
3844     }
3845 }
3846
3847 /* All tokens lexed in R after calling this function will be forced to
3848    have their location_t to be P, until
3849    cpp_stop_forcing_token_locations is called for R.  */
3850
3851 void
3852 cpp_force_token_locations (cpp_reader *r, location_t loc)
3853 {
3854   r->forced_token_location = loc;
3855 }
3856
3857 /* Go back to assigning locations naturally for lexed tokens.  */
3858
3859 void
3860 cpp_stop_forcing_token_locations (cpp_reader *r)
3861 {
3862   r->forced_token_location = 0;
3863 }
3864
3865 /* We're looking at \, if it's escaping EOL, look past it.  If at
3866    LIMIT, don't advance.  */
3867
3868 static const unsigned char *
3869 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
3870 {
3871   const unsigned char *probe = peek;
3872
3873   if (__builtin_expect (peek[1] == '\n', true))
3874     {
3875     eol:
3876       probe += 2;
3877       if (__builtin_expect (probe < limit, true))
3878         {
3879           peek = probe;
3880           if (*peek == '\\')
3881             /* The user might be perverse.  */
3882             return do_peek_backslash (peek, limit);
3883         }
3884     }
3885   else if (__builtin_expect (peek[1] == '\r', false))
3886     {
3887       if (probe[2] == '\n')
3888         probe++;
3889       goto eol;
3890     }
3891
3892   return peek;
3893 }
3894
3895 static const unsigned char *
3896 do_peek_next (const unsigned char *peek, const unsigned char *limit)
3897 {
3898   if (__builtin_expect (*peek == '\\', false))
3899     peek = do_peek_backslash (peek, limit);
3900   return peek;
3901 }
3902
3903 static const unsigned char *
3904 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
3905 {
3906   if (peek == bound)
3907     return NULL;
3908
3909   unsigned char c = *--peek;
3910   if (__builtin_expect (c == '\n', false)
3911       || __builtin_expect (c == 'r', false))
3912     {
3913       if (peek == bound)
3914         return peek;
3915       int ix = -1;
3916       if (c == '\n' && peek[ix] == '\r')
3917         {
3918           if (peek + ix == bound)
3919             return peek;
3920           ix--;
3921         }
3922
3923       if (peek[ix] == '\\')
3924         return do_peek_prev (peek + ix, bound);
3925
3926       return peek;
3927     }
3928   else
3929     return peek;
3930 }
3931
3932 /* Directives-only scanning.  Somewhat more relaxed than correct
3933    parsing -- some ill-formed programs will not be rejected.  */
3934
3935 void
3936 cpp_directive_only_process (cpp_reader *pfile,
3937                             void *data,
3938                             void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
3939 {
3940   do
3941     {
3942     restart:
3943       /* Buffer initialization, but no line cleaning. */
3944       cpp_buffer *buffer = pfile->buffer;
3945       buffer->cur_note = buffer->notes_used = 0;
3946       buffer->cur = buffer->line_base = buffer->next_line;
3947       buffer->need_line = false;
3948       /* Files always end in a newline.  We rely on this for
3949          character peeking safety.  */
3950       gcc_assert (buffer->rlimit[-1] == '\n');
3951
3952       const unsigned char *base = buffer->cur;
3953       unsigned line_count = 0;
3954       const unsigned char *line_start = base;
3955
3956       bool bol = true;
3957       bool raw = false;
3958
3959       const unsigned char *lwm = base;
3960       for (const unsigned char *pos = base, *limit = buffer->rlimit;
3961            pos < limit;)
3962         {
3963           unsigned char c = *pos++;
3964           /* This matches the switch in _cpp_lex_direct.  */
3965           switch (c)
3966             {
3967             case ' ': case '\t': case '\f': case '\v':
3968               /* Whitespace, do nothing.  */
3969               break;
3970
3971             case '\r': /* MAC line ending, or Windows \r\n  */
3972               if (*pos == '\n')
3973                 pos++;
3974               /* FALLTHROUGH */
3975
3976             case '\n':
3977               bol = true;
3978
3979             next_line:
3980               CPP_INCREMENT_LINE (pfile, 0);
3981               line_count++;
3982               line_start = pos;
3983               break;
3984
3985             case '\\':
3986               /* <backslash><newline> is removed, and doesn't undo any
3987                  preceeding escape or whatnot.  */
3988               if (*pos == '\n')
3989                 {
3990                   pos++;
3991                   goto next_line;
3992                 }
3993               else if (*pos == '\r')
3994                 {
3995                   if (pos[1] == '\n')
3996                     pos++;
3997                   pos++;
3998                   goto next_line;
3999                 }
4000               goto dflt;
4001
4002             case '#':
4003               if (bol)
4004                 {
4005                   /* Line directive.  */
4006                   if (pos - 1 > base && !pfile->state.skipping)
4007                     cb (pfile, CPP_DO_print, data,
4008                         line_count, base, pos - 1 - base);
4009
4010                   /* Prep things for directive handling. */
4011                   buffer->next_line = pos;
4012                   buffer->need_line = true;
4013                   bool ok = _cpp_get_fresh_line (pfile);
4014                   gcc_checking_assert (ok);
4015
4016                   /* Ensure proper column numbering for generated
4017                      error messages. */
4018                   buffer->line_base -= pos - line_start;
4019
4020                   _cpp_handle_directive (pfile, line_start + 1 != pos);
4021
4022                   /* Sanitize the line settings.  Duplicate #include's can
4023                      mess things up. */
4024                   // FIXME: Necessary?
4025                   pfile->line_table->highest_location
4026                     = pfile->line_table->highest_line;
4027
4028                   if (!pfile->state.skipping
4029                       && pfile->buffer->next_line < pfile->buffer->rlimit)
4030                     cb (pfile, CPP_DO_location, data,
4031                         pfile->line_table->highest_line);
4032
4033                   goto restart;
4034                 }
4035               goto dflt;
4036
4037             case '/':
4038               {
4039                 const unsigned char *peek = do_peek_next (pos, limit);
4040                 if (!(*peek == '/' || *peek == '*'))
4041                   goto dflt;
4042
4043                 /* Line or block comment  */
4044                 bool is_block = *peek == '*';
4045                 bool star = false;
4046                 bool esc = false;
4047                 location_t sloc
4048                   = linemap_position_for_column (pfile->line_table,
4049                                                  pos - line_start);
4050
4051                 while (pos < limit)
4052                   {
4053                     char c = *pos++;
4054                     switch (c)
4055                       {
4056                       case '\\':
4057                         esc = true;
4058                         break;
4059
4060                       case '\r':
4061                         if (*pos == '\n')
4062                           pos++;
4063                         /* FALLTHROUGH  */
4064
4065                       case '\n':
4066                         {
4067                           CPP_INCREMENT_LINE (pfile, 0);
4068                           line_count++;
4069                           line_start = pos;
4070                           if (!esc && !is_block)
4071                             {
4072                               bol = true;
4073                               goto done_comment;
4074                             }
4075                         }
4076                         if (!esc)
4077                           star = false;
4078                         esc = false;
4079                         break;
4080
4081                       case '*':
4082                         if (pos > peek && !esc)
4083                           star = is_block;
4084                         esc = false;
4085                         break;
4086
4087                       case '/':
4088                         if (star)
4089                           goto done_comment;
4090                         /* FALLTHROUGH  */
4091
4092                       default:
4093                         star = false;
4094                         esc = false;
4095                         break;
4096                       }
4097                   }
4098                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4099                                      "unterminated comment");
4100               done_comment:
4101                 lwm = pos;
4102                 break;
4103               }
4104
4105             case '\'':
4106               if (!CPP_OPTION (pfile, digit_separators))
4107                 goto delimited_string;
4108
4109               /* Possibly a number punctuator.  */
4110               if (!ISIDNUM (*do_peek_next (pos, limit)))
4111                 goto delimited_string;
4112
4113               goto quote_peek;
4114
4115             case '\"':
4116               if (!CPP_OPTION (pfile, rliterals))
4117                 goto delimited_string;
4118
4119             quote_peek:
4120               {
4121                 /* For ' see if it's a number punctuator
4122                    \.?<digit>(<digit>|<identifier-nondigit>
4123                    |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
4124                 /* For " see if it's a raw string
4125                    {U,L,u,u8}R.  This includes CPP_NUMBER detection,
4126                    because that could be 0e+R.  */
4127                 const unsigned char *peek = pos - 1;
4128                 bool quote_first = c == '"';
4129                 bool quote_eight = false;
4130                 bool maybe_number_start = false;
4131                 bool want_number = false;
4132
4133                 while ((peek = do_peek_prev (peek, lwm)))
4134                   {
4135                     unsigned char p = *peek;
4136                     if (quote_first)
4137                       {
4138                         if (!raw)
4139                           {
4140                             if (p != 'R')
4141                               break;
4142                             raw = true;
4143                             continue;
4144                           }
4145
4146                         quote_first = false;
4147                         if (p == 'L' || p == 'U' || p == 'u')
4148                           ;
4149                         else if (p == '8')
4150                           quote_eight = true;
4151                         else
4152                           goto second_raw;
4153                       }
4154                     else if (quote_eight)
4155                       {
4156                         if (p != 'u')
4157                           {
4158                             raw = false;
4159                             break;
4160                           }
4161                         quote_eight = false;
4162                       }
4163                     else if (c == '"')
4164                       {
4165                       second_raw:;
4166                         if (!want_number && ISIDNUM (p))
4167                           {
4168                             raw = false;
4169                             break;
4170                           }
4171                       }
4172
4173                     if (ISDIGIT (p))
4174                       maybe_number_start = true;
4175                     else if (p == '.')
4176                       want_number = true;
4177                     else if (ISIDNUM (p))
4178                       maybe_number_start = false;
4179                     else if (p == '+' || p == '-')
4180                       {
4181                         if (const unsigned char *peek_prev
4182                             = do_peek_prev (peek, lwm))
4183                           {
4184                             p = *peek_prev;
4185                             if (p == 'e' || p == 'E'
4186                                 || p == 'p' || p == 'P')
4187                               {
4188                                 want_number = true;
4189                                 maybe_number_start = false;
4190                               }
4191                             else
4192                               break;
4193                           }
4194                         else
4195                           break;
4196                       }
4197                     else if (p == '\'' || p == '\"')
4198                       {
4199                         /* If this is lwm, this must be the end of a
4200                            previous string.  So this is a trailing
4201                            literal type, (a) if those are allowed,
4202                              and (b) maybe_start is false.  Otherwise
4203                              this must be a CPP_NUMBER because we've
4204                              met another ', and we'd have checked that
4205                              in its own right.  */
4206                         if (peek == lwm && CPP_OPTION (pfile, uliterals))
4207                           {
4208                             if  (!maybe_number_start && !want_number)
4209                               /* Must be a literal type.  */
4210                               raw = false;
4211                           }
4212                         else if (p == '\''
4213                                  && CPP_OPTION (pfile, digit_separators))
4214                           maybe_number_start = true;
4215                         break;
4216                       }
4217                     else if (c == '\'')
4218                       break;
4219                     else if (!quote_first && !quote_eight)
4220                       break;
4221                   }
4222
4223                 if (maybe_number_start)
4224                   {
4225                     if (c == '\'')
4226                       /* A CPP NUMBER.  */
4227                       goto dflt;
4228                     raw = false;
4229                   }
4230
4231                 goto delimited_string;
4232               }
4233
4234             delimited_string:
4235               {
4236                 /* (Possibly raw) string or char literal.  */
4237                 unsigned char end = c;
4238                 int delim_len = -1;
4239                 const unsigned char *delim = NULL;
4240                 location_t sloc = linemap_position_for_column (pfile->line_table,
4241                                                                pos - line_start);
4242                 int esc = 0;
4243
4244                 if (raw)
4245                   {
4246                     /* There can be no line breaks in the delimiter.  */
4247                     delim = pos;
4248                     for (delim_len = 0; (c = *pos++) != '('; delim_len++)
4249                       {
4250                         if (delim_len == 16)
4251                           {
4252                             cpp_error_with_line (pfile, CPP_DL_ERROR,
4253                                                  sloc, 0,
4254                                                  "raw string delimiter"
4255                                                  " longer than %d"
4256                                                  " characters",
4257                                                  delim_len);
4258                             raw = false;
4259                             pos = delim;
4260                             break;
4261                           }
4262                         if (strchr (") \\\t\v\f\n", c))
4263                           {
4264                             cpp_error_with_line (pfile, CPP_DL_ERROR,
4265                                                  sloc, 0,
4266                                                  "invalid character '%c'"
4267                                                  " in raw string"
4268                                                  " delimiter", c);
4269                             raw = false;
4270                             pos = delim;
4271                             break;
4272                           }
4273                         if (pos >= limit)
4274                           goto bad_string;
4275                       }
4276                   }
4277
4278                 while (pos < limit)
4279                   {
4280                     char c = *pos++;
4281                     switch (c)
4282                       {
4283                       case '\\':
4284                         if (!raw)
4285                           esc++;
4286                         break;
4287
4288                       case '\r':
4289                         if (*pos == '\n')
4290                           pos++;
4291                         /* FALLTHROUGH  */
4292
4293                       case '\n':
4294                         {
4295                           CPP_INCREMENT_LINE (pfile, 0);
4296                           line_count++;
4297                           line_start = pos;
4298                         }
4299                         if (esc)
4300                           esc--;
4301                         break;
4302
4303                       case ')':
4304                         if (raw
4305                             && pos + delim_len + 1 < limit
4306                             && pos[delim_len] == end
4307                             && !memcmp (delim, pos, delim_len))
4308                           {
4309                             pos += delim_len + 1;
4310                             raw = false;
4311                             goto done_string;
4312                           }
4313                         break;
4314
4315                       default:
4316                         if (!raw && !(esc & 1) && c == end)
4317                           goto done_string;
4318                         esc = 0;
4319                         break;
4320                       }
4321                   }
4322               bad_string:
4323                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4324                                      "unterminated literal");
4325
4326               done_string:
4327                 raw = false;
4328                 lwm = pos - 1;
4329               }
4330               goto dflt;
4331
4332             default:
4333             dflt:
4334               bol = false;
4335               pfile->mi_valid = false;
4336               break;
4337             }
4338         }
4339
4340       if (buffer->rlimit > base && !pfile->state.skipping)
4341         cb (pfile, CPP_DO_print, data, line_count, base, buffer->rlimit - base);
4342
4343       _cpp_pop_buffer (pfile);
4344     }
4345   while (pfile->buffer);
4346 }