libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2019 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problem can be properly
 267    autoconfed:
 268
 269    The Solaris 10+ assembler tags objects with the instruction set
 270    extensions used, so SSE4.2 executables cannot run on machines that
 271    don't support that extension.  */
 272
 273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 274
 275 /* Replicated character data to be shared between implementations.
 276    Recall that outside of a context with vector support we can't
 277    define compatible vector types, therefore these are all defined
 278    in terms of raw characters.  */
 279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 280   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 281     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 282   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 283     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 284   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 285     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 286   { '?', '?', '?', '?', '?', '?', '?', '?',
 287     '?', '?', '?', '?', '?', '?', '?', '?' },
 288 };
 289
 290 /* A version of the fast scanner using MMX vectorized byte compare insns.
 291
 292    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 293    which was packaged into SSE1; it is also present in the AMD MMX
 294    extension.  Mark the function as using "sse" so that we emit a real
 295    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 296
 297 static const uchar *
 298 #ifndef __SSE__
 299 __attribute__((__target__("sse")))
 300 #endif
 301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 302 {
 303   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 304   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 305
 306   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 307   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 308   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 309   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 310
 311   unsigned int misalign, found, mask;
 312   const v8qi *p;
 313   v8qi data, t, c;
 314
 315   /* Align the source pointer.  While MMX doesn't generate unaligned data
 316      faults, this allows us to safely scan to the end of the buffer without
 317      reading beyond the end of the last page.  */
 318   misalign = (uintptr_t)s & 7;
 319   p = (const v8qi *)((uintptr_t)s & -8);
 320   data = *p;
 321
 322   /* Create a mask for the bytes that are valid within the first
 323      16-byte block.  The Idea here is that the AND with the mask
 324      within the loop is "free", since we need some AND or TEST
 325      insn in order to set the flags for the branch anyway.  */
 326   mask = -1u << misalign;
 327
 328   /* Main loop processing 8 bytes at a time.  */
 329   goto start;
 330   do
 331     {
 332       data = *++p;
 333       mask = -1;
 334
 335     start:
 336       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 337       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 338       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 339       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       found = __builtin_ia32_pmovmskb (t);
 344       found &= mask;
 345     }
 346   while (!found);
 347
 348   __builtin_ia32_emms ();
 349
 350   /* FOUND contains 1 in bits for which we matched a relevant
 351      character.  Conversion to the byte index is trivial.  */
 352   found = __builtin_ctz(found);
 353   return (const uchar *)p + found;
 354 }
 355
 356 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 357
 358 static const uchar *
 359 #ifndef __SSE2__
 360 __attribute__((__target__("sse2")))
 361 #endif
 362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 363 {
 364   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 365
 366   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 367   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 368   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 369   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 370
 371   unsigned int misalign, found, mask;
 372   const v16qi *p;
 373   v16qi data, t;
 374
 375   /* Align the source pointer.  */
 376   misalign = (uintptr_t)s & 15;
 377   p = (const v16qi *)((uintptr_t)s & -16);
 378   data = *p;
 379
 380   /* Create a mask for the bytes that are valid within the first
 381      16-byte block.  The Idea here is that the AND with the mask
 382      within the loop is "free", since we need some AND or TEST
 383      insn in order to set the flags for the branch anyway.  */
 384   mask = -1u << misalign;
 385
 386   /* Main loop processing 16 bytes at a time.  */
 387   goto start;
 388   do
 389     {
 390       data = *++p;
 391       mask = -1;
 392
 393     start:
 394       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 395       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 396       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 397       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 398       found = __builtin_ia32_pmovmskb128 (t);
 399       found &= mask;
 400     }
 401   while (!found);
 402
 403   /* FOUND contains 1 in bits for which we matched a relevant
 404      character.  Conversion to the byte index is trivial.  */
 405   found = __builtin_ctz(found);
 406   return (const uchar *)p + found;
 407 }
 408
 409 #ifdef HAVE_SSE4
 410 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 411
 412 static const uchar *
 413 #ifndef __SSE4_2__
 414 __attribute__((__target__("sse4.2")))
 415 #endif
 416 search_line_sse42 (const uchar *s, const uchar *end)
 417 {
 418   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 419   static const v16qi search = { '\n', '\r', '?', '\\' };
 420
 421   uintptr_t si = (uintptr_t)s;
 422   uintptr_t index;
 423
 424   /* Check for unaligned input.  */
 425   if (si & 15)
 426     {
 427       v16qi sv;
 428
 429       if (__builtin_expect (end - s < 16, 0)
 430           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 431         {
 432           /* There are less than 16 bytes left in the buffer, and less
 433              than 16 bytes left on the page.  Reading 16 bytes at this
 434              point might generate a spurious page fault.  Defer to the
 435              SSE2 implementation, which already handles alignment.  */
 436           return search_line_sse2 (s, end);
 437         }
 438
 439       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 440          memory need not be aligned.  */
 441       sv = __builtin_ia32_loaddqu ((const char *) s);
 442       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 443
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 15) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  */
 454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 455   while (1)
 456     {
 457       char f;
 458
 459       /* By using inline assembly instead of the builtin,
 460          we can use the result, as well as the flags set.  */
 461       __asm ("%vpcmpestri\t$0, %2, %3"
 462              : "=c"(index), "=@ccc"(f)
 463              : "m"(*s), "x"(search), "a"(4), "d"(16));
 464       if (f)
 465         break;
 466
 467       s += 16;
 468     }
 469 #else
 470   s -= 16;
 471   /* By doing the whole loop in inline assembly,
 472      we can make proper use of the flags set.  */
 473   __asm (      ".balign 16\n"
 474         "0:     add $16, %1\n"
 475         "       %vpcmpestri\t$0, (%1), %2\n"
 476         "       jnc 0b"
 477         : "=&c"(index), "+r"(s)
 478         : "x"(search), "a"(4), "d"(16));
 479 #endif
 480
 481  found:
 482   return s + index;
 483 }
 484
 485 #else
 486 /* Work around out-dated assemblers without sse4 support.  */
 487 #define search_line_sse42 search_line_sse2
 488 #endif
 489
 490 /* Check the CPU capabilities.  */
 491
 492 #include "../gcc/config/i386/cpuid.h"
 493
 494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 495 static search_line_fast_type search_line_fast;
 496
 497 #define HAVE_init_vectorized_lexer 1
 498 static inline void
 499 init_vectorized_lexer (void)
 500 {
 501   unsigned dummy, ecx = 0, edx = 0;
 502   search_line_fast_type impl = search_line_acc_char;
 503   int minimum = 0;
 504
 505 #if defined(__SSE4_2__)
 506   minimum = 3;
 507 #elif defined(__SSE2__)
 508   minimum = 2;
 509 #elif defined(__SSE__)
 510   minimum = 1;
 511 #endif
 512
 513   if (minimum == 3)
 514     impl = search_line_sse42;
 515   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 516     {
 517       if (minimum == 3 || (ecx & bit_SSE4_2))
 518         impl = search_line_sse42;
 519       else if (minimum == 2 || (edx & bit_SSE2))
 520         impl = search_line_sse2;
 521       else if (minimum == 1 || (edx & bit_SSE))
 522         impl = search_line_mmx;
 523     }
 524   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 525     {
 526       if (minimum == 1
 527           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 528         impl = search_line_mmx;
 529     }
 530
 531   search_line_fast = impl;
 532 }
 533
 534 #elif defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 535
 536 /* A vection of the fast scanner using AltiVec vectorized byte compares
 537    and VSX unaligned loads (when VSX is available).  This is otherwise
 538    the same as the pre-GCC 5 version.  */
 539
 540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 541 static const uchar *
 542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 543 {
 544   typedef __attribute__((altivec(vector))) unsigned char vc;
 545
 546   const vc repl_nl = {
 547     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 548     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 549   };
 550   const vc repl_cr = {
 551     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 552     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 553   };
 554   const vc repl_bs = {
 555     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 556     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 557   };
 558   const vc repl_qm = {
 559     '?', '?', '?', '?', '?', '?', '?', '?',
 560     '?', '?', '?', '?', '?', '?', '?', '?',
 561   };
 562   const vc zero = { 0 };
 563
 564   vc data, t;
 565
 566   /* Main loop processing 16 bytes at a time.  */
 567   do
 568     {
 569       vc m_nl, m_cr, m_bs, m_qm;
 570
 571       data = __builtin_vec_vsx_ld (0, s);
 572       s += 16;
 573
 574       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 575       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 576       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 577       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 578       t = (m_nl | m_cr) | (m_bs | m_qm);
 579
 580       /* T now contains 0xff in bytes for which we matched one of the relevant
 581          characters.  We want to exit the loop if any byte in T is non-zero.
 582          Below is the expansion of vec_any_ne(t, zero).  */
 583     }
 584   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 585
 586   /* Restore s to to point to the 16 bytes we just processed.  */
 587   s -= 16;
 588
 589   {
 590 #define N  (sizeof(vc) / sizeof(long))
 591
 592     union {
 593       vc v;
 594       /* Statically assert that N is 2 or 4.  */
 595       unsigned long l[(N == 2 || N == 4) ? N : -1];
 596     } u;
 597     unsigned long l, i = 0;
 598
 599     u.v = t;
 600
 601     /* Find the first word of T that is non-zero.  */
 602     switch (N)
 603       {
 604       case 4:
 605         l = u.l[i++];
 606         if (l != 0)
 607           break;
 608         s += sizeof(unsigned long);
 609         l = u.l[i++];
 610         if (l != 0)
 611           break;
 612         s += sizeof(unsigned long);
 613         /* FALLTHRU */
 614       case 2:
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619         l = u.l[i];
 620       }
 621
 622     /* L now contains 0xff in bytes for which we matched one of the
 623        relevant characters.  We can find the byte index by finding
 624        its bit index and dividing by 8.  */
 625 #ifdef __BIG_ENDIAN__
 626     l = __builtin_clzl(l) >> 3;
 627 #else
 628     l = __builtin_ctzl(l) >> 3;
 629 #endif
 630     return s + l;
 631
 632 #undef N
 633   }
 634 }
 635
 636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 637
 638 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 639    This cannot be used for little endian because vec_lvsl/lvsr are
 640    deprecated for little endian and the code won't work properly.  */
 641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 642    so we can't compile this function without -maltivec on the command line
 643    (or implied by some other switch).  */
 644
 645 static const uchar *
 646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 647 {
 648   typedef __attribute__((altivec(vector))) unsigned char vc;
 649
 650   const vc repl_nl = {
 651     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 652     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 653   };
 654   const vc repl_cr = {
 655     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 656     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 657   };
 658   const vc repl_bs = {
 659     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 660     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 661   };
 662   const vc repl_qm = {
 663     '?', '?', '?', '?', '?', '?', '?', '?',
 664     '?', '?', '?', '?', '?', '?', '?', '?',
 665   };
 666   const vc ones = {
 667     -1, -1, -1, -1, -1, -1, -1, -1,
 668     -1, -1, -1, -1, -1, -1, -1, -1,
 669   };
 670   const vc zero = { 0 };
 671
 672   vc data, mask, t;
 673
 674   /* Altivec loads automatically mask addresses with -16.  This lets us
 675      issue the first load as early as possible.  */
 676   data = __builtin_vec_ld(0, (const vc *)s);
 677
 678   /* Discard bytes before the beginning of the buffer.  Do this by
 679      beginning with all ones and shifting in zeros according to the
 680      mis-alignment.  The LVSR instruction pulls the exact shift we
 681      want from the address.  */
 682   mask = __builtin_vec_lvsr(0, s);
 683   mask = __builtin_vec_perm(zero, ones, mask);
 684   data &= mask;
 685
 686   /* While altivec loads mask addresses, we still need to align S so
 687      that the offset we compute at the end is correct.  */
 688   s = (const uchar *)((uintptr_t)s & -16);
 689
 690   /* Main loop processing 16 bytes at a time.  */
 691   goto start;
 692   do
 693     {
 694       vc m_nl, m_cr, m_bs, m_qm;
 695
 696       s += 16;
 697       data = __builtin_vec_ld(0, (const vc *)s);
 698
 699     start:
 700       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 701       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 702       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 703       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 704       t = (m_nl | m_cr) | (m_bs | m_qm);
 705
 706       /* T now contains 0xff in bytes for which we matched one of the relevant
 707          characters.  We want to exit the loop if any byte in T is non-zero.
 708          Below is the expansion of vec_any_ne(t, zero).  */
 709     }
 710   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 711
 712   {
 713 #define N  (sizeof(vc) / sizeof(long))
 714
 715     union {
 716       vc v;
 717       /* Statically assert that N is 2 or 4.  */
 718       unsigned long l[(N == 2 || N == 4) ? N : -1];
 719     } u;
 720     unsigned long l, i = 0;
 721
 722     u.v = t;
 723
 724     /* Find the first word of T that is non-zero.  */
 725     switch (N)
 726       {
 727       case 4:
 728         l = u.l[i++];
 729         if (l != 0)
 730           break;
 731         s += sizeof(unsigned long);
 732         l = u.l[i++];
 733         if (l != 0)
 734           break;
 735         s += sizeof(unsigned long);
 736         /* FALLTHROUGH */
 737       case 2:
 738         l = u.l[i++];
 739         if (l != 0)
 740           break;
 741         s += sizeof(unsigned long);
 742         l = u.l[i];
 743       }
 744
 745     /* L now contains 0xff in bytes for which we matched one of the
 746        relevant characters.  We can find the byte index by finding
 747        its bit index and dividing by 8.  */
 748     l = __builtin_clzl(l) >> 3;
 749     return s + l;
 750
 751 #undef N
 752   }
 753 }
 754
 755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 756 #include "arm_neon.h"
 757
 758 /* This doesn't have to be the exact page size, but no system may use
 759    a size smaller than this.  ARMv8 requires a minimum page size of
 760    4k.  The impact of being conservative here is a small number of
 761    cases will take the slightly slower entry path into the main
 762    loop.  */
 763
 764 #define AARCH64_MIN_PAGE_SIZE 4096
 765
 766 static const uchar *
 767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 768 {
 769   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 770   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 771   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 772   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 773   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 774
 775 #ifdef __ARM_BIG_ENDIAN
 776   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 777 #else
 778   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 779 #endif
 780
 781   unsigned int found;
 782   const uint8_t *p;
 783   uint8x16_t data;
 784   uint8x16_t t;
 785   uint16x8_t m;
 786   uint8x16_t u, v, w;
 787
 788   /* Align the source pointer.  */
 789   p = (const uint8_t *)((uintptr_t)s & -16);
 790
 791   /* Assuming random string start positions, with a 4k page size we'll take
 792      the slow path about 0.37% of the time.  */
 793   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 794                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 795                         < 16, 0))
 796     {
 797       /* Slow path: the string starts near a possible page boundary.  */
 798       uint32_t misalign, mask;
 799
 800       misalign = (uintptr_t)s & 15;
 801       mask = (-1u << misalign) & 0xffff;
 802       data = vld1q_u8 (p);
 803       t = vceqq_u8 (data, repl_nl);
 804       u = vceqq_u8 (data, repl_cr);
 805       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 806       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 807       t = vorrq_u8 (v, w);
 808       t = vandq_u8 (t, xmask);
 809       m = vpaddlq_u8 (t);
 810       m = vshlq_u16 (m, shift);
 811       found = vaddvq_u16 (m);
 812       found &= mask;
 813       if (found)
 814         return (const uchar*)p + __builtin_ctz (found);
 815     }
 816   else
 817     {
 818       data = vld1q_u8 ((const uint8_t *) s);
 819       t = vceqq_u8 (data, repl_nl);
 820       u = vceqq_u8 (data, repl_cr);
 821       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 822       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 823       t = vorrq_u8 (v, w);
 824       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 825         goto done;
 826     }
 827
 828   do
 829     {
 830       p += 16;
 831       data = vld1q_u8 (p);
 832       t = vceqq_u8 (data, repl_nl);
 833       u = vceqq_u8 (data, repl_cr);
 834       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 835       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 836       t = vorrq_u8 (v, w);
 837     } while (!vpaddd_u64 ((uint64x2_t)t));
 838
 839 done:
 840   /* Now that we've found the terminating substring, work out precisely where
 841      we need to stop.  */
 842   t = vandq_u8 (t, xmask);
 843   m = vpaddlq_u8 (t);
 844   m = vshlq_u16 (m, shift);
 845   found = vaddvq_u16 (m);
 846   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 847           + __builtin_ctz (found));
 848 }
 849
 850 #elif defined (__ARM_NEON)
 851 #include "arm_neon.h"
 852
 853 static const uchar *
 854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 855 {
 856   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 857   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 858   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 859   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 860   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 861
 862   unsigned int misalign, found, mask;
 863   const uint8_t *p;
 864   uint8x16_t data;
 865
 866   /* Align the source pointer.  */
 867   misalign = (uintptr_t)s & 15;
 868   p = (const uint8_t *)((uintptr_t)s & -16);
 869   data = vld1q_u8 (p);
 870
 871   /* Create a mask for the bytes that are valid within the first
 872      16-byte block.  The Idea here is that the AND with the mask
 873      within the loop is "free", since we need some AND or TEST
 874      insn in order to set the flags for the branch anyway.  */
 875   mask = (-1u << misalign) & 0xffff;
 876
 877   /* Main loop, processing 16 bytes at a time.  */
 878   goto start;
 879
 880   do
 881     {
 882       uint8x8_t l;
 883       uint16x4_t m;
 884       uint32x2_t n;
 885       uint8x16_t t, u, v, w;
 886
 887       p += 16;
 888       data = vld1q_u8 (p);
 889       mask = 0xffff;
 890
 891     start:
 892       t = vceqq_u8 (data, repl_nl);
 893       u = vceqq_u8 (data, repl_cr);
 894       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 895       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 896       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 897       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 898       m = vpaddl_u8 (l);
 899       n = vpaddl_u16 (m);
 900
 901       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 902               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 903       found &= mask;
 904     }
 905   while (!found);
 906
 907   /* FOUND contains 1 in bits for which we matched a relevant
 908      character.  Conversion to the byte index is trivial.  */
 909   found = __builtin_ctz (found);
 910   return (const uchar *)p + found;
 911 }
 912
 913 #else
 914
 915 /* We only have one accelerated alternative.  Use a direct call so that
 916    we encourage inlining.  */
 917
 918 #define search_line_fast  search_line_acc_char
 919
 920 #endif
 921
 922 /* Initialize the lexer if needed.  */
 923
 924 void
 925 _cpp_init_lexer (void)
 926 {
 927 #ifdef HAVE_init_vectorized_lexer
 928   init_vectorized_lexer ();
 929 #endif
 930 }
 931
 932 /* Returns with a logical line that contains no escaped newlines or
 933    trigraphs.  This is a time-critical inner loop.  */
 934 void
 935 _cpp_clean_line (cpp_reader *pfile)
 936 {
 937   cpp_buffer *buffer;
 938   const uchar *s;
 939   uchar c, *d, *p;
 940
 941   buffer = pfile->buffer;
 942   buffer->cur_note = buffer->notes_used = 0;
 943   buffer->cur = buffer->line_base = buffer->next_line;
 944   buffer->need_line = false;
 945   s = buffer->next_line;
 946
 947   if (!buffer->from_stage3)
 948     {
 949       const uchar *pbackslash = NULL;
 950
 951       /* Fast path.  This is the common case of an un-escaped line with
 952          no trigraphs.  The primary win here is by not writing any
 953          data back to memory until we have to.  */
 954       while (1)
 955         {
 956           /* Perform an optimized search for \n, \r, \\, ?.  */
 957           s = search_line_fast (s, buffer->rlimit);
 958
 959           c = *s;
 960           if (c == '\\')
 961             {
 962               /* Record the location of the backslash and continue.  */
 963               pbackslash = s++;
 964             }
 965           else if (__builtin_expect (c == '?', 0))
 966             {
 967               if (__builtin_expect (s[1] == '?', false)
 968                    && _cpp_trigraph_map[s[2]])
 969                 {
 970                   /* Have a trigraph.  We may or may not have to convert
 971                      it.  Add a line note regardless, for -Wtrigraphs.  */
 972                   add_line_note (buffer, s, s[2]);
 973                   if (CPP_OPTION (pfile, trigraphs))
 974                     {
 975                       /* We do, and that means we have to switch to the
 976                          slow path.  */
 977                       d = (uchar *) s;
 978                       *d = _cpp_trigraph_map[s[2]];
 979                       s += 2;
 980                       goto slow_path;
 981                     }
 982                 }
 983               /* Not a trigraph.  Continue on fast-path.  */
 984               s++;
 985             }
 986           else
 987             break;
 988         }
 989
 990       /* This must be \r or \n.  We're either done, or we'll be forced
 991          to write back to the buffer and continue on the slow path.  */
 992       d = (uchar *) s;
 993
 994       if (__builtin_expect (s == buffer->rlimit, false))
 995         goto done;
 996
 997       /* DOS line ending? */
 998       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 999         {
1000           s++;
1001           if (s == buffer->rlimit)
1002             goto done;
1003         }
1004
1005       if (__builtin_expect (pbackslash == NULL, true))
1006         goto done;
1007
1008       /* Check for escaped newline.  */
1009       p = d;
1010       while (is_nvspace (p[-1]))
1011         p--;
1012       if (p - 1 != pbackslash)
1013         goto done;
1014
1015       /* Have an escaped newline; process it and proceed to
1016          the slow path.  */
1017       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018       d = p - 2;
1019       buffer->next_line = p - 1;
1020
1021     slow_path:
1022       while (1)
1023         {
1024           c = *++s;
1025           *++d = c;
1026
1027           if (c == '\n' || c == '\r')
1028             {
1029               /* Handle DOS line endings.  */
1030               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031                 s++;
1032               if (s == buffer->rlimit)
1033                 break;
1034
1035               /* Escaped?  */
1036               p = d;
1037               while (p != buffer->next_line && is_nvspace (p[-1]))
1038                 p--;
1039               if (p == buffer->next_line || p[-1] != '\\')
1040                 break;
1041
1042               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043               d = p - 2;
1044               buffer->next_line = p - 1;
1045             }
1046           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1047             {
1048               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1049               add_line_note (buffer, d, s[2]);
1050               if (CPP_OPTION (pfile, trigraphs))
1051                 {
1052                   *d = _cpp_trigraph_map[s[2]];
1053                   s += 2;
1054                 }
1055             }
1056         }
1057     }
1058   else
1059     {
1060       while (*s != '\n' && *s != '\r')
1061         s++;
1062       d = (uchar *) s;
1063
1064       /* Handle DOS line endings.  */
1065       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
1066         s++;
1067     }
1068
1069  done:
1070   *d = '\n';
1071   /* A sentinel note that should never be processed.  */
1072   add_line_note (buffer, d + 1, '\n');
1073   buffer->next_line = s + 1;
1074 }
1075
1076 /* Return true if the trigraph indicated by NOTE should be warned
1077    about in a comment.  */
1078 static bool
1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1080 {
1081   const uchar *p;
1082
1083   /* Within comments we don't warn about trigraphs, unless the
1084      trigraph forms an escaped newline, as that may change
1085      behavior.  */
1086   if (note->type != '/')
1087     return false;
1088
1089   /* If -trigraphs, then this was an escaped newline iff the next note
1090      is coincident.  */
1091   if (CPP_OPTION (pfile, trigraphs))
1092     return note[1].pos == note->pos;
1093
1094   /* Otherwise, see if this forms an escaped newline.  */
1095   p = note->pos + 3;
1096   while (is_nvspace (*p))
1097     p++;
1098
1099   /* There might have been escaped newlines between the trigraph and the
1100      newline we found.  Hence the position test.  */
1101   return (*p == '\n' && p < note[1].pos);
1102 }
1103
1104 /* Process the notes created by add_line_note as far as the current
1105    location.  */
1106 void
1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1108 {
1109   cpp_buffer *buffer = pfile->buffer;
1110
1111   for (;;)
1112     {
1113       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114       unsigned int col;
1115
1116       if (note->pos > buffer->cur)
1117         break;
1118
1119       buffer->cur_note++;
1120       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1121
1122       if (note->type == '\\' || note->type == ' ')
1123         {
1124           if (note->type == ' ' && !in_comment)
1125             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126                                  "backslash and newline separated by space");
1127
1128           if (buffer->next_line > buffer->rlimit)
1129             {
1130               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131                                    "backslash-newline at end of file");
1132               /* Prevent "no newline at end of file" warning.  */
1133               buffer->next_line = buffer->rlimit;
1134             }
1135
1136           buffer->line_base = note->pos;
1137           CPP_INCREMENT_LINE (pfile, 0);
1138         }
1139       else if (_cpp_trigraph_map[note->type])
1140         {
1141           if (CPP_OPTION (pfile, warn_trigraphs)
1142               && (!in_comment || warn_in_comment (pfile, note)))
1143             {
1144               if (CPP_OPTION (pfile, trigraphs))
1145                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146                                        pfile->line_table->highest_line, col,
1147                                        "trigraph ??%c converted to %c",
1148                                        note->type,
1149                                        (int) _cpp_trigraph_map[note->type]);
1150               else
1151                 {
1152                   cpp_warning_with_line
1153                     (pfile, CPP_W_TRIGRAPHS,
1154                      pfile->line_table->highest_line, col,
1155                      "trigraph ??%c ignored, use -trigraphs to enable",
1156                      note->type);
1157                 }
1158             }
1159         }
1160       else if (note->type == 0)
1161         /* Already processed in lex_raw_string.  */;
1162       else
1163         abort ();
1164     }
1165 }
1166
1167 /* Skip a C-style block comment.  We find the end of the comment by
1168    seeing if an asterisk is before every '/' we encounter.  Returns
1169    nonzero if comment terminated by EOF, zero otherwise.
1170
1171    Buffer->cur points to the initial asterisk of the comment.  */
1172 bool
1173 _cpp_skip_block_comment (cpp_reader *pfile)
1174 {
1175   cpp_buffer *buffer = pfile->buffer;
1176   const uchar *cur = buffer->cur;
1177   uchar c;
1178
1179   cur++;
1180   if (*cur == '/')
1181     cur++;
1182
1183   for (;;)
1184     {
1185       /* People like decorating comments with '*', so check for '/'
1186          instead for efficiency.  */
1187       c = *cur++;
1188
1189       if (c == '/')
1190         {
1191           if (cur[-2] == '*')
1192             break;
1193
1194           /* Warn about potential nested comments, but not if the '/'
1195              comes immediately before the true comment delimiter.
1196              Don't bother to get it right across escaped newlines.  */
1197           if (CPP_OPTION (pfile, warn_comments)
1198               && cur[0] == '*' && cur[1] != '/')
1199             {
1200               buffer->cur = cur;
1201               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1202                                      pfile->line_table->highest_line,
1203                                      CPP_BUF_COL (buffer),
1204                                      "\"/*\" within comment");
1205             }
1206         }
1207       else if (c == '\n')
1208         {
1209           unsigned int cols;
1210           buffer->cur = cur - 1;
1211           _cpp_process_line_notes (pfile, true);
1212           if (buffer->next_line >= buffer->rlimit)
1213             return true;
1214           _cpp_clean_line (pfile);
1215
1216           cols = buffer->next_line - buffer->line_base;
1217           CPP_INCREMENT_LINE (pfile, cols);
1218
1219           cur = buffer->cur;
1220         }
1221     }
1222
1223   buffer->cur = cur;
1224   _cpp_process_line_notes (pfile, true);
1225   return false;
1226 }
1227
1228 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1229    terminating newline.  Handles escaped newlines.  Returns nonzero
1230    if a multiline comment.  */
1231 static int
1232 skip_line_comment (cpp_reader *pfile)
1233 {
1234   cpp_buffer *buffer = pfile->buffer;
1235   location_t orig_line = pfile->line_table->highest_line;
1236
1237   while (*buffer->cur != '\n')
1238     buffer->cur++;
1239
1240   _cpp_process_line_notes (pfile, true);
1241   return orig_line != pfile->line_table->highest_line;
1242 }
1243
1244 /* Skips whitespace, saving the next non-whitespace character.  */
1245 static void
1246 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1247 {
1248   cpp_buffer *buffer = pfile->buffer;
1249   bool saw_NUL = false;
1250
1251   do
1252     {
1253       /* Horizontal space always OK.  */
1254       if (c == ' ' || c == '\t')
1255         ;
1256       /* Just \f \v or \0 left.  */
1257       else if (c == '\0')
1258         saw_NUL = true;
1259       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1260         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1261                              CPP_BUF_COL (buffer),
1262                              "%s in preprocessing directive",
1263                              c == '\f' ? "form feed" : "vertical tab");
1264
1265       c = *buffer->cur++;
1266     }
1267   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1268   while (is_nvspace (c));
1269
1270   if (saw_NUL)
1271     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1272
1273   buffer->cur--;
1274 }
1275
1276 /* See if the characters of a number token are valid in a name (no
1277    '.', '+' or '-').  */
1278 static int
1279 name_p (cpp_reader *pfile, const cpp_string *string)
1280 {
1281   unsigned int i;
1282
1283   for (i = 0; i < string->len; i++)
1284     if (!is_idchar (string->text[i]))
1285       return 0;
1286
1287   return 1;
1288 }
1289
1290 /* After parsing an identifier or other sequence, produce a warning about
1291    sequences not in NFC/NFKC.  */
1292 static void
1293 warn_about_normalization (cpp_reader *pfile,
1294                           const cpp_token *token,
1295                           const struct normalize_state *s)
1296 {
1297   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1298       && !pfile->state.skipping)
1299     {
1300       /* Make sure that the token is printed using UCNs, even
1301          if we'd otherwise happily print UTF-8.  */
1302       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1303       size_t sz;
1304
1305       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1306       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1307         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1308                                "`%.*s' is not in NFKC", (int) sz, buf);
1309       else
1310         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1311                                "`%.*s' is not in NFC", (int) sz, buf);
1312       free (buf);
1313     }
1314 }
1315
1316 static const cppchar_t utf8_signifier = 0xC0;
1317
1318 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1319    an identifier.  FIRST is TRUE if this starts an identifier.  */
1320 static bool
1321 forms_identifier_p (cpp_reader *pfile, int first,
1322                     struct normalize_state *state)
1323 {
1324   cpp_buffer *buffer = pfile->buffer;
1325
1326   if (*buffer->cur == '$')
1327     {
1328       if (!CPP_OPTION (pfile, dollars_in_ident))
1329         return false;
1330
1331       buffer->cur++;
1332       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1333         {
1334           CPP_OPTION (pfile, warn_dollars) = 0;
1335           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1336         }
1337
1338       return true;
1339     }
1340
1341   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
1342   if (CPP_OPTION (pfile, extended_identifiers))
1343     {
1344       cppchar_t s;
1345       if (*buffer->cur >= utf8_signifier)
1346         {
1347           if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1348                                state, &s))
1349             return true;
1350         }
1351       else if (*buffer->cur == '\\'
1352                && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1353         {
1354           buffer->cur += 2;
1355           if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1356                               state, &s, NULL, NULL))
1357             return true;
1358           buffer->cur -= 2;
1359         }
1360     }
1361
1362   return false;
1363 }
1364
1365 /* Helper function to issue error about improper __VA_OPT__ use.  */
1366 static void
1367 maybe_va_opt_error (cpp_reader *pfile)
1368 {
1369   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1370     {
1371       /* __VA_OPT__ should not be accepted at all, but allow it in
1372          system headers.  */
1373       if (!cpp_in_system_header (pfile))
1374         cpp_error (pfile, CPP_DL_PEDWARN,
1375                    "__VA_OPT__ is not available until C++2a");
1376     }
1377   else if (!pfile->state.va_args_ok)
1378     {
1379       /* __VA_OPT__ should only appear in the replacement list of a
1380          variadic macro.  */
1381       cpp_error (pfile, CPP_DL_PEDWARN,
1382                  "__VA_OPT__ can only appear in the expansion"
1383                  " of a C++2a variadic macro");
1384     }
1385 }
1386
1387 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1388 static cpp_hashnode *
1389 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1390 {
1391   cpp_hashnode *result;
1392   const uchar *cur;
1393   unsigned int len;
1394   unsigned int hash = HT_HASHSTEP (0, *base);
1395
1396   cur = base + 1;
1397   while (ISIDNUM (*cur))
1398     {
1399       hash = HT_HASHSTEP (hash, *cur);
1400       cur++;
1401     }
1402   len = cur - base;
1403   hash = HT_HASHFINISH (hash, len);
1404   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1405                                               base, len, hash, HT_ALLOC));
1406
1407   /* Rarely, identifiers require diagnostics when lexed.  */
1408   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1409                         && !pfile->state.skipping, 0))
1410     {
1411       /* It is allowed to poison the same identifier twice.  */
1412       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1413         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1414                    NODE_NAME (result));
1415
1416       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1417          replacement list of a variadic macro.  */
1418       if (result == pfile->spec_nodes.n__VA_ARGS__
1419           && !pfile->state.va_args_ok)
1420         {
1421           if (CPP_OPTION (pfile, cplusplus))
1422             cpp_error (pfile, CPP_DL_PEDWARN,
1423                        "__VA_ARGS__ can only appear in the expansion"
1424                        " of a C++11 variadic macro");
1425           else
1426             cpp_error (pfile, CPP_DL_PEDWARN,
1427                        "__VA_ARGS__ can only appear in the expansion"
1428                        " of a C99 variadic macro");
1429         }
1430
1431       if (result == pfile->spec_nodes.n__VA_OPT__)
1432         maybe_va_opt_error (pfile);
1433
1434       /* For -Wc++-compat, warn about use of C++ named operators.  */
1435       if (result->flags & NODE_WARN_OPERATOR)
1436         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1437                      "identifier \"%s\" is a special operator name in C++",
1438                      NODE_NAME (result));
1439     }
1440
1441   return result;
1442 }
1443
1444 /* Get the cpp_hashnode of an identifier specified by NAME in
1445    the current cpp_reader object.  If none is found, NULL is returned.  */
1446 cpp_hashnode *
1447 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1448 {
1449   cpp_hashnode *result;
1450   result = lex_identifier_intern (pfile, (uchar *) name);
1451   return result;
1452 }
1453
1454 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1455 static cpp_hashnode *
1456 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1457                 struct normalize_state *nst, cpp_hashnode **spelling)
1458 {
1459   cpp_hashnode *result;
1460   const uchar *cur;
1461   unsigned int len;
1462   unsigned int hash = HT_HASHSTEP (0, *base);
1463
1464   cur = pfile->buffer->cur;
1465   if (! starts_ucn)
1466     {
1467       while (ISIDNUM (*cur))
1468         {
1469           hash = HT_HASHSTEP (hash, *cur);
1470           cur++;
1471         }
1472       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1473     }
1474   pfile->buffer->cur = cur;
1475   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1476     {
1477       /* Slower version for identifiers containing UCNs
1478          or extended chars (including $).  */
1479       do {
1480         while (ISIDNUM (*pfile->buffer->cur))
1481           {
1482             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1483             pfile->buffer->cur++;
1484           }
1485       } while (forms_identifier_p (pfile, false, nst));
1486       result = _cpp_interpret_identifier (pfile, base,
1487                                           pfile->buffer->cur - base);
1488       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1489     }
1490   else
1491     {
1492       len = cur - base;
1493       hash = HT_HASHFINISH (hash, len);
1494
1495       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1496                                                   base, len, hash, HT_ALLOC));
1497       *spelling = result;
1498     }
1499
1500   /* Rarely, identifiers require diagnostics when lexed.  */
1501   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1502                         && !pfile->state.skipping, 0))
1503     {
1504       /* It is allowed to poison the same identifier twice.  */
1505       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1506         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1507                    NODE_NAME (result));
1508
1509       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1510          replacement list of a variadic macro.  */
1511       if (result == pfile->spec_nodes.n__VA_ARGS__
1512           && !pfile->state.va_args_ok)
1513         {
1514           if (CPP_OPTION (pfile, cplusplus))
1515             cpp_error (pfile, CPP_DL_PEDWARN,
1516                        "__VA_ARGS__ can only appear in the expansion"
1517                        " of a C++11 variadic macro");
1518           else
1519             cpp_error (pfile, CPP_DL_PEDWARN,
1520                        "__VA_ARGS__ can only appear in the expansion"
1521                        " of a C99 variadic macro");
1522         }
1523
1524       /* __VA_OPT__ should only appear in the replacement list of a
1525          variadic macro.  */
1526       if (result == pfile->spec_nodes.n__VA_OPT__)
1527         maybe_va_opt_error (pfile);
1528
1529       /* For -Wc++-compat, warn about use of C++ named operators.  */
1530       if (result->flags & NODE_WARN_OPERATOR)
1531         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1532                      "identifier \"%s\" is a special operator name in C++",
1533                      NODE_NAME (result));
1534     }
1535
1536   return result;
1537 }
1538
1539 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1540 static void
1541 lex_number (cpp_reader *pfile, cpp_string *number,
1542             struct normalize_state *nst)
1543 {
1544   const uchar *cur;
1545   const uchar *base;
1546   uchar *dest;
1547
1548   base = pfile->buffer->cur - 1;
1549   do
1550     {
1551       cur = pfile->buffer->cur;
1552
1553       /* N.B. ISIDNUM does not include $.  */
1554       while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1555              || VALID_SIGN (*cur, cur[-1]))
1556         {
1557           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1558           cur++;
1559         }
1560       /* A number can't end with a digit separator.  */
1561       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
1562         --cur;
1563
1564       pfile->buffer->cur = cur;
1565     }
1566   while (forms_identifier_p (pfile, false, nst));
1567
1568   number->len = cur - base;
1569   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1570   memcpy (dest, base, number->len);
1571   dest[number->len] = '\0';
1572   number->text = dest;
1573 }
1574
1575 /* Create a token of type TYPE with a literal spelling.  */
1576 static void
1577 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1578                 unsigned int len, enum cpp_ttype type)
1579 {
1580   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1581
1582   memcpy (dest, base, len);
1583   dest[len] = '\0';
1584   token->type = type;
1585   token->val.str.len = len;
1586   token->val.str.text = dest;
1587 }
1588
1589 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1590    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1591
1592 static void
1593 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1594                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1595 {
1596   _cpp_buff *first_buff = *first_buff_p;
1597   _cpp_buff *last_buff = *last_buff_p;
1598
1599   if (first_buff == NULL)
1600     first_buff = last_buff = _cpp_get_buff (pfile, len);
1601   else if (len > BUFF_ROOM (last_buff))
1602     {
1603       size_t room = BUFF_ROOM (last_buff);
1604       memcpy (BUFF_FRONT (last_buff), base, room);
1605       BUFF_FRONT (last_buff) += room;
1606       base += room;
1607       len -= room;
1608       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1609     }
1610
1611   memcpy (BUFF_FRONT (last_buff), base, len);
1612   BUFF_FRONT (last_buff) += len;
1613
1614   *first_buff_p = first_buff;
1615   *last_buff_p = last_buff;
1616 }
1617
1618
1619 /* Returns true if a macro has been defined.
1620    This might not work if compile with -save-temps,
1621    or preprocess separately from compilation.  */
1622
1623 static bool
1624 is_macro(cpp_reader *pfile, const uchar *base)
1625 {
1626   const uchar *cur = base;
1627   if (! ISIDST (*cur))
1628     return false;
1629   unsigned int hash = HT_HASHSTEP (0, *cur);
1630   ++cur;
1631   while (ISIDNUM (*cur))
1632     {
1633       hash = HT_HASHSTEP (hash, *cur);
1634       ++cur;
1635     }
1636   hash = HT_HASHFINISH (hash, cur - base);
1637
1638   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1639                                         base, cur - base, hash, HT_NO_INSERT));
1640
1641   return result && cpp_macro_p (result);
1642 }
1643
1644 /* Returns true if a literal suffix does not have the expected form
1645    and is defined as a macro.  */
1646
1647 static bool
1648 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
1649 {
1650   /* User-defined literals outside of namespace std must start with a single
1651      underscore, so assume anything of that form really is a UDL suffix.
1652      We don't need to worry about UDLs defined inside namespace std because
1653      their names are reserved, so cannot be used as macro names in valid
1654      programs.  */
1655   if (base[0] == '_' && base[1] != '_')
1656     return false;
1657   return is_macro (pfile, base);
1658 }
1659
1660 /* Lexes a raw string.  The stored string contains the spelling, including
1661    double quotes, delimiter string, '(' and ')', any leading
1662    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1663    literal, or CPP_OTHER if it was not properly terminated.
1664
1665    The spelling is NUL-terminated, but it is not guaranteed that this
1666    is the first NUL since embedded NULs are preserved.  */
1667
1668 static void
1669 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1670                 const uchar *cur)
1671 {
1672   uchar raw_prefix[17];
1673   uchar temp_buffer[18];
1674   const uchar *orig_base;
1675   unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
1676   enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
1677   raw_str_phase phase = RAW_STR_PREFIX;
1678   enum cpp_ttype type;
1679   size_t total_len = 0;
1680   /* Index into temp_buffer during phases other than RAW_STR,
1681      during RAW_STR phase 17 to tell BUF_APPEND that nothing should
1682      be appended to temp_buffer.  */
1683   size_t temp_buffer_len = 0;
1684   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1685   size_t raw_prefix_start;
1686   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1687
1688   type = (*base == 'L' ? CPP_WSTRING :
1689           *base == 'U' ? CPP_STRING32 :
1690           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1691           : CPP_STRING);
1692
1693 #define BUF_APPEND(STR,LEN)                                     \
1694       do {                                                      \
1695         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1696                         &first_buff, &last_buff);               \
1697         total_len += (LEN);                                     \
1698         if (__builtin_expect (temp_buffer_len < 17, 0)          \
1699             && (const uchar *)(STR) != base                     \
1700             && (LEN) <= 2)                                      \
1701           {                                                     \
1702             memcpy (temp_buffer + temp_buffer_len,              \
1703                     (const uchar *)(STR), (LEN));               \
1704             temp_buffer_len += (LEN);                           \
1705           }                                                     \
1706       } while (0)
1707
1708   orig_base = base;
1709   ++cur;
1710   raw_prefix_start = cur - base;
1711   for (;;)
1712     {
1713       cppchar_t c;
1714
1715       /* If we previously performed any trigraph or line splicing
1716          transformations, undo them in between the opening and closing
1717          double quote.  */
1718       while (note->pos < cur)
1719         ++note;
1720       for (; note->pos == cur; ++note)
1721         {
1722           switch (note->type)
1723             {
1724             case '\\':
1725             case ' ':
1726               /* Restore backslash followed by newline.  */
1727               BUF_APPEND (base, cur - base);
1728               base = cur;
1729               BUF_APPEND ("\\", 1);
1730             after_backslash:
1731               if (note->type == ' ')
1732                 {
1733                   /* GNU backslash whitespace newline extension.  FIXME
1734                      could be any sequence of non-vertical space.  When we
1735                      can properly restore any such sequence, we should mark
1736                      this note as handled so _cpp_process_line_notes
1737                      doesn't warn.  */
1738                   BUF_APPEND (" ", 1);
1739                 }
1740
1741               BUF_APPEND ("\n", 1);
1742               break;
1743
1744             case 0:
1745               /* Already handled.  */
1746               break;
1747
1748             default:
1749               if (_cpp_trigraph_map[note->type])
1750                 {
1751                   /* Don't warn about this trigraph in
1752                      _cpp_process_line_notes, since trigraphs show up as
1753                      trigraphs in raw strings.  */
1754                   uchar type = note->type;
1755                   note->type = 0;
1756
1757                   if (!CPP_OPTION (pfile, trigraphs))
1758                     /* If we didn't convert the trigraph in the first
1759                        place, don't do anything now either.  */
1760                     break;
1761
1762                   BUF_APPEND (base, cur - base);
1763                   base = cur;
1764                   BUF_APPEND ("??", 2);
1765
1766                   /* ??/ followed by newline gets two line notes, one for
1767                      the trigraph and one for the backslash/newline.  */
1768                   if (type == '/' && note[1].pos == cur)
1769                     {
1770                       if (note[1].type != '\\'
1771                           && note[1].type != ' ')
1772                         abort ();
1773                       BUF_APPEND ("/", 1);
1774                       ++note;
1775                       goto after_backslash;
1776                     }
1777                   else
1778                     {
1779                       /* Skip the replacement character.  */
1780                       base = ++cur;
1781                       BUF_APPEND (&type, 1);
1782                       c = type;
1783                       goto check_c;
1784                     }
1785                 }
1786               else
1787                 abort ();
1788               break;
1789             }
1790         }
1791       c = *cur++;
1792       if (__builtin_expect (temp_buffer_len < 17, 0))
1793         temp_buffer[temp_buffer_len++] = c;
1794
1795      check_c:
1796       if (phase == RAW_STR_PREFIX)
1797         {
1798           while (raw_prefix_len < temp_buffer_len)
1799             {
1800               raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len];
1801               switch (raw_prefix[raw_prefix_len])
1802                 {
1803                 case ' ': case '(': case ')': case '\\': case '\t':
1804                 case '\v': case '\f': case '\n': default:
1805                   break;
1806                 /* Basic source charset except the above chars.  */
1807                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1808                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1809                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1810                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1811                 case 'y': case 'z':
1812                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1813                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1814                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1815                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1816                 case 'Y': case 'Z':
1817                 case '0': case '1': case '2': case '3': case '4': case '5':
1818                 case '6': case '7': case '8': case '9':
1819                 case '_': case '{': case '}': case '#': case '[': case ']':
1820                 case '<': case '>': case '%': case ':': case ';': case '.':
1821                 case '?': case '*': case '+': case '-': case '/': case '^':
1822                 case '&': case '|': case '~': case '!': case '=': case ',':
1823                 case '"': case '\'':
1824                   if (raw_prefix_len < 16)
1825                     {
1826                       raw_prefix_len++;
1827                       continue;
1828                     }
1829                   break;
1830                 }
1831
1832               if (raw_prefix[raw_prefix_len] != '(')
1833                 {
1834                   int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1;
1835                   if (raw_prefix_len == 16)
1836                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1837                                          col, "raw string delimiter longer "
1838                                               "than 16 characters");
1839                   else if (raw_prefix[raw_prefix_len] == '\n')
1840                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1841                                          col, "invalid new-line in raw "
1842                                               "string delimiter");
1843                   else
1844                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1845                                          col, "invalid character '%c' in "
1846                                               "raw string delimiter",
1847                                          (int) raw_prefix[raw_prefix_len]);
1848                   pfile->buffer->cur = orig_base + raw_prefix_start - 1;
1849                   create_literal (pfile, token, orig_base,
1850                                   raw_prefix_start - 1, CPP_OTHER);
1851                   if (first_buff)
1852                     _cpp_release_buff (pfile, first_buff);
1853                   return;
1854                 }
1855               raw_prefix[raw_prefix_len] = '"';
1856               phase = RAW_STR;
1857               /* Nothing should be appended to temp_buffer during
1858                  RAW_STR phase.  */
1859               temp_buffer_len = 17;
1860               break;
1861             }
1862           continue;
1863         }
1864       else if (phase == RAW_STR_SUFFIX)
1865         {
1866           while (raw_suffix_len <= raw_prefix_len
1867                  && raw_suffix_len < temp_buffer_len
1868                  && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len])
1869             raw_suffix_len++;
1870           if (raw_suffix_len > raw_prefix_len)
1871             break;
1872           if (raw_suffix_len == temp_buffer_len)
1873             continue;
1874           phase = RAW_STR;
1875           /* Nothing should be appended to temp_buffer during
1876              RAW_STR phase.  */
1877           temp_buffer_len = 17;
1878         }
1879       if (c == ')')
1880         {
1881           phase = RAW_STR_SUFFIX;
1882           raw_suffix_len = 0;
1883           temp_buffer_len = 0;
1884         }
1885       else if (c == '\n')
1886         {
1887           if (pfile->state.in_directive
1888               || (pfile->state.parsing_args
1889                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
1890             {
1891               cur--;
1892               type = CPP_OTHER;
1893               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1894                                    "unterminated raw string");
1895               break;
1896             }
1897
1898           BUF_APPEND (base, cur - base);
1899
1900           if (pfile->buffer->cur < pfile->buffer->rlimit)
1901             CPP_INCREMENT_LINE (pfile, 0);
1902           pfile->buffer->need_line = true;
1903
1904           pfile->buffer->cur = cur-1;
1905           _cpp_process_line_notes (pfile, false);
1906           if (!_cpp_get_fresh_line (pfile))
1907             {
1908               location_t src_loc = token->src_loc;
1909               token->type = CPP_EOF;
1910               /* Tell the compiler the line number of the EOF token.  */
1911               token->src_loc = pfile->line_table->highest_line;
1912               token->flags = BOL;
1913               if (first_buff != NULL)
1914                 _cpp_release_buff (pfile, first_buff);
1915               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1916                                    "unterminated raw string");
1917               return;
1918             }
1919
1920           cur = base = pfile->buffer->cur;
1921           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1922         }
1923     }
1924
1925   if (CPP_OPTION (pfile, user_literals))
1926     {
1927       /* If a string format macro, say from inttypes.h, is placed touching
1928          a string literal it could be parsed as a C++11 user-defined string
1929          literal thus breaking the program.  */
1930       if (is_macro_not_literal_suffix (pfile, cur))
1931         {
1932           /* Raise a warning, but do not consume subsequent tokens.  */
1933           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1934             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1935                                    token->src_loc, 0,
1936                                    "invalid suffix on literal; C++11 requires "
1937                                    "a space between literal and string macro");
1938         }
1939       /* Grab user defined literal suffix.  */
1940       else if (ISIDST (*cur))
1941         {
1942           type = cpp_userdef_string_add_type (type);
1943           ++cur;
1944
1945           while (ISIDNUM (*cur))
1946             ++cur;
1947         }
1948     }
1949
1950   pfile->buffer->cur = cur;
1951   if (first_buff == NULL)
1952     create_literal (pfile, token, base, cur - base, type);
1953   else
1954     {
1955       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1956
1957       token->type = type;
1958       token->val.str.len = total_len + (cur - base);
1959       token->val.str.text = dest;
1960       last_buff = first_buff;
1961       while (last_buff != NULL)
1962         {
1963           memcpy (dest, last_buff->base,
1964                   BUFF_FRONT (last_buff) - last_buff->base);
1965           dest += BUFF_FRONT (last_buff) - last_buff->base;
1966           last_buff = last_buff->next;
1967         }
1968       _cpp_release_buff (pfile, first_buff);
1969       memcpy (dest, base, cur - base);
1970       dest[cur - base] = '\0';
1971     }
1972 }
1973
1974 /* Lexes a string, character constant, or angle-bracketed header file
1975    name.  The stored string contains the spelling, including opening
1976    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1977    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1978    if it was not properly terminated, or CPP_LESS for an unterminated
1979    header name which must be relexed as normal tokens.
1980
1981    The spelling is NUL-terminated, but it is not guaranteed that this
1982    is the first NUL since embedded NULs are preserved.  */
1983 static void
1984 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1985 {
1986   bool saw_NUL = false;
1987   const uchar *cur;
1988   cppchar_t terminator;
1989   enum cpp_ttype type;
1990
1991   cur = base;
1992   terminator = *cur++;
1993   if (terminator == 'L' || terminator == 'U')
1994     terminator = *cur++;
1995   else if (terminator == 'u')
1996     {
1997       terminator = *cur++;
1998       if (terminator == '8')
1999         terminator = *cur++;
2000     }
2001   if (terminator == 'R')
2002     {
2003       lex_raw_string (pfile, token, base, cur);
2004       return;
2005     }
2006   if (terminator == '"')
2007     type = (*base == 'L' ? CPP_WSTRING :
2008             *base == 'U' ? CPP_STRING32 :
2009             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2010                          : CPP_STRING);
2011   else if (terminator == '\'')
2012     type = (*base == 'L' ? CPP_WCHAR :
2013             *base == 'U' ? CPP_CHAR32 :
2014             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2015                          : CPP_CHAR);
2016   else
2017     terminator = '>', type = CPP_HEADER_NAME;
2018
2019   for (;;)
2020     {
2021       cppchar_t c = *cur++;
2022
2023       /* In #include-style directives, terminators are not escapable.  */
2024       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2025         cur++;
2026       else if (c == terminator)
2027         break;
2028       else if (c == '\n')
2029         {
2030           cur--;
2031           /* Unmatched quotes always yield undefined behavior, but
2032              greedy lexing means that what appears to be an unterminated
2033              header name may actually be a legitimate sequence of tokens.  */
2034           if (terminator == '>')
2035             {
2036               token->type = CPP_LESS;
2037               return;
2038             }
2039           type = CPP_OTHER;
2040           break;
2041         }
2042       else if (c == '\0')
2043         saw_NUL = true;
2044     }
2045
2046   if (saw_NUL && !pfile->state.skipping)
2047     cpp_error (pfile, CPP_DL_WARNING,
2048                "null character(s) preserved in literal");
2049
2050   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2051     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2052                (int) terminator);
2053
2054   if (CPP_OPTION (pfile, user_literals))
2055     {
2056       /* If a string format macro, say from inttypes.h, is placed touching
2057          a string literal it could be parsed as a C++11 user-defined string
2058          literal thus breaking the program.  */
2059       if (is_macro_not_literal_suffix (pfile, cur))
2060         {
2061           /* Raise a warning, but do not consume subsequent tokens.  */
2062           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2063             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2064                                    token->src_loc, 0,
2065                                    "invalid suffix on literal; C++11 requires "
2066                                    "a space between literal and string macro");
2067         }
2068       /* Grab user defined literal suffix.  */
2069       else if (ISIDST (*cur))
2070         {
2071           type = cpp_userdef_char_add_type (type);
2072           type = cpp_userdef_string_add_type (type);
2073           ++cur;
2074
2075           while (ISIDNUM (*cur))
2076             ++cur;
2077         }
2078     }
2079   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2080            && is_macro (pfile, cur)
2081            && !pfile->state.skipping)
2082     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2083                            token->src_loc, 0, "C++11 requires a space "
2084                            "between string literal and macro");
2085
2086   pfile->buffer->cur = cur;
2087   create_literal (pfile, token, base, cur - base, type);
2088 }
2089
2090 /* Return the comment table. The client may not make any assumption
2091    about the ordering of the table.  */
2092 cpp_comment_table *
2093 cpp_get_comments (cpp_reader *pfile)
2094 {
2095   return &pfile->comments;
2096 }
2097
2098 /* Append a comment to the end of the comment table. */
2099 static void
2100 store_comment (cpp_reader *pfile, cpp_token *token)
2101 {
2102   int len;
2103
2104   if (pfile->comments.allocated == 0)
2105     {
2106       pfile->comments.allocated = 256;
2107       pfile->comments.entries = (cpp_comment *) xmalloc
2108         (pfile->comments.allocated * sizeof (cpp_comment));
2109     }
2110
2111   if (pfile->comments.count == pfile->comments.allocated)
2112     {
2113       pfile->comments.allocated *= 2;
2114       pfile->comments.entries = (cpp_comment *) xrealloc
2115         (pfile->comments.entries,
2116          pfile->comments.allocated * sizeof (cpp_comment));
2117     }
2118
2119   len = token->val.str.len;
2120
2121   /* Copy comment. Note, token may not be NULL terminated. */
2122   pfile->comments.entries[pfile->comments.count].comment =
2123     (char *) xmalloc (sizeof (char) * (len + 1));
2124   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2125           token->val.str.text, len);
2126   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2127
2128   /* Set source location. */
2129   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2130
2131   /* Increment the count of entries in the comment table. */
2132   pfile->comments.count++;
2133 }
2134
2135 /* The stored comment includes the comment start and any terminator.  */
2136 static void
2137 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2138               cppchar_t type)
2139 {
2140   unsigned char *buffer;
2141   unsigned int len, clen, i;
2142
2143   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2144
2145   /* C++ comments probably (not definitely) have moved past a new
2146      line, which we don't want to save in the comment.  */
2147   if (is_vspace (pfile->buffer->cur[-1]))
2148     len--;
2149
2150   /* If we are currently in a directive or in argument parsing, then
2151      we need to store all C++ comments as C comments internally, and
2152      so we need to allocate a little extra space in that case.
2153
2154      Note that the only time we encounter a directive here is
2155      when we are saving comments in a "#define".  */
2156   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2157           && type == '/') ? len + 2 : len;
2158
2159   buffer = _cpp_unaligned_alloc (pfile, clen);
2160
2161   token->type = CPP_COMMENT;
2162   token->val.str.len = clen;
2163   token->val.str.text = buffer;
2164
2165   buffer[0] = '/';
2166   memcpy (buffer + 1, from, len - 1);
2167
2168   /* Finish conversion to a C comment, if necessary.  */
2169   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2170     {
2171       buffer[1] = '*';
2172       buffer[clen - 2] = '*';
2173       buffer[clen - 1] = '/';
2174       /* As there can be in a C++ comments illegal sequences for C comments
2175          we need to filter them out.  */
2176       for (i = 2; i < (clen - 2); i++)
2177         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2178           buffer[i] = '|';
2179     }
2180
2181   /* Finally store this comment for use by clients of libcpp. */
2182   store_comment (pfile, token);
2183 }
2184
2185 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2186    comment.  */
2187
2188 static bool
2189 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2190 {
2191   const unsigned char *from = comment_start + 1;
2192
2193   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2194     {
2195       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2196          don't recognize any comments.  The latter only checks attributes,
2197          the former doesn't warn.  */
2198     case 0:
2199     default:
2200       return false;
2201       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2202          content it has.  */
2203     case 1:
2204       return true;
2205     case 2:
2206       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2207          .*falls?[ \t-]*thr(u|ough).* regex.  */
2208       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2209            from++)
2210         {
2211           /* Is there anything like strpbrk with upper boundary, or
2212              memchr looking for 2 characters rather than just one?  */
2213           if (from[0] != 'f' && from[0] != 'F')
2214             continue;
2215           if (from[1] != 'a' && from[1] != 'A')
2216             continue;
2217           if (from[2] != 'l' && from[2] != 'L')
2218             continue;
2219           if (from[3] != 'l' && from[3] != 'L')
2220             continue;
2221           from += sizeof "fall" - 1;
2222           if (from[0] == 's' || from[0] == 'S')
2223             from++;
2224           while (*from == ' ' || *from == '\t' || *from == '-')
2225             from++;
2226           if (from[0] != 't' && from[0] != 'T')
2227             continue;
2228           if (from[1] != 'h' && from[1] != 'H')
2229             continue;
2230           if (from[2] != 'r' && from[2] != 'R')
2231             continue;
2232           if (from[3] == 'u' || from[3] == 'U')
2233             return true;
2234           if (from[3] != 'o' && from[3] != 'O')
2235             continue;
2236           if (from[4] != 'u' && from[4] != 'U')
2237             continue;
2238           if (from[5] != 'g' && from[5] != 'G')
2239             continue;
2240           if (from[6] != 'h' && from[6] != 'H')
2241             continue;
2242           return true;
2243         }
2244       return false;
2245     case 3:
2246     case 4:
2247       break;
2248     }
2249
2250   /* Whole comment contents:
2251      -fallthrough
2252      @fallthrough@
2253    */
2254   if (*from == '-' || *from == '@')
2255     {
2256       size_t len = sizeof "fallthrough" - 1;
2257       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2258         return false;
2259       if (memcmp (from + 1, "fallthrough", len))
2260         return false;
2261       if (*from == '@')
2262         {
2263           if (from[len + 1] != '@')
2264             return false;
2265           len++;
2266         }
2267       from += 1 + len;
2268     }
2269   /* Whole comment contents (regex):
2270      lint -fallthrough[ \t]*
2271    */
2272   else if (*from == 'l')
2273     {
2274       size_t len = sizeof "int -fallthrough" - 1;
2275       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2276         return false;
2277       if (memcmp (from + 1, "int -fallthrough", len))
2278         return false;
2279       from += 1 + len;
2280       while (*from == ' ' || *from == '\t')
2281         from++;
2282     }
2283   /* Whole comment contents (regex):
2284      [ \t]*FALLTHR(U|OUGH)[ \t]*
2285    */
2286   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2287     {
2288       while (*from == ' ' || *from == '\t')
2289         from++;
2290       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
2291         return false;
2292       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2293         return false;
2294       from += sizeof "FALLTHR" - 1;
2295       if (*from == 'U')
2296         from++;
2297       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
2298         return false;
2299       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2300         return false;
2301       else
2302         from += sizeof "OUGH" - 1;
2303       while (*from == ' ' || *from == '\t')
2304         from++;
2305     }
2306   /* Whole comment contents (regex):
2307      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2308      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2309      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2310    */
2311   else
2312     {
2313       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2314         from++;
2315       unsigned char f = *from;
2316       bool all_upper = false;
2317       if (f == 'E' || f == 'e')
2318         {
2319           if ((size_t) (pfile->buffer->cur - from)
2320               < sizeof "else fallthru" - 1)
2321             return false;
2322           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2323             all_upper = true;
2324           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2325             return false;
2326           from += sizeof "else" - 1;
2327           if (*from == ',')
2328             from++;
2329           if (*from != ' ')
2330             return false;
2331           from++;
2332           if (all_upper && *from == 'f')
2333             return false;
2334           if (f == 'e' && *from == 'F')
2335             return false;
2336           f = *from;
2337         }
2338       else if (f == 'I' || f == 'i')
2339         {
2340           if ((size_t) (pfile->buffer->cur - from)
2341               < sizeof "intentional fallthru" - 1)
2342             return false;
2343           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2344                                   sizeof "NTENTIONAL" - 1) == 0)
2345             all_upper = true;
2346           else if (memcmp (from + 1, "ntentional",
2347                            sizeof "ntentional" - 1))
2348             return false;
2349           from += sizeof "intentional" - 1;
2350           if (*from == ' ')
2351             {
2352               from++;
2353               if (all_upper && *from == 'f')
2354                 return false;
2355             }
2356           else if (all_upper)
2357             {
2358               if (memcmp (from, "LY F", sizeof "LY F" - 1))
2359                 return false;
2360               from += sizeof "LY " - 1;
2361             }
2362           else
2363             {
2364               if (memcmp (from, "ly ", sizeof "ly " - 1))
2365                 return false;
2366               from += sizeof "ly " - 1;
2367             }
2368           if (f == 'i' && *from == 'F')
2369             return false;
2370           f = *from;
2371         }
2372       if (f != 'F' && f != 'f')
2373         return false;
2374       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
2375         return false;
2376       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
2377         all_upper = true;
2378       else if (all_upper)
2379         return false;
2380       else if (memcmp (from + 1, "all", sizeof "all" - 1))
2381         return false;
2382       from += sizeof "fall" - 1;
2383       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
2384         from += 2;
2385       else if (*from == ' ' || *from == '-')
2386         from++;
2387       else if (*from != (all_upper ? 'T' : 't'))
2388         return false;
2389       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
2390         return false;
2391       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
2392         return false;
2393       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
2394         {
2395           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
2396             return false;
2397           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
2398                       sizeof "hrough" - 1))
2399             return false;
2400           from += sizeof "through" - 1;
2401         }
2402       else
2403         from += sizeof "thru" - 1;
2404       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2405         from++;
2406       if (*from == '-')
2407         {
2408           from++;
2409           if (*comment_start == '*')
2410             {
2411               do
2412                 {
2413                   while (*from && *from != '*'
2414                          && *from != '\n' && *from != '\r')
2415                     from++;
2416                   if (*from != '*' || from[1] == '/')
2417                     break;
2418                   from++;
2419                 }
2420               while (1);
2421             }
2422           else
2423             while (*from && *from != '\n' && *from != '\r')
2424               from++;
2425         }
2426     }
2427   /* C block comment.  */
2428   if (*comment_start == '*')
2429     {
2430       if (*from != '*' || from[1] != '/')
2431         return false;
2432     }
2433   /* C++ line comment.  */
2434   else if (*from != '\n')
2435     return false;
2436
2437   return true;
2438 }
2439
2440 /* Allocate COUNT tokens for RUN.  */
2441 void
2442 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
2443 {
2444   run->base = XNEWVEC (cpp_token, count);
2445   run->limit = run->base + count;
2446   run->next = NULL;
2447 }
2448
2449 /* Returns the next tokenrun, or creates one if there is none.  */
2450 static tokenrun *
2451 next_tokenrun (tokenrun *run)
2452 {
2453   if (run->next == NULL)
2454     {
2455       run->next = XNEW (tokenrun);
2456       run->next->prev = run;
2457       _cpp_init_tokenrun (run->next, 250);
2458     }
2459
2460   return run->next;
2461 }
2462
2463 /* Return the number of not yet processed token in a given
2464    context.  */
2465 int
2466 _cpp_remaining_tokens_num_in_context (cpp_context *context)
2467 {
2468   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2469     return (LAST (context).token - FIRST (context).token);
2470   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2471            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2472     return (LAST (context).ptoken - FIRST (context).ptoken);
2473   else
2474       abort ();
2475 }
2476
2477 /* Returns the token present at index INDEX in a given context.  If
2478    INDEX is zero, the next token to be processed is returned.  */
2479 static const cpp_token*
2480 _cpp_token_from_context_at (cpp_context *context, int index)
2481 {
2482   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2483     return &(FIRST (context).token[index]);
2484   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2485            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2486     return FIRST (context).ptoken[index];
2487  else
2488    abort ();
2489 }
2490
2491 /* Look ahead in the input stream.  */
2492 const cpp_token *
2493 cpp_peek_token (cpp_reader *pfile, int index)
2494 {
2495   cpp_context *context = pfile->context;
2496   const cpp_token *peektok;
2497   int count;
2498
2499   /* First, scan through any pending cpp_context objects.  */
2500   while (context->prev)
2501     {
2502       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2503
2504       if (index < (int) sz)
2505         return _cpp_token_from_context_at (context, index);
2506       index -= (int) sz;
2507       context = context->prev;
2508     }
2509
2510   /* We will have to read some new tokens after all (and do so
2511      without invalidating preceding tokens).  */
2512   count = index;
2513   pfile->keep_tokens++;
2514
2515   /* For peeked tokens temporarily disable line_change reporting,
2516      until the tokens are parsed for real.  */
2517   void (*line_change) (cpp_reader *, const cpp_token *, int)
2518     = pfile->cb.line_change;
2519   pfile->cb.line_change = NULL;
2520
2521   do
2522     {
2523       peektok = _cpp_lex_token (pfile);
2524       if (peektok->type == CPP_EOF)
2525         {
2526           index--;
2527           break;
2528         }
2529     }
2530   while (index--);
2531
2532   _cpp_backup_tokens_direct (pfile, count - index);
2533   pfile->keep_tokens--;
2534   pfile->cb.line_change = line_change;
2535
2536   return peektok;
2537 }
2538
2539 /* Allocate a single token that is invalidated at the same time as the
2540    rest of the tokens on the line.  Has its line and col set to the
2541    same as the last lexed token, so that diagnostics appear in the
2542    right place.  */
2543 cpp_token *
2544 _cpp_temp_token (cpp_reader *pfile)
2545 {
2546   cpp_token *old, *result;
2547   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2548   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2549
2550   old = pfile->cur_token - 1;
2551   /* Any pre-existing lookaheads must not be clobbered.  */
2552   if (la)
2553     {
2554       if (sz <= la)
2555         {
2556           tokenrun *next = next_tokenrun (pfile->cur_run);
2557
2558           if (sz < la)
2559             memmove (next->base + 1, next->base,
2560                      (la - sz) * sizeof (cpp_token));
2561
2562           next->base[0] = pfile->cur_run->limit[-1];
2563         }
2564
2565       if (sz > 1)
2566         memmove (pfile->cur_token + 1, pfile->cur_token,
2567                  MIN (la, sz - 1) * sizeof (cpp_token));
2568     }
2569
2570   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2571     {
2572       pfile->cur_run = next_tokenrun (pfile->cur_run);
2573       pfile->cur_token = pfile->cur_run->base;
2574     }
2575
2576   result = pfile->cur_token++;
2577   result->src_loc = old->src_loc;
2578   return result;
2579 }
2580
2581 /* Lex a token into RESULT (external interface).  Takes care of issues
2582    like directive handling, token lookahead, multiple include
2583    optimization and skipping.  */
2584 const cpp_token *
2585 _cpp_lex_token (cpp_reader *pfile)
2586 {
2587   cpp_token *result;
2588
2589   for (;;)
2590     {
2591       if (pfile->cur_token == pfile->cur_run->limit)
2592         {
2593           pfile->cur_run = next_tokenrun (pfile->cur_run);
2594           pfile->cur_token = pfile->cur_run->base;
2595         }
2596       /* We assume that the current token is somewhere in the current
2597          run.  */
2598       if (pfile->cur_token < pfile->cur_run->base
2599           || pfile->cur_token >= pfile->cur_run->limit)
2600         abort ();
2601
2602       if (pfile->lookaheads)
2603         {
2604           pfile->lookaheads--;
2605           result = pfile->cur_token++;
2606         }
2607       else
2608         result = _cpp_lex_direct (pfile);
2609
2610       if (result->flags & BOL)
2611         {
2612           /* Is this a directive.  If _cpp_handle_directive returns
2613              false, it is an assembler #.  */
2614           if (result->type == CPP_HASH
2615               /* 6.10.3 p 11: Directives in a list of macro arguments
2616                  gives undefined behavior.  This implementation
2617                  handles the directive as normal.  */
2618               && pfile->state.parsing_args != 1)
2619             {
2620               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2621                 {
2622                   if (pfile->directive_result.type == CPP_PADDING)
2623                     continue;
2624                   result = &pfile->directive_result;
2625                 }
2626             }
2627           else if (pfile->state.in_deferred_pragma)
2628             result = &pfile->directive_result;
2629
2630           if (pfile->cb.line_change && !pfile->state.skipping)
2631             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2632         }
2633
2634       /* We don't skip tokens in directives.  */
2635       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2636         break;
2637
2638       /* Outside a directive, invalidate controlling macros.  At file
2639          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2640          get here and MI optimization works.  */
2641       pfile->mi_valid = false;
2642
2643       if (!pfile->state.skipping || result->type == CPP_EOF)
2644         break;
2645     }
2646
2647   return result;
2648 }
2649
2650 /* Returns true if a fresh line has been loaded.  */
2651 bool
2652 _cpp_get_fresh_line (cpp_reader *pfile)
2653 {
2654   int return_at_eof;
2655
2656   /* We can't get a new line until we leave the current directive.  */
2657   if (pfile->state.in_directive)
2658     return false;
2659
2660   for (;;)
2661     {
2662       cpp_buffer *buffer = pfile->buffer;
2663
2664       if (!buffer->need_line)
2665         return true;
2666
2667       if (buffer->next_line < buffer->rlimit)
2668         {
2669           _cpp_clean_line (pfile);
2670           return true;
2671         }
2672
2673       /* First, get out of parsing arguments state.  */
2674       if (pfile->state.parsing_args)
2675         return false;
2676
2677       /* End of buffer.  Non-empty files should end in a newline.  */
2678       if (buffer->buf != buffer->rlimit
2679           && buffer->next_line > buffer->rlimit
2680           && !buffer->from_stage3)
2681         {
2682           /* Clip to buffer size.  */
2683           buffer->next_line = buffer->rlimit;
2684         }
2685
2686       return_at_eof = buffer->return_at_eof;
2687       _cpp_pop_buffer (pfile);
2688       if (pfile->buffer == NULL || return_at_eof)
2689         return false;
2690     }
2691 }
2692
2693 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2694   do                                                    \
2695     {                                                   \
2696       result->type = ELSE_TYPE;                         \
2697       if (*buffer->cur == CHAR)                         \
2698         buffer->cur++, result->type = THEN_TYPE;        \
2699     }                                                   \
2700   while (0)
2701
2702 /* Lex a token into pfile->cur_token, which is also incremented, to
2703    get diagnostics pointing to the correct location.
2704
2705    Does not handle issues such as token lookahead, multiple-include
2706    optimization, directives, skipping etc.  This function is only
2707    suitable for use by _cpp_lex_token, and in special cases like
2708    lex_expansion_token which doesn't care for any of these issues.
2709
2710    When meeting a newline, returns CPP_EOF if parsing a directive,
2711    otherwise returns to the start of the token buffer if permissible.
2712    Returns the location of the lexed token.  */
2713 cpp_token *
2714 _cpp_lex_direct (cpp_reader *pfile)
2715 {
2716   cppchar_t c;
2717   cpp_buffer *buffer;
2718   const unsigned char *comment_start;
2719   bool fallthrough_comment = false;
2720   cpp_token *result = pfile->cur_token++;
2721
2722  fresh_line:
2723   result->flags = 0;
2724   buffer = pfile->buffer;
2725   if (buffer->need_line)
2726     {
2727       if (pfile->state.in_deferred_pragma)
2728         {
2729           result->type = CPP_PRAGMA_EOL;
2730           pfile->state.in_deferred_pragma = false;
2731           if (!pfile->state.pragma_allow_expansion)
2732             pfile->state.prevent_expansion--;
2733           return result;
2734         }
2735       if (!_cpp_get_fresh_line (pfile))
2736         {
2737           result->type = CPP_EOF;
2738           if (!pfile->state.in_directive)
2739             {
2740               /* Tell the compiler the line number of the EOF token.  */
2741               result->src_loc = pfile->line_table->highest_line;
2742               result->flags = BOL;
2743             }
2744           return result;
2745         }
2746       if (buffer != pfile->buffer)
2747         fallthrough_comment = false;
2748       if (!pfile->keep_tokens)
2749         {
2750           pfile->cur_run = &pfile->base_run;
2751           result = pfile->base_run.base;
2752           pfile->cur_token = result + 1;
2753         }
2754       result->flags = BOL;
2755       if (pfile->state.parsing_args == 2)
2756         result->flags |= PREV_WHITE;
2757     }
2758   buffer = pfile->buffer;
2759  update_tokens_line:
2760   result->src_loc = pfile->line_table->highest_line;
2761
2762  skipped_white:
2763   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2764       && !pfile->overlaid_buffer)
2765     {
2766       _cpp_process_line_notes (pfile, false);
2767       result->src_loc = pfile->line_table->highest_line;
2768     }
2769   c = *buffer->cur++;
2770
2771   if (pfile->forced_token_location)
2772     result->src_loc = pfile->forced_token_location;
2773   else
2774     result->src_loc = linemap_position_for_column (pfile->line_table,
2775                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2776
2777   switch (c)
2778     {
2779     case ' ': case '\t': case '\f': case '\v': case '\0':
2780       result->flags |= PREV_WHITE;
2781       skip_whitespace (pfile, c);
2782       goto skipped_white;
2783
2784     case '\n':
2785       /* Increment the line, unless this is the last line ...  */
2786       if (buffer->cur < buffer->rlimit
2787           /* ... or this is a #include, (where _cpp_stack_file needs to
2788              unwind by one line) ...  */
2789           || (pfile->state.in_directive > 1
2790               /* ... except traditional-cpp increments this elsewhere.  */
2791               && !CPP_OPTION (pfile, traditional)))
2792         CPP_INCREMENT_LINE (pfile, 0);
2793       buffer->need_line = true;
2794       goto fresh_line;
2795
2796     case '0': case '1': case '2': case '3': case '4':
2797     case '5': case '6': case '7': case '8': case '9':
2798       {
2799         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2800         result->type = CPP_NUMBER;
2801         lex_number (pfile, &result->val.str, &nst);
2802         warn_about_normalization (pfile, result, &nst);
2803         break;
2804       }
2805
2806     case 'L':
2807     case 'u':
2808     case 'U':
2809     case 'R':
2810       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2811          wide strings or raw strings.  */
2812       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2813           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2814         {
2815           if ((*buffer->cur == '\'' && c != 'R')
2816               || *buffer->cur == '"'
2817               || (*buffer->cur == 'R'
2818                   && c != 'R'
2819                   && buffer->cur[1] == '"'
2820                   && CPP_OPTION (pfile, rliterals))
2821               || (*buffer->cur == '8'
2822                   && c == 'u'
2823                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
2824                                 && CPP_OPTION (pfile, utf8_char_literals)))
2825                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2826                           && CPP_OPTION (pfile, rliterals)))))
2827             {
2828               lex_string (pfile, result, buffer->cur - 1);
2829               break;
2830             }
2831         }
2832       /* Fall through.  */
2833
2834     case '_':
2835     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2836     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2837     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2838     case 's': case 't':           case 'v': case 'w': case 'x':
2839     case 'y': case 'z':
2840     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2841     case 'G': case 'H': case 'I': case 'J': case 'K':
2842     case 'M': case 'N': case 'O': case 'P': case 'Q':
2843     case 'S': case 'T':           case 'V': case 'W': case 'X':
2844     case 'Y': case 'Z':
2845       result->type = CPP_NAME;
2846       {
2847         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2848         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2849                                                 &nst,
2850                                                 &result->val.node.spelling);
2851         warn_about_normalization (pfile, result, &nst);
2852       }
2853
2854       /* Convert named operators to their proper types.  */
2855       if (result->val.node.node->flags & NODE_OPERATOR)
2856         {
2857           result->flags |= NAMED_OP;
2858           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2859         }
2860
2861       /* Signal FALLTHROUGH comment followed by another token.  */
2862       if (fallthrough_comment)
2863         result->flags |= PREV_FALLTHROUGH;
2864       break;
2865
2866     case '\'':
2867     case '"':
2868       lex_string (pfile, result, buffer->cur - 1);
2869       break;
2870
2871     case '/':
2872       /* A potential block or line comment.  */
2873       comment_start = buffer->cur;
2874       c = *buffer->cur;
2875
2876       if (c == '*')
2877         {
2878           if (_cpp_skip_block_comment (pfile))
2879             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2880         }
2881       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
2882         {
2883           /* Don't warn for system headers.  */
2884           if (cpp_in_system_header (pfile))
2885             ;
2886           /* Warn about comments if pedantically GNUC89, and not
2887              in system headers.  */
2888           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
2889                    && CPP_PEDANTIC (pfile)
2890                    && ! buffer->warned_cplusplus_comments)
2891             {
2892               if (cpp_error (pfile, CPP_DL_PEDWARN,
2893                              "C++ style comments are not allowed in ISO C90"))
2894                 cpp_error (pfile, CPP_DL_NOTE,
2895                            "(this will be reported only once per input file)");
2896               buffer->warned_cplusplus_comments = 1;
2897             }
2898           /* Or if specifically desired via -Wc90-c99-compat.  */
2899           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
2900                    && ! CPP_OPTION (pfile, cplusplus)
2901                    && ! buffer->warned_cplusplus_comments)
2902             {
2903               if (cpp_error (pfile, CPP_DL_WARNING,
2904                              "C++ style comments are incompatible with C90"))
2905                 cpp_error (pfile, CPP_DL_NOTE,
2906                            "(this will be reported only once per input file)");
2907               buffer->warned_cplusplus_comments = 1;
2908             }
2909           /* In C89/C94, C++ style comments are forbidden.  */
2910           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
2911                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
2912             {
2913               /* But don't be confused about valid code such as
2914                  - // immediately followed by *,
2915                  - // in a preprocessing directive,
2916                  - // in an #if 0 block.  */
2917               if (buffer->cur[1] == '*'
2918                   || pfile->state.in_directive
2919                   || pfile->state.skipping)
2920                 {
2921                   result->type = CPP_DIV;
2922                   break;
2923                 }
2924               else if (! buffer->warned_cplusplus_comments)
2925                 {
2926                   if (cpp_error (pfile, CPP_DL_ERROR,
2927                                  "C++ style comments are not allowed in "
2928                                  "ISO C90"))
2929                     cpp_error (pfile, CPP_DL_NOTE,
2930                                "(this will be reported only once per input "
2931                                "file)");
2932                   buffer->warned_cplusplus_comments = 1;
2933                 }
2934             }
2935           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2936             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2937         }
2938       else if (c == '=')
2939         {
2940           buffer->cur++;
2941           result->type = CPP_DIV_EQ;
2942           break;
2943         }
2944       else
2945         {
2946           result->type = CPP_DIV;
2947           break;
2948         }
2949
2950       if (fallthrough_comment_p (pfile, comment_start))
2951         fallthrough_comment = true;
2952
2953       if (pfile->cb.comment)
2954         {
2955           size_t len = pfile->buffer->cur - comment_start;
2956           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
2957                              len + 1);
2958         }
2959
2960       if (!pfile->state.save_comments)
2961         {
2962           result->flags |= PREV_WHITE;
2963           goto update_tokens_line;
2964         }
2965
2966       if (fallthrough_comment)
2967         result->flags |= PREV_FALLTHROUGH;
2968
2969       /* Save the comment as a token in its own right.  */
2970       save_comment (pfile, result, comment_start, c);
2971       break;
2972
2973     case '<':
2974       if (pfile->state.angled_headers)
2975         {
2976           lex_string (pfile, result, buffer->cur - 1);
2977           if (result->type != CPP_LESS)
2978             break;
2979         }
2980
2981       result->type = CPP_LESS;
2982       if (*buffer->cur == '=')
2983         buffer->cur++, result->type = CPP_LESS_EQ;
2984       else if (*buffer->cur == '<')
2985         {
2986           buffer->cur++;
2987           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2988         }
2989       else if (CPP_OPTION (pfile, digraphs))
2990         {
2991           if (*buffer->cur == ':')
2992             {
2993               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2994                  three characters are <:: and the subsequent character
2995                  is neither : nor >, the < is treated as a preprocessor
2996                  token by itself".  */
2997               if (CPP_OPTION (pfile, cplusplus)
2998                   && CPP_OPTION (pfile, lang) != CLK_CXX98
2999                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
3000                   && buffer->cur[1] == ':'
3001                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3002                 break;
3003
3004               buffer->cur++;
3005               result->flags |= DIGRAPH;
3006               result->type = CPP_OPEN_SQUARE;
3007             }
3008           else if (*buffer->cur == '%')
3009             {
3010               buffer->cur++;
3011               result->flags |= DIGRAPH;
3012               result->type = CPP_OPEN_BRACE;
3013             }
3014         }
3015       break;
3016
3017     case '>':
3018       result->type = CPP_GREATER;
3019       if (*buffer->cur == '=')
3020         buffer->cur++, result->type = CPP_GREATER_EQ;
3021       else if (*buffer->cur == '>')
3022         {
3023           buffer->cur++;
3024           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3025         }
3026       break;
3027
3028     case '%':
3029       result->type = CPP_MOD;
3030       if (*buffer->cur == '=')
3031         buffer->cur++, result->type = CPP_MOD_EQ;
3032       else if (CPP_OPTION (pfile, digraphs))
3033         {
3034           if (*buffer->cur == ':')
3035             {
3036               buffer->cur++;
3037               result->flags |= DIGRAPH;
3038               result->type = CPP_HASH;
3039               if (*buffer->cur == '%' && buffer->cur[1] == ':')
3040                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3041             }
3042           else if (*buffer->cur == '>')
3043             {
3044               buffer->cur++;
3045               result->flags |= DIGRAPH;
3046               result->type = CPP_CLOSE_BRACE;
3047             }
3048         }
3049       break;
3050
3051     case '.':
3052       result->type = CPP_DOT;
3053       if (ISDIGIT (*buffer->cur))
3054         {
3055           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3056           result->type = CPP_NUMBER;
3057           lex_number (pfile, &result->val.str, &nst);
3058           warn_about_normalization (pfile, result, &nst);
3059         }
3060       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3061         buffer->cur += 2, result->type = CPP_ELLIPSIS;
3062       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3063         buffer->cur++, result->type = CPP_DOT_STAR;
3064       break;
3065
3066     case '+':
3067       result->type = CPP_PLUS;
3068       if (*buffer->cur == '+')
3069         buffer->cur++, result->type = CPP_PLUS_PLUS;
3070       else if (*buffer->cur == '=')
3071         buffer->cur++, result->type = CPP_PLUS_EQ;
3072       break;
3073
3074     case '-':
3075       result->type = CPP_MINUS;
3076       if (*buffer->cur == '>')
3077         {
3078           buffer->cur++;
3079           result->type = CPP_DEREF;
3080           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3081             buffer->cur++, result->type = CPP_DEREF_STAR;
3082         }
3083       else if (*buffer->cur == '-')
3084         buffer->cur++, result->type = CPP_MINUS_MINUS;
3085       else if (*buffer->cur == '=')
3086         buffer->cur++, result->type = CPP_MINUS_EQ;
3087       break;
3088
3089     case '&':
3090       result->type = CPP_AND;
3091       if (*buffer->cur == '&')
3092         buffer->cur++, result->type = CPP_AND_AND;
3093       else if (*buffer->cur == '=')
3094         buffer->cur++, result->type = CPP_AND_EQ;
3095       break;
3096
3097     case '|':
3098       result->type = CPP_OR;
3099       if (*buffer->cur == '|')
3100         buffer->cur++, result->type = CPP_OR_OR;
3101       else if (*buffer->cur == '=')
3102         buffer->cur++, result->type = CPP_OR_EQ;
3103       break;
3104
3105     case ':':
3106       result->type = CPP_COLON;
3107       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
3108         buffer->cur++, result->type = CPP_SCOPE;
3109       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3110         {
3111           buffer->cur++;
3112           result->flags |= DIGRAPH;
3113           result->type = CPP_CLOSE_SQUARE;
3114         }
3115       break;
3116
3117     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3118     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3119     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3120     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3121     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3122
3123     case '?': result->type = CPP_QUERY; break;
3124     case '~': result->type = CPP_COMPL; break;
3125     case ',': result->type = CPP_COMMA; break;
3126     case '(': result->type = CPP_OPEN_PAREN; break;
3127     case ')': result->type = CPP_CLOSE_PAREN; break;
3128     case '[': result->type = CPP_OPEN_SQUARE; break;
3129     case ']': result->type = CPP_CLOSE_SQUARE; break;
3130     case '{': result->type = CPP_OPEN_BRACE; break;
3131     case '}': result->type = CPP_CLOSE_BRACE; break;
3132     case ';': result->type = CPP_SEMICOLON; break;
3133
3134       /* @ is a punctuator in Objective-C.  */
3135     case '@': result->type = CPP_ATSIGN; break;
3136
3137     default:
3138       {
3139         const uchar *base = --buffer->cur;
3140
3141         /* Check for an extended identifier ($ or UCN or UTF-8).  */
3142         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3143         if (forms_identifier_p (pfile, true, &nst))
3144           {
3145             result->type = CPP_NAME;
3146             result->val.node.node = lex_identifier (pfile, base, true, &nst,
3147                                                     &result->val.node.spelling);
3148             warn_about_normalization (pfile, result, &nst);
3149             break;
3150           }
3151
3152         /* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
3153            single token.  */
3154         buffer->cur++;
3155         if (c >= utf8_signifier)
3156           {
3157             const uchar *pstr = base;
3158             cppchar_t s;
3159             if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
3160               buffer->cur = pstr;
3161           }
3162         create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
3163         break;
3164       }
3165
3166     }
3167
3168   /* Potentially convert the location of the token to a range.  */
3169   if (result->src_loc >= RESERVED_LOCATION_COUNT
3170       && result->type != CPP_EOF)
3171     {
3172       /* Ensure that any line notes are processed, so that we have the
3173          correct physical line/column for the end-point of the token even
3174          when a logical line is split via one or more backslashes.  */
3175       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3176           && !pfile->overlaid_buffer)
3177         _cpp_process_line_notes (pfile, false);
3178
3179       source_range tok_range;
3180       tok_range.m_start = result->src_loc;
3181       tok_range.m_finish
3182         = linemap_position_for_column (pfile->line_table,
3183                                        CPP_BUF_COLUMN (buffer, buffer->cur));
3184
3185       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
3186                                                result->src_loc,
3187                                                tok_range, NULL);
3188     }
3189
3190   return result;
3191 }
3192
3193 /* An upper bound on the number of bytes needed to spell TOKEN.
3194    Does not include preceding whitespace.  */
3195 unsigned int
3196 cpp_token_len (const cpp_token *token)
3197 {
3198   unsigned int len;
3199
3200   switch (TOKEN_SPELL (token))
3201     {
3202     default:            len = 6;                                break;
3203     case SPELL_LITERAL: len = token->val.str.len;               break;
3204     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
3205     }
3206
3207   return len;
3208 }
3209
3210 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
3211    Return the number of bytes read out of NAME.  (There are always
3212    10 bytes written to BUFFER.)  */
3213
3214 static size_t
3215 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
3216 {
3217   int j;
3218   int ucn_len = 0;
3219   int ucn_len_c;
3220   unsigned t;
3221   unsigned long utf32;
3222
3223   /* Compute the length of the UTF-8 sequence.  */
3224   for (t = *name; t & 0x80; t <<= 1)
3225     ucn_len++;
3226
3227   utf32 = *name & (0x7F >> ucn_len);
3228   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
3229     {
3230       utf32 = (utf32 << 6) | (*++name & 0x3F);
3231
3232       /* Ill-formed UTF-8.  */
3233       if ((*name & ~0x3F) != 0x80)
3234         abort ();
3235     }
3236
3237   *buffer++ = '\\';
3238   *buffer++ = 'U';
3239   for (j = 7; j >= 0; j--)
3240     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
3241   return ucn_len;
3242 }
3243
3244 /* Given a token TYPE corresponding to a digraph, return a pointer to
3245    the spelling of the digraph.  */
3246 static const unsigned char *
3247 cpp_digraph2name (enum cpp_ttype type)
3248 {
3249   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
3250 }
3251
3252 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
3253    The buffer must already contain the enough space to hold the
3254    token's spelling.  Returns a pointer to the character after the
3255    last character written.  */
3256 unsigned char *
3257 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
3258 {
3259   size_t i;
3260   const unsigned char *name = NODE_NAME (ident);
3261
3262   for (i = 0; i < NODE_LEN (ident); i++)
3263     if (name[i] & ~0x7F)
3264       {
3265         i += utf8_to_ucn (buffer, name + i) - 1;
3266         buffer += 10;
3267       }
3268     else
3269       *buffer++ = name[i];
3270
3271   return buffer;
3272 }
3273
3274 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
3275    already contain the enough space to hold the token's spelling.
3276    Returns a pointer to the character after the last character written.
3277    FORSTRING is true if this is to be the spelling after translation
3278    phase 1 (with the original spelling of extended identifiers), false
3279    if extended identifiers should always be written using UCNs (there is
3280    no option for always writing them in the internal UTF-8 form).
3281    FIXME: Would be nice if we didn't need the PFILE argument.  */
3282 unsigned char *
3283 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
3284                  unsigned char *buffer, bool forstring)
3285 {
3286   switch (TOKEN_SPELL (token))
3287     {
3288     case SPELL_OPERATOR:
3289       {
3290         const unsigned char *spelling;
3291         unsigned char c;
3292
3293         if (token->flags & DIGRAPH)
3294           spelling = cpp_digraph2name (token->type);
3295         else if (token->flags & NAMED_OP)
3296           goto spell_ident;
3297         else
3298           spelling = TOKEN_NAME (token);
3299
3300         while ((c = *spelling++) != '\0')
3301           *buffer++ = c;
3302       }
3303       break;
3304
3305     spell_ident:
3306     case SPELL_IDENT:
3307       if (forstring)
3308         {
3309           memcpy (buffer, NODE_NAME (token->val.node.spelling),
3310                   NODE_LEN (token->val.node.spelling));
3311           buffer += NODE_LEN (token->val.node.spelling);
3312         }
3313       else
3314         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
3315       break;
3316
3317     case SPELL_LITERAL:
3318       memcpy (buffer, token->val.str.text, token->val.str.len);
3319       buffer += token->val.str.len;
3320       break;
3321
3322     case SPELL_NONE:
3323       cpp_error (pfile, CPP_DL_ICE,
3324                  "unspellable token %s", TOKEN_NAME (token));
3325       break;
3326     }
3327
3328   return buffer;
3329 }
3330
3331 /* Returns TOKEN spelt as a null-terminated string.  The string is
3332    freed when the reader is destroyed.  Useful for diagnostics.  */
3333 unsigned char *
3334 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
3335 {
3336   unsigned int len = cpp_token_len (token) + 1;
3337   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
3338
3339   end = cpp_spell_token (pfile, token, start, false);
3340   end[0] = '\0';
3341
3342   return start;
3343 }
3344
3345 /* Returns a pointer to a string which spells the token defined by
3346    TYPE and FLAGS.  Used by C front ends, which really should move to
3347    using cpp_token_as_text.  */
3348 const char *
3349 cpp_type2name (enum cpp_ttype type, unsigned char flags)
3350 {
3351   if (flags & DIGRAPH)
3352     return (const char *) cpp_digraph2name (type);
3353   else if (flags & NAMED_OP)
3354     return cpp_named_operator2name (type);
3355
3356   return (const char *) token_spellings[type].name;
3357 }
3358
3359 /* Writes the spelling of token to FP, without any preceding space.
3360    Separated from cpp_spell_token for efficiency - to avoid stdio
3361    double-buffering.  */
3362 void
3363 cpp_output_token (const cpp_token *token, FILE *fp)
3364 {
3365   switch (TOKEN_SPELL (token))
3366     {
3367     case SPELL_OPERATOR:
3368       {
3369         const unsigned char *spelling;
3370         int c;
3371
3372         if (token->flags & DIGRAPH)
3373           spelling = cpp_digraph2name (token->type);
3374         else if (token->flags & NAMED_OP)
3375           goto spell_ident;
3376         else
3377           spelling = TOKEN_NAME (token);
3378
3379         c = *spelling;
3380         do
3381           putc (c, fp);
3382         while ((c = *++spelling) != '\0');
3383       }
3384       break;
3385
3386     spell_ident:
3387     case SPELL_IDENT:
3388       {
3389         size_t i;
3390         const unsigned char * name = NODE_NAME (token->val.node.node);
3391
3392         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
3393           if (name[i] & ~0x7F)
3394             {
3395               unsigned char buffer[10];
3396               i += utf8_to_ucn (buffer, name + i) - 1;
3397               fwrite (buffer, 1, 10, fp);
3398             }
3399           else
3400             fputc (NODE_NAME (token->val.node.node)[i], fp);
3401       }
3402       break;
3403
3404     case SPELL_LITERAL:
3405       fwrite (token->val.str.text, 1, token->val.str.len, fp);
3406       break;
3407
3408     case SPELL_NONE:
3409       /* An error, most probably.  */
3410       break;
3411     }
3412 }
3413
3414 /* Compare two tokens.  */
3415 int
3416 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
3417 {
3418   if (a->type == b->type && a->flags == b->flags)
3419     switch (TOKEN_SPELL (a))
3420       {
3421       default:                  /* Keep compiler happy.  */
3422       case SPELL_OPERATOR:
3423         /* token_no is used to track where multiple consecutive ##
3424            tokens were originally located.  */
3425         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
3426       case SPELL_NONE:
3427         return (a->type != CPP_MACRO_ARG
3428                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
3429                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
3430       case SPELL_IDENT:
3431         return (a->val.node.node == b->val.node.node
3432                 && a->val.node.spelling == b->val.node.spelling);
3433       case SPELL_LITERAL:
3434         return (a->val.str.len == b->val.str.len
3435                 && !memcmp (a->val.str.text, b->val.str.text,
3436                             a->val.str.len));
3437       }
3438
3439   return 0;
3440 }
3441
3442 /* Returns nonzero if a space should be inserted to avoid an
3443    accidental token paste for output.  For simplicity, it is
3444    conservative, and occasionally advises a space where one is not
3445    needed, e.g. "." and ".2".  */
3446 int
3447 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
3448                  const cpp_token *token2)
3449 {
3450   enum cpp_ttype a = token1->type, b = token2->type;
3451   cppchar_t c;
3452
3453   if (token1->flags & NAMED_OP)
3454     a = CPP_NAME;
3455   if (token2->flags & NAMED_OP)
3456     b = CPP_NAME;
3457
3458   c = EOF;
3459   if (token2->flags & DIGRAPH)
3460     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
3461   else if (token_spellings[b].category == SPELL_OPERATOR)
3462     c = token_spellings[b].name[0];
3463
3464   /* Quickly get everything that can paste with an '='.  */
3465   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
3466     return 1;
3467
3468   switch (a)
3469     {
3470     case CPP_GREATER:   return c == '>';
3471     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
3472     case CPP_PLUS:      return c == '+';
3473     case CPP_MINUS:     return c == '-' || c == '>';
3474     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
3475     case CPP_MOD:       return c == ':' || c == '>';
3476     case CPP_AND:       return c == '&';
3477     case CPP_OR:        return c == '|';
3478     case CPP_COLON:     return c == ':' || c == '>';
3479     case CPP_DEREF:     return c == '*';
3480     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
3481     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
3482     case CPP_NAME:      return ((b == CPP_NUMBER
3483                                  && name_p (pfile, &token2->val.str))
3484                                 || b == CPP_NAME
3485                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
3486     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
3487                                 || c == '.' || c == '+' || c == '-');
3488                                       /* UCNs */
3489     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
3490                                  && b == CPP_NAME)
3491                                 || (CPP_OPTION (pfile, objc)
3492                                     && token1->val.str.text[0] == '@'
3493                                     && (b == CPP_NAME || b == CPP_STRING)));
3494     case CPP_STRING:
3495     case CPP_WSTRING:
3496     case CPP_UTF8STRING:
3497     case CPP_STRING16:
3498     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
3499                                 && (b == CPP_NAME
3500                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
3501                                         && ISIDST (token2->val.str.text[0]))));
3502
3503     default:            break;
3504     }
3505
3506   return 0;
3507 }
3508
3509 /* Output all the remaining tokens on the current line, and a newline
3510    character, to FP.  Leading whitespace is removed.  If there are
3511    macros, special token padding is not performed.  */
3512 void
3513 cpp_output_line (cpp_reader *pfile, FILE *fp)
3514 {
3515   const cpp_token *token;
3516
3517   token = cpp_get_token (pfile);
3518   while (token->type != CPP_EOF)
3519     {
3520       cpp_output_token (token, fp);
3521       token = cpp_get_token (pfile);
3522       if (token->flags & PREV_WHITE)
3523         putc (' ', fp);
3524     }
3525
3526   putc ('\n', fp);
3527 }
3528
3529 /* Return a string representation of all the remaining tokens on the
3530    current line.  The result is allocated using xmalloc and must be
3531    freed by the caller.  */
3532 unsigned char *
3533 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
3534 {
3535   const cpp_token *token;
3536   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
3537   unsigned int alloced = 120 + out;
3538   unsigned char *result = (unsigned char *) xmalloc (alloced);
3539
3540   /* If DIR_NAME is empty, there are no initial contents.  */
3541   if (dir_name)
3542     {
3543       sprintf ((char *) result, "#%s ", dir_name);
3544       out += 2;
3545     }
3546
3547   token = cpp_get_token (pfile);
3548   while (token->type != CPP_EOF)
3549     {
3550       unsigned char *last;
3551       /* Include room for a possible space and the terminating nul.  */
3552       unsigned int len = cpp_token_len (token) + 2;
3553
3554       if (out + len > alloced)
3555         {
3556           alloced *= 2;
3557           if (out + len > alloced)
3558             alloced = out + len;
3559           result = (unsigned char *) xrealloc (result, alloced);
3560         }
3561
3562       last = cpp_spell_token (pfile, token, &result[out], 0);
3563       out = last - result;
3564
3565       token = cpp_get_token (pfile);
3566       if (token->flags & PREV_WHITE)
3567         result[out++] = ' ';
3568     }
3569
3570   result[out] = '\0';
3571   return result;
3572 }
3573
3574 /* Memory buffers.  Changing these three constants can have a dramatic
3575    effect on performance.  The values here are reasonable defaults,
3576    but might be tuned.  If you adjust them, be sure to test across a
3577    range of uses of cpplib, including heavy nested function-like macro
3578    expansion.  Also check the change in peak memory usage (NJAMD is a
3579    good tool for this).  */
3580 #define MIN_BUFF_SIZE 8000
3581 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3582 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
3583         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
3584
3585 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
3586   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
3587 #endif
3588
3589 /* Create a new allocation buffer.  Place the control block at the end
3590    of the buffer, so that buffer overflows will cause immediate chaos.  */
3591 static _cpp_buff *
3592 new_buff (size_t len)
3593 {
3594   _cpp_buff *result;
3595   unsigned char *base;
3596
3597   if (len < MIN_BUFF_SIZE)
3598     len = MIN_BUFF_SIZE;
3599   len = CPP_ALIGN (len);
3600
3601 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3602   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
3603      struct first.  */
3604   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
3605   base = XNEWVEC (unsigned char, len + slen);
3606   result = (_cpp_buff *) base;
3607   base += slen;
3608 #else
3609   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3610   result = (_cpp_buff *) (base + len);
3611 #endif
3612   result->base = base;
3613   result->cur = base;
3614   result->limit = base + len;
3615   result->next = NULL;
3616   return result;
3617 }
3618
3619 /* Place a chain of unwanted allocation buffers on the free list.  */
3620 void
3621 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3622 {
3623   _cpp_buff *end = buff;
3624
3625   while (end->next)
3626     end = end->next;
3627   end->next = pfile->free_buffs;
3628   pfile->free_buffs = buff;
3629 }
3630
3631 /* Return a free buffer of size at least MIN_SIZE.  */
3632 _cpp_buff *
3633 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3634 {
3635   _cpp_buff *result, **p;
3636
3637   for (p = &pfile->free_buffs;; p = &(*p)->next)
3638     {
3639       size_t size;
3640
3641       if (*p == NULL)
3642         return new_buff (min_size);
3643       result = *p;
3644       size = result->limit - result->base;
3645       /* Return a buffer that's big enough, but don't waste one that's
3646          way too big.  */
3647       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3648         break;
3649     }
3650
3651   *p = result->next;
3652   result->next = NULL;
3653   result->cur = result->base;
3654   return result;
3655 }
3656
3657 /* Creates a new buffer with enough space to hold the uncommitted
3658    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
3659    the excess bytes to the new buffer.  Chains the new buffer after
3660    BUFF, and returns the new buffer.  */
3661 _cpp_buff *
3662 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3663 {
3664   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3665   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3666
3667   buff->next = new_buff;
3668   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3669   return new_buff;
3670 }
3671
3672 /* Creates a new buffer with enough space to hold the uncommitted
3673    remaining bytes of the buffer pointed to by BUFF, and at least
3674    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3675    Chains the new buffer before the buffer pointed to by BUFF, and
3676    updates the pointer to point to the new buffer.  */
3677 void
3678 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3679 {
3680   _cpp_buff *new_buff, *old_buff = *pbuff;
3681   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3682
3683   new_buff = _cpp_get_buff (pfile, size);
3684   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3685   new_buff->next = old_buff;
3686   *pbuff = new_buff;
3687 }
3688
3689 /* Free a chain of buffers starting at BUFF.  */
3690 void
3691 _cpp_free_buff (_cpp_buff *buff)
3692 {
3693   _cpp_buff *next;
3694
3695   for (; buff; buff = next)
3696     {
3697       next = buff->next;
3698 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3699       free (buff);
3700 #else
3701       free (buff->base);
3702 #endif
3703     }
3704 }
3705
3706 /* Allocate permanent, unaligned storage of length LEN.  */
3707 unsigned char *
3708 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3709 {
3710   _cpp_buff *buff = pfile->u_buff;
3711   unsigned char *result = buff->cur;
3712
3713   if (len > (size_t) (buff->limit - result))
3714     {
3715       buff = _cpp_get_buff (pfile, len);
3716       buff->next = pfile->u_buff;
3717       pfile->u_buff = buff;
3718       result = buff->cur;
3719     }
3720
3721   buff->cur = result + len;
3722   return result;
3723 }
3724
3725 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3726    That buffer is used for growing allocations when saving macro
3727    replacement lists in a #define, and when parsing an answer to an
3728    assertion in #assert, #unassert or #if (and therefore possibly
3729    whilst expanding macros).  It therefore must not be used by any
3730    code that they might call: specifically the lexer and the guts of
3731    the macro expander.
3732
3733    All existing other uses clearly fit this restriction: storing
3734    registered pragmas during initialization.  */
3735 unsigned char *
3736 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3737 {
3738   _cpp_buff *buff = pfile->a_buff;
3739   unsigned char *result = buff->cur;
3740
3741   if (len > (size_t) (buff->limit - result))
3742     {
3743       buff = _cpp_get_buff (pfile, len);
3744       buff->next = pfile->a_buff;
3745       pfile->a_buff = buff;
3746       result = buff->cur;
3747     }
3748
3749   buff->cur = result + len;
3750   return result;
3751 }
3752
3753 /* Commit or allocate storage from a buffer.  */
3754
3755 void *
3756 _cpp_commit_buff (cpp_reader *pfile, size_t size)
3757 {
3758   void *ptr = BUFF_FRONT (pfile->a_buff);
3759
3760   if (pfile->hash_table->alloc_subobject)
3761     {
3762       void *copy = pfile->hash_table->alloc_subobject (size);
3763       memcpy (copy, ptr, size);
3764       ptr = copy;
3765     }
3766   else
3767     BUFF_FRONT (pfile->a_buff) += size;
3768
3769   return ptr;
3770 }
3771
3772 /* Say which field of TOK is in use.  */
3773
3774 enum cpp_token_fld_kind
3775 cpp_token_val_index (const cpp_token *tok)
3776 {
3777   switch (TOKEN_SPELL (tok))
3778     {
3779     case SPELL_IDENT:
3780       return CPP_TOKEN_FLD_NODE;
3781     case SPELL_LITERAL:
3782       return CPP_TOKEN_FLD_STR;
3783     case SPELL_OPERATOR:
3784       /* Operands which were originally spelled as ident keep around
3785          the node for the exact spelling.  */
3786       if (tok->flags & NAMED_OP)
3787         return CPP_TOKEN_FLD_NODE;
3788       else if (tok->type == CPP_PASTE)
3789         return CPP_TOKEN_FLD_TOKEN_NO;
3790       else
3791         return CPP_TOKEN_FLD_NONE;
3792     case SPELL_NONE:
3793       if (tok->type == CPP_MACRO_ARG)
3794         return CPP_TOKEN_FLD_ARG_NO;
3795       else if (tok->type == CPP_PADDING)
3796         return CPP_TOKEN_FLD_SOURCE;
3797       else if (tok->type == CPP_PRAGMA)
3798         return CPP_TOKEN_FLD_PRAGMA;
3799       /* fall through */
3800     default:
3801       return CPP_TOKEN_FLD_NONE;
3802     }
3803 }
3804
3805 /* All tokens lexed in R after calling this function will be forced to
3806    have their location_t to be P, until
3807    cpp_stop_forcing_token_locations is called for R.  */
3808
3809 void
3810 cpp_force_token_locations (cpp_reader *r, location_t loc)
3811 {
3812   r->forced_token_location = loc;
3813 }
3814
3815 /* Go back to assigning locations naturally for lexed tokens.  */
3816
3817 void
3818 cpp_stop_forcing_token_locations (cpp_reader *r)
3819 {
3820   r->forced_token_location = 0;
3821 }