libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2017 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problem can be properly
 267    autoconfed:
 268
 269    The Solaris 10+ assembler tags objects with the instruction set
 270    extensions used, so SSE4.2 executables cannot run on machines that
 271    don't support that extension.  */
 272
 273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 274
 275 /* Replicated character data to be shared between implementations.
 276    Recall that outside of a context with vector support we can't
 277    define compatible vector types, therefore these are all defined
 278    in terms of raw characters.  */
 279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 280   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 281     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 282   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 283     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 284   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 285     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 286   { '?', '?', '?', '?', '?', '?', '?', '?',
 287     '?', '?', '?', '?', '?', '?', '?', '?' },
 288 };
 289
 290 /* A version of the fast scanner using MMX vectorized byte compare insns.
 291
 292    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 293    which was packaged into SSE1; it is also present in the AMD MMX
 294    extension.  Mark the function as using "sse" so that we emit a real
 295    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 296
 297 static const uchar *
 298 #ifndef __SSE__
 299 __attribute__((__target__("sse")))
 300 #endif
 301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 302 {
 303   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 304   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 305
 306   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 307   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 308   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 309   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 310
 311   unsigned int misalign, found, mask;
 312   const v8qi *p;
 313   v8qi data, t, c;
 314
 315   /* Align the source pointer.  While MMX doesn't generate unaligned data
 316      faults, this allows us to safely scan to the end of the buffer without
 317      reading beyond the end of the last page.  */
 318   misalign = (uintptr_t)s & 7;
 319   p = (const v8qi *)((uintptr_t)s & -8);
 320   data = *p;
 321
 322   /* Create a mask for the bytes that are valid within the first
 323      16-byte block.  The Idea here is that the AND with the mask
 324      within the loop is "free", since we need some AND or TEST
 325      insn in order to set the flags for the branch anyway.  */
 326   mask = -1u << misalign;
 327
 328   /* Main loop processing 8 bytes at a time.  */
 329   goto start;
 330   do
 331     {
 332       data = *++p;
 333       mask = -1;
 334
 335     start:
 336       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 337       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 338       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 339       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       found = __builtin_ia32_pmovmskb (t);
 344       found &= mask;
 345     }
 346   while (!found);
 347
 348   __builtin_ia32_emms ();
 349
 350   /* FOUND contains 1 in bits for which we matched a relevant
 351      character.  Conversion to the byte index is trivial.  */
 352   found = __builtin_ctz(found);
 353   return (const uchar *)p + found;
 354 }
 355
 356 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 357
 358 static const uchar *
 359 #ifndef __SSE2__
 360 __attribute__((__target__("sse2")))
 361 #endif
 362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 363 {
 364   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 365
 366   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 367   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 368   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 369   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 370
 371   unsigned int misalign, found, mask;
 372   const v16qi *p;
 373   v16qi data, t;
 374
 375   /* Align the source pointer.  */
 376   misalign = (uintptr_t)s & 15;
 377   p = (const v16qi *)((uintptr_t)s & -16);
 378   data = *p;
 379
 380   /* Create a mask for the bytes that are valid within the first
 381      16-byte block.  The Idea here is that the AND with the mask
 382      within the loop is "free", since we need some AND or TEST
 383      insn in order to set the flags for the branch anyway.  */
 384   mask = -1u << misalign;
 385
 386   /* Main loop processing 16 bytes at a time.  */
 387   goto start;
 388   do
 389     {
 390       data = *++p;
 391       mask = -1;
 392
 393     start:
 394       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 395       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 396       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 397       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 398       found = __builtin_ia32_pmovmskb128 (t);
 399       found &= mask;
 400     }
 401   while (!found);
 402
 403   /* FOUND contains 1 in bits for which we matched a relevant
 404      character.  Conversion to the byte index is trivial.  */
 405   found = __builtin_ctz(found);
 406   return (const uchar *)p + found;
 407 }
 408
 409 #ifdef HAVE_SSE4
 410 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 411
 412 static const uchar *
 413 #ifndef __SSE4_2__
 414 __attribute__((__target__("sse4.2")))
 415 #endif
 416 search_line_sse42 (const uchar *s, const uchar *end)
 417 {
 418   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 419   static const v16qi search = { '\n', '\r', '?', '\\' };
 420
 421   uintptr_t si = (uintptr_t)s;
 422   uintptr_t index;
 423
 424   /* Check for unaligned input.  */
 425   if (si & 15)
 426     {
 427       v16qi sv;
 428
 429       if (__builtin_expect (end - s < 16, 0)
 430           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 431         {
 432           /* There are less than 16 bytes left in the buffer, and less
 433              than 16 bytes left on the page.  Reading 16 bytes at this
 434              point might generate a spurious page fault.  Defer to the
 435              SSE2 implementation, which already handles alignment.  */
 436           return search_line_sse2 (s, end);
 437         }
 438
 439       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 440          memory need not be aligned.  */
 441       sv = __builtin_ia32_loaddqu ((const char *) s);
 442       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 443
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 15) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  */
 454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 455   while (1)
 456     {
 457       char f;
 458
 459       /* By using inline assembly instead of the builtin,
 460          we can use the result, as well as the flags set.  */
 461       __asm ("%vpcmpestri\t$0, %2, %3"
 462              : "=c"(index), "=@ccc"(f)
 463              : "m"(*s), "x"(search), "a"(4), "d"(16));
 464       if (f)
 465         break;
 466
 467       s += 16;
 468     }
 469 #else
 470   s -= 16;
 471   /* By doing the whole loop in inline assembly,
 472      we can make proper use of the flags set.  */
 473   __asm (      ".balign 16\n"
 474         "0:     add $16, %1\n"
 475         "       %vpcmpestri\t$0, (%1), %2\n"
 476         "       jnc 0b"
 477         : "=&c"(index), "+r"(s)
 478         : "x"(search), "a"(4), "d"(16));
 479 #endif
 480
 481  found:
 482   return s + index;
 483 }
 484
 485 #else
 486 /* Work around out-dated assemblers without sse4 support.  */
 487 #define search_line_sse42 search_line_sse2
 488 #endif
 489
 490 /* Check the CPU capabilities.  */
 491
 492 #include "../gcc/config/i386/cpuid.h"
 493
 494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 495 static search_line_fast_type search_line_fast;
 496
 497 #define HAVE_init_vectorized_lexer 1
 498 static inline void
 499 init_vectorized_lexer (void)
 500 {
 501   unsigned dummy, ecx = 0, edx = 0;
 502   search_line_fast_type impl = search_line_acc_char;
 503   int minimum = 0;
 504
 505 #if defined(__SSE4_2__)
 506   minimum = 3;
 507 #elif defined(__SSE2__)
 508   minimum = 2;
 509 #elif defined(__SSE__)
 510   minimum = 1;
 511 #endif
 512
 513   if (minimum == 3)
 514     impl = search_line_sse42;
 515   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 516     {
 517       if (minimum == 3 || (ecx & bit_SSE4_2))
 518         impl = search_line_sse42;
 519       else if (minimum == 2 || (edx & bit_SSE2))
 520         impl = search_line_sse2;
 521       else if (minimum == 1 || (edx & bit_SSE))
 522         impl = search_line_mmx;
 523     }
 524   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 525     {
 526       if (minimum == 1
 527           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 528         impl = search_line_mmx;
 529     }
 530
 531   search_line_fast = impl;
 532 }
 533
 534 #elif defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 535
 536 /* A vection of the fast scanner using AltiVec vectorized byte compares
 537    and VSX unaligned loads (when VSX is available).  This is otherwise
 538    the same as the pre-GCC 5 version.  */
 539
 540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 541 static const uchar *
 542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 543 {
 544   typedef __attribute__((altivec(vector))) unsigned char vc;
 545
 546   const vc repl_nl = {
 547     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 548     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 549   };
 550   const vc repl_cr = {
 551     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 552     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 553   };
 554   const vc repl_bs = {
 555     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 556     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 557   };
 558   const vc repl_qm = {
 559     '?', '?', '?', '?', '?', '?', '?', '?',
 560     '?', '?', '?', '?', '?', '?', '?', '?',
 561   };
 562   const vc zero = { 0 };
 563
 564   vc data, t;
 565
 566   /* Main loop processing 16 bytes at a time.  */
 567   do
 568     {
 569       vc m_nl, m_cr, m_bs, m_qm;
 570
 571       data = *((const vc *)s);
 572       s += 16;
 573
 574       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 575       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 576       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 577       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 578       t = (m_nl | m_cr) | (m_bs | m_qm);
 579
 580       /* T now contains 0xff in bytes for which we matched one of the relevant
 581          characters.  We want to exit the loop if any byte in T is non-zero.
 582          Below is the expansion of vec_any_ne(t, zero).  */
 583     }
 584   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 585
 586   /* Restore s to to point to the 16 bytes we just processed.  */
 587   s -= 16;
 588
 589   {
 590 #define N  (sizeof(vc) / sizeof(long))
 591
 592     union {
 593       vc v;
 594       /* Statically assert that N is 2 or 4.  */
 595       unsigned long l[(N == 2 || N == 4) ? N : -1];
 596     } u;
 597     unsigned long l, i = 0;
 598
 599     u.v = t;
 600
 601     /* Find the first word of T that is non-zero.  */
 602     switch (N)
 603       {
 604       case 4:
 605         l = u.l[i++];
 606         if (l != 0)
 607           break;
 608         s += sizeof(unsigned long);
 609         l = u.l[i++];
 610         if (l != 0)
 611           break;
 612         s += sizeof(unsigned long);
 613         /* FALLTHRU */
 614       case 2:
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619         l = u.l[i];
 620       }
 621
 622     /* L now contains 0xff in bytes for which we matched one of the
 623        relevant characters.  We can find the byte index by finding
 624        its bit index and dividing by 8.  */
 625 #ifdef __BIG_ENDIAN__
 626     l = __builtin_clzl(l) >> 3;
 627 #else
 628     l = __builtin_ctzl(l) >> 3;
 629 #endif
 630     return s + l;
 631
 632 #undef N
 633   }
 634 }
 635
 636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 637
 638 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 639    This cannot be used for little endian because vec_lvsl/lvsr are
 640    deprecated for little endian and the code won't work properly.  */
 641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 642    so we can't compile this function without -maltivec on the command line
 643    (or implied by some other switch).  */
 644
 645 static const uchar *
 646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 647 {
 648   typedef __attribute__((altivec(vector))) unsigned char vc;
 649
 650   const vc repl_nl = {
 651     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 652     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 653   };
 654   const vc repl_cr = {
 655     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 656     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 657   };
 658   const vc repl_bs = {
 659     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 660     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 661   };
 662   const vc repl_qm = {
 663     '?', '?', '?', '?', '?', '?', '?', '?',
 664     '?', '?', '?', '?', '?', '?', '?', '?',
 665   };
 666   const vc ones = {
 667     -1, -1, -1, -1, -1, -1, -1, -1,
 668     -1, -1, -1, -1, -1, -1, -1, -1,
 669   };
 670   const vc zero = { 0 };
 671
 672   vc data, mask, t;
 673
 674   /* Altivec loads automatically mask addresses with -16.  This lets us
 675      issue the first load as early as possible.  */
 676   data = __builtin_vec_ld(0, (const vc *)s);
 677
 678   /* Discard bytes before the beginning of the buffer.  Do this by
 679      beginning with all ones and shifting in zeros according to the
 680      mis-alignment.  The LVSR instruction pulls the exact shift we
 681      want from the address.  */
 682   mask = __builtin_vec_lvsr(0, s);
 683   mask = __builtin_vec_perm(zero, ones, mask);
 684   data &= mask;
 685
 686   /* While altivec loads mask addresses, we still need to align S so
 687      that the offset we compute at the end is correct.  */
 688   s = (const uchar *)((uintptr_t)s & -16);
 689
 690   /* Main loop processing 16 bytes at a time.  */
 691   goto start;
 692   do
 693     {
 694       vc m_nl, m_cr, m_bs, m_qm;
 695
 696       s += 16;
 697       data = __builtin_vec_ld(0, (const vc *)s);
 698
 699     start:
 700       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 701       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 702       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 703       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 704       t = (m_nl | m_cr) | (m_bs | m_qm);
 705
 706       /* T now contains 0xff in bytes for which we matched one of the relevant
 707          characters.  We want to exit the loop if any byte in T is non-zero.
 708          Below is the expansion of vec_any_ne(t, zero).  */
 709     }
 710   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 711
 712   {
 713 #define N  (sizeof(vc) / sizeof(long))
 714
 715     union {
 716       vc v;
 717       /* Statically assert that N is 2 or 4.  */
 718       unsigned long l[(N == 2 || N == 4) ? N : -1];
 719     } u;
 720     unsigned long l, i = 0;
 721
 722     u.v = t;
 723
 724     /* Find the first word of T that is non-zero.  */
 725     switch (N)
 726       {
 727       case 4:
 728         l = u.l[i++];
 729         if (l != 0)
 730           break;
 731         s += sizeof(unsigned long);
 732         l = u.l[i++];
 733         if (l != 0)
 734           break;
 735         s += sizeof(unsigned long);
 736         /* FALLTHROUGH */
 737       case 2:
 738         l = u.l[i++];
 739         if (l != 0)
 740           break;
 741         s += sizeof(unsigned long);
 742         l = u.l[i];
 743       }
 744
 745     /* L now contains 0xff in bytes for which we matched one of the
 746        relevant characters.  We can find the byte index by finding
 747        its bit index and dividing by 8.  */
 748     l = __builtin_clzl(l) >> 3;
 749     return s + l;
 750
 751 #undef N
 752   }
 753 }
 754
 755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 756 #include "arm_neon.h"
 757
 758 /* This doesn't have to be the exact page size, but no system may use
 759    a size smaller than this.  ARMv8 requires a minimum page size of
 760    4k.  The impact of being conservative here is a small number of
 761    cases will take the slightly slower entry path into the main
 762    loop.  */
 763
 764 #define AARCH64_MIN_PAGE_SIZE 4096
 765
 766 static const uchar *
 767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 768 {
 769   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 770   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 771   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 772   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 773   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 774
 775 #ifdef __AARCH64EB
 776   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 777 #else
 778   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 779 #endif
 780
 781   unsigned int found;
 782   const uint8_t *p;
 783   uint8x16_t data;
 784   uint8x16_t t;
 785   uint16x8_t m;
 786   uint8x16_t u, v, w;
 787
 788   /* Align the source pointer.  */
 789   p = (const uint8_t *)((uintptr_t)s & -16);
 790
 791   /* Assuming random string start positions, with a 4k page size we'll take
 792      the slow path about 0.37% of the time.  */
 793   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 794                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 795                         < 16, 0))
 796     {
 797       /* Slow path: the string starts near a possible page boundary.  */
 798       uint32_t misalign, mask;
 799
 800       misalign = (uintptr_t)s & 15;
 801       mask = (-1u << misalign) & 0xffff;
 802       data = vld1q_u8 (p);
 803       t = vceqq_u8 (data, repl_nl);
 804       u = vceqq_u8 (data, repl_cr);
 805       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 806       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 807       t = vorrq_u8 (v, w);
 808       t = vandq_u8 (t, xmask);
 809       m = vpaddlq_u8 (t);
 810       m = vshlq_u16 (m, shift);
 811       found = vaddvq_u16 (m);
 812       found &= mask;
 813       if (found)
 814         return (const uchar*)p + __builtin_ctz (found);
 815     }
 816   else
 817     {
 818       data = vld1q_u8 ((const uint8_t *) s);
 819       t = vceqq_u8 (data, repl_nl);
 820       u = vceqq_u8 (data, repl_cr);
 821       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 822       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 823       t = vorrq_u8 (v, w);
 824       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 825         goto done;
 826     }
 827
 828   do
 829     {
 830       p += 16;
 831       data = vld1q_u8 (p);
 832       t = vceqq_u8 (data, repl_nl);
 833       u = vceqq_u8 (data, repl_cr);
 834       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 835       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 836       t = vorrq_u8 (v, w);
 837     } while (!vpaddd_u64 ((uint64x2_t)t));
 838
 839 done:
 840   /* Now that we've found the terminating substring, work out precisely where
 841      we need to stop.  */
 842   t = vandq_u8 (t, xmask);
 843   m = vpaddlq_u8 (t);
 844   m = vshlq_u16 (m, shift);
 845   found = vaddvq_u16 (m);
 846   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 847           + __builtin_ctz (found));
 848 }
 849
 850 #elif defined (__ARM_NEON)
 851 #include "arm_neon.h"
 852
 853 static const uchar *
 854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 855 {
 856   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 857   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 858   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 859   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 860   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 861
 862   unsigned int misalign, found, mask;
 863   const uint8_t *p;
 864   uint8x16_t data;
 865
 866   /* Align the source pointer.  */
 867   misalign = (uintptr_t)s & 15;
 868   p = (const uint8_t *)((uintptr_t)s & -16);
 869   data = vld1q_u8 (p);
 870
 871   /* Create a mask for the bytes that are valid within the first
 872      16-byte block.  The Idea here is that the AND with the mask
 873      within the loop is "free", since we need some AND or TEST
 874      insn in order to set the flags for the branch anyway.  */
 875   mask = (-1u << misalign) & 0xffff;
 876
 877   /* Main loop, processing 16 bytes at a time.  */
 878   goto start;
 879
 880   do
 881     {
 882       uint8x8_t l;
 883       uint16x4_t m;
 884       uint32x2_t n;
 885       uint8x16_t t, u, v, w;
 886
 887       p += 16;
 888       data = vld1q_u8 (p);
 889       mask = 0xffff;
 890
 891     start:
 892       t = vceqq_u8 (data, repl_nl);
 893       u = vceqq_u8 (data, repl_cr);
 894       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 895       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 896       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 897       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 898       m = vpaddl_u8 (l);
 899       n = vpaddl_u16 (m);
 900
 901       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 902               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 903       found &= mask;
 904     }
 905   while (!found);
 906
 907   /* FOUND contains 1 in bits for which we matched a relevant
 908      character.  Conversion to the byte index is trivial.  */
 909   found = __builtin_ctz (found);
 910   return (const uchar *)p + found;
 911 }
 912
 913 #else
 914
 915 /* We only have one accelerated alternative.  Use a direct call so that
 916    we encourage inlining.  */
 917
 918 #define search_line_fast  search_line_acc_char
 919
 920 #endif
 921
 922 /* Initialize the lexer if needed.  */
 923
 924 void
 925 _cpp_init_lexer (void)
 926 {
 927 #ifdef HAVE_init_vectorized_lexer
 928   init_vectorized_lexer ();
 929 #endif
 930 }
 931
 932 /* Returns with a logical line that contains no escaped newlines or
 933    trigraphs.  This is a time-critical inner loop.  */
 934 void
 935 _cpp_clean_line (cpp_reader *pfile)
 936 {
 937   cpp_buffer *buffer;
 938   const uchar *s;
 939   uchar c, *d, *p;
 940
 941   buffer = pfile->buffer;
 942   buffer->cur_note = buffer->notes_used = 0;
 943   buffer->cur = buffer->line_base = buffer->next_line;
 944   buffer->need_line = false;
 945   s = buffer->next_line;
 946
 947   if (!buffer->from_stage3)
 948     {
 949       const uchar *pbackslash = NULL;
 950
 951       /* Fast path.  This is the common case of an un-escaped line with
 952          no trigraphs.  The primary win here is by not writing any
 953          data back to memory until we have to.  */
 954       while (1)
 955         {
 956           /* Perform an optimized search for \n, \r, \\, ?.  */
 957           s = search_line_fast (s, buffer->rlimit);
 958
 959           c = *s;
 960           if (c == '\\')
 961             {
 962               /* Record the location of the backslash and continue.  */
 963               pbackslash = s++;
 964             }
 965           else if (__builtin_expect (c == '?', 0))
 966             {
 967               if (__builtin_expect (s[1] == '?', false)
 968                    && _cpp_trigraph_map[s[2]])
 969                 {
 970                   /* Have a trigraph.  We may or may not have to convert
 971                      it.  Add a line note regardless, for -Wtrigraphs.  */
 972                   add_line_note (buffer, s, s[2]);
 973                   if (CPP_OPTION (pfile, trigraphs))
 974                     {
 975                       /* We do, and that means we have to switch to the
 976                          slow path.  */
 977                       d = (uchar *) s;
 978                       *d = _cpp_trigraph_map[s[2]];
 979                       s += 2;
 980                       goto slow_path;
 981                     }
 982                 }
 983               /* Not a trigraph.  Continue on fast-path.  */
 984               s++;
 985             }
 986           else
 987             break;
 988         }
 989
 990       /* This must be \r or \n.  We're either done, or we'll be forced
 991          to write back to the buffer and continue on the slow path.  */
 992       d = (uchar *) s;
 993
 994       if (__builtin_expect (s == buffer->rlimit, false))
 995         goto done;
 996
 997       /* DOS line ending? */
 998       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 999         {
1000           s++;
1001           if (s == buffer->rlimit)
1002             goto done;
1003         }
1004
1005       if (__builtin_expect (pbackslash == NULL, true))
1006         goto done;
1007
1008       /* Check for escaped newline.  */
1009       p = d;
1010       while (is_nvspace (p[-1]))
1011         p--;
1012       if (p - 1 != pbackslash)
1013         goto done;
1014
1015       /* Have an escaped newline; process it and proceed to
1016          the slow path.  */
1017       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018       d = p - 2;
1019       buffer->next_line = p - 1;
1020
1021     slow_path:
1022       while (1)
1023         {
1024           c = *++s;
1025           *++d = c;
1026
1027           if (c == '\n' || c == '\r')
1028             {
1029               /* Handle DOS line endings.  */
1030               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031                 s++;
1032               if (s == buffer->rlimit)
1033                 break;
1034
1035               /* Escaped?  */
1036               p = d;
1037               while (p != buffer->next_line && is_nvspace (p[-1]))
1038                 p--;
1039               if (p == buffer->next_line || p[-1] != '\\')
1040                 break;
1041
1042               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043               d = p - 2;
1044               buffer->next_line = p - 1;
1045             }
1046           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1047             {
1048               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1049               add_line_note (buffer, d, s[2]);
1050               if (CPP_OPTION (pfile, trigraphs))
1051                 {
1052                   *d = _cpp_trigraph_map[s[2]];
1053                   s += 2;
1054                 }
1055             }
1056         }
1057     }
1058   else
1059     {
1060       while (*s != '\n' && *s != '\r')
1061         s++;
1062       d = (uchar *) s;
1063
1064       /* Handle DOS line endings.  */
1065       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
1066         s++;
1067     }
1068
1069  done:
1070   *d = '\n';
1071   /* A sentinel note that should never be processed.  */
1072   add_line_note (buffer, d + 1, '\n');
1073   buffer->next_line = s + 1;
1074 }
1075
1076 /* Return true if the trigraph indicated by NOTE should be warned
1077    about in a comment.  */
1078 static bool
1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1080 {
1081   const uchar *p;
1082
1083   /* Within comments we don't warn about trigraphs, unless the
1084      trigraph forms an escaped newline, as that may change
1085      behavior.  */
1086   if (note->type != '/')
1087     return false;
1088
1089   /* If -trigraphs, then this was an escaped newline iff the next note
1090      is coincident.  */
1091   if (CPP_OPTION (pfile, trigraphs))
1092     return note[1].pos == note->pos;
1093
1094   /* Otherwise, see if this forms an escaped newline.  */
1095   p = note->pos + 3;
1096   while (is_nvspace (*p))
1097     p++;
1098
1099   /* There might have been escaped newlines between the trigraph and the
1100      newline we found.  Hence the position test.  */
1101   return (*p == '\n' && p < note[1].pos);
1102 }
1103
1104 /* Process the notes created by add_line_note as far as the current
1105    location.  */
1106 void
1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1108 {
1109   cpp_buffer *buffer = pfile->buffer;
1110
1111   for (;;)
1112     {
1113       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114       unsigned int col;
1115
1116       if (note->pos > buffer->cur)
1117         break;
1118
1119       buffer->cur_note++;
1120       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1121
1122       if (note->type == '\\' || note->type == ' ')
1123         {
1124           if (note->type == ' ' && !in_comment)
1125             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126                                  "backslash and newline separated by space");
1127
1128           if (buffer->next_line > buffer->rlimit)
1129             {
1130               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131                                    "backslash-newline at end of file");
1132               /* Prevent "no newline at end of file" warning.  */
1133               buffer->next_line = buffer->rlimit;
1134             }
1135
1136           buffer->line_base = note->pos;
1137           CPP_INCREMENT_LINE (pfile, 0);
1138         }
1139       else if (_cpp_trigraph_map[note->type])
1140         {
1141           if (CPP_OPTION (pfile, warn_trigraphs)
1142               && (!in_comment || warn_in_comment (pfile, note)))
1143             {
1144               if (CPP_OPTION (pfile, trigraphs))
1145                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146                                        pfile->line_table->highest_line, col,
1147                                        "trigraph ??%c converted to %c",
1148                                        note->type,
1149                                        (int) _cpp_trigraph_map[note->type]);
1150               else
1151                 {
1152                   cpp_warning_with_line
1153                     (pfile, CPP_W_TRIGRAPHS,
1154                      pfile->line_table->highest_line, col,
1155                      "trigraph ??%c ignored, use -trigraphs to enable",
1156                      note->type);
1157                 }
1158             }
1159         }
1160       else if (note->type == 0)
1161         /* Already processed in lex_raw_string.  */;
1162       else
1163         abort ();
1164     }
1165 }
1166
1167 /* Skip a C-style block comment.  We find the end of the comment by
1168    seeing if an asterisk is before every '/' we encounter.  Returns
1169    nonzero if comment terminated by EOF, zero otherwise.
1170
1171    Buffer->cur points to the initial asterisk of the comment.  */
1172 bool
1173 _cpp_skip_block_comment (cpp_reader *pfile)
1174 {
1175   cpp_buffer *buffer = pfile->buffer;
1176   const uchar *cur = buffer->cur;
1177   uchar c;
1178
1179   cur++;
1180   if (*cur == '/')
1181     cur++;
1182
1183   for (;;)
1184     {
1185       /* People like decorating comments with '*', so check for '/'
1186          instead for efficiency.  */
1187       c = *cur++;
1188
1189       if (c == '/')
1190         {
1191           if (cur[-2] == '*')
1192             break;
1193
1194           /* Warn about potential nested comments, but not if the '/'
1195              comes immediately before the true comment delimiter.
1196              Don't bother to get it right across escaped newlines.  */
1197           if (CPP_OPTION (pfile, warn_comments)
1198               && cur[0] == '*' && cur[1] != '/')
1199             {
1200               buffer->cur = cur;
1201               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1202                                      pfile->line_table->highest_line,
1203                                      CPP_BUF_COL (buffer),
1204                                      "\"/*\" within comment");
1205             }
1206         }
1207       else if (c == '\n')
1208         {
1209           unsigned int cols;
1210           buffer->cur = cur - 1;
1211           _cpp_process_line_notes (pfile, true);
1212           if (buffer->next_line >= buffer->rlimit)
1213             return true;
1214           _cpp_clean_line (pfile);
1215
1216           cols = buffer->next_line - buffer->line_base;
1217           CPP_INCREMENT_LINE (pfile, cols);
1218
1219           cur = buffer->cur;
1220         }
1221     }
1222
1223   buffer->cur = cur;
1224   _cpp_process_line_notes (pfile, true);
1225   return false;
1226 }
1227
1228 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1229    terminating newline.  Handles escaped newlines.  Returns nonzero
1230    if a multiline comment.  */
1231 static int
1232 skip_line_comment (cpp_reader *pfile)
1233 {
1234   cpp_buffer *buffer = pfile->buffer;
1235   source_location orig_line = pfile->line_table->highest_line;
1236
1237   while (*buffer->cur != '\n')
1238     buffer->cur++;
1239
1240   _cpp_process_line_notes (pfile, true);
1241   return orig_line != pfile->line_table->highest_line;
1242 }
1243
1244 /* Skips whitespace, saving the next non-whitespace character.  */
1245 static void
1246 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1247 {
1248   cpp_buffer *buffer = pfile->buffer;
1249   bool saw_NUL = false;
1250
1251   do
1252     {
1253       /* Horizontal space always OK.  */
1254       if (c == ' ' || c == '\t')
1255         ;
1256       /* Just \f \v or \0 left.  */
1257       else if (c == '\0')
1258         saw_NUL = true;
1259       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1260         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1261                              CPP_BUF_COL (buffer),
1262                              "%s in preprocessing directive",
1263                              c == '\f' ? "form feed" : "vertical tab");
1264
1265       c = *buffer->cur++;
1266     }
1267   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1268   while (is_nvspace (c));
1269
1270   if (saw_NUL)
1271     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1272
1273   buffer->cur--;
1274 }
1275
1276 /* See if the characters of a number token are valid in a name (no
1277    '.', '+' or '-').  */
1278 static int
1279 name_p (cpp_reader *pfile, const cpp_string *string)
1280 {
1281   unsigned int i;
1282
1283   for (i = 0; i < string->len; i++)
1284     if (!is_idchar (string->text[i]))
1285       return 0;
1286
1287   return 1;
1288 }
1289
1290 /* After parsing an identifier or other sequence, produce a warning about
1291    sequences not in NFC/NFKC.  */
1292 static void
1293 warn_about_normalization (cpp_reader *pfile,
1294                           const cpp_token *token,
1295                           const struct normalize_state *s)
1296 {
1297   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1298       && !pfile->state.skipping)
1299     {
1300       /* Make sure that the token is printed using UCNs, even
1301          if we'd otherwise happily print UTF-8.  */
1302       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1303       size_t sz;
1304
1305       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1306       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1307         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1308                                "`%.*s' is not in NFKC", (int) sz, buf);
1309       else
1310         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1311                                "`%.*s' is not in NFC", (int) sz, buf);
1312       free (buf);
1313     }
1314 }
1315
1316 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1317    an identifier.  FIRST is TRUE if this starts an identifier.  */
1318 static bool
1319 forms_identifier_p (cpp_reader *pfile, int first,
1320                     struct normalize_state *state)
1321 {
1322   cpp_buffer *buffer = pfile->buffer;
1323
1324   if (*buffer->cur == '$')
1325     {
1326       if (!CPP_OPTION (pfile, dollars_in_ident))
1327         return false;
1328
1329       buffer->cur++;
1330       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1331         {
1332           CPP_OPTION (pfile, warn_dollars) = 0;
1333           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1334         }
1335
1336       return true;
1337     }
1338
1339   /* Is this a syntactically valid UCN?  */
1340   if (CPP_OPTION (pfile, extended_identifiers)
1341       && *buffer->cur == '\\'
1342       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1343     {
1344       cppchar_t s;
1345       buffer->cur += 2;
1346       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1347                           state, &s, NULL, NULL))
1348         return true;
1349       buffer->cur -= 2;
1350     }
1351
1352   return false;
1353 }
1354
1355 /* Helper function to issue error about improper __VA_OPT__ use.  */
1356 static void
1357 maybe_va_opt_error (cpp_reader *pfile)
1358 {
1359   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1360     {
1361       /* __VA_OPT__ should not be accepted at all, but allow it in
1362          system headers.  */
1363       if (!cpp_in_system_header (pfile))
1364         cpp_error (pfile, CPP_DL_PEDWARN,
1365                    "__VA_OPT__ is not available until C++2a");
1366     }
1367   else if (!pfile->state.va_args_ok)
1368     {
1369       /* __VA_OPT__ should only appear in the replacement list of a
1370          variadic macro.  */
1371       cpp_error (pfile, CPP_DL_PEDWARN,
1372                  "__VA_OPT__ can only appear in the expansion"
1373                  " of a C++2a variadic macro");
1374     }
1375 }
1376
1377 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1378 static cpp_hashnode *
1379 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1380 {
1381   cpp_hashnode *result;
1382   const uchar *cur;
1383   unsigned int len;
1384   unsigned int hash = HT_HASHSTEP (0, *base);
1385
1386   cur = base + 1;
1387   while (ISIDNUM (*cur))
1388     {
1389       hash = HT_HASHSTEP (hash, *cur);
1390       cur++;
1391     }
1392   len = cur - base;
1393   hash = HT_HASHFINISH (hash, len);
1394   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1395                                               base, len, hash, HT_ALLOC));
1396
1397   /* Rarely, identifiers require diagnostics when lexed.  */
1398   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1399                         && !pfile->state.skipping, 0))
1400     {
1401       /* It is allowed to poison the same identifier twice.  */
1402       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1403         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1404                    NODE_NAME (result));
1405
1406       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1407          replacement list of a variadic macro.  */
1408       if (result == pfile->spec_nodes.n__VA_ARGS__
1409           && !pfile->state.va_args_ok)
1410         {
1411           if (CPP_OPTION (pfile, cplusplus))
1412             cpp_error (pfile, CPP_DL_PEDWARN,
1413                        "__VA_ARGS__ can only appear in the expansion"
1414                        " of a C++11 variadic macro");
1415           else
1416             cpp_error (pfile, CPP_DL_PEDWARN,
1417                        "__VA_ARGS__ can only appear in the expansion"
1418                        " of a C99 variadic macro");
1419         }
1420
1421       if (result == pfile->spec_nodes.n__VA_OPT__)
1422         maybe_va_opt_error (pfile);
1423
1424       /* For -Wc++-compat, warn about use of C++ named operators.  */
1425       if (result->flags & NODE_WARN_OPERATOR)
1426         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1427                      "identifier \"%s\" is a special operator name in C++",
1428                      NODE_NAME (result));
1429     }
1430
1431   return result;
1432 }
1433
1434 /* Get the cpp_hashnode of an identifier specified by NAME in
1435    the current cpp_reader object.  If none is found, NULL is returned.  */
1436 cpp_hashnode *
1437 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1438 {
1439   cpp_hashnode *result;
1440   result = lex_identifier_intern (pfile, (uchar *) name);
1441   return result;
1442 }
1443
1444 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1445 static cpp_hashnode *
1446 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1447                 struct normalize_state *nst, cpp_hashnode **spelling)
1448 {
1449   cpp_hashnode *result;
1450   const uchar *cur;
1451   unsigned int len;
1452   unsigned int hash = HT_HASHSTEP (0, *base);
1453
1454   cur = pfile->buffer->cur;
1455   if (! starts_ucn)
1456     {
1457       while (ISIDNUM (*cur))
1458         {
1459           hash = HT_HASHSTEP (hash, *cur);
1460           cur++;
1461         }
1462       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1463     }
1464   pfile->buffer->cur = cur;
1465   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1466     {
1467       /* Slower version for identifiers containing UCNs (or $).  */
1468       do {
1469         while (ISIDNUM (*pfile->buffer->cur))
1470           {
1471             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1472             pfile->buffer->cur++;
1473           }
1474       } while (forms_identifier_p (pfile, false, nst));
1475       result = _cpp_interpret_identifier (pfile, base,
1476                                           pfile->buffer->cur - base);
1477       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1478     }
1479   else
1480     {
1481       len = cur - base;
1482       hash = HT_HASHFINISH (hash, len);
1483
1484       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1485                                                   base, len, hash, HT_ALLOC));
1486       *spelling = result;
1487     }
1488
1489   /* Rarely, identifiers require diagnostics when lexed.  */
1490   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1491                         && !pfile->state.skipping, 0))
1492     {
1493       /* It is allowed to poison the same identifier twice.  */
1494       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1495         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1496                    NODE_NAME (result));
1497
1498       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1499          replacement list of a variadic macro.  */
1500       if (result == pfile->spec_nodes.n__VA_ARGS__
1501           && !pfile->state.va_args_ok)
1502         {
1503           if (CPP_OPTION (pfile, cplusplus))
1504             cpp_error (pfile, CPP_DL_PEDWARN,
1505                        "__VA_ARGS__ can only appear in the expansion"
1506                        " of a C++11 variadic macro");
1507           else
1508             cpp_error (pfile, CPP_DL_PEDWARN,
1509                        "__VA_ARGS__ can only appear in the expansion"
1510                        " of a C99 variadic macro");
1511         }
1512
1513       /* __VA_OPT__ should only appear in the replacement list of a
1514          variadic macro.  */
1515       if (result == pfile->spec_nodes.n__VA_OPT__)
1516         maybe_va_opt_error (pfile);
1517
1518       /* For -Wc++-compat, warn about use of C++ named operators.  */
1519       if (result->flags & NODE_WARN_OPERATOR)
1520         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1521                      "identifier \"%s\" is a special operator name in C++",
1522                      NODE_NAME (result));
1523     }
1524
1525   return result;
1526 }
1527
1528 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1529 static void
1530 lex_number (cpp_reader *pfile, cpp_string *number,
1531             struct normalize_state *nst)
1532 {
1533   const uchar *cur;
1534   const uchar *base;
1535   uchar *dest;
1536
1537   base = pfile->buffer->cur - 1;
1538   do
1539     {
1540       cur = pfile->buffer->cur;
1541
1542       /* N.B. ISIDNUM does not include $.  */
1543       while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1544              || VALID_SIGN (*cur, cur[-1]))
1545         {
1546           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1547           cur++;
1548         }
1549       /* A number can't end with a digit separator.  */
1550       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
1551         --cur;
1552
1553       pfile->buffer->cur = cur;
1554     }
1555   while (forms_identifier_p (pfile, false, nst));
1556
1557   number->len = cur - base;
1558   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1559   memcpy (dest, base, number->len);
1560   dest[number->len] = '\0';
1561   number->text = dest;
1562 }
1563
1564 /* Create a token of type TYPE with a literal spelling.  */
1565 static void
1566 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1567                 unsigned int len, enum cpp_ttype type)
1568 {
1569   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1570
1571   memcpy (dest, base, len);
1572   dest[len] = '\0';
1573   token->type = type;
1574   token->val.str.len = len;
1575   token->val.str.text = dest;
1576 }
1577
1578 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1579    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1580
1581 static void
1582 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1583                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1584 {
1585   _cpp_buff *first_buff = *first_buff_p;
1586   _cpp_buff *last_buff = *last_buff_p;
1587
1588   if (first_buff == NULL)
1589     first_buff = last_buff = _cpp_get_buff (pfile, len);
1590   else if (len > BUFF_ROOM (last_buff))
1591     {
1592       size_t room = BUFF_ROOM (last_buff);
1593       memcpy (BUFF_FRONT (last_buff), base, room);
1594       BUFF_FRONT (last_buff) += room;
1595       base += room;
1596       len -= room;
1597       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1598     }
1599
1600   memcpy (BUFF_FRONT (last_buff), base, len);
1601   BUFF_FRONT (last_buff) += len;
1602
1603   *first_buff_p = first_buff;
1604   *last_buff_p = last_buff;
1605 }
1606
1607
1608 /* Returns true if a macro has been defined.
1609    This might not work if compile with -save-temps,
1610    or preprocess separately from compilation.  */
1611
1612 static bool
1613 is_macro(cpp_reader *pfile, const uchar *base)
1614 {
1615   const uchar *cur = base;
1616   if (! ISIDST (*cur))
1617     return false;
1618   unsigned int hash = HT_HASHSTEP (0, *cur);
1619   ++cur;
1620   while (ISIDNUM (*cur))
1621     {
1622       hash = HT_HASHSTEP (hash, *cur);
1623       ++cur;
1624     }
1625   hash = HT_HASHFINISH (hash, cur - base);
1626
1627   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1628                                         base, cur - base, hash, HT_NO_INSERT));
1629
1630   return !result ? false : (result->type == NT_MACRO);
1631 }
1632
1633
1634 /* Lexes a raw string.  The stored string contains the spelling, including
1635    double quotes, delimiter string, '(' and ')', any leading
1636    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1637    literal, or CPP_OTHER if it was not properly terminated.
1638
1639    The spelling is NUL-terminated, but it is not guaranteed that this
1640    is the first NUL since embedded NULs are preserved.  */
1641
1642 static void
1643 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1644                 const uchar *cur)
1645 {
1646   uchar raw_prefix[17];
1647   uchar temp_buffer[18];
1648   const uchar *orig_base;
1649   unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
1650   enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
1651   raw_str_phase phase = RAW_STR_PREFIX;
1652   enum cpp_ttype type;
1653   size_t total_len = 0;
1654   /* Index into temp_buffer during phases other than RAW_STR,
1655      during RAW_STR phase 17 to tell BUF_APPEND that nothing should
1656      be appended to temp_buffer.  */
1657   size_t temp_buffer_len = 0;
1658   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1659   size_t raw_prefix_start;
1660   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1661
1662   type = (*base == 'L' ? CPP_WSTRING :
1663           *base == 'U' ? CPP_STRING32 :
1664           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1665           : CPP_STRING);
1666
1667 #define BUF_APPEND(STR,LEN)                                     \
1668       do {                                                      \
1669         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1670                         &first_buff, &last_buff);               \
1671         total_len += (LEN);                                     \
1672         if (__builtin_expect (temp_buffer_len < 17, 0)          \
1673             && (const uchar *)(STR) != base                     \
1674             && (LEN) <= 2)                                      \
1675           {                                                     \
1676             memcpy (temp_buffer + temp_buffer_len,              \
1677                     (const uchar *)(STR), (LEN));               \
1678             temp_buffer_len += (LEN);                           \
1679           }                                                     \
1680       } while (0)
1681
1682   orig_base = base;
1683   ++cur;
1684   raw_prefix_start = cur - base;
1685   for (;;)
1686     {
1687       cppchar_t c;
1688
1689       /* If we previously performed any trigraph or line splicing
1690          transformations, undo them in between the opening and closing
1691          double quote.  */
1692       while (note->pos < cur)
1693         ++note;
1694       for (; note->pos == cur; ++note)
1695         {
1696           switch (note->type)
1697             {
1698             case '\\':
1699             case ' ':
1700               /* Restore backslash followed by newline.  */
1701               BUF_APPEND (base, cur - base);
1702               base = cur;
1703               BUF_APPEND ("\\", 1);
1704             after_backslash:
1705               if (note->type == ' ')
1706                 {
1707                   /* GNU backslash whitespace newline extension.  FIXME
1708                      could be any sequence of non-vertical space.  When we
1709                      can properly restore any such sequence, we should mark
1710                      this note as handled so _cpp_process_line_notes
1711                      doesn't warn.  */
1712                   BUF_APPEND (" ", 1);
1713                 }
1714
1715               BUF_APPEND ("\n", 1);
1716               break;
1717
1718             case 0:
1719               /* Already handled.  */
1720               break;
1721
1722             default:
1723               if (_cpp_trigraph_map[note->type])
1724                 {
1725                   /* Don't warn about this trigraph in
1726                      _cpp_process_line_notes, since trigraphs show up as
1727                      trigraphs in raw strings.  */
1728                   uchar type = note->type;
1729                   note->type = 0;
1730
1731                   if (!CPP_OPTION (pfile, trigraphs))
1732                     /* If we didn't convert the trigraph in the first
1733                        place, don't do anything now either.  */
1734                     break;
1735
1736                   BUF_APPEND (base, cur - base);
1737                   base = cur;
1738                   BUF_APPEND ("??", 2);
1739
1740                   /* ??/ followed by newline gets two line notes, one for
1741                      the trigraph and one for the backslash/newline.  */
1742                   if (type == '/' && note[1].pos == cur)
1743                     {
1744                       if (note[1].type != '\\'
1745                           && note[1].type != ' ')
1746                         abort ();
1747                       BUF_APPEND ("/", 1);
1748                       ++note;
1749                       goto after_backslash;
1750                     }
1751                   else
1752                     {
1753                       /* Skip the replacement character.  */
1754                       base = ++cur;
1755                       BUF_APPEND (&type, 1);
1756                       c = type;
1757                       goto check_c;
1758                     }
1759                 }
1760               else
1761                 abort ();
1762               break;
1763             }
1764         }
1765       c = *cur++;
1766       if (__builtin_expect (temp_buffer_len < 17, 0))
1767         temp_buffer[temp_buffer_len++] = c;
1768
1769      check_c:
1770       if (phase == RAW_STR_PREFIX)
1771         {
1772           while (raw_prefix_len < temp_buffer_len)
1773             {
1774               raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len];
1775               switch (raw_prefix[raw_prefix_len])
1776                 {
1777                 case ' ': case '(': case ')': case '\\': case '\t':
1778                 case '\v': case '\f': case '\n': default:
1779                   break;
1780                 /* Basic source charset except the above chars.  */
1781                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1782                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1783                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1784                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1785                 case 'y': case 'z':
1786                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1787                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1788                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1789                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1790                 case 'Y': case 'Z':
1791                 case '0': case '1': case '2': case '3': case '4': case '5':
1792                 case '6': case '7': case '8': case '9':
1793                 case '_': case '{': case '}': case '#': case '[': case ']':
1794                 case '<': case '>': case '%': case ':': case ';': case '.':
1795                 case '?': case '*': case '+': case '-': case '/': case '^':
1796                 case '&': case '|': case '~': case '!': case '=': case ',':
1797                 case '"': case '\'':
1798                   if (raw_prefix_len < 16)
1799                     {
1800                       raw_prefix_len++;
1801                       continue;
1802                     }
1803                   break;
1804                 }
1805
1806               if (raw_prefix[raw_prefix_len] != '(')
1807                 {
1808                   int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1;
1809                   if (raw_prefix_len == 16)
1810                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1811                                          col, "raw string delimiter longer "
1812                                               "than 16 characters");
1813                   else if (raw_prefix[raw_prefix_len] == '\n')
1814                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1815                                          col, "invalid new-line in raw "
1816                                               "string delimiter");
1817                   else
1818                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1819                                          col, "invalid character '%c' in "
1820                                               "raw string delimiter",
1821                                          (int) raw_prefix[raw_prefix_len]);
1822                   pfile->buffer->cur = orig_base + raw_prefix_start - 1;
1823                   create_literal (pfile, token, orig_base,
1824                                   raw_prefix_start - 1, CPP_OTHER);
1825                   if (first_buff)
1826                     _cpp_release_buff (pfile, first_buff);
1827                   return;
1828                 }
1829               raw_prefix[raw_prefix_len] = '"';
1830               phase = RAW_STR;
1831               /* Nothing should be appended to temp_buffer during
1832                  RAW_STR phase.  */
1833               temp_buffer_len = 17;
1834               break;
1835             }
1836           continue;
1837         }
1838       else if (phase == RAW_STR_SUFFIX)
1839         {
1840           while (raw_suffix_len <= raw_prefix_len
1841                  && raw_suffix_len < temp_buffer_len
1842                  && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len])
1843             raw_suffix_len++;
1844           if (raw_suffix_len > raw_prefix_len)
1845             break;
1846           if (raw_suffix_len == temp_buffer_len)
1847             continue;
1848           phase = RAW_STR;
1849           /* Nothing should be appended to temp_buffer during
1850              RAW_STR phase.  */
1851           temp_buffer_len = 17;
1852         }
1853       if (c == ')')
1854         {
1855           phase = RAW_STR_SUFFIX;
1856           raw_suffix_len = 0;
1857           temp_buffer_len = 0;
1858         }
1859       else if (c == '\n')
1860         {
1861           if (pfile->state.in_directive
1862               || (pfile->state.parsing_args
1863                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
1864             {
1865               cur--;
1866               type = CPP_OTHER;
1867               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1868                                    "unterminated raw string");
1869               break;
1870             }
1871
1872           BUF_APPEND (base, cur - base);
1873
1874           if (pfile->buffer->cur < pfile->buffer->rlimit)
1875             CPP_INCREMENT_LINE (pfile, 0);
1876           pfile->buffer->need_line = true;
1877
1878           pfile->buffer->cur = cur-1;
1879           _cpp_process_line_notes (pfile, false);
1880           if (!_cpp_get_fresh_line (pfile))
1881             {
1882               source_location src_loc = token->src_loc;
1883               token->type = CPP_EOF;
1884               /* Tell the compiler the line number of the EOF token.  */
1885               token->src_loc = pfile->line_table->highest_line;
1886               token->flags = BOL;
1887               if (first_buff != NULL)
1888                 _cpp_release_buff (pfile, first_buff);
1889               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1890                                    "unterminated raw string");
1891               return;
1892             }
1893
1894           cur = base = pfile->buffer->cur;
1895           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1896         }
1897     }
1898
1899   if (CPP_OPTION (pfile, user_literals))
1900     {
1901       /* If a string format macro, say from inttypes.h, is placed touching
1902          a string literal it could be parsed as a C++11 user-defined string
1903          literal thus breaking the program.
1904          Try to identify macros with is_macro. A warning is issued.
1905          The macro name should not start with '_' for this warning. */
1906       if ((*cur != '_') && is_macro (pfile, cur))
1907         {
1908           /* Raise a warning, but do not consume subsequent tokens.  */
1909           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1910             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1911                                    token->src_loc, 0,
1912                                    "invalid suffix on literal; C++11 requires "
1913                                    "a space between literal and string macro");
1914         }
1915       /* Grab user defined literal suffix.  */
1916       else if (ISIDST (*cur))
1917         {
1918           type = cpp_userdef_string_add_type (type);
1919           ++cur;
1920
1921           while (ISIDNUM (*cur))
1922             ++cur;
1923         }
1924     }
1925
1926   pfile->buffer->cur = cur;
1927   if (first_buff == NULL)
1928     create_literal (pfile, token, base, cur - base, type);
1929   else
1930     {
1931       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1932
1933       token->type = type;
1934       token->val.str.len = total_len + (cur - base);
1935       token->val.str.text = dest;
1936       last_buff = first_buff;
1937       while (last_buff != NULL)
1938         {
1939           memcpy (dest, last_buff->base,
1940                   BUFF_FRONT (last_buff) - last_buff->base);
1941           dest += BUFF_FRONT (last_buff) - last_buff->base;
1942           last_buff = last_buff->next;
1943         }
1944       _cpp_release_buff (pfile, first_buff);
1945       memcpy (dest, base, cur - base);
1946       dest[cur - base] = '\0';
1947     }
1948 }
1949
1950 /* Lexes a string, character constant, or angle-bracketed header file
1951    name.  The stored string contains the spelling, including opening
1952    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1953    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1954    if it was not properly terminated, or CPP_LESS for an unterminated
1955    header name which must be relexed as normal tokens.
1956
1957    The spelling is NUL-terminated, but it is not guaranteed that this
1958    is the first NUL since embedded NULs are preserved.  */
1959 static void
1960 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1961 {
1962   bool saw_NUL = false;
1963   const uchar *cur;
1964   cppchar_t terminator;
1965   enum cpp_ttype type;
1966
1967   cur = base;
1968   terminator = *cur++;
1969   if (terminator == 'L' || terminator == 'U')
1970     terminator = *cur++;
1971   else if (terminator == 'u')
1972     {
1973       terminator = *cur++;
1974       if (terminator == '8')
1975         terminator = *cur++;
1976     }
1977   if (terminator == 'R')
1978     {
1979       lex_raw_string (pfile, token, base, cur);
1980       return;
1981     }
1982   if (terminator == '"')
1983     type = (*base == 'L' ? CPP_WSTRING :
1984             *base == 'U' ? CPP_STRING32 :
1985             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1986                          : CPP_STRING);
1987   else if (terminator == '\'')
1988     type = (*base == 'L' ? CPP_WCHAR :
1989             *base == 'U' ? CPP_CHAR32 :
1990             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
1991                          : CPP_CHAR);
1992   else
1993     terminator = '>', type = CPP_HEADER_NAME;
1994
1995   for (;;)
1996     {
1997       cppchar_t c = *cur++;
1998
1999       /* In #include-style directives, terminators are not escapable.  */
2000       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2001         cur++;
2002       else if (c == terminator)
2003         break;
2004       else if (c == '\n')
2005         {
2006           cur--;
2007           /* Unmatched quotes always yield undefined behavior, but
2008              greedy lexing means that what appears to be an unterminated
2009              header name may actually be a legitimate sequence of tokens.  */
2010           if (terminator == '>')
2011             {
2012               token->type = CPP_LESS;
2013               return;
2014             }
2015           type = CPP_OTHER;
2016           break;
2017         }
2018       else if (c == '\0')
2019         saw_NUL = true;
2020     }
2021
2022   if (saw_NUL && !pfile->state.skipping)
2023     cpp_error (pfile, CPP_DL_WARNING,
2024                "null character(s) preserved in literal");
2025
2026   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2027     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2028                (int) terminator);
2029
2030   if (CPP_OPTION (pfile, user_literals))
2031     {
2032       /* If a string format macro, say from inttypes.h, is placed touching
2033          a string literal it could be parsed as a C++11 user-defined string
2034          literal thus breaking the program.
2035          Try to identify macros with is_macro. A warning is issued.
2036          The macro name should not start with '_' for this warning. */
2037       if ((*cur != '_') && is_macro (pfile, cur))
2038         {
2039           /* Raise a warning, but do not consume subsequent tokens.  */
2040           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2041             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2042                                    token->src_loc, 0,
2043                                    "invalid suffix on literal; C++11 requires "
2044                                    "a space between literal and string macro");
2045         }
2046       /* Grab user defined literal suffix.  */
2047       else if (ISIDST (*cur))
2048         {
2049           type = cpp_userdef_char_add_type (type);
2050           type = cpp_userdef_string_add_type (type);
2051           ++cur;
2052
2053           while (ISIDNUM (*cur))
2054             ++cur;
2055         }
2056     }
2057   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2058            && is_macro (pfile, cur)
2059            && !pfile->state.skipping)
2060     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2061                            token->src_loc, 0, "C++11 requires a space "
2062                            "between string literal and macro");
2063
2064   pfile->buffer->cur = cur;
2065   create_literal (pfile, token, base, cur - base, type);
2066 }
2067
2068 /* Return the comment table. The client may not make any assumption
2069    about the ordering of the table.  */
2070 cpp_comment_table *
2071 cpp_get_comments (cpp_reader *pfile)
2072 {
2073   return &pfile->comments;
2074 }
2075
2076 /* Append a comment to the end of the comment table. */
2077 static void
2078 store_comment (cpp_reader *pfile, cpp_token *token)
2079 {
2080   int len;
2081
2082   if (pfile->comments.allocated == 0)
2083     {
2084       pfile->comments.allocated = 256;
2085       pfile->comments.entries = (cpp_comment *) xmalloc
2086         (pfile->comments.allocated * sizeof (cpp_comment));
2087     }
2088
2089   if (pfile->comments.count == pfile->comments.allocated)
2090     {
2091       pfile->comments.allocated *= 2;
2092       pfile->comments.entries = (cpp_comment *) xrealloc
2093         (pfile->comments.entries,
2094          pfile->comments.allocated * sizeof (cpp_comment));
2095     }
2096
2097   len = token->val.str.len;
2098
2099   /* Copy comment. Note, token may not be NULL terminated. */
2100   pfile->comments.entries[pfile->comments.count].comment =
2101     (char *) xmalloc (sizeof (char) * (len + 1));
2102   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2103           token->val.str.text, len);
2104   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2105
2106   /* Set source location. */
2107   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2108
2109   /* Increment the count of entries in the comment table. */
2110   pfile->comments.count++;
2111 }
2112
2113 /* The stored comment includes the comment start and any terminator.  */
2114 static void
2115 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2116               cppchar_t type)
2117 {
2118   unsigned char *buffer;
2119   unsigned int len, clen, i;
2120
2121   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2122
2123   /* C++ comments probably (not definitely) have moved past a new
2124      line, which we don't want to save in the comment.  */
2125   if (is_vspace (pfile->buffer->cur[-1]))
2126     len--;
2127
2128   /* If we are currently in a directive or in argument parsing, then
2129      we need to store all C++ comments as C comments internally, and
2130      so we need to allocate a little extra space in that case.
2131
2132      Note that the only time we encounter a directive here is
2133      when we are saving comments in a "#define".  */
2134   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2135           && type == '/') ? len + 2 : len;
2136
2137   buffer = _cpp_unaligned_alloc (pfile, clen);
2138
2139   token->type = CPP_COMMENT;
2140   token->val.str.len = clen;
2141   token->val.str.text = buffer;
2142
2143   buffer[0] = '/';
2144   memcpy (buffer + 1, from, len - 1);
2145
2146   /* Finish conversion to a C comment, if necessary.  */
2147   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2148     {
2149       buffer[1] = '*';
2150       buffer[clen - 2] = '*';
2151       buffer[clen - 1] = '/';
2152       /* As there can be in a C++ comments illegal sequences for C comments
2153          we need to filter them out.  */
2154       for (i = 2; i < (clen - 2); i++)
2155         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2156           buffer[i] = '|';
2157     }
2158
2159   /* Finally store this comment for use by clients of libcpp. */
2160   store_comment (pfile, token);
2161 }
2162
2163 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2164    comment.  */
2165
2166 static bool
2167 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2168 {
2169   const unsigned char *from = comment_start + 1;
2170
2171   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2172     {
2173       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2174          don't recognize any comments.  The latter only checks attributes,
2175          the former doesn't warn.  */
2176     case 0:
2177     default:
2178       return false;
2179       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2180          content it has.  */
2181     case 1:
2182       return true;
2183     case 2:
2184       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2185          .*falls?[ \t-]*thr(u|ough).* regex.  */
2186       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2187            from++)
2188         {
2189           /* Is there anything like strpbrk with upper boundary, or
2190              memchr looking for 2 characters rather than just one?  */
2191           if (from[0] != 'f' && from[0] != 'F')
2192             continue;
2193           if (from[1] != 'a' && from[1] != 'A')
2194             continue;
2195           if (from[2] != 'l' && from[2] != 'L')
2196             continue;
2197           if (from[3] != 'l' && from[3] != 'L')
2198             continue;
2199           from += sizeof "fall" - 1;
2200           if (from[0] == 's' || from[0] == 'S')
2201             from++;
2202           while (*from == ' ' || *from == '\t' || *from == '-')
2203             from++;
2204           if (from[0] != 't' && from[0] != 'T')
2205             continue;
2206           if (from[1] != 'h' && from[1] != 'H')
2207             continue;
2208           if (from[2] != 'r' && from[2] != 'R')
2209             continue;
2210           if (from[3] == 'u' || from[3] == 'U')
2211             return true;
2212           if (from[3] != 'o' && from[3] != 'O')
2213             continue;
2214           if (from[4] != 'u' && from[4] != 'U')
2215             continue;
2216           if (from[5] != 'g' && from[5] != 'G')
2217             continue;
2218           if (from[6] != 'h' && from[6] != 'H')
2219             continue;
2220           return true;
2221         }
2222       return false;
2223     case 3:
2224     case 4:
2225       break;
2226     }
2227
2228   /* Whole comment contents:
2229      -fallthrough
2230      @fallthrough@
2231    */
2232   if (*from == '-' || *from == '@')
2233     {
2234       size_t len = sizeof "fallthrough" - 1;
2235       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2236         return false;
2237       if (memcmp (from + 1, "fallthrough", len))
2238         return false;
2239       if (*from == '@')
2240         {
2241           if (from[len + 1] != '@')
2242             return false;
2243           len++;
2244         }
2245       from += 1 + len;
2246     }
2247   /* Whole comment contents (regex):
2248      lint -fallthrough[ \t]*
2249    */
2250   else if (*from == 'l')
2251     {
2252       size_t len = sizeof "int -fallthrough" - 1;
2253       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2254         return false;
2255       if (memcmp (from + 1, "int -fallthrough", len))
2256         return false;
2257       from += 1 + len;
2258       while (*from == ' ' || *from == '\t')
2259         from++;
2260     }
2261   /* Whole comment contents (regex):
2262      [ \t]*FALLTHR(U|OUGH)[ \t]*
2263    */
2264   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2265     {
2266       while (*from == ' ' || *from == '\t')
2267         from++;
2268       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
2269         return false;
2270       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2271         return false;
2272       from += sizeof "FALLTHR" - 1;
2273       if (*from == 'U')
2274         from++;
2275       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
2276         return false;
2277       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2278         return false;
2279       else
2280         from += sizeof "OUGH" - 1;
2281       while (*from == ' ' || *from == '\t')
2282         from++;
2283     }
2284   /* Whole comment contents (regex):
2285      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2286      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2287      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2288    */
2289   else
2290     {
2291       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2292         from++;
2293       unsigned char f = *from;
2294       bool all_upper = false;
2295       if (f == 'E' || f == 'e')
2296         {
2297           if ((size_t) (pfile->buffer->cur - from)
2298               < sizeof "else fallthru" - 1)
2299             return false;
2300           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2301             all_upper = true;
2302           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2303             return false;
2304           from += sizeof "else" - 1;
2305           if (*from == ',')
2306             from++;
2307           if (*from != ' ')
2308             return false;
2309           from++;
2310           if (all_upper && *from == 'f')
2311             return false;
2312           if (f == 'e' && *from == 'F')
2313             return false;
2314           f = *from;
2315         }
2316       else if (f == 'I' || f == 'i')
2317         {
2318           if ((size_t) (pfile->buffer->cur - from)
2319               < sizeof "intentional fallthru" - 1)
2320             return false;
2321           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2322                                   sizeof "NTENTIONAL" - 1) == 0)
2323             all_upper = true;
2324           else if (memcmp (from + 1, "ntentional",
2325                            sizeof "ntentional" - 1))
2326             return false;
2327           from += sizeof "intentional" - 1;
2328           if (*from == ' ')
2329             {
2330               from++;
2331               if (all_upper && *from == 'f')
2332                 return false;
2333             }
2334           else if (all_upper)
2335             {
2336               if (memcmp (from, "LY F", sizeof "LY F" - 1))
2337                 return false;
2338               from += sizeof "LY " - 1;
2339             }
2340           else
2341             {
2342               if (memcmp (from, "ly ", sizeof "ly " - 1))
2343                 return false;
2344               from += sizeof "ly " - 1;
2345             }
2346           if (f == 'i' && *from == 'F')
2347             return false;
2348           f = *from;
2349         }
2350       if (f != 'F' && f != 'f')
2351         return false;
2352       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
2353         return false;
2354       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
2355         all_upper = true;
2356       else if (all_upper)
2357         return false;
2358       else if (memcmp (from + 1, "all", sizeof "all" - 1))
2359         return false;
2360       from += sizeof "fall" - 1;
2361       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
2362         from += 2;
2363       else if (*from == ' ' || *from == '-')
2364         from++;
2365       else if (*from != (all_upper ? 'T' : 't'))
2366         return false;
2367       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
2368         return false;
2369       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
2370         return false;
2371       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
2372         {
2373           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
2374             return false;
2375           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
2376                       sizeof "hrough" - 1))
2377             return false;
2378           from += sizeof "through" - 1;
2379         }
2380       else
2381         from += sizeof "thru" - 1;
2382       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2383         from++;
2384       if (*from == '-')
2385         {
2386           from++;
2387           if (*comment_start == '*')
2388             {
2389               do
2390                 {
2391                   while (*from && *from != '*'
2392                          && *from != '\n' && *from != '\r')
2393                     from++;
2394                   if (*from != '*' || from[1] == '/')
2395                     break;
2396                   from++;
2397                 }
2398               while (1);
2399             }
2400           else
2401             while (*from && *from != '\n' && *from != '\r')
2402               from++;
2403         }
2404     }
2405   /* C block comment.  */
2406   if (*comment_start == '*')
2407     {
2408       if (*from != '*' || from[1] != '/')
2409         return false;
2410     }
2411   /* C++ line comment.  */
2412   else if (*from != '\n')
2413     return false;
2414
2415   return true;
2416 }
2417
2418 /* Allocate COUNT tokens for RUN.  */
2419 void
2420 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
2421 {
2422   run->base = XNEWVEC (cpp_token, count);
2423   run->limit = run->base + count;
2424   run->next = NULL;
2425 }
2426
2427 /* Returns the next tokenrun, or creates one if there is none.  */
2428 static tokenrun *
2429 next_tokenrun (tokenrun *run)
2430 {
2431   if (run->next == NULL)
2432     {
2433       run->next = XNEW (tokenrun);
2434       run->next->prev = run;
2435       _cpp_init_tokenrun (run->next, 250);
2436     }
2437
2438   return run->next;
2439 }
2440
2441 /* Return the number of not yet processed token in a given
2442    context.  */
2443 int
2444 _cpp_remaining_tokens_num_in_context (cpp_context *context)
2445 {
2446   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2447     return (LAST (context).token - FIRST (context).token);
2448   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2449            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2450     return (LAST (context).ptoken - FIRST (context).ptoken);
2451   else
2452       abort ();
2453 }
2454
2455 /* Returns the token present at index INDEX in a given context.  If
2456    INDEX is zero, the next token to be processed is returned.  */
2457 static const cpp_token*
2458 _cpp_token_from_context_at (cpp_context *context, int index)
2459 {
2460   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2461     return &(FIRST (context).token[index]);
2462   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2463            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2464     return FIRST (context).ptoken[index];
2465  else
2466    abort ();
2467 }
2468
2469 /* Look ahead in the input stream.  */
2470 const cpp_token *
2471 cpp_peek_token (cpp_reader *pfile, int index)
2472 {
2473   cpp_context *context = pfile->context;
2474   const cpp_token *peektok;
2475   int count;
2476
2477   /* First, scan through any pending cpp_context objects.  */
2478   while (context->prev)
2479     {
2480       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2481
2482       if (index < (int) sz)
2483         return _cpp_token_from_context_at (context, index);
2484       index -= (int) sz;
2485       context = context->prev;
2486     }
2487
2488   /* We will have to read some new tokens after all (and do so
2489      without invalidating preceding tokens).  */
2490   count = index;
2491   pfile->keep_tokens++;
2492
2493   /* For peeked tokens temporarily disable line_change reporting,
2494      until the tokens are parsed for real.  */
2495   void (*line_change) (cpp_reader *, const cpp_token *, int)
2496     = pfile->cb.line_change;
2497   pfile->cb.line_change = NULL;
2498
2499   do
2500     {
2501       peektok = _cpp_lex_token (pfile);
2502       if (peektok->type == CPP_EOF)
2503         {
2504           index--;
2505           break;
2506         }
2507     }
2508   while (index--);
2509
2510   _cpp_backup_tokens_direct (pfile, count - index);
2511   pfile->keep_tokens--;
2512   pfile->cb.line_change = line_change;
2513
2514   return peektok;
2515 }
2516
2517 /* Allocate a single token that is invalidated at the same time as the
2518    rest of the tokens on the line.  Has its line and col set to the
2519    same as the last lexed token, so that diagnostics appear in the
2520    right place.  */
2521 cpp_token *
2522 _cpp_temp_token (cpp_reader *pfile)
2523 {
2524   cpp_token *old, *result;
2525   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2526   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2527
2528   old = pfile->cur_token - 1;
2529   /* Any pre-existing lookaheads must not be clobbered.  */
2530   if (la)
2531     {
2532       if (sz <= la)
2533         {
2534           tokenrun *next = next_tokenrun (pfile->cur_run);
2535
2536           if (sz < la)
2537             memmove (next->base + 1, next->base,
2538                      (la - sz) * sizeof (cpp_token));
2539
2540           next->base[0] = pfile->cur_run->limit[-1];
2541         }
2542
2543       if (sz > 1)
2544         memmove (pfile->cur_token + 1, pfile->cur_token,
2545                  MIN (la, sz - 1) * sizeof (cpp_token));
2546     }
2547
2548   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2549     {
2550       pfile->cur_run = next_tokenrun (pfile->cur_run);
2551       pfile->cur_token = pfile->cur_run->base;
2552     }
2553
2554   result = pfile->cur_token++;
2555   result->src_loc = old->src_loc;
2556   return result;
2557 }
2558
2559 /* Lex a token into RESULT (external interface).  Takes care of issues
2560    like directive handling, token lookahead, multiple include
2561    optimization and skipping.  */
2562 const cpp_token *
2563 _cpp_lex_token (cpp_reader *pfile)
2564 {
2565   cpp_token *result;
2566
2567   for (;;)
2568     {
2569       if (pfile->cur_token == pfile->cur_run->limit)
2570         {
2571           pfile->cur_run = next_tokenrun (pfile->cur_run);
2572           pfile->cur_token = pfile->cur_run->base;
2573         }
2574       /* We assume that the current token is somewhere in the current
2575          run.  */
2576       if (pfile->cur_token < pfile->cur_run->base
2577           || pfile->cur_token >= pfile->cur_run->limit)
2578         abort ();
2579
2580       if (pfile->lookaheads)
2581         {
2582           pfile->lookaheads--;
2583           result = pfile->cur_token++;
2584         }
2585       else
2586         result = _cpp_lex_direct (pfile);
2587
2588       if (result->flags & BOL)
2589         {
2590           /* Is this a directive.  If _cpp_handle_directive returns
2591              false, it is an assembler #.  */
2592           if (result->type == CPP_HASH
2593               /* 6.10.3 p 11: Directives in a list of macro arguments
2594                  gives undefined behavior.  This implementation
2595                  handles the directive as normal.  */
2596               && pfile->state.parsing_args != 1)
2597             {
2598               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2599                 {
2600                   if (pfile->directive_result.type == CPP_PADDING)
2601                     continue;
2602                   result = &pfile->directive_result;
2603                 }
2604             }
2605           else if (pfile->state.in_deferred_pragma)
2606             result = &pfile->directive_result;
2607
2608           if (pfile->cb.line_change && !pfile->state.skipping)
2609             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2610         }
2611
2612       /* We don't skip tokens in directives.  */
2613       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2614         break;
2615
2616       /* Outside a directive, invalidate controlling macros.  At file
2617          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2618          get here and MI optimization works.  */
2619       pfile->mi_valid = false;
2620
2621       if (!pfile->state.skipping || result->type == CPP_EOF)
2622         break;
2623     }
2624
2625   return result;
2626 }
2627
2628 /* Returns true if a fresh line has been loaded.  */
2629 bool
2630 _cpp_get_fresh_line (cpp_reader *pfile)
2631 {
2632   int return_at_eof;
2633
2634   /* We can't get a new line until we leave the current directive.  */
2635   if (pfile->state.in_directive)
2636     return false;
2637
2638   for (;;)
2639     {
2640       cpp_buffer *buffer = pfile->buffer;
2641
2642       if (!buffer->need_line)
2643         return true;
2644
2645       if (buffer->next_line < buffer->rlimit)
2646         {
2647           _cpp_clean_line (pfile);
2648           return true;
2649         }
2650
2651       /* First, get out of parsing arguments state.  */
2652       if (pfile->state.parsing_args)
2653         return false;
2654
2655       /* End of buffer.  Non-empty files should end in a newline.  */
2656       if (buffer->buf != buffer->rlimit
2657           && buffer->next_line > buffer->rlimit
2658           && !buffer->from_stage3)
2659         {
2660           /* Clip to buffer size.  */
2661           buffer->next_line = buffer->rlimit;
2662         }
2663
2664       return_at_eof = buffer->return_at_eof;
2665       _cpp_pop_buffer (pfile);
2666       if (pfile->buffer == NULL || return_at_eof)
2667         return false;
2668     }
2669 }
2670
2671 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2672   do                                                    \
2673     {                                                   \
2674       result->type = ELSE_TYPE;                         \
2675       if (*buffer->cur == CHAR)                         \
2676         buffer->cur++, result->type = THEN_TYPE;        \
2677     }                                                   \
2678   while (0)
2679
2680 /* Lex a token into pfile->cur_token, which is also incremented, to
2681    get diagnostics pointing to the correct location.
2682
2683    Does not handle issues such as token lookahead, multiple-include
2684    optimization, directives, skipping etc.  This function is only
2685    suitable for use by _cpp_lex_token, and in special cases like
2686    lex_expansion_token which doesn't care for any of these issues.
2687
2688    When meeting a newline, returns CPP_EOF if parsing a directive,
2689    otherwise returns to the start of the token buffer if permissible.
2690    Returns the location of the lexed token.  */
2691 cpp_token *
2692 _cpp_lex_direct (cpp_reader *pfile)
2693 {
2694   cppchar_t c;
2695   cpp_buffer *buffer;
2696   const unsigned char *comment_start;
2697   bool fallthrough_comment = false;
2698   cpp_token *result = pfile->cur_token++;
2699
2700  fresh_line:
2701   result->flags = 0;
2702   buffer = pfile->buffer;
2703   if (buffer->need_line)
2704     {
2705       if (pfile->state.in_deferred_pragma)
2706         {
2707           result->type = CPP_PRAGMA_EOL;
2708           pfile->state.in_deferred_pragma = false;
2709           if (!pfile->state.pragma_allow_expansion)
2710             pfile->state.prevent_expansion--;
2711           return result;
2712         }
2713       if (!_cpp_get_fresh_line (pfile))
2714         {
2715           result->type = CPP_EOF;
2716           if (!pfile->state.in_directive)
2717             {
2718               /* Tell the compiler the line number of the EOF token.  */
2719               result->src_loc = pfile->line_table->highest_line;
2720               result->flags = BOL;
2721             }
2722           return result;
2723         }
2724       if (buffer != pfile->buffer)
2725         fallthrough_comment = false;
2726       if (!pfile->keep_tokens)
2727         {
2728           pfile->cur_run = &pfile->base_run;
2729           result = pfile->base_run.base;
2730           pfile->cur_token = result + 1;
2731         }
2732       result->flags = BOL;
2733       if (pfile->state.parsing_args == 2)
2734         result->flags |= PREV_WHITE;
2735     }
2736   buffer = pfile->buffer;
2737  update_tokens_line:
2738   result->src_loc = pfile->line_table->highest_line;
2739
2740  skipped_white:
2741   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2742       && !pfile->overlaid_buffer)
2743     {
2744       _cpp_process_line_notes (pfile, false);
2745       result->src_loc = pfile->line_table->highest_line;
2746     }
2747   c = *buffer->cur++;
2748
2749   if (pfile->forced_token_location_p)
2750     result->src_loc = *pfile->forced_token_location_p;
2751   else
2752     result->src_loc = linemap_position_for_column (pfile->line_table,
2753                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2754
2755   switch (c)
2756     {
2757     case ' ': case '\t': case '\f': case '\v': case '\0':
2758       result->flags |= PREV_WHITE;
2759       skip_whitespace (pfile, c);
2760       goto skipped_white;
2761
2762     case '\n':
2763       if (buffer->cur < buffer->rlimit)
2764         CPP_INCREMENT_LINE (pfile, 0);
2765       buffer->need_line = true;
2766       goto fresh_line;
2767
2768     case '0': case '1': case '2': case '3': case '4':
2769     case '5': case '6': case '7': case '8': case '9':
2770       {
2771         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2772         result->type = CPP_NUMBER;
2773         lex_number (pfile, &result->val.str, &nst);
2774         warn_about_normalization (pfile, result, &nst);
2775         break;
2776       }
2777
2778     case 'L':
2779     case 'u':
2780     case 'U':
2781     case 'R':
2782       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2783          wide strings or raw strings.  */
2784       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2785           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2786         {
2787           if ((*buffer->cur == '\'' && c != 'R')
2788               || *buffer->cur == '"'
2789               || (*buffer->cur == 'R'
2790                   && c != 'R'
2791                   && buffer->cur[1] == '"'
2792                   && CPP_OPTION (pfile, rliterals))
2793               || (*buffer->cur == '8'
2794                   && c == 'u'
2795                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
2796                                 && CPP_OPTION (pfile, utf8_char_literals)))
2797                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2798                           && CPP_OPTION (pfile, rliterals)))))
2799             {
2800               lex_string (pfile, result, buffer->cur - 1);
2801               break;
2802             }
2803         }
2804       /* Fall through.  */
2805
2806     case '_':
2807     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2808     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2809     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2810     case 's': case 't':           case 'v': case 'w': case 'x':
2811     case 'y': case 'z':
2812     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2813     case 'G': case 'H': case 'I': case 'J': case 'K':
2814     case 'M': case 'N': case 'O': case 'P': case 'Q':
2815     case 'S': case 'T':           case 'V': case 'W': case 'X':
2816     case 'Y': case 'Z':
2817       result->type = CPP_NAME;
2818       {
2819         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2820         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2821                                                 &nst,
2822                                                 &result->val.node.spelling);
2823         warn_about_normalization (pfile, result, &nst);
2824       }
2825
2826       /* Convert named operators to their proper types.  */
2827       if (result->val.node.node->flags & NODE_OPERATOR)
2828         {
2829           result->flags |= NAMED_OP;
2830           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2831         }
2832
2833       /* Signal FALLTHROUGH comment followed by another token.  */
2834       if (fallthrough_comment)
2835         result->flags |= PREV_FALLTHROUGH;
2836       break;
2837
2838     case '\'':
2839     case '"':
2840       lex_string (pfile, result, buffer->cur - 1);
2841       break;
2842
2843     case '/':
2844       /* A potential block or line comment.  */
2845       comment_start = buffer->cur;
2846       c = *buffer->cur;
2847
2848       if (c == '*')
2849         {
2850           if (_cpp_skip_block_comment (pfile))
2851             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2852         }
2853       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
2854         {
2855           /* Don't warn for system headers.  */
2856           if (cpp_in_system_header (pfile))
2857             ;
2858           /* Warn about comments if pedantically GNUC89, and not
2859              in system headers.  */
2860           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
2861                    && CPP_PEDANTIC (pfile)
2862                    && ! buffer->warned_cplusplus_comments)
2863             {
2864               cpp_error (pfile, CPP_DL_PEDWARN,
2865                          "C++ style comments are not allowed in ISO C90");
2866               cpp_error (pfile, CPP_DL_PEDWARN,
2867                          "(this will be reported only once per input file)");
2868               buffer->warned_cplusplus_comments = 1;
2869             }
2870           /* Or if specifically desired via -Wc90-c99-compat.  */
2871           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
2872                    && ! CPP_OPTION (pfile, cplusplus)
2873                    && ! buffer->warned_cplusplus_comments)
2874             {
2875               cpp_error (pfile, CPP_DL_WARNING,
2876                          "C++ style comments are incompatible with C90");
2877               cpp_error (pfile, CPP_DL_WARNING,
2878                          "(this will be reported only once per input file)");
2879               buffer->warned_cplusplus_comments = 1;
2880             }
2881           /* In C89/C94, C++ style comments are forbidden.  */
2882           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
2883                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
2884             {
2885               /* But don't be confused about valid code such as
2886                  - // immediately followed by *,
2887                  - // in a preprocessing directive,
2888                  - // in an #if 0 block.  */
2889               if (buffer->cur[1] == '*'
2890                   || pfile->state.in_directive
2891                   || pfile->state.skipping)
2892                 {
2893                   result->type = CPP_DIV;
2894                   break;
2895                 }
2896               else if (! buffer->warned_cplusplus_comments)
2897                 {
2898                   cpp_error (pfile, CPP_DL_ERROR,
2899                              "C++ style comments are not allowed in ISO C90");
2900                   cpp_error (pfile, CPP_DL_ERROR,
2901                              "(this will be reported only once per input "
2902                              "file)");
2903                   buffer->warned_cplusplus_comments = 1;
2904                 }
2905             }
2906           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2907             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2908         }
2909       else if (c == '=')
2910         {
2911           buffer->cur++;
2912           result->type = CPP_DIV_EQ;
2913           break;
2914         }
2915       else
2916         {
2917           result->type = CPP_DIV;
2918           break;
2919         }
2920
2921       if (fallthrough_comment_p (pfile, comment_start))
2922         fallthrough_comment = true;
2923
2924       if (pfile->cb.comment)
2925         {
2926           size_t len = pfile->buffer->cur - comment_start;
2927           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
2928                              len + 1);
2929         }
2930
2931       if (!pfile->state.save_comments)
2932         {
2933           result->flags |= PREV_WHITE;
2934           goto update_tokens_line;
2935         }
2936
2937       if (fallthrough_comment)
2938         result->flags |= PREV_FALLTHROUGH;
2939
2940       /* Save the comment as a token in its own right.  */
2941       save_comment (pfile, result, comment_start, c);
2942       break;
2943
2944     case '<':
2945       if (pfile->state.angled_headers)
2946         {
2947           lex_string (pfile, result, buffer->cur - 1);
2948           if (result->type != CPP_LESS)
2949             break;
2950         }
2951
2952       result->type = CPP_LESS;
2953       if (*buffer->cur == '=')
2954         buffer->cur++, result->type = CPP_LESS_EQ;
2955       else if (*buffer->cur == '<')
2956         {
2957           buffer->cur++;
2958           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2959         }
2960       else if (CPP_OPTION (pfile, digraphs))
2961         {
2962           if (*buffer->cur == ':')
2963             {
2964               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2965                  three characters are <:: and the subsequent character
2966                  is neither : nor >, the < is treated as a preprocessor
2967                  token by itself".  */
2968               if (CPP_OPTION (pfile, cplusplus)
2969                   && CPP_OPTION (pfile, lang) != CLK_CXX98
2970                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
2971                   && buffer->cur[1] == ':'
2972                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2973                 break;
2974
2975               buffer->cur++;
2976               result->flags |= DIGRAPH;
2977               result->type = CPP_OPEN_SQUARE;
2978             }
2979           else if (*buffer->cur == '%')
2980             {
2981               buffer->cur++;
2982               result->flags |= DIGRAPH;
2983               result->type = CPP_OPEN_BRACE;
2984             }
2985         }
2986       break;
2987
2988     case '>':
2989       result->type = CPP_GREATER;
2990       if (*buffer->cur == '=')
2991         buffer->cur++, result->type = CPP_GREATER_EQ;
2992       else if (*buffer->cur == '>')
2993         {
2994           buffer->cur++;
2995           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2996         }
2997       break;
2998
2999     case '%':
3000       result->type = CPP_MOD;
3001       if (*buffer->cur == '=')
3002         buffer->cur++, result->type = CPP_MOD_EQ;
3003       else if (CPP_OPTION (pfile, digraphs))
3004         {
3005           if (*buffer->cur == ':')
3006             {
3007               buffer->cur++;
3008               result->flags |= DIGRAPH;
3009               result->type = CPP_HASH;
3010               if (*buffer->cur == '%' && buffer->cur[1] == ':')
3011                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3012             }
3013           else if (*buffer->cur == '>')
3014             {
3015               buffer->cur++;
3016               result->flags |= DIGRAPH;
3017               result->type = CPP_CLOSE_BRACE;
3018             }
3019         }
3020       break;
3021
3022     case '.':
3023       result->type = CPP_DOT;
3024       if (ISDIGIT (*buffer->cur))
3025         {
3026           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3027           result->type = CPP_NUMBER;
3028           lex_number (pfile, &result->val.str, &nst);
3029           warn_about_normalization (pfile, result, &nst);
3030         }
3031       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3032         buffer->cur += 2, result->type = CPP_ELLIPSIS;
3033       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3034         buffer->cur++, result->type = CPP_DOT_STAR;
3035       break;
3036
3037     case '+':
3038       result->type = CPP_PLUS;
3039       if (*buffer->cur == '+')
3040         buffer->cur++, result->type = CPP_PLUS_PLUS;
3041       else if (*buffer->cur == '=')
3042         buffer->cur++, result->type = CPP_PLUS_EQ;
3043       break;
3044
3045     case '-':
3046       result->type = CPP_MINUS;
3047       if (*buffer->cur == '>')
3048         {
3049           buffer->cur++;
3050           result->type = CPP_DEREF;
3051           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3052             buffer->cur++, result->type = CPP_DEREF_STAR;
3053         }
3054       else if (*buffer->cur == '-')
3055         buffer->cur++, result->type = CPP_MINUS_MINUS;
3056       else if (*buffer->cur == '=')
3057         buffer->cur++, result->type = CPP_MINUS_EQ;
3058       break;
3059
3060     case '&':
3061       result->type = CPP_AND;
3062       if (*buffer->cur == '&')
3063         buffer->cur++, result->type = CPP_AND_AND;
3064       else if (*buffer->cur == '=')
3065         buffer->cur++, result->type = CPP_AND_EQ;
3066       break;
3067
3068     case '|':
3069       result->type = CPP_OR;
3070       if (*buffer->cur == '|')
3071         buffer->cur++, result->type = CPP_OR_OR;
3072       else if (*buffer->cur == '=')
3073         buffer->cur++, result->type = CPP_OR_EQ;
3074       break;
3075
3076     case ':':
3077       result->type = CPP_COLON;
3078       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
3079         buffer->cur++, result->type = CPP_SCOPE;
3080       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3081         {
3082           buffer->cur++;
3083           result->flags |= DIGRAPH;
3084           result->type = CPP_CLOSE_SQUARE;
3085         }
3086       break;
3087
3088     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3089     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3090     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3091     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3092     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3093
3094     case '?': result->type = CPP_QUERY; break;
3095     case '~': result->type = CPP_COMPL; break;
3096     case ',': result->type = CPP_COMMA; break;
3097     case '(': result->type = CPP_OPEN_PAREN; break;
3098     case ')': result->type = CPP_CLOSE_PAREN; break;
3099     case '[': result->type = CPP_OPEN_SQUARE; break;
3100     case ']': result->type = CPP_CLOSE_SQUARE; break;
3101     case '{': result->type = CPP_OPEN_BRACE; break;
3102     case '}': result->type = CPP_CLOSE_BRACE; break;
3103     case ';': result->type = CPP_SEMICOLON; break;
3104
3105       /* @ is a punctuator in Objective-C.  */
3106     case '@': result->type = CPP_ATSIGN; break;
3107
3108     case '$':
3109     case '\\':
3110       {
3111         const uchar *base = --buffer->cur;
3112         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3113
3114         if (forms_identifier_p (pfile, true, &nst))
3115           {
3116             result->type = CPP_NAME;
3117             result->val.node.node = lex_identifier (pfile, base, true, &nst,
3118                                                     &result->val.node.spelling);
3119             warn_about_normalization (pfile, result, &nst);
3120             break;
3121           }
3122         buffer->cur++;
3123       }
3124       /* FALLTHRU */
3125
3126     default:
3127       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
3128       break;
3129     }
3130
3131   /* Potentially convert the location of the token to a range.  */
3132   if (result->src_loc >= RESERVED_LOCATION_COUNT
3133       && result->type != CPP_EOF)
3134     {
3135       /* Ensure that any line notes are processed, so that we have the
3136          correct physical line/column for the end-point of the token even
3137          when a logical line is split via one or more backslashes.  */
3138       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3139           && !pfile->overlaid_buffer)
3140         _cpp_process_line_notes (pfile, false);
3141
3142       source_range tok_range;
3143       tok_range.m_start = result->src_loc;
3144       tok_range.m_finish
3145         = linemap_position_for_column (pfile->line_table,
3146                                        CPP_BUF_COLUMN (buffer, buffer->cur));
3147
3148       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
3149                                                result->src_loc,
3150                                                tok_range, NULL);
3151     }
3152
3153   return result;
3154 }
3155
3156 /* An upper bound on the number of bytes needed to spell TOKEN.
3157    Does not include preceding whitespace.  */
3158 unsigned int
3159 cpp_token_len (const cpp_token *token)
3160 {
3161   unsigned int len;
3162
3163   switch (TOKEN_SPELL (token))
3164     {
3165     default:            len = 6;                                break;
3166     case SPELL_LITERAL: len = token->val.str.len;               break;
3167     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
3168     }
3169
3170   return len;
3171 }
3172
3173 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
3174    Return the number of bytes read out of NAME.  (There are always
3175    10 bytes written to BUFFER.)  */
3176
3177 static size_t
3178 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
3179 {
3180   int j;
3181   int ucn_len = 0;
3182   int ucn_len_c;
3183   unsigned t;
3184   unsigned long utf32;
3185
3186   /* Compute the length of the UTF-8 sequence.  */
3187   for (t = *name; t & 0x80; t <<= 1)
3188     ucn_len++;
3189
3190   utf32 = *name & (0x7F >> ucn_len);
3191   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
3192     {
3193       utf32 = (utf32 << 6) | (*++name & 0x3F);
3194
3195       /* Ill-formed UTF-8.  */
3196       if ((*name & ~0x3F) != 0x80)
3197         abort ();
3198     }
3199
3200   *buffer++ = '\\';
3201   *buffer++ = 'U';
3202   for (j = 7; j >= 0; j--)
3203     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
3204   return ucn_len;
3205 }
3206
3207 /* Given a token TYPE corresponding to a digraph, return a pointer to
3208    the spelling of the digraph.  */
3209 static const unsigned char *
3210 cpp_digraph2name (enum cpp_ttype type)
3211 {
3212   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
3213 }
3214
3215 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
3216    The buffer must already contain the enough space to hold the
3217    token's spelling.  Returns a pointer to the character after the
3218    last character written.  */
3219 unsigned char *
3220 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
3221 {
3222   size_t i;
3223   const unsigned char *name = NODE_NAME (ident);
3224
3225   for (i = 0; i < NODE_LEN (ident); i++)
3226     if (name[i] & ~0x7F)
3227       {
3228         i += utf8_to_ucn (buffer, name + i) - 1;
3229         buffer += 10;
3230       }
3231     else
3232       *buffer++ = name[i];
3233
3234   return buffer;
3235 }
3236
3237 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
3238    already contain the enough space to hold the token's spelling.
3239    Returns a pointer to the character after the last character written.
3240    FORSTRING is true if this is to be the spelling after translation
3241    phase 1 (with the original spelling of extended identifiers), false
3242    if extended identifiers should always be written using UCNs (there is
3243    no option for always writing them in the internal UTF-8 form).
3244    FIXME: Would be nice if we didn't need the PFILE argument.  */
3245 unsigned char *
3246 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
3247                  unsigned char *buffer, bool forstring)
3248 {
3249   switch (TOKEN_SPELL (token))
3250     {
3251     case SPELL_OPERATOR:
3252       {
3253         const unsigned char *spelling;
3254         unsigned char c;
3255
3256         if (token->flags & DIGRAPH)
3257           spelling = cpp_digraph2name (token->type);
3258         else if (token->flags & NAMED_OP)
3259           goto spell_ident;
3260         else
3261           spelling = TOKEN_NAME (token);
3262
3263         while ((c = *spelling++) != '\0')
3264           *buffer++ = c;
3265       }
3266       break;
3267
3268     spell_ident:
3269     case SPELL_IDENT:
3270       if (forstring)
3271         {
3272           memcpy (buffer, NODE_NAME (token->val.node.spelling),
3273                   NODE_LEN (token->val.node.spelling));
3274           buffer += NODE_LEN (token->val.node.spelling);
3275         }
3276       else
3277         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
3278       break;
3279
3280     case SPELL_LITERAL:
3281       memcpy (buffer, token->val.str.text, token->val.str.len);
3282       buffer += token->val.str.len;
3283       break;
3284
3285     case SPELL_NONE:
3286       cpp_error (pfile, CPP_DL_ICE,
3287                  "unspellable token %s", TOKEN_NAME (token));
3288       break;
3289     }
3290
3291   return buffer;
3292 }
3293
3294 /* Returns TOKEN spelt as a null-terminated string.  The string is
3295    freed when the reader is destroyed.  Useful for diagnostics.  */
3296 unsigned char *
3297 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
3298 {
3299   unsigned int len = cpp_token_len (token) + 1;
3300   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
3301
3302   end = cpp_spell_token (pfile, token, start, false);
3303   end[0] = '\0';
3304
3305   return start;
3306 }
3307
3308 /* Returns a pointer to a string which spells the token defined by
3309    TYPE and FLAGS.  Used by C front ends, which really should move to
3310    using cpp_token_as_text.  */
3311 const char *
3312 cpp_type2name (enum cpp_ttype type, unsigned char flags)
3313 {
3314   if (flags & DIGRAPH)
3315     return (const char *) cpp_digraph2name (type);
3316   else if (flags & NAMED_OP)
3317     return cpp_named_operator2name (type);
3318
3319   return (const char *) token_spellings[type].name;
3320 }
3321
3322 /* Writes the spelling of token to FP, without any preceding space.
3323    Separated from cpp_spell_token for efficiency - to avoid stdio
3324    double-buffering.  */
3325 void
3326 cpp_output_token (const cpp_token *token, FILE *fp)
3327 {
3328   switch (TOKEN_SPELL (token))
3329     {
3330     case SPELL_OPERATOR:
3331       {
3332         const unsigned char *spelling;
3333         int c;
3334
3335         if (token->flags & DIGRAPH)
3336           spelling = cpp_digraph2name (token->type);
3337         else if (token->flags & NAMED_OP)
3338           goto spell_ident;
3339         else
3340           spelling = TOKEN_NAME (token);
3341
3342         c = *spelling;
3343         do
3344           putc (c, fp);
3345         while ((c = *++spelling) != '\0');
3346       }
3347       break;
3348
3349     spell_ident:
3350     case SPELL_IDENT:
3351       {
3352         size_t i;
3353         const unsigned char * name = NODE_NAME (token->val.node.node);
3354
3355         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
3356           if (name[i] & ~0x7F)
3357             {
3358               unsigned char buffer[10];
3359               i += utf8_to_ucn (buffer, name + i) - 1;
3360               fwrite (buffer, 1, 10, fp);
3361             }
3362           else
3363             fputc (NODE_NAME (token->val.node.node)[i], fp);
3364       }
3365       break;
3366
3367     case SPELL_LITERAL:
3368       fwrite (token->val.str.text, 1, token->val.str.len, fp);
3369       break;
3370
3371     case SPELL_NONE:
3372       /* An error, most probably.  */
3373       break;
3374     }
3375 }
3376
3377 /* Compare two tokens.  */
3378 int
3379 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
3380 {
3381   if (a->type == b->type && a->flags == b->flags)
3382     switch (TOKEN_SPELL (a))
3383       {
3384       default:                  /* Keep compiler happy.  */
3385       case SPELL_OPERATOR:
3386         /* token_no is used to track where multiple consecutive ##
3387            tokens were originally located.  */
3388         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
3389       case SPELL_NONE:
3390         return (a->type != CPP_MACRO_ARG
3391                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
3392                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
3393       case SPELL_IDENT:
3394         return (a->val.node.node == b->val.node.node
3395                 && a->val.node.spelling == b->val.node.spelling);
3396       case SPELL_LITERAL:
3397         return (a->val.str.len == b->val.str.len
3398                 && !memcmp (a->val.str.text, b->val.str.text,
3399                             a->val.str.len));
3400       }
3401
3402   return 0;
3403 }
3404
3405 /* Returns nonzero if a space should be inserted to avoid an
3406    accidental token paste for output.  For simplicity, it is
3407    conservative, and occasionally advises a space where one is not
3408    needed, e.g. "." and ".2".  */
3409 int
3410 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
3411                  const cpp_token *token2)
3412 {
3413   enum cpp_ttype a = token1->type, b = token2->type;
3414   cppchar_t c;
3415
3416   if (token1->flags & NAMED_OP)
3417     a = CPP_NAME;
3418   if (token2->flags & NAMED_OP)
3419     b = CPP_NAME;
3420
3421   c = EOF;
3422   if (token2->flags & DIGRAPH)
3423     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
3424   else if (token_spellings[b].category == SPELL_OPERATOR)
3425     c = token_spellings[b].name[0];
3426
3427   /* Quickly get everything that can paste with an '='.  */
3428   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
3429     return 1;
3430
3431   switch (a)
3432     {
3433     case CPP_GREATER:   return c == '>';
3434     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
3435     case CPP_PLUS:      return c == '+';
3436     case CPP_MINUS:     return c == '-' || c == '>';
3437     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
3438     case CPP_MOD:       return c == ':' || c == '>';
3439     case CPP_AND:       return c == '&';
3440     case CPP_OR:        return c == '|';
3441     case CPP_COLON:     return c == ':' || c == '>';
3442     case CPP_DEREF:     return c == '*';
3443     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
3444     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
3445     case CPP_NAME:      return ((b == CPP_NUMBER
3446                                  && name_p (pfile, &token2->val.str))
3447                                 || b == CPP_NAME
3448                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
3449     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
3450                                 || c == '.' || c == '+' || c == '-');
3451                                       /* UCNs */
3452     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
3453                                  && b == CPP_NAME)
3454                                 || (CPP_OPTION (pfile, objc)
3455                                     && token1->val.str.text[0] == '@'
3456                                     && (b == CPP_NAME || b == CPP_STRING)));
3457     case CPP_STRING:
3458     case CPP_WSTRING:
3459     case CPP_UTF8STRING:
3460     case CPP_STRING16:
3461     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
3462                                 && (b == CPP_NAME
3463                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
3464                                         && ISIDST (token2->val.str.text[0]))));
3465
3466     default:            break;
3467     }
3468
3469   return 0;
3470 }
3471
3472 /* Output all the remaining tokens on the current line, and a newline
3473    character, to FP.  Leading whitespace is removed.  If there are
3474    macros, special token padding is not performed.  */
3475 void
3476 cpp_output_line (cpp_reader *pfile, FILE *fp)
3477 {
3478   const cpp_token *token;
3479
3480   token = cpp_get_token (pfile);
3481   while (token->type != CPP_EOF)
3482     {
3483       cpp_output_token (token, fp);
3484       token = cpp_get_token (pfile);
3485       if (token->flags & PREV_WHITE)
3486         putc (' ', fp);
3487     }
3488
3489   putc ('\n', fp);
3490 }
3491
3492 /* Return a string representation of all the remaining tokens on the
3493    current line.  The result is allocated using xmalloc and must be
3494    freed by the caller.  */
3495 unsigned char *
3496 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
3497 {
3498   const cpp_token *token;
3499   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
3500   unsigned int alloced = 120 + out;
3501   unsigned char *result = (unsigned char *) xmalloc (alloced);
3502
3503   /* If DIR_NAME is empty, there are no initial contents.  */
3504   if (dir_name)
3505     {
3506       sprintf ((char *) result, "#%s ", dir_name);
3507       out += 2;
3508     }
3509
3510   token = cpp_get_token (pfile);
3511   while (token->type != CPP_EOF)
3512     {
3513       unsigned char *last;
3514       /* Include room for a possible space and the terminating nul.  */
3515       unsigned int len = cpp_token_len (token) + 2;
3516
3517       if (out + len > alloced)
3518         {
3519           alloced *= 2;
3520           if (out + len > alloced)
3521             alloced = out + len;
3522           result = (unsigned char *) xrealloc (result, alloced);
3523         }
3524
3525       last = cpp_spell_token (pfile, token, &result[out], 0);
3526       out = last - result;
3527
3528       token = cpp_get_token (pfile);
3529       if (token->flags & PREV_WHITE)
3530         result[out++] = ' ';
3531     }
3532
3533   result[out] = '\0';
3534   return result;
3535 }
3536
3537 /* Memory buffers.  Changing these three constants can have a dramatic
3538    effect on performance.  The values here are reasonable defaults,
3539    but might be tuned.  If you adjust them, be sure to test across a
3540    range of uses of cpplib, including heavy nested function-like macro
3541    expansion.  Also check the change in peak memory usage (NJAMD is a
3542    good tool for this).  */
3543 #define MIN_BUFF_SIZE 8000
3544 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3545 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
3546         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
3547
3548 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
3549   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
3550 #endif
3551
3552 /* Create a new allocation buffer.  Place the control block at the end
3553    of the buffer, so that buffer overflows will cause immediate chaos.  */
3554 static _cpp_buff *
3555 new_buff (size_t len)
3556 {
3557   _cpp_buff *result;
3558   unsigned char *base;
3559
3560   if (len < MIN_BUFF_SIZE)
3561     len = MIN_BUFF_SIZE;
3562   len = CPP_ALIGN (len);
3563
3564 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3565   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
3566      struct first.  */
3567   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
3568   base = XNEWVEC (unsigned char, len + slen);
3569   result = (_cpp_buff *) base;
3570   base += slen;
3571 #else
3572   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3573   result = (_cpp_buff *) (base + len);
3574 #endif
3575   result->base = base;
3576   result->cur = base;
3577   result->limit = base + len;
3578   result->next = NULL;
3579   return result;
3580 }
3581
3582 /* Place a chain of unwanted allocation buffers on the free list.  */
3583 void
3584 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3585 {
3586   _cpp_buff *end = buff;
3587
3588   while (end->next)
3589     end = end->next;
3590   end->next = pfile->free_buffs;
3591   pfile->free_buffs = buff;
3592 }
3593
3594 /* Return a free buffer of size at least MIN_SIZE.  */
3595 _cpp_buff *
3596 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3597 {
3598   _cpp_buff *result, **p;
3599
3600   for (p = &pfile->free_buffs;; p = &(*p)->next)
3601     {
3602       size_t size;
3603
3604       if (*p == NULL)
3605         return new_buff (min_size);
3606       result = *p;
3607       size = result->limit - result->base;
3608       /* Return a buffer that's big enough, but don't waste one that's
3609          way too big.  */
3610       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3611         break;
3612     }
3613
3614   *p = result->next;
3615   result->next = NULL;
3616   result->cur = result->base;
3617   return result;
3618 }
3619
3620 /* Creates a new buffer with enough space to hold the uncommitted
3621    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
3622    the excess bytes to the new buffer.  Chains the new buffer after
3623    BUFF, and returns the new buffer.  */
3624 _cpp_buff *
3625 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3626 {
3627   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3628   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3629
3630   buff->next = new_buff;
3631   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3632   return new_buff;
3633 }
3634
3635 /* Creates a new buffer with enough space to hold the uncommitted
3636    remaining bytes of the buffer pointed to by BUFF, and at least
3637    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3638    Chains the new buffer before the buffer pointed to by BUFF, and
3639    updates the pointer to point to the new buffer.  */
3640 void
3641 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3642 {
3643   _cpp_buff *new_buff, *old_buff = *pbuff;
3644   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3645
3646   new_buff = _cpp_get_buff (pfile, size);
3647   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3648   new_buff->next = old_buff;
3649   *pbuff = new_buff;
3650 }
3651
3652 /* Free a chain of buffers starting at BUFF.  */
3653 void
3654 _cpp_free_buff (_cpp_buff *buff)
3655 {
3656   _cpp_buff *next;
3657
3658   for (; buff; buff = next)
3659     {
3660       next = buff->next;
3661 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3662       free (buff);
3663 #else
3664       free (buff->base);
3665 #endif
3666     }
3667 }
3668
3669 /* Allocate permanent, unaligned storage of length LEN.  */
3670 unsigned char *
3671 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3672 {
3673   _cpp_buff *buff = pfile->u_buff;
3674   unsigned char *result = buff->cur;
3675
3676   if (len > (size_t) (buff->limit - result))
3677     {
3678       buff = _cpp_get_buff (pfile, len);
3679       buff->next = pfile->u_buff;
3680       pfile->u_buff = buff;
3681       result = buff->cur;
3682     }
3683
3684   buff->cur = result + len;
3685   return result;
3686 }
3687
3688 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3689    That buffer is used for growing allocations when saving macro
3690    replacement lists in a #define, and when parsing an answer to an
3691    assertion in #assert, #unassert or #if (and therefore possibly
3692    whilst expanding macros).  It therefore must not be used by any
3693    code that they might call: specifically the lexer and the guts of
3694    the macro expander.
3695
3696    All existing other uses clearly fit this restriction: storing
3697    registered pragmas during initialization.  */
3698 unsigned char *
3699 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3700 {
3701   _cpp_buff *buff = pfile->a_buff;
3702   unsigned char *result = buff->cur;
3703
3704   if (len > (size_t) (buff->limit - result))
3705     {
3706       buff = _cpp_get_buff (pfile, len);
3707       buff->next = pfile->a_buff;
3708       pfile->a_buff = buff;
3709       result = buff->cur;
3710     }
3711
3712   buff->cur = result + len;
3713   return result;
3714 }
3715
3716 /* Say which field of TOK is in use.  */
3717
3718 enum cpp_token_fld_kind
3719 cpp_token_val_index (const cpp_token *tok)
3720 {
3721   switch (TOKEN_SPELL (tok))
3722     {
3723     case SPELL_IDENT:
3724       return CPP_TOKEN_FLD_NODE;
3725     case SPELL_LITERAL:
3726       return CPP_TOKEN_FLD_STR;
3727     case SPELL_OPERATOR:
3728       if (tok->type == CPP_PASTE)
3729         return CPP_TOKEN_FLD_TOKEN_NO;
3730       else
3731         return CPP_TOKEN_FLD_NONE;
3732     case SPELL_NONE:
3733       if (tok->type == CPP_MACRO_ARG)
3734         return CPP_TOKEN_FLD_ARG_NO;
3735       else if (tok->type == CPP_PADDING)
3736         return CPP_TOKEN_FLD_SOURCE;
3737       else if (tok->type == CPP_PRAGMA)
3738         return CPP_TOKEN_FLD_PRAGMA;
3739       /* fall through */
3740     default:
3741       return CPP_TOKEN_FLD_NONE;
3742     }
3743 }
3744
3745 /* All tokens lexed in R after calling this function will be forced to have
3746    their source_location the same as the location referenced by P, until
3747    cpp_stop_forcing_token_locations is called for R.  */
3748
3749 void
3750 cpp_force_token_locations (cpp_reader *r, source_location *p)
3751 {
3752   r->forced_token_location_p = p;
3753 }
3754
3755 /* Go back to assigning locations naturally for lexed tokens.  */
3756
3757 void
3758 cpp_stop_forcing_token_locations (cpp_reader *r)
3759 {
3760   r->forced_token_location_p = NULL;
3761 }