libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2017 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problem can be properly
 267    autoconfed:
 268
 269    The Solaris 10+ assembler tags objects with the instruction set
 270    extensions used, so SSE4.2 executables cannot run on machines that
 271    don't support that extension.  */
 272
 273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 274
 275 /* Replicated character data to be shared between implementations.
 276    Recall that outside of a context with vector support we can't
 277    define compatible vector types, therefore these are all defined
 278    in terms of raw characters.  */
 279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 280   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 281     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 282   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 283     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 284   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 285     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 286   { '?', '?', '?', '?', '?', '?', '?', '?',
 287     '?', '?', '?', '?', '?', '?', '?', '?' },
 288 };
 289
 290 /* A version of the fast scanner using MMX vectorized byte compare insns.
 291
 292    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 293    which was packaged into SSE1; it is also present in the AMD MMX
 294    extension.  Mark the function as using "sse" so that we emit a real
 295    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 296
 297 static const uchar *
 298 #ifndef __SSE__
 299 __attribute__((__target__("sse")))
 300 #endif
 301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 302 {
 303   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 304   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 305
 306   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 307   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 308   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 309   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 310
 311   unsigned int misalign, found, mask;
 312   const v8qi *p;
 313   v8qi data, t, c;
 314
 315   /* Align the source pointer.  While MMX doesn't generate unaligned data
 316      faults, this allows us to safely scan to the end of the buffer without
 317      reading beyond the end of the last page.  */
 318   misalign = (uintptr_t)s & 7;
 319   p = (const v8qi *)((uintptr_t)s & -8);
 320   data = *p;
 321
 322   /* Create a mask for the bytes that are valid within the first
 323      16-byte block.  The Idea here is that the AND with the mask
 324      within the loop is "free", since we need some AND or TEST
 325      insn in order to set the flags for the branch anyway.  */
 326   mask = -1u << misalign;
 327
 328   /* Main loop processing 8 bytes at a time.  */
 329   goto start;
 330   do
 331     {
 332       data = *++p;
 333       mask = -1;
 334
 335     start:
 336       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 337       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 338       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 339       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       found = __builtin_ia32_pmovmskb (t);
 344       found &= mask;
 345     }
 346   while (!found);
 347
 348   __builtin_ia32_emms ();
 349
 350   /* FOUND contains 1 in bits for which we matched a relevant
 351      character.  Conversion to the byte index is trivial.  */
 352   found = __builtin_ctz(found);
 353   return (const uchar *)p + found;
 354 }
 355
 356 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 357
 358 static const uchar *
 359 #ifndef __SSE2__
 360 __attribute__((__target__("sse2")))
 361 #endif
 362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 363 {
 364   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 365
 366   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 367   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 368   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 369   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 370
 371   unsigned int misalign, found, mask;
 372   const v16qi *p;
 373   v16qi data, t;
 374
 375   /* Align the source pointer.  */
 376   misalign = (uintptr_t)s & 15;
 377   p = (const v16qi *)((uintptr_t)s & -16);
 378   data = *p;
 379
 380   /* Create a mask for the bytes that are valid within the first
 381      16-byte block.  The Idea here is that the AND with the mask
 382      within the loop is "free", since we need some AND or TEST
 383      insn in order to set the flags for the branch anyway.  */
 384   mask = -1u << misalign;
 385
 386   /* Main loop processing 16 bytes at a time.  */
 387   goto start;
 388   do
 389     {
 390       data = *++p;
 391       mask = -1;
 392
 393     start:
 394       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 395       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 396       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 397       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 398       found = __builtin_ia32_pmovmskb128 (t);
 399       found &= mask;
 400     }
 401   while (!found);
 402
 403   /* FOUND contains 1 in bits for which we matched a relevant
 404      character.  Conversion to the byte index is trivial.  */
 405   found = __builtin_ctz(found);
 406   return (const uchar *)p + found;
 407 }
 408
 409 #ifdef HAVE_SSE4
 410 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 411
 412 static const uchar *
 413 #ifndef __SSE4_2__
 414 __attribute__((__target__("sse4.2")))
 415 #endif
 416 search_line_sse42 (const uchar *s, const uchar *end)
 417 {
 418   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 419   static const v16qi search = { '\n', '\r', '?', '\\' };
 420
 421   uintptr_t si = (uintptr_t)s;
 422   uintptr_t index;
 423
 424   /* Check for unaligned input.  */
 425   if (si & 15)
 426     {
 427       v16qi sv;
 428
 429       if (__builtin_expect (end - s < 16, 0)
 430           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 431         {
 432           /* There are less than 16 bytes left in the buffer, and less
 433              than 16 bytes left on the page.  Reading 16 bytes at this
 434              point might generate a spurious page fault.  Defer to the
 435              SSE2 implementation, which already handles alignment.  */
 436           return search_line_sse2 (s, end);
 437         }
 438
 439       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 440          memory need not be aligned.  */
 441       sv = __builtin_ia32_loaddqu ((const char *) s);
 442       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 443
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 15) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  */
 454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 455   while (1)
 456     {
 457       char f;
 458
 459       /* By using inline assembly instead of the builtin,
 460          we can use the result, as well as the flags set.  */
 461       __asm ("%vpcmpestri\t$0, %2, %3"
 462              : "=c"(index), "=@ccc"(f)
 463              : "m"(*s), "x"(search), "a"(4), "d"(16));
 464       if (f)
 465         break;
 466
 467       s += 16;
 468     }
 469 #else
 470   s -= 16;
 471   /* By doing the whole loop in inline assembly,
 472      we can make proper use of the flags set.  */
 473   __asm (      ".balign 16\n"
 474         "0:     add $16, %1\n"
 475         "       %vpcmpestri\t$0, (%1), %2\n"
 476         "       jnc 0b"
 477         : "=&c"(index), "+r"(s)
 478         : "x"(search), "a"(4), "d"(16));
 479 #endif
 480
 481  found:
 482   return s + index;
 483 }
 484
 485 #else
 486 /* Work around out-dated assemblers without sse4 support.  */
 487 #define search_line_sse42 search_line_sse2
 488 #endif
 489
 490 /* Check the CPU capabilities.  */
 491
 492 #include "../gcc/config/i386/cpuid.h"
 493
 494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 495 static search_line_fast_type search_line_fast;
 496
 497 #define HAVE_init_vectorized_lexer 1
 498 static inline void
 499 init_vectorized_lexer (void)
 500 {
 501   unsigned dummy, ecx = 0, edx = 0;
 502   search_line_fast_type impl = search_line_acc_char;
 503   int minimum = 0;
 504
 505 #if defined(__SSE4_2__)
 506   minimum = 3;
 507 #elif defined(__SSE2__)
 508   minimum = 2;
 509 #elif defined(__SSE__)
 510   minimum = 1;
 511 #endif
 512
 513   if (minimum == 3)
 514     impl = search_line_sse42;
 515   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 516     {
 517       if (minimum == 3 || (ecx & bit_SSE4_2))
 518         impl = search_line_sse42;
 519       else if (minimum == 2 || (edx & bit_SSE2))
 520         impl = search_line_sse2;
 521       else if (minimum == 1 || (edx & bit_SSE))
 522         impl = search_line_mmx;
 523     }
 524   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 525     {
 526       if (minimum == 1
 527           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 528         impl = search_line_mmx;
 529     }
 530
 531   search_line_fast = impl;
 532 }
 533
 534 #elif defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 535
 536 /* A vection of the fast scanner using AltiVec vectorized byte compares
 537    and VSX unaligned loads (when VSX is available).  This is otherwise
 538    the same as the pre-GCC 5 version.  */
 539
 540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 541 static const uchar *
 542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 543 {
 544   typedef __attribute__((altivec(vector))) unsigned char vc;
 545
 546   const vc repl_nl = {
 547     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 548     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 549   };
 550   const vc repl_cr = {
 551     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 552     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 553   };
 554   const vc repl_bs = {
 555     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 556     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 557   };
 558   const vc repl_qm = {
 559     '?', '?', '?', '?', '?', '?', '?', '?',
 560     '?', '?', '?', '?', '?', '?', '?', '?',
 561   };
 562   const vc zero = { 0 };
 563
 564   vc data, t;
 565
 566   /* Main loop processing 16 bytes at a time.  */
 567   do
 568     {
 569       vc m_nl, m_cr, m_bs, m_qm;
 570
 571       data = *((const vc *)s);
 572       s += 16;
 573
 574       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 575       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 576       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 577       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 578       t = (m_nl | m_cr) | (m_bs | m_qm);
 579
 580       /* T now contains 0xff in bytes for which we matched one of the relevant
 581          characters.  We want to exit the loop if any byte in T is non-zero.
 582          Below is the expansion of vec_any_ne(t, zero).  */
 583     }
 584   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 585
 586   /* Restore s to to point to the 16 bytes we just processed.  */
 587   s -= 16;
 588
 589   {
 590 #define N  (sizeof(vc) / sizeof(long))
 591
 592     union {
 593       vc v;
 594       /* Statically assert that N is 2 or 4.  */
 595       unsigned long l[(N == 2 || N == 4) ? N : -1];
 596     } u;
 597     unsigned long l, i = 0;
 598
 599     u.v = t;
 600
 601     /* Find the first word of T that is non-zero.  */
 602     switch (N)
 603       {
 604       case 4:
 605         l = u.l[i++];
 606         if (l != 0)
 607           break;
 608         s += sizeof(unsigned long);
 609         l = u.l[i++];
 610         if (l != 0)
 611           break;
 612         s += sizeof(unsigned long);
 613         /* FALLTHRU */
 614       case 2:
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619         l = u.l[i];
 620       }
 621
 622     /* L now contains 0xff in bytes for which we matched one of the
 623        relevant characters.  We can find the byte index by finding
 624        its bit index and dividing by 8.  */
 625 #ifdef __BIG_ENDIAN__
 626     l = __builtin_clzl(l) >> 3;
 627 #else
 628     l = __builtin_ctzl(l) >> 3;
 629 #endif
 630     return s + l;
 631
 632 #undef N
 633   }
 634 }
 635
 636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 637
 638 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 639    This cannot be used for little endian because vec_lvsl/lvsr are
 640    deprecated for little endian and the code won't work properly.  */
 641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 642    so we can't compile this function without -maltivec on the command line
 643    (or implied by some other switch).  */
 644
 645 static const uchar *
 646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 647 {
 648   typedef __attribute__((altivec(vector))) unsigned char vc;
 649
 650   const vc repl_nl = {
 651     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 652     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 653   };
 654   const vc repl_cr = {
 655     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 656     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 657   };
 658   const vc repl_bs = {
 659     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 660     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 661   };
 662   const vc repl_qm = {
 663     '?', '?', '?', '?', '?', '?', '?', '?',
 664     '?', '?', '?', '?', '?', '?', '?', '?',
 665   };
 666   const vc ones = {
 667     -1, -1, -1, -1, -1, -1, -1, -1,
 668     -1, -1, -1, -1, -1, -1, -1, -1,
 669   };
 670   const vc zero = { 0 };
 671
 672   vc data, mask, t;
 673
 674   /* Altivec loads automatically mask addresses with -16.  This lets us
 675      issue the first load as early as possible.  */
 676   data = __builtin_vec_ld(0, (const vc *)s);
 677
 678   /* Discard bytes before the beginning of the buffer.  Do this by
 679      beginning with all ones and shifting in zeros according to the
 680      mis-alignment.  The LVSR instruction pulls the exact shift we
 681      want from the address.  */
 682   mask = __builtin_vec_lvsr(0, s);
 683   mask = __builtin_vec_perm(zero, ones, mask);
 684   data &= mask;
 685
 686   /* While altivec loads mask addresses, we still need to align S so
 687      that the offset we compute at the end is correct.  */
 688   s = (const uchar *)((uintptr_t)s & -16);
 689
 690   /* Main loop processing 16 bytes at a time.  */
 691   goto start;
 692   do
 693     {
 694       vc m_nl, m_cr, m_bs, m_qm;
 695
 696       s += 16;
 697       data = __builtin_vec_ld(0, (const vc *)s);
 698
 699     start:
 700       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 701       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 702       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 703       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 704       t = (m_nl | m_cr) | (m_bs | m_qm);
 705
 706       /* T now contains 0xff in bytes for which we matched one of the relevant
 707          characters.  We want to exit the loop if any byte in T is non-zero.
 708          Below is the expansion of vec_any_ne(t, zero).  */
 709     }
 710   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 711
 712   {
 713 #define N  (sizeof(vc) / sizeof(long))
 714
 715     union {
 716       vc v;
 717       /* Statically assert that N is 2 or 4.  */
 718       unsigned long l[(N == 2 || N == 4) ? N : -1];
 719     } u;
 720     unsigned long l, i = 0;
 721
 722     u.v = t;
 723
 724     /* Find the first word of T that is non-zero.  */
 725     switch (N)
 726       {
 727       case 4:
 728         l = u.l[i++];
 729         if (l != 0)
 730           break;
 731         s += sizeof(unsigned long);
 732         l = u.l[i++];
 733         if (l != 0)
 734           break;
 735         s += sizeof(unsigned long);
 736         /* FALLTHROUGH */
 737       case 2:
 738         l = u.l[i++];
 739         if (l != 0)
 740           break;
 741         s += sizeof(unsigned long);
 742         l = u.l[i];
 743       }
 744
 745     /* L now contains 0xff in bytes for which we matched one of the
 746        relevant characters.  We can find the byte index by finding
 747        its bit index and dividing by 8.  */
 748     l = __builtin_clzl(l) >> 3;
 749     return s + l;
 750
 751 #undef N
 752   }
 753 }
 754
 755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 756 #include "arm_neon.h"
 757
 758 /* This doesn't have to be the exact page size, but no system may use
 759    a size smaller than this.  ARMv8 requires a minimum page size of
 760    4k.  The impact of being conservative here is a small number of
 761    cases will take the slightly slower entry path into the main
 762    loop.  */
 763
 764 #define AARCH64_MIN_PAGE_SIZE 4096
 765
 766 static const uchar *
 767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 768 {
 769   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 770   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 771   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 772   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 773   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 774
 775 #ifdef __AARCH64EB
 776   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 777 #else
 778   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 779 #endif
 780
 781   unsigned int found;
 782   const uint8_t *p;
 783   uint8x16_t data;
 784   uint8x16_t t;
 785   uint16x8_t m;
 786   uint8x16_t u, v, w;
 787
 788   /* Align the source pointer.  */
 789   p = (const uint8_t *)((uintptr_t)s & -16);
 790
 791   /* Assuming random string start positions, with a 4k page size we'll take
 792      the slow path about 0.37% of the time.  */
 793   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 794                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 795                         < 16, 0))
 796     {
 797       /* Slow path: the string starts near a possible page boundary.  */
 798       uint32_t misalign, mask;
 799
 800       misalign = (uintptr_t)s & 15;
 801       mask = (-1u << misalign) & 0xffff;
 802       data = vld1q_u8 (p);
 803       t = vceqq_u8 (data, repl_nl);
 804       u = vceqq_u8 (data, repl_cr);
 805       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 806       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 807       t = vorrq_u8 (v, w);
 808       t = vandq_u8 (t, xmask);
 809       m = vpaddlq_u8 (t);
 810       m = vshlq_u16 (m, shift);
 811       found = vaddvq_u16 (m);
 812       found &= mask;
 813       if (found)
 814         return (const uchar*)p + __builtin_ctz (found);
 815     }
 816   else
 817     {
 818       data = vld1q_u8 ((const uint8_t *) s);
 819       t = vceqq_u8 (data, repl_nl);
 820       u = vceqq_u8 (data, repl_cr);
 821       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 822       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 823       t = vorrq_u8 (v, w);
 824       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t), 0))
 825         goto done;
 826     }
 827
 828   do
 829     {
 830       p += 16;
 831       data = vld1q_u8 (p);
 832       t = vceqq_u8 (data, repl_nl);
 833       u = vceqq_u8 (data, repl_cr);
 834       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 835       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 836       t = vorrq_u8 (v, w);
 837     } while (!vpaddd_u64 ((uint64x2_t)t));
 838
 839 done:
 840   /* Now that we've found the terminating substring, work out precisely where
 841      we need to stop.  */
 842   t = vandq_u8 (t, xmask);
 843   m = vpaddlq_u8 (t);
 844   m = vshlq_u16 (m, shift);
 845   found = vaddvq_u16 (m);
 846   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 847           + __builtin_ctz (found));
 848 }
 849
 850 #elif defined (__ARM_NEON)
 851 #include "arm_neon.h"
 852
 853 static const uchar *
 854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 855 {
 856   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 857   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 858   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 859   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 860   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 861
 862   unsigned int misalign, found, mask;
 863   const uint8_t *p;
 864   uint8x16_t data;
 865
 866   /* Align the source pointer.  */
 867   misalign = (uintptr_t)s & 15;
 868   p = (const uint8_t *)((uintptr_t)s & -16);
 869   data = vld1q_u8 (p);
 870
 871   /* Create a mask for the bytes that are valid within the first
 872      16-byte block.  The Idea here is that the AND with the mask
 873      within the loop is "free", since we need some AND or TEST
 874      insn in order to set the flags for the branch anyway.  */
 875   mask = (-1u << misalign) & 0xffff;
 876
 877   /* Main loop, processing 16 bytes at a time.  */
 878   goto start;
 879
 880   do
 881     {
 882       uint8x8_t l;
 883       uint16x4_t m;
 884       uint32x2_t n;
 885       uint8x16_t t, u, v, w;
 886
 887       p += 16;
 888       data = vld1q_u8 (p);
 889       mask = 0xffff;
 890
 891     start:
 892       t = vceqq_u8 (data, repl_nl);
 893       u = vceqq_u8 (data, repl_cr);
 894       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 895       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 896       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 897       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 898       m = vpaddl_u8 (l);
 899       n = vpaddl_u16 (m);
 900
 901       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 902               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 903       found &= mask;
 904     }
 905   while (!found);
 906
 907   /* FOUND contains 1 in bits for which we matched a relevant
 908      character.  Conversion to the byte index is trivial.  */
 909   found = __builtin_ctz (found);
 910   return (const uchar *)p + found;
 911 }
 912
 913 #else
 914
 915 /* We only have one accellerated alternative.  Use a direct call so that
 916    we encourage inlining.  */
 917
 918 #define search_line_fast  search_line_acc_char
 919
 920 #endif
 921
 922 /* Initialize the lexer if needed.  */
 923
 924 void
 925 _cpp_init_lexer (void)
 926 {
 927 #ifdef HAVE_init_vectorized_lexer
 928   init_vectorized_lexer ();
 929 #endif
 930 }
 931
 932 /* Returns with a logical line that contains no escaped newlines or
 933    trigraphs.  This is a time-critical inner loop.  */
 934 void
 935 _cpp_clean_line (cpp_reader *pfile)
 936 {
 937   cpp_buffer *buffer;
 938   const uchar *s;
 939   uchar c, *d, *p;
 940
 941   buffer = pfile->buffer;
 942   buffer->cur_note = buffer->notes_used = 0;
 943   buffer->cur = buffer->line_base = buffer->next_line;
 944   buffer->need_line = false;
 945   s = buffer->next_line;
 946
 947   if (!buffer->from_stage3)
 948     {
 949       const uchar *pbackslash = NULL;
 950
 951       /* Fast path.  This is the common case of an un-escaped line with
 952          no trigraphs.  The primary win here is by not writing any
 953          data back to memory until we have to.  */
 954       while (1)
 955         {
 956           /* Perform an optimized search for \n, \r, \\, ?.  */
 957           s = search_line_fast (s, buffer->rlimit);
 958
 959           c = *s;
 960           if (c == '\\')
 961             {
 962               /* Record the location of the backslash and continue.  */
 963               pbackslash = s++;
 964             }
 965           else if (__builtin_expect (c == '?', 0))
 966             {
 967               if (__builtin_expect (s[1] == '?', false)
 968                    && _cpp_trigraph_map[s[2]])
 969                 {
 970                   /* Have a trigraph.  We may or may not have to convert
 971                      it.  Add a line note regardless, for -Wtrigraphs.  */
 972                   add_line_note (buffer, s, s[2]);
 973                   if (CPP_OPTION (pfile, trigraphs))
 974                     {
 975                       /* We do, and that means we have to switch to the
 976                          slow path.  */
 977                       d = (uchar *) s;
 978                       *d = _cpp_trigraph_map[s[2]];
 979                       s += 2;
 980                       goto slow_path;
 981                     }
 982                 }
 983               /* Not a trigraph.  Continue on fast-path.  */
 984               s++;
 985             }
 986           else
 987             break;
 988         }
 989
 990       /* This must be \r or \n.  We're either done, or we'll be forced
 991          to write back to the buffer and continue on the slow path.  */
 992       d = (uchar *) s;
 993
 994       if (__builtin_expect (s == buffer->rlimit, false))
 995         goto done;
 996
 997       /* DOS line ending? */
 998       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 999         {
1000           s++;
1001           if (s == buffer->rlimit)
1002             goto done;
1003         }
1004
1005       if (__builtin_expect (pbackslash == NULL, true))
1006         goto done;
1007
1008       /* Check for escaped newline.  */
1009       p = d;
1010       while (is_nvspace (p[-1]))
1011         p--;
1012       if (p - 1 != pbackslash)
1013         goto done;
1014
1015       /* Have an escaped newline; process it and proceed to
1016          the slow path.  */
1017       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018       d = p - 2;
1019       buffer->next_line = p - 1;
1020
1021     slow_path:
1022       while (1)
1023         {
1024           c = *++s;
1025           *++d = c;
1026
1027           if (c == '\n' || c == '\r')
1028             {
1029               /* Handle DOS line endings.  */
1030               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031                 s++;
1032               if (s == buffer->rlimit)
1033                 break;
1034
1035               /* Escaped?  */
1036               p = d;
1037               while (p != buffer->next_line && is_nvspace (p[-1]))
1038                 p--;
1039               if (p == buffer->next_line || p[-1] != '\\')
1040                 break;
1041
1042               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043               d = p - 2;
1044               buffer->next_line = p - 1;
1045             }
1046           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1047             {
1048               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1049               add_line_note (buffer, d, s[2]);
1050               if (CPP_OPTION (pfile, trigraphs))
1051                 {
1052                   *d = _cpp_trigraph_map[s[2]];
1053                   s += 2;
1054                 }
1055             }
1056         }
1057     }
1058   else
1059     {
1060       while (*s != '\n' && *s != '\r')
1061         s++;
1062       d = (uchar *) s;
1063
1064       /* Handle DOS line endings.  */
1065       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
1066         s++;
1067     }
1068
1069  done:
1070   *d = '\n';
1071   /* A sentinel note that should never be processed.  */
1072   add_line_note (buffer, d + 1, '\n');
1073   buffer->next_line = s + 1;
1074 }
1075
1076 /* Return true if the trigraph indicated by NOTE should be warned
1077    about in a comment.  */
1078 static bool
1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1080 {
1081   const uchar *p;
1082
1083   /* Within comments we don't warn about trigraphs, unless the
1084      trigraph forms an escaped newline, as that may change
1085      behavior.  */
1086   if (note->type != '/')
1087     return false;
1088
1089   /* If -trigraphs, then this was an escaped newline iff the next note
1090      is coincident.  */
1091   if (CPP_OPTION (pfile, trigraphs))
1092     return note[1].pos == note->pos;
1093
1094   /* Otherwise, see if this forms an escaped newline.  */
1095   p = note->pos + 3;
1096   while (is_nvspace (*p))
1097     p++;
1098
1099   /* There might have been escaped newlines between the trigraph and the
1100      newline we found.  Hence the position test.  */
1101   return (*p == '\n' && p < note[1].pos);
1102 }
1103
1104 /* Process the notes created by add_line_note as far as the current
1105    location.  */
1106 void
1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1108 {
1109   cpp_buffer *buffer = pfile->buffer;
1110
1111   for (;;)
1112     {
1113       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114       unsigned int col;
1115
1116       if (note->pos > buffer->cur)
1117         break;
1118
1119       buffer->cur_note++;
1120       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1121
1122       if (note->type == '\\' || note->type == ' ')
1123         {
1124           if (note->type == ' ' && !in_comment)
1125             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126                                  "backslash and newline separated by space");
1127
1128           if (buffer->next_line > buffer->rlimit)
1129             {
1130               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131                                    "backslash-newline at end of file");
1132               /* Prevent "no newline at end of file" warning.  */
1133               buffer->next_line = buffer->rlimit;
1134             }
1135
1136           buffer->line_base = note->pos;
1137           CPP_INCREMENT_LINE (pfile, 0);
1138         }
1139       else if (_cpp_trigraph_map[note->type])
1140         {
1141           if (CPP_OPTION (pfile, warn_trigraphs)
1142               && (!in_comment || warn_in_comment (pfile, note)))
1143             {
1144               if (CPP_OPTION (pfile, trigraphs))
1145                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146                                        pfile->line_table->highest_line, col,
1147                                        "trigraph ??%c converted to %c",
1148                                        note->type,
1149                                        (int) _cpp_trigraph_map[note->type]);
1150               else
1151                 {
1152                   cpp_warning_with_line
1153                     (pfile, CPP_W_TRIGRAPHS,
1154                      pfile->line_table->highest_line, col,
1155                      "trigraph ??%c ignored, use -trigraphs to enable",
1156                      note->type);
1157                 }
1158             }
1159         }
1160       else if (note->type == 0)
1161         /* Already processed in lex_raw_string.  */;
1162       else
1163         abort ();
1164     }
1165 }
1166
1167 /* Skip a C-style block comment.  We find the end of the comment by
1168    seeing if an asterisk is before every '/' we encounter.  Returns
1169    nonzero if comment terminated by EOF, zero otherwise.
1170
1171    Buffer->cur points to the initial asterisk of the comment.  */
1172 bool
1173 _cpp_skip_block_comment (cpp_reader *pfile)
1174 {
1175   cpp_buffer *buffer = pfile->buffer;
1176   const uchar *cur = buffer->cur;
1177   uchar c;
1178
1179   cur++;
1180   if (*cur == '/')
1181     cur++;
1182
1183   for (;;)
1184     {
1185       /* People like decorating comments with '*', so check for '/'
1186          instead for efficiency.  */
1187       c = *cur++;
1188
1189       if (c == '/')
1190         {
1191           if (cur[-2] == '*')
1192             break;
1193
1194           /* Warn about potential nested comments, but not if the '/'
1195              comes immediately before the true comment delimiter.
1196              Don't bother to get it right across escaped newlines.  */
1197           if (CPP_OPTION (pfile, warn_comments)
1198               && cur[0] == '*' && cur[1] != '/')
1199             {
1200               buffer->cur = cur;
1201               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1202                                      pfile->line_table->highest_line,
1203                                      CPP_BUF_COL (buffer),
1204                                      "\"/*\" within comment");
1205             }
1206         }
1207       else if (c == '\n')
1208         {
1209           unsigned int cols;
1210           buffer->cur = cur - 1;
1211           _cpp_process_line_notes (pfile, true);
1212           if (buffer->next_line >= buffer->rlimit)
1213             return true;
1214           _cpp_clean_line (pfile);
1215
1216           cols = buffer->next_line - buffer->line_base;
1217           CPP_INCREMENT_LINE (pfile, cols);
1218
1219           cur = buffer->cur;
1220         }
1221     }
1222
1223   buffer->cur = cur;
1224   _cpp_process_line_notes (pfile, true);
1225   return false;
1226 }
1227
1228 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1229    terminating newline.  Handles escaped newlines.  Returns nonzero
1230    if a multiline comment.  */
1231 static int
1232 skip_line_comment (cpp_reader *pfile)
1233 {
1234   cpp_buffer *buffer = pfile->buffer;
1235   source_location orig_line = pfile->line_table->highest_line;
1236
1237   while (*buffer->cur != '\n')
1238     buffer->cur++;
1239
1240   _cpp_process_line_notes (pfile, true);
1241   return orig_line != pfile->line_table->highest_line;
1242 }
1243
1244 /* Skips whitespace, saving the next non-whitespace character.  */
1245 static void
1246 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1247 {
1248   cpp_buffer *buffer = pfile->buffer;
1249   bool saw_NUL = false;
1250
1251   do
1252     {
1253       /* Horizontal space always OK.  */
1254       if (c == ' ' || c == '\t')
1255         ;
1256       /* Just \f \v or \0 left.  */
1257       else if (c == '\0')
1258         saw_NUL = true;
1259       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1260         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1261                              CPP_BUF_COL (buffer),
1262                              "%s in preprocessing directive",
1263                              c == '\f' ? "form feed" : "vertical tab");
1264
1265       c = *buffer->cur++;
1266     }
1267   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1268   while (is_nvspace (c));
1269
1270   if (saw_NUL)
1271     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1272
1273   buffer->cur--;
1274 }
1275
1276 /* See if the characters of a number token are valid in a name (no
1277    '.', '+' or '-').  */
1278 static int
1279 name_p (cpp_reader *pfile, const cpp_string *string)
1280 {
1281   unsigned int i;
1282
1283   for (i = 0; i < string->len; i++)
1284     if (!is_idchar (string->text[i]))
1285       return 0;
1286
1287   return 1;
1288 }
1289
1290 /* After parsing an identifier or other sequence, produce a warning about
1291    sequences not in NFC/NFKC.  */
1292 static void
1293 warn_about_normalization (cpp_reader *pfile,
1294                           const cpp_token *token,
1295                           const struct normalize_state *s)
1296 {
1297   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1298       && !pfile->state.skipping)
1299     {
1300       /* Make sure that the token is printed using UCNs, even
1301          if we'd otherwise happily print UTF-8.  */
1302       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1303       size_t sz;
1304
1305       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1306       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1307         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1308                                "`%.*s' is not in NFKC", (int) sz, buf);
1309       else
1310         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1311                                "`%.*s' is not in NFC", (int) sz, buf);
1312       free (buf);
1313     }
1314 }
1315
1316 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1317    an identifier.  FIRST is TRUE if this starts an identifier.  */
1318 static bool
1319 forms_identifier_p (cpp_reader *pfile, int first,
1320                     struct normalize_state *state)
1321 {
1322   cpp_buffer *buffer = pfile->buffer;
1323
1324   if (*buffer->cur == '$')
1325     {
1326       if (!CPP_OPTION (pfile, dollars_in_ident))
1327         return false;
1328
1329       buffer->cur++;
1330       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1331         {
1332           CPP_OPTION (pfile, warn_dollars) = 0;
1333           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1334         }
1335
1336       return true;
1337     }
1338
1339   /* Is this a syntactically valid UCN?  */
1340   if (CPP_OPTION (pfile, extended_identifiers)
1341       && *buffer->cur == '\\'
1342       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1343     {
1344       cppchar_t s;
1345       buffer->cur += 2;
1346       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1347                           state, &s, NULL, NULL))
1348         return true;
1349       buffer->cur -= 2;
1350     }
1351
1352   return false;
1353 }
1354
1355 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1356 static cpp_hashnode *
1357 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1358 {
1359   cpp_hashnode *result;
1360   const uchar *cur;
1361   unsigned int len;
1362   unsigned int hash = HT_HASHSTEP (0, *base);
1363
1364   cur = base + 1;
1365   while (ISIDNUM (*cur))
1366     {
1367       hash = HT_HASHSTEP (hash, *cur);
1368       cur++;
1369     }
1370   len = cur - base;
1371   hash = HT_HASHFINISH (hash, len);
1372   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1373                                               base, len, hash, HT_ALLOC));
1374
1375   /* Rarely, identifiers require diagnostics when lexed.  */
1376   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1377                         && !pfile->state.skipping, 0))
1378     {
1379       /* It is allowed to poison the same identifier twice.  */
1380       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1381         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1382                    NODE_NAME (result));
1383
1384       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1385          replacement list of a variadic macro.  */
1386       if (result == pfile->spec_nodes.n__VA_ARGS__
1387           && !pfile->state.va_args_ok)
1388         {
1389           if (CPP_OPTION (pfile, cplusplus))
1390             cpp_error (pfile, CPP_DL_PEDWARN,
1391                        "__VA_ARGS__ can only appear in the expansion"
1392                        " of a C++11 variadic macro");
1393           else
1394             cpp_error (pfile, CPP_DL_PEDWARN,
1395                        "__VA_ARGS__ can only appear in the expansion"
1396                        " of a C99 variadic macro");
1397         }
1398
1399       /* For -Wc++-compat, warn about use of C++ named operators.  */
1400       if (result->flags & NODE_WARN_OPERATOR)
1401         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1402                      "identifier \"%s\" is a special operator name in C++",
1403                      NODE_NAME (result));
1404     }
1405
1406   return result;
1407 }
1408
1409 /* Get the cpp_hashnode of an identifier specified by NAME in
1410    the current cpp_reader object.  If none is found, NULL is returned.  */
1411 cpp_hashnode *
1412 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1413 {
1414   cpp_hashnode *result;
1415   result = lex_identifier_intern (pfile, (uchar *) name);
1416   return result;
1417 }
1418
1419 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1420 static cpp_hashnode *
1421 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1422                 struct normalize_state *nst, cpp_hashnode **spelling)
1423 {
1424   cpp_hashnode *result;
1425   const uchar *cur;
1426   unsigned int len;
1427   unsigned int hash = HT_HASHSTEP (0, *base);
1428
1429   cur = pfile->buffer->cur;
1430   if (! starts_ucn)
1431     {
1432       while (ISIDNUM (*cur))
1433         {
1434           hash = HT_HASHSTEP (hash, *cur);
1435           cur++;
1436         }
1437       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1438     }
1439   pfile->buffer->cur = cur;
1440   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1441     {
1442       /* Slower version for identifiers containing UCNs (or $).  */
1443       do {
1444         while (ISIDNUM (*pfile->buffer->cur))
1445           {
1446             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1447             pfile->buffer->cur++;
1448           }
1449       } while (forms_identifier_p (pfile, false, nst));
1450       result = _cpp_interpret_identifier (pfile, base,
1451                                           pfile->buffer->cur - base);
1452       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1453     }
1454   else
1455     {
1456       len = cur - base;
1457       hash = HT_HASHFINISH (hash, len);
1458
1459       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1460                                                   base, len, hash, HT_ALLOC));
1461       *spelling = result;
1462     }
1463
1464   /* Rarely, identifiers require diagnostics when lexed.  */
1465   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1466                         && !pfile->state.skipping, 0))
1467     {
1468       /* It is allowed to poison the same identifier twice.  */
1469       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1470         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1471                    NODE_NAME (result));
1472
1473       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1474          replacement list of a variadic macro.  */
1475       if (result == pfile->spec_nodes.n__VA_ARGS__
1476           && !pfile->state.va_args_ok)
1477         {
1478           if (CPP_OPTION (pfile, cplusplus))
1479             cpp_error (pfile, CPP_DL_PEDWARN,
1480                        "__VA_ARGS__ can only appear in the expansion"
1481                        " of a C++11 variadic macro");
1482           else
1483             cpp_error (pfile, CPP_DL_PEDWARN,
1484                        "__VA_ARGS__ can only appear in the expansion"
1485                        " of a C99 variadic macro");
1486         }
1487
1488       /* For -Wc++-compat, warn about use of C++ named operators.  */
1489       if (result->flags & NODE_WARN_OPERATOR)
1490         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1491                      "identifier \"%s\" is a special operator name in C++",
1492                      NODE_NAME (result));
1493     }
1494
1495   return result;
1496 }
1497
1498 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1499 static void
1500 lex_number (cpp_reader *pfile, cpp_string *number,
1501             struct normalize_state *nst)
1502 {
1503   const uchar *cur;
1504   const uchar *base;
1505   uchar *dest;
1506
1507   base = pfile->buffer->cur - 1;
1508   do
1509     {
1510       cur = pfile->buffer->cur;
1511
1512       /* N.B. ISIDNUM does not include $.  */
1513       while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1514              || VALID_SIGN (*cur, cur[-1]))
1515         {
1516           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1517           cur++;
1518         }
1519       /* A number can't end with a digit separator.  */
1520       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
1521         --cur;
1522
1523       pfile->buffer->cur = cur;
1524     }
1525   while (forms_identifier_p (pfile, false, nst));
1526
1527   number->len = cur - base;
1528   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1529   memcpy (dest, base, number->len);
1530   dest[number->len] = '\0';
1531   number->text = dest;
1532 }
1533
1534 /* Create a token of type TYPE with a literal spelling.  */
1535 static void
1536 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1537                 unsigned int len, enum cpp_ttype type)
1538 {
1539   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1540
1541   memcpy (dest, base, len);
1542   dest[len] = '\0';
1543   token->type = type;
1544   token->val.str.len = len;
1545   token->val.str.text = dest;
1546 }
1547
1548 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1549    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1550
1551 static void
1552 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1553                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1554 {
1555   _cpp_buff *first_buff = *first_buff_p;
1556   _cpp_buff *last_buff = *last_buff_p;
1557
1558   if (first_buff == NULL)
1559     first_buff = last_buff = _cpp_get_buff (pfile, len);
1560   else if (len > BUFF_ROOM (last_buff))
1561     {
1562       size_t room = BUFF_ROOM (last_buff);
1563       memcpy (BUFF_FRONT (last_buff), base, room);
1564       BUFF_FRONT (last_buff) += room;
1565       base += room;
1566       len -= room;
1567       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1568     }
1569
1570   memcpy (BUFF_FRONT (last_buff), base, len);
1571   BUFF_FRONT (last_buff) += len;
1572
1573   *first_buff_p = first_buff;
1574   *last_buff_p = last_buff;
1575 }
1576
1577
1578 /* Returns true if a macro has been defined.
1579    This might not work if compile with -save-temps,
1580    or preprocess separately from compilation.  */
1581
1582 static bool
1583 is_macro(cpp_reader *pfile, const uchar *base)
1584 {
1585   const uchar *cur = base;
1586   if (! ISIDST (*cur))
1587     return false;
1588   unsigned int hash = HT_HASHSTEP (0, *cur);
1589   ++cur;
1590   while (ISIDNUM (*cur))
1591     {
1592       hash = HT_HASHSTEP (hash, *cur);
1593       ++cur;
1594     }
1595   hash = HT_HASHFINISH (hash, cur - base);
1596
1597   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1598                                         base, cur - base, hash, HT_NO_INSERT));
1599
1600   return !result ? false : (result->type == NT_MACRO);
1601 }
1602
1603
1604 /* Lexes a raw string.  The stored string contains the spelling, including
1605    double quotes, delimiter string, '(' and ')', any leading
1606    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1607    literal, or CPP_OTHER if it was not properly terminated.
1608
1609    The spelling is NUL-terminated, but it is not guaranteed that this
1610    is the first NUL since embedded NULs are preserved.  */
1611
1612 static void
1613 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1614                 const uchar *cur)
1615 {
1616   uchar raw_prefix[17];
1617   uchar temp_buffer[18];
1618   const uchar *orig_base;
1619   unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
1620   enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
1621   raw_str_phase phase = RAW_STR_PREFIX;
1622   enum cpp_ttype type;
1623   size_t total_len = 0;
1624   /* Index into temp_buffer during phases other than RAW_STR,
1625      during RAW_STR phase 17 to tell BUF_APPEND that nothing should
1626      be appended to temp_buffer.  */
1627   size_t temp_buffer_len = 0;
1628   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1629   size_t raw_prefix_start;
1630   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1631
1632   type = (*base == 'L' ? CPP_WSTRING :
1633           *base == 'U' ? CPP_STRING32 :
1634           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1635           : CPP_STRING);
1636
1637 #define BUF_APPEND(STR,LEN)                                     \
1638       do {                                                      \
1639         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1640                         &first_buff, &last_buff);               \
1641         total_len += (LEN);                                     \
1642         if (__builtin_expect (temp_buffer_len < 17, 0)          \
1643             && (const uchar *)(STR) != base                     \
1644             && (LEN) <= 2)                                      \
1645           {                                                     \
1646             memcpy (temp_buffer + temp_buffer_len,              \
1647                     (const uchar *)(STR), (LEN));               \
1648             temp_buffer_len += (LEN);                           \
1649           }                                                     \
1650       } while (0);
1651
1652   orig_base = base;
1653   ++cur;
1654   raw_prefix_start = cur - base;
1655   for (;;)
1656     {
1657       cppchar_t c;
1658
1659       /* If we previously performed any trigraph or line splicing
1660          transformations, undo them in between the opening and closing
1661          double quote.  */
1662       while (note->pos < cur)
1663         ++note;
1664       for (; note->pos == cur; ++note)
1665         {
1666           switch (note->type)
1667             {
1668             case '\\':
1669             case ' ':
1670               /* Restore backslash followed by newline.  */
1671               BUF_APPEND (base, cur - base);
1672               base = cur;
1673               BUF_APPEND ("\\", 1);
1674             after_backslash:
1675               if (note->type == ' ')
1676                 {
1677                   /* GNU backslash whitespace newline extension.  FIXME
1678                      could be any sequence of non-vertical space.  When we
1679                      can properly restore any such sequence, we should mark
1680                      this note as handled so _cpp_process_line_notes
1681                      doesn't warn.  */
1682                   BUF_APPEND (" ", 1);
1683                 }
1684
1685               BUF_APPEND ("\n", 1);
1686               break;
1687
1688             case 0:
1689               /* Already handled.  */
1690               break;
1691
1692             default:
1693               if (_cpp_trigraph_map[note->type])
1694                 {
1695                   /* Don't warn about this trigraph in
1696                      _cpp_process_line_notes, since trigraphs show up as
1697                      trigraphs in raw strings.  */
1698                   uchar type = note->type;
1699                   note->type = 0;
1700
1701                   if (!CPP_OPTION (pfile, trigraphs))
1702                     /* If we didn't convert the trigraph in the first
1703                        place, don't do anything now either.  */
1704                     break;
1705
1706                   BUF_APPEND (base, cur - base);
1707                   base = cur;
1708                   BUF_APPEND ("??", 2);
1709
1710                   /* ??/ followed by newline gets two line notes, one for
1711                      the trigraph and one for the backslash/newline.  */
1712                   if (type == '/' && note[1].pos == cur)
1713                     {
1714                       if (note[1].type != '\\'
1715                           && note[1].type != ' ')
1716                         abort ();
1717                       BUF_APPEND ("/", 1);
1718                       ++note;
1719                       goto after_backslash;
1720                     }
1721                   else
1722                     {
1723                       /* Skip the replacement character.  */
1724                       base = ++cur;
1725                       BUF_APPEND (&type, 1);
1726                       c = type;
1727                       goto check_c;
1728                     }
1729                 }
1730               else
1731                 abort ();
1732               break;
1733             }
1734         }
1735       c = *cur++;
1736       if (__builtin_expect (temp_buffer_len < 17, 0))
1737         temp_buffer[temp_buffer_len++] = c;
1738
1739      check_c:
1740       if (phase == RAW_STR_PREFIX)
1741         {
1742           while (raw_prefix_len < temp_buffer_len)
1743             {
1744               raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len];
1745               switch (raw_prefix[raw_prefix_len])
1746                 {
1747                 case ' ': case '(': case ')': case '\\': case '\t':
1748                 case '\v': case '\f': case '\n': default:
1749                   break;
1750                 /* Basic source charset except the above chars.  */
1751                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1752                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1753                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1754                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1755                 case 'y': case 'z':
1756                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1757                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1758                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1759                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1760                 case 'Y': case 'Z':
1761                 case '0': case '1': case '2': case '3': case '4': case '5':
1762                 case '6': case '7': case '8': case '9':
1763                 case '_': case '{': case '}': case '#': case '[': case ']':
1764                 case '<': case '>': case '%': case ':': case ';': case '.':
1765                 case '?': case '*': case '+': case '-': case '/': case '^':
1766                 case '&': case '|': case '~': case '!': case '=': case ',':
1767                 case '"': case '\'':
1768                   if (raw_prefix_len < 16)
1769                     {
1770                       raw_prefix_len++;
1771                       continue;
1772                     }
1773                   break;
1774                 }
1775
1776               if (raw_prefix[raw_prefix_len] != '(')
1777                 {
1778                   int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1;
1779                   if (raw_prefix_len == 16)
1780                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1781                                          col, "raw string delimiter longer "
1782                                               "than 16 characters");
1783                   else if (raw_prefix[raw_prefix_len] == '\n')
1784                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1785                                          col, "invalid new-line in raw "
1786                                               "string delimiter");
1787                   else
1788                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1789                                          col, "invalid character '%c' in "
1790                                               "raw string delimiter",
1791                                          (int) raw_prefix[raw_prefix_len]);
1792                   pfile->buffer->cur = orig_base + raw_prefix_start - 1;
1793                   create_literal (pfile, token, orig_base,
1794                                   raw_prefix_start - 1, CPP_OTHER);
1795                   if (first_buff)
1796                     _cpp_release_buff (pfile, first_buff);
1797                   return;
1798                 }
1799               raw_prefix[raw_prefix_len] = '"';
1800               phase = RAW_STR;
1801               /* Nothing should be appended to temp_buffer during
1802                  RAW_STR phase.  */
1803               temp_buffer_len = 17;
1804               break;
1805             }
1806           continue;
1807         }
1808       else if (phase == RAW_STR_SUFFIX)
1809         {
1810           while (raw_suffix_len <= raw_prefix_len
1811                  && raw_suffix_len < temp_buffer_len
1812                  && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len])
1813             raw_suffix_len++;
1814           if (raw_suffix_len > raw_prefix_len)
1815             break;
1816           if (raw_suffix_len == temp_buffer_len)
1817             continue;
1818           phase = RAW_STR;
1819           /* Nothing should be appended to temp_buffer during
1820              RAW_STR phase.  */
1821           temp_buffer_len = 17;
1822         }
1823       if (c == ')')
1824         {
1825           phase = RAW_STR_SUFFIX;
1826           raw_suffix_len = 0;
1827           temp_buffer_len = 0;
1828         }
1829       else if (c == '\n')
1830         {
1831           if (pfile->state.in_directive
1832               || (pfile->state.parsing_args
1833                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
1834             {
1835               cur--;
1836               type = CPP_OTHER;
1837               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1838                                    "unterminated raw string");
1839               break;
1840             }
1841
1842           BUF_APPEND (base, cur - base);
1843
1844           if (pfile->buffer->cur < pfile->buffer->rlimit)
1845             CPP_INCREMENT_LINE (pfile, 0);
1846           pfile->buffer->need_line = true;
1847
1848           pfile->buffer->cur = cur-1;
1849           _cpp_process_line_notes (pfile, false);
1850           if (!_cpp_get_fresh_line (pfile))
1851             {
1852               source_location src_loc = token->src_loc;
1853               token->type = CPP_EOF;
1854               /* Tell the compiler the line number of the EOF token.  */
1855               token->src_loc = pfile->line_table->highest_line;
1856               token->flags = BOL;
1857               if (first_buff != NULL)
1858                 _cpp_release_buff (pfile, first_buff);
1859               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1860                                    "unterminated raw string");
1861               return;
1862             }
1863
1864           cur = base = pfile->buffer->cur;
1865           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1866         }
1867     }
1868
1869   if (CPP_OPTION (pfile, user_literals))
1870     {
1871       /* If a string format macro, say from inttypes.h, is placed touching
1872          a string literal it could be parsed as a C++11 user-defined string
1873          literal thus breaking the program.
1874          Try to identify macros with is_macro. A warning is issued. */
1875       if (is_macro (pfile, cur))
1876         {
1877           /* Raise a warning, but do not consume subsequent tokens.  */
1878           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1879             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1880                                    token->src_loc, 0,
1881                                    "invalid suffix on literal; C++11 requires "
1882                                    "a space between literal and string macro");
1883         }
1884       /* Grab user defined literal suffix.  */
1885       else if (ISIDST (*cur))
1886         {
1887           type = cpp_userdef_string_add_type (type);
1888           ++cur;
1889
1890           while (ISIDNUM (*cur))
1891             ++cur;
1892         }
1893     }
1894
1895   pfile->buffer->cur = cur;
1896   if (first_buff == NULL)
1897     create_literal (pfile, token, base, cur - base, type);
1898   else
1899     {
1900       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1901
1902       token->type = type;
1903       token->val.str.len = total_len + (cur - base);
1904       token->val.str.text = dest;
1905       last_buff = first_buff;
1906       while (last_buff != NULL)
1907         {
1908           memcpy (dest, last_buff->base,
1909                   BUFF_FRONT (last_buff) - last_buff->base);
1910           dest += BUFF_FRONT (last_buff) - last_buff->base;
1911           last_buff = last_buff->next;
1912         }
1913       _cpp_release_buff (pfile, first_buff);
1914       memcpy (dest, base, cur - base);
1915       dest[cur - base] = '\0';
1916     }
1917 }
1918
1919 /* Lexes a string, character constant, or angle-bracketed header file
1920    name.  The stored string contains the spelling, including opening
1921    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1922    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1923    if it was not properly terminated, or CPP_LESS for an unterminated
1924    header name which must be relexed as normal tokens.
1925
1926    The spelling is NUL-terminated, but it is not guaranteed that this
1927    is the first NUL since embedded NULs are preserved.  */
1928 static void
1929 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1930 {
1931   bool saw_NUL = false;
1932   const uchar *cur;
1933   cppchar_t terminator;
1934   enum cpp_ttype type;
1935
1936   cur = base;
1937   terminator = *cur++;
1938   if (terminator == 'L' || terminator == 'U')
1939     terminator = *cur++;
1940   else if (terminator == 'u')
1941     {
1942       terminator = *cur++;
1943       if (terminator == '8')
1944         terminator = *cur++;
1945     }
1946   if (terminator == 'R')
1947     {
1948       lex_raw_string (pfile, token, base, cur);
1949       return;
1950     }
1951   if (terminator == '"')
1952     type = (*base == 'L' ? CPP_WSTRING :
1953             *base == 'U' ? CPP_STRING32 :
1954             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1955                          : CPP_STRING);
1956   else if (terminator == '\'')
1957     type = (*base == 'L' ? CPP_WCHAR :
1958             *base == 'U' ? CPP_CHAR32 :
1959             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
1960                          : CPP_CHAR);
1961   else
1962     terminator = '>', type = CPP_HEADER_NAME;
1963
1964   for (;;)
1965     {
1966       cppchar_t c = *cur++;
1967
1968       /* In #include-style directives, terminators are not escapable.  */
1969       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1970         cur++;
1971       else if (c == terminator)
1972         break;
1973       else if (c == '\n')
1974         {
1975           cur--;
1976           /* Unmatched quotes always yield undefined behavior, but
1977              greedy lexing means that what appears to be an unterminated
1978              header name may actually be a legitimate sequence of tokens.  */
1979           if (terminator == '>')
1980             {
1981               token->type = CPP_LESS;
1982               return;
1983             }
1984           type = CPP_OTHER;
1985           break;
1986         }
1987       else if (c == '\0')
1988         saw_NUL = true;
1989     }
1990
1991   if (saw_NUL && !pfile->state.skipping)
1992     cpp_error (pfile, CPP_DL_WARNING,
1993                "null character(s) preserved in literal");
1994
1995   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1996     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1997                (int) terminator);
1998
1999   if (CPP_OPTION (pfile, user_literals))
2000     {
2001       /* If a string format macro, say from inttypes.h, is placed touching
2002          a string literal it could be parsed as a C++11 user-defined string
2003          literal thus breaking the program.
2004          Try to identify macros with is_macro. A warning is issued. */
2005       if (is_macro (pfile, cur))
2006         {
2007           /* Raise a warning, but do not consume subsequent tokens.  */
2008           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2009             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2010                                    token->src_loc, 0,
2011                                    "invalid suffix on literal; C++11 requires "
2012                                    "a space between literal and string macro");
2013         }
2014       /* Grab user defined literal suffix.  */
2015       else if (ISIDST (*cur))
2016         {
2017           type = cpp_userdef_char_add_type (type);
2018           type = cpp_userdef_string_add_type (type);
2019           ++cur;
2020
2021           while (ISIDNUM (*cur))
2022             ++cur;
2023         }
2024     }
2025   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2026            && is_macro (pfile, cur)
2027            && !pfile->state.skipping)
2028     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2029                            token->src_loc, 0, "C++11 requires a space "
2030                            "between string literal and macro");
2031
2032   pfile->buffer->cur = cur;
2033   create_literal (pfile, token, base, cur - base, type);
2034 }
2035
2036 /* Return the comment table. The client may not make any assumption
2037    about the ordering of the table.  */
2038 cpp_comment_table *
2039 cpp_get_comments (cpp_reader *pfile)
2040 {
2041   return &pfile->comments;
2042 }
2043
2044 /* Append a comment to the end of the comment table. */
2045 static void
2046 store_comment (cpp_reader *pfile, cpp_token *token)
2047 {
2048   int len;
2049
2050   if (pfile->comments.allocated == 0)
2051     {
2052       pfile->comments.allocated = 256;
2053       pfile->comments.entries = (cpp_comment *) xmalloc
2054         (pfile->comments.allocated * sizeof (cpp_comment));
2055     }
2056
2057   if (pfile->comments.count == pfile->comments.allocated)
2058     {
2059       pfile->comments.allocated *= 2;
2060       pfile->comments.entries = (cpp_comment *) xrealloc
2061         (pfile->comments.entries,
2062          pfile->comments.allocated * sizeof (cpp_comment));
2063     }
2064
2065   len = token->val.str.len;
2066
2067   /* Copy comment. Note, token may not be NULL terminated. */
2068   pfile->comments.entries[pfile->comments.count].comment =
2069     (char *) xmalloc (sizeof (char) * (len + 1));
2070   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2071           token->val.str.text, len);
2072   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2073
2074   /* Set source location. */
2075   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2076
2077   /* Increment the count of entries in the comment table. */
2078   pfile->comments.count++;
2079 }
2080
2081 /* The stored comment includes the comment start and any terminator.  */
2082 static void
2083 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2084               cppchar_t type)
2085 {
2086   unsigned char *buffer;
2087   unsigned int len, clen, i;
2088
2089   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2090
2091   /* C++ comments probably (not definitely) have moved past a new
2092      line, which we don't want to save in the comment.  */
2093   if (is_vspace (pfile->buffer->cur[-1]))
2094     len--;
2095
2096   /* If we are currently in a directive or in argument parsing, then
2097      we need to store all C++ comments as C comments internally, and
2098      so we need to allocate a little extra space in that case.
2099
2100      Note that the only time we encounter a directive here is
2101      when we are saving comments in a "#define".  */
2102   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2103           && type == '/') ? len + 2 : len;
2104
2105   buffer = _cpp_unaligned_alloc (pfile, clen);
2106
2107   token->type = CPP_COMMENT;
2108   token->val.str.len = clen;
2109   token->val.str.text = buffer;
2110
2111   buffer[0] = '/';
2112   memcpy (buffer + 1, from, len - 1);
2113
2114   /* Finish conversion to a C comment, if necessary.  */
2115   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2116     {
2117       buffer[1] = '*';
2118       buffer[clen - 2] = '*';
2119       buffer[clen - 1] = '/';
2120       /* As there can be in a C++ comments illegal sequences for C comments
2121          we need to filter them out.  */
2122       for (i = 2; i < (clen - 2); i++)
2123         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2124           buffer[i] = '|';
2125     }
2126
2127   /* Finally store this comment for use by clients of libcpp. */
2128   store_comment (pfile, token);
2129 }
2130
2131 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2132    comment.  */
2133
2134 static bool
2135 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2136 {
2137   const unsigned char *from = comment_start + 1;
2138
2139   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2140     {
2141       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2142          don't recognize any comments.  The latter only checks attributes,
2143          the former doesn't warn.  */
2144     case 0:
2145     default:
2146       return false;
2147       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2148          content it has.  */
2149     case 1:
2150       return true;
2151     case 2:
2152       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2153          .*falls?[ \t-]*thr(u|ough).* regex.  */
2154       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2155            from++)
2156         {
2157           /* Is there anything like strpbrk with upper boundary, or
2158              memchr looking for 2 characters rather than just one?  */
2159           if (from[0] != 'f' && from[0] != 'F')
2160             continue;
2161           if (from[1] != 'a' && from[1] != 'A')
2162             continue;
2163           if (from[2] != 'l' && from[2] != 'L')
2164             continue;
2165           if (from[3] != 'l' && from[3] != 'L')
2166             continue;
2167           from += sizeof "fall" - 1;
2168           if (from[0] == 's' || from[0] == 'S')
2169             from++;
2170           while (*from == ' ' || *from == '\t' || *from == '-')
2171             from++;
2172           if (from[0] != 't' && from[0] != 'T')
2173             continue;
2174           if (from[1] != 'h' && from[1] != 'H')
2175             continue;
2176           if (from[2] != 'r' && from[2] != 'R')
2177             continue;
2178           if (from[3] == 'u' || from[3] == 'U')
2179             return true;
2180           if (from[3] != 'o' && from[3] != 'O')
2181             continue;
2182           if (from[4] != 'u' && from[4] != 'U')
2183             continue;
2184           if (from[5] != 'g' && from[5] != 'G')
2185             continue;
2186           if (from[6] != 'h' && from[6] != 'H')
2187             continue;
2188           return true;
2189         }
2190       return false;
2191     case 3:
2192     case 4:
2193       break;
2194     }
2195
2196   /* Whole comment contents:
2197      -fallthrough
2198      @fallthrough@
2199    */
2200   if (*from == '-' || *from == '@')
2201     {
2202       size_t len = sizeof "fallthrough" - 1;
2203       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2204         return false;
2205       if (memcmp (from + 1, "fallthrough", len))
2206         return false;
2207       if (*from == '@')
2208         {
2209           if (from[len + 1] != '@')
2210             return false;
2211           len++;
2212         }
2213       from += 1 + len;
2214     }
2215   /* Whole comment contents (regex):
2216      lint -fallthrough[ \t]*
2217    */
2218   else if (*from == 'l')
2219     {
2220       size_t len = sizeof "int -fallthrough" - 1;
2221       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2222         return false;
2223       if (memcmp (from + 1, "int -fallthrough", len))
2224         return false;
2225       from += 1 + len;
2226       while (*from == ' ' || *from == '\t')
2227         from++;
2228     }
2229   /* Whole comment contents (regex):
2230      [ \t]*FALLTHR(U|OUGH)[ \t]*
2231    */
2232   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2233     {
2234       while (*from == ' ' || *from == '\t')
2235         from++;
2236       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
2237         return false;
2238       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2239         return false;
2240       from += sizeof "FALLTHR" - 1;
2241       if (*from == 'U')
2242         from++;
2243       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
2244         return false;
2245       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2246         return false;
2247       else
2248         from += sizeof "OUGH" - 1;
2249       while (*from == ' ' || *from == '\t')
2250         from++;
2251     }
2252   /* Whole comment contents (regex):
2253      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2254      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2255      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2256    */
2257   else
2258     {
2259       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2260         from++;
2261       unsigned char f = *from;
2262       bool all_upper = false;
2263       if (f == 'E' || f == 'e')
2264         {
2265           if ((size_t) (pfile->buffer->cur - from)
2266               < sizeof "else fallthru" - 1)
2267             return false;
2268           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2269             all_upper = true;
2270           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2271             return false;
2272           from += sizeof "else" - 1;
2273           if (*from == ',')
2274             from++;
2275           if (*from != ' ')
2276             return false;
2277           from++;
2278           if (all_upper && *from == 'f')
2279             return false;
2280           if (f == 'e' && *from == 'F')
2281             return false;
2282           f = *from;
2283         }
2284       else if (f == 'I' || f == 'i')
2285         {
2286           if ((size_t) (pfile->buffer->cur - from)
2287               < sizeof "intentional fallthru" - 1)
2288             return false;
2289           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2290                                   sizeof "NTENTIONAL" - 1) == 0)
2291             all_upper = true;
2292           else if (memcmp (from + 1, "ntentional",
2293                            sizeof "ntentional" - 1))
2294             return false;
2295           from += sizeof "intentional" - 1;
2296           if (*from == ' ')
2297             {
2298               from++;
2299               if (all_upper && *from == 'f')
2300                 return false;
2301             }
2302           else if (all_upper)
2303             {
2304               if (memcmp (from, "LY F", sizeof "LY F" - 1))
2305                 return false;
2306               from += sizeof "LY " - 1;
2307             }
2308           else
2309             {
2310               if (memcmp (from, "ly ", sizeof "ly " - 1))
2311                 return false;
2312               from += sizeof "ly " - 1;
2313             }
2314           if (f == 'i' && *from == 'F')
2315             return false;
2316           f = *from;
2317         }
2318       if (f != 'F' && f != 'f')
2319         return false;
2320       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
2321         return false;
2322       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
2323         all_upper = true;
2324       else if (all_upper)
2325         return false;
2326       else if (memcmp (from + 1, "all", sizeof "all" - 1))
2327         return false;
2328       from += sizeof "fall" - 1;
2329       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
2330         from += 2;
2331       else if (*from == ' ' || *from == '-')
2332         from++;
2333       else if (*from != (all_upper ? 'T' : 't'))
2334         return false;
2335       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
2336         return false;
2337       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
2338         return false;
2339       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
2340         {
2341           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
2342             return false;
2343           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
2344                       sizeof "hrough" - 1))
2345             return false;
2346           from += sizeof "through" - 1;
2347         }
2348       else
2349         from += sizeof "thru" - 1;
2350       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2351         from++;
2352       if (*from == '-')
2353         {
2354           from++;
2355           if (*comment_start == '*')
2356             {
2357               do
2358                 {
2359                   while (*from && *from != '*'
2360                          && *from != '\n' && *from != '\r')
2361                     from++;
2362                   if (*from != '*' || from[1] == '/')
2363                     break;
2364                   from++;
2365                 }
2366               while (1);
2367             }
2368           else
2369             while (*from && *from != '\n' && *from != '\r')
2370               from++;
2371         }
2372     }
2373   /* C block comment.  */
2374   if (*comment_start == '*')
2375     {
2376       if (*from != '*' || from[1] != '/')
2377         return false;
2378     }
2379   /* C++ line comment.  */
2380   else if (*from != '\n')
2381     return false;
2382
2383   return true;
2384 }
2385
2386 /* Allocate COUNT tokens for RUN.  */
2387 void
2388 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
2389 {
2390   run->base = XNEWVEC (cpp_token, count);
2391   run->limit = run->base + count;
2392   run->next = NULL;
2393 }
2394
2395 /* Returns the next tokenrun, or creates one if there is none.  */
2396 static tokenrun *
2397 next_tokenrun (tokenrun *run)
2398 {
2399   if (run->next == NULL)
2400     {
2401       run->next = XNEW (tokenrun);
2402       run->next->prev = run;
2403       _cpp_init_tokenrun (run->next, 250);
2404     }
2405
2406   return run->next;
2407 }
2408
2409 /* Return the number of not yet processed token in a given
2410    context.  */
2411 int
2412 _cpp_remaining_tokens_num_in_context (cpp_context *context)
2413 {
2414   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2415     return (LAST (context).token - FIRST (context).token);
2416   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2417            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2418     return (LAST (context).ptoken - FIRST (context).ptoken);
2419   else
2420       abort ();
2421 }
2422
2423 /* Returns the token present at index INDEX in a given context.  If
2424    INDEX is zero, the next token to be processed is returned.  */
2425 static const cpp_token*
2426 _cpp_token_from_context_at (cpp_context *context, int index)
2427 {
2428   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2429     return &(FIRST (context).token[index]);
2430   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2431            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2432     return FIRST (context).ptoken[index];
2433  else
2434    abort ();
2435 }
2436
2437 /* Look ahead in the input stream.  */
2438 const cpp_token *
2439 cpp_peek_token (cpp_reader *pfile, int index)
2440 {
2441   cpp_context *context = pfile->context;
2442   const cpp_token *peektok;
2443   int count;
2444
2445   /* First, scan through any pending cpp_context objects.  */
2446   while (context->prev)
2447     {
2448       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2449
2450       if (index < (int) sz)
2451         return _cpp_token_from_context_at (context, index);
2452       index -= (int) sz;
2453       context = context->prev;
2454     }
2455
2456   /* We will have to read some new tokens after all (and do so
2457      without invalidating preceding tokens).  */
2458   count = index;
2459   pfile->keep_tokens++;
2460
2461   /* For peeked tokens temporarily disable line_change reporting,
2462      until the tokens are parsed for real.  */
2463   void (*line_change) (cpp_reader *, const cpp_token *, int)
2464     = pfile->cb.line_change;
2465   pfile->cb.line_change = NULL;
2466
2467   do
2468     {
2469       peektok = _cpp_lex_token (pfile);
2470       if (peektok->type == CPP_EOF)
2471         {
2472           index--;
2473           break;
2474         }
2475     }
2476   while (index--);
2477
2478   _cpp_backup_tokens_direct (pfile, count - index);
2479   pfile->keep_tokens--;
2480   pfile->cb.line_change = line_change;
2481
2482   return peektok;
2483 }
2484
2485 /* Allocate a single token that is invalidated at the same time as the
2486    rest of the tokens on the line.  Has its line and col set to the
2487    same as the last lexed token, so that diagnostics appear in the
2488    right place.  */
2489 cpp_token *
2490 _cpp_temp_token (cpp_reader *pfile)
2491 {
2492   cpp_token *old, *result;
2493   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2494   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2495
2496   old = pfile->cur_token - 1;
2497   /* Any pre-existing lookaheads must not be clobbered.  */
2498   if (la)
2499     {
2500       if (sz <= la)
2501         {
2502           tokenrun *next = next_tokenrun (pfile->cur_run);
2503
2504           if (sz < la)
2505             memmove (next->base + 1, next->base,
2506                      (la - sz) * sizeof (cpp_token));
2507
2508           next->base[0] = pfile->cur_run->limit[-1];
2509         }
2510
2511       if (sz > 1)
2512         memmove (pfile->cur_token + 1, pfile->cur_token,
2513                  MIN (la, sz - 1) * sizeof (cpp_token));
2514     }
2515
2516   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2517     {
2518       pfile->cur_run = next_tokenrun (pfile->cur_run);
2519       pfile->cur_token = pfile->cur_run->base;
2520     }
2521
2522   result = pfile->cur_token++;
2523   result->src_loc = old->src_loc;
2524   return result;
2525 }
2526
2527 /* Lex a token into RESULT (external interface).  Takes care of issues
2528    like directive handling, token lookahead, multiple include
2529    optimization and skipping.  */
2530 const cpp_token *
2531 _cpp_lex_token (cpp_reader *pfile)
2532 {
2533   cpp_token *result;
2534
2535   for (;;)
2536     {
2537       if (pfile->cur_token == pfile->cur_run->limit)
2538         {
2539           pfile->cur_run = next_tokenrun (pfile->cur_run);
2540           pfile->cur_token = pfile->cur_run->base;
2541         }
2542       /* We assume that the current token is somewhere in the current
2543          run.  */
2544       if (pfile->cur_token < pfile->cur_run->base
2545           || pfile->cur_token >= pfile->cur_run->limit)
2546         abort ();
2547
2548       if (pfile->lookaheads)
2549         {
2550           pfile->lookaheads--;
2551           result = pfile->cur_token++;
2552         }
2553       else
2554         result = _cpp_lex_direct (pfile);
2555
2556       if (result->flags & BOL)
2557         {
2558           /* Is this a directive.  If _cpp_handle_directive returns
2559              false, it is an assembler #.  */
2560           if (result->type == CPP_HASH
2561               /* 6.10.3 p 11: Directives in a list of macro arguments
2562                  gives undefined behavior.  This implementation
2563                  handles the directive as normal.  */
2564               && pfile->state.parsing_args != 1)
2565             {
2566               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2567                 {
2568                   if (pfile->directive_result.type == CPP_PADDING)
2569                     continue;
2570                   result = &pfile->directive_result;
2571                 }
2572             }
2573           else if (pfile->state.in_deferred_pragma)
2574             result = &pfile->directive_result;
2575
2576           if (pfile->cb.line_change && !pfile->state.skipping)
2577             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2578         }
2579
2580       /* We don't skip tokens in directives.  */
2581       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2582         break;
2583
2584       /* Outside a directive, invalidate controlling macros.  At file
2585          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2586          get here and MI optimization works.  */
2587       pfile->mi_valid = false;
2588
2589       if (!pfile->state.skipping || result->type == CPP_EOF)
2590         break;
2591     }
2592
2593   return result;
2594 }
2595
2596 /* Returns true if a fresh line has been loaded.  */
2597 bool
2598 _cpp_get_fresh_line (cpp_reader *pfile)
2599 {
2600   int return_at_eof;
2601
2602   /* We can't get a new line until we leave the current directive.  */
2603   if (pfile->state.in_directive)
2604     return false;
2605
2606   for (;;)
2607     {
2608       cpp_buffer *buffer = pfile->buffer;
2609
2610       if (!buffer->need_line)
2611         return true;
2612
2613       if (buffer->next_line < buffer->rlimit)
2614         {
2615           _cpp_clean_line (pfile);
2616           return true;
2617         }
2618
2619       /* First, get out of parsing arguments state.  */
2620       if (pfile->state.parsing_args)
2621         return false;
2622
2623       /* End of buffer.  Non-empty files should end in a newline.  */
2624       if (buffer->buf != buffer->rlimit
2625           && buffer->next_line > buffer->rlimit
2626           && !buffer->from_stage3)
2627         {
2628           /* Clip to buffer size.  */
2629           buffer->next_line = buffer->rlimit;
2630         }
2631
2632       return_at_eof = buffer->return_at_eof;
2633       _cpp_pop_buffer (pfile);
2634       if (pfile->buffer == NULL || return_at_eof)
2635         return false;
2636     }
2637 }
2638
2639 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2640   do                                                    \
2641     {                                                   \
2642       result->type = ELSE_TYPE;                         \
2643       if (*buffer->cur == CHAR)                         \
2644         buffer->cur++, result->type = THEN_TYPE;        \
2645     }                                                   \
2646   while (0)
2647
2648 /* Lex a token into pfile->cur_token, which is also incremented, to
2649    get diagnostics pointing to the correct location.
2650
2651    Does not handle issues such as token lookahead, multiple-include
2652    optimization, directives, skipping etc.  This function is only
2653    suitable for use by _cpp_lex_token, and in special cases like
2654    lex_expansion_token which doesn't care for any of these issues.
2655
2656    When meeting a newline, returns CPP_EOF if parsing a directive,
2657    otherwise returns to the start of the token buffer if permissible.
2658    Returns the location of the lexed token.  */
2659 cpp_token *
2660 _cpp_lex_direct (cpp_reader *pfile)
2661 {
2662   cppchar_t c;
2663   cpp_buffer *buffer;
2664   const unsigned char *comment_start;
2665   bool fallthrough_comment = false;
2666   cpp_token *result = pfile->cur_token++;
2667
2668  fresh_line:
2669   result->flags = 0;
2670   buffer = pfile->buffer;
2671   if (buffer->need_line)
2672     {
2673       if (pfile->state.in_deferred_pragma)
2674         {
2675           result->type = CPP_PRAGMA_EOL;
2676           pfile->state.in_deferred_pragma = false;
2677           if (!pfile->state.pragma_allow_expansion)
2678             pfile->state.prevent_expansion--;
2679           return result;
2680         }
2681       if (!_cpp_get_fresh_line (pfile))
2682         {
2683           result->type = CPP_EOF;
2684           if (!pfile->state.in_directive)
2685             {
2686               /* Tell the compiler the line number of the EOF token.  */
2687               result->src_loc = pfile->line_table->highest_line;
2688               result->flags = BOL;
2689             }
2690           return result;
2691         }
2692       if (buffer != pfile->buffer)
2693         fallthrough_comment = false;
2694       if (!pfile->keep_tokens)
2695         {
2696           pfile->cur_run = &pfile->base_run;
2697           result = pfile->base_run.base;
2698           pfile->cur_token = result + 1;
2699         }
2700       result->flags = BOL;
2701       if (pfile->state.parsing_args == 2)
2702         result->flags |= PREV_WHITE;
2703     }
2704   buffer = pfile->buffer;
2705  update_tokens_line:
2706   result->src_loc = pfile->line_table->highest_line;
2707
2708  skipped_white:
2709   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2710       && !pfile->overlaid_buffer)
2711     {
2712       _cpp_process_line_notes (pfile, false);
2713       result->src_loc = pfile->line_table->highest_line;
2714     }
2715   c = *buffer->cur++;
2716
2717   if (pfile->forced_token_location_p)
2718     result->src_loc = *pfile->forced_token_location_p;
2719   else
2720     result->src_loc = linemap_position_for_column (pfile->line_table,
2721                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2722
2723   switch (c)
2724     {
2725     case ' ': case '\t': case '\f': case '\v': case '\0':
2726       result->flags |= PREV_WHITE;
2727       skip_whitespace (pfile, c);
2728       goto skipped_white;
2729
2730     case '\n':
2731       if (buffer->cur < buffer->rlimit)
2732         CPP_INCREMENT_LINE (pfile, 0);
2733       buffer->need_line = true;
2734       goto fresh_line;
2735
2736     case '0': case '1': case '2': case '3': case '4':
2737     case '5': case '6': case '7': case '8': case '9':
2738       {
2739         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2740         result->type = CPP_NUMBER;
2741         lex_number (pfile, &result->val.str, &nst);
2742         warn_about_normalization (pfile, result, &nst);
2743         break;
2744       }
2745
2746     case 'L':
2747     case 'u':
2748     case 'U':
2749     case 'R':
2750       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2751          wide strings or raw strings.  */
2752       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2753           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2754         {
2755           if ((*buffer->cur == '\'' && c != 'R')
2756               || *buffer->cur == '"'
2757               || (*buffer->cur == 'R'
2758                   && c != 'R'
2759                   && buffer->cur[1] == '"'
2760                   && CPP_OPTION (pfile, rliterals))
2761               || (*buffer->cur == '8'
2762                   && c == 'u'
2763                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
2764                                 && CPP_OPTION (pfile, utf8_char_literals)))
2765                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2766                           && CPP_OPTION (pfile, rliterals)))))
2767             {
2768               lex_string (pfile, result, buffer->cur - 1);
2769               break;
2770             }
2771         }
2772       /* Fall through.  */
2773
2774     case '_':
2775     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2776     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2777     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2778     case 's': case 't':           case 'v': case 'w': case 'x':
2779     case 'y': case 'z':
2780     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2781     case 'G': case 'H': case 'I': case 'J': case 'K':
2782     case 'M': case 'N': case 'O': case 'P': case 'Q':
2783     case 'S': case 'T':           case 'V': case 'W': case 'X':
2784     case 'Y': case 'Z':
2785       result->type = CPP_NAME;
2786       {
2787         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2788         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2789                                                 &nst,
2790                                                 &result->val.node.spelling);
2791         warn_about_normalization (pfile, result, &nst);
2792       }
2793
2794       /* Convert named operators to their proper types.  */
2795       if (result->val.node.node->flags & NODE_OPERATOR)
2796         {
2797           result->flags |= NAMED_OP;
2798           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2799         }
2800
2801       /* Signal FALLTHROUGH comment followed by another token.  */
2802       if (fallthrough_comment)
2803         result->flags |= PREV_FALLTHROUGH;
2804       break;
2805
2806     case '\'':
2807     case '"':
2808       lex_string (pfile, result, buffer->cur - 1);
2809       break;
2810
2811     case '/':
2812       /* A potential block or line comment.  */
2813       comment_start = buffer->cur;
2814       c = *buffer->cur;
2815
2816       if (c == '*')
2817         {
2818           if (_cpp_skip_block_comment (pfile))
2819             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2820         }
2821       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
2822         {
2823           /* Don't warn for system headers.  */
2824           if (cpp_in_system_header (pfile))
2825             ;
2826           /* Warn about comments if pedantically GNUC89, and not
2827              in system headers.  */
2828           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
2829                    && CPP_PEDANTIC (pfile)
2830                    && ! buffer->warned_cplusplus_comments)
2831             {
2832               cpp_error (pfile, CPP_DL_PEDWARN,
2833                          "C++ style comments are not allowed in ISO C90");
2834               cpp_error (pfile, CPP_DL_PEDWARN,
2835                          "(this will be reported only once per input file)");
2836               buffer->warned_cplusplus_comments = 1;
2837             }
2838           /* Or if specifically desired via -Wc90-c99-compat.  */
2839           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
2840                    && ! CPP_OPTION (pfile, cplusplus)
2841                    && ! buffer->warned_cplusplus_comments)
2842             {
2843               cpp_error (pfile, CPP_DL_WARNING,
2844                          "C++ style comments are incompatible with C90");
2845               cpp_error (pfile, CPP_DL_WARNING,
2846                          "(this will be reported only once per input file)");
2847               buffer->warned_cplusplus_comments = 1;
2848             }
2849           /* In C89/C94, C++ style comments are forbidden.  */
2850           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
2851                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
2852             {
2853               /* But don't be confused about valid code such as
2854                  - // immediately followed by *,
2855                  - // in a preprocessing directive,
2856                  - // in an #if 0 block.  */
2857               if (buffer->cur[1] == '*'
2858                   || pfile->state.in_directive
2859                   || pfile->state.skipping)
2860                 {
2861                   result->type = CPP_DIV;
2862                   break;
2863                 }
2864               else if (! buffer->warned_cplusplus_comments)
2865                 {
2866                   cpp_error (pfile, CPP_DL_ERROR,
2867                              "C++ style comments are not allowed in ISO C90");
2868                   cpp_error (pfile, CPP_DL_ERROR,
2869                              "(this will be reported only once per input "
2870                              "file)");
2871                   buffer->warned_cplusplus_comments = 1;
2872                 }
2873             }
2874           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2875             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2876         }
2877       else if (c == '=')
2878         {
2879           buffer->cur++;
2880           result->type = CPP_DIV_EQ;
2881           break;
2882         }
2883       else
2884         {
2885           result->type = CPP_DIV;
2886           break;
2887         }
2888
2889       if (fallthrough_comment_p (pfile, comment_start))
2890         fallthrough_comment = true;
2891
2892       if (!pfile->state.save_comments)
2893         {
2894           result->flags |= PREV_WHITE;
2895           goto update_tokens_line;
2896         }
2897
2898       if (fallthrough_comment)
2899         result->flags |= PREV_FALLTHROUGH;
2900
2901       /* Save the comment as a token in its own right.  */
2902       save_comment (pfile, result, comment_start, c);
2903       break;
2904
2905     case '<':
2906       if (pfile->state.angled_headers)
2907         {
2908           lex_string (pfile, result, buffer->cur - 1);
2909           if (result->type != CPP_LESS)
2910             break;
2911         }
2912
2913       result->type = CPP_LESS;
2914       if (*buffer->cur == '=')
2915         buffer->cur++, result->type = CPP_LESS_EQ;
2916       else if (*buffer->cur == '<')
2917         {
2918           buffer->cur++;
2919           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2920         }
2921       else if (CPP_OPTION (pfile, digraphs))
2922         {
2923           if (*buffer->cur == ':')
2924             {
2925               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2926                  three characters are <:: and the subsequent character
2927                  is neither : nor >, the < is treated as a preprocessor
2928                  token by itself".  */
2929               if (CPP_OPTION (pfile, cplusplus)
2930                   && CPP_OPTION (pfile, lang) != CLK_CXX98
2931                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
2932                   && buffer->cur[1] == ':'
2933                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2934                 break;
2935
2936               buffer->cur++;
2937               result->flags |= DIGRAPH;
2938               result->type = CPP_OPEN_SQUARE;
2939             }
2940           else if (*buffer->cur == '%')
2941             {
2942               buffer->cur++;
2943               result->flags |= DIGRAPH;
2944               result->type = CPP_OPEN_BRACE;
2945             }
2946         }
2947       break;
2948
2949     case '>':
2950       result->type = CPP_GREATER;
2951       if (*buffer->cur == '=')
2952         buffer->cur++, result->type = CPP_GREATER_EQ;
2953       else if (*buffer->cur == '>')
2954         {
2955           buffer->cur++;
2956           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2957         }
2958       break;
2959
2960     case '%':
2961       result->type = CPP_MOD;
2962       if (*buffer->cur == '=')
2963         buffer->cur++, result->type = CPP_MOD_EQ;
2964       else if (CPP_OPTION (pfile, digraphs))
2965         {
2966           if (*buffer->cur == ':')
2967             {
2968               buffer->cur++;
2969               result->flags |= DIGRAPH;
2970               result->type = CPP_HASH;
2971               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2972                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2973             }
2974           else if (*buffer->cur == '>')
2975             {
2976               buffer->cur++;
2977               result->flags |= DIGRAPH;
2978               result->type = CPP_CLOSE_BRACE;
2979             }
2980         }
2981       break;
2982
2983     case '.':
2984       result->type = CPP_DOT;
2985       if (ISDIGIT (*buffer->cur))
2986         {
2987           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2988           result->type = CPP_NUMBER;
2989           lex_number (pfile, &result->val.str, &nst);
2990           warn_about_normalization (pfile, result, &nst);
2991         }
2992       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2993         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2994       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2995         buffer->cur++, result->type = CPP_DOT_STAR;
2996       break;
2997
2998     case '+':
2999       result->type = CPP_PLUS;
3000       if (*buffer->cur == '+')
3001         buffer->cur++, result->type = CPP_PLUS_PLUS;
3002       else if (*buffer->cur == '=')
3003         buffer->cur++, result->type = CPP_PLUS_EQ;
3004       break;
3005
3006     case '-':
3007       result->type = CPP_MINUS;
3008       if (*buffer->cur == '>')
3009         {
3010           buffer->cur++;
3011           result->type = CPP_DEREF;
3012           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3013             buffer->cur++, result->type = CPP_DEREF_STAR;
3014         }
3015       else if (*buffer->cur == '-')
3016         buffer->cur++, result->type = CPP_MINUS_MINUS;
3017       else if (*buffer->cur == '=')
3018         buffer->cur++, result->type = CPP_MINUS_EQ;
3019       break;
3020
3021     case '&':
3022       result->type = CPP_AND;
3023       if (*buffer->cur == '&')
3024         buffer->cur++, result->type = CPP_AND_AND;
3025       else if (*buffer->cur == '=')
3026         buffer->cur++, result->type = CPP_AND_EQ;
3027       break;
3028
3029     case '|':
3030       result->type = CPP_OR;
3031       if (*buffer->cur == '|')
3032         buffer->cur++, result->type = CPP_OR_OR;
3033       else if (*buffer->cur == '=')
3034         buffer->cur++, result->type = CPP_OR_EQ;
3035       break;
3036
3037     case ':':
3038       result->type = CPP_COLON;
3039       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
3040         buffer->cur++, result->type = CPP_SCOPE;
3041       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3042         {
3043           buffer->cur++;
3044           result->flags |= DIGRAPH;
3045           result->type = CPP_CLOSE_SQUARE;
3046         }
3047       break;
3048
3049     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3050     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3051     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3052     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3053     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3054
3055     case '?': result->type = CPP_QUERY; break;
3056     case '~': result->type = CPP_COMPL; break;
3057     case ',': result->type = CPP_COMMA; break;
3058     case '(': result->type = CPP_OPEN_PAREN; break;
3059     case ')': result->type = CPP_CLOSE_PAREN; break;
3060     case '[': result->type = CPP_OPEN_SQUARE; break;
3061     case ']': result->type = CPP_CLOSE_SQUARE; break;
3062     case '{': result->type = CPP_OPEN_BRACE; break;
3063     case '}': result->type = CPP_CLOSE_BRACE; break;
3064     case ';': result->type = CPP_SEMICOLON; break;
3065
3066       /* @ is a punctuator in Objective-C.  */
3067     case '@': result->type = CPP_ATSIGN; break;
3068
3069     case '$':
3070     case '\\':
3071       {
3072         const uchar *base = --buffer->cur;
3073         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3074
3075         if (forms_identifier_p (pfile, true, &nst))
3076           {
3077             result->type = CPP_NAME;
3078             result->val.node.node = lex_identifier (pfile, base, true, &nst,
3079                                                     &result->val.node.spelling);
3080             warn_about_normalization (pfile, result, &nst);
3081             break;
3082           }
3083         buffer->cur++;
3084       }
3085       /* FALLTHRU */
3086
3087     default:
3088       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
3089       break;
3090     }
3091
3092   /* Potentially convert the location of the token to a range.  */
3093   if (result->src_loc >= RESERVED_LOCATION_COUNT
3094       && result->type != CPP_EOF)
3095     {
3096       /* Ensure that any line notes are processed, so that we have the
3097          correct physical line/column for the end-point of the token even
3098          when a logical line is split via one or more backslashes.  */
3099       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3100           && !pfile->overlaid_buffer)
3101         _cpp_process_line_notes (pfile, false);
3102
3103       source_range tok_range;
3104       tok_range.m_start = result->src_loc;
3105       tok_range.m_finish
3106         = linemap_position_for_column (pfile->line_table,
3107                                        CPP_BUF_COLUMN (buffer, buffer->cur));
3108
3109       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
3110                                                result->src_loc,
3111                                                tok_range, NULL);
3112     }
3113
3114   return result;
3115 }
3116
3117 /* An upper bound on the number of bytes needed to spell TOKEN.
3118    Does not include preceding whitespace.  */
3119 unsigned int
3120 cpp_token_len (const cpp_token *token)
3121 {
3122   unsigned int len;
3123
3124   switch (TOKEN_SPELL (token))
3125     {
3126     default:            len = 6;                                break;
3127     case SPELL_LITERAL: len = token->val.str.len;               break;
3128     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
3129     }
3130
3131   return len;
3132 }
3133
3134 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
3135    Return the number of bytes read out of NAME.  (There are always
3136    10 bytes written to BUFFER.)  */
3137
3138 static size_t
3139 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
3140 {
3141   int j;
3142   int ucn_len = 0;
3143   int ucn_len_c;
3144   unsigned t;
3145   unsigned long utf32;
3146
3147   /* Compute the length of the UTF-8 sequence.  */
3148   for (t = *name; t & 0x80; t <<= 1)
3149     ucn_len++;
3150
3151   utf32 = *name & (0x7F >> ucn_len);
3152   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
3153     {
3154       utf32 = (utf32 << 6) | (*++name & 0x3F);
3155
3156       /* Ill-formed UTF-8.  */
3157       if ((*name & ~0x3F) != 0x80)
3158         abort ();
3159     }
3160
3161   *buffer++ = '\\';
3162   *buffer++ = 'U';
3163   for (j = 7; j >= 0; j--)
3164     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
3165   return ucn_len;
3166 }
3167
3168 /* Given a token TYPE corresponding to a digraph, return a pointer to
3169    the spelling of the digraph.  */
3170 static const unsigned char *
3171 cpp_digraph2name (enum cpp_ttype type)
3172 {
3173   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
3174 }
3175
3176 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
3177    The buffer must already contain the enough space to hold the
3178    token's spelling.  Returns a pointer to the character after the
3179    last character written.  */
3180 unsigned char *
3181 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
3182 {
3183   size_t i;
3184   const unsigned char *name = NODE_NAME (ident);
3185
3186   for (i = 0; i < NODE_LEN (ident); i++)
3187     if (name[i] & ~0x7F)
3188       {
3189         i += utf8_to_ucn (buffer, name + i) - 1;
3190         buffer += 10;
3191       }
3192     else
3193       *buffer++ = name[i];
3194
3195   return buffer;
3196 }
3197
3198 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
3199    already contain the enough space to hold the token's spelling.
3200    Returns a pointer to the character after the last character written.
3201    FORSTRING is true if this is to be the spelling after translation
3202    phase 1 (with the original spelling of extended identifiers), false
3203    if extended identifiers should always be written using UCNs (there is
3204    no option for always writing them in the internal UTF-8 form).
3205    FIXME: Would be nice if we didn't need the PFILE argument.  */
3206 unsigned char *
3207 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
3208                  unsigned char *buffer, bool forstring)
3209 {
3210   switch (TOKEN_SPELL (token))
3211     {
3212     case SPELL_OPERATOR:
3213       {
3214         const unsigned char *spelling;
3215         unsigned char c;
3216
3217         if (token->flags & DIGRAPH)
3218           spelling = cpp_digraph2name (token->type);
3219         else if (token->flags & NAMED_OP)
3220           goto spell_ident;
3221         else
3222           spelling = TOKEN_NAME (token);
3223
3224         while ((c = *spelling++) != '\0')
3225           *buffer++ = c;
3226       }
3227       break;
3228
3229     spell_ident:
3230     case SPELL_IDENT:
3231       if (forstring)
3232         {
3233           memcpy (buffer, NODE_NAME (token->val.node.spelling),
3234                   NODE_LEN (token->val.node.spelling));
3235           buffer += NODE_LEN (token->val.node.spelling);
3236         }
3237       else
3238         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
3239       break;
3240
3241     case SPELL_LITERAL:
3242       memcpy (buffer, token->val.str.text, token->val.str.len);
3243       buffer += token->val.str.len;
3244       break;
3245
3246     case SPELL_NONE:
3247       cpp_error (pfile, CPP_DL_ICE,
3248                  "unspellable token %s", TOKEN_NAME (token));
3249       break;
3250     }
3251
3252   return buffer;
3253 }
3254
3255 /* Returns TOKEN spelt as a null-terminated string.  The string is
3256    freed when the reader is destroyed.  Useful for diagnostics.  */
3257 unsigned char *
3258 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
3259 {
3260   unsigned int len = cpp_token_len (token) + 1;
3261   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
3262
3263   end = cpp_spell_token (pfile, token, start, false);
3264   end[0] = '\0';
3265
3266   return start;
3267 }
3268
3269 /* Returns a pointer to a string which spells the token defined by
3270    TYPE and FLAGS.  Used by C front ends, which really should move to
3271    using cpp_token_as_text.  */
3272 const char *
3273 cpp_type2name (enum cpp_ttype type, unsigned char flags)
3274 {
3275   if (flags & DIGRAPH)
3276     return (const char *) cpp_digraph2name (type);
3277   else if (flags & NAMED_OP)
3278     return cpp_named_operator2name (type);
3279
3280   return (const char *) token_spellings[type].name;
3281 }
3282
3283 /* Writes the spelling of token to FP, without any preceding space.
3284    Separated from cpp_spell_token for efficiency - to avoid stdio
3285    double-buffering.  */
3286 void
3287 cpp_output_token (const cpp_token *token, FILE *fp)
3288 {
3289   switch (TOKEN_SPELL (token))
3290     {
3291     case SPELL_OPERATOR:
3292       {
3293         const unsigned char *spelling;
3294         int c;
3295
3296         if (token->flags & DIGRAPH)
3297           spelling = cpp_digraph2name (token->type);
3298         else if (token->flags & NAMED_OP)
3299           goto spell_ident;
3300         else
3301           spelling = TOKEN_NAME (token);
3302
3303         c = *spelling;
3304         do
3305           putc (c, fp);
3306         while ((c = *++spelling) != '\0');
3307       }
3308       break;
3309
3310     spell_ident:
3311     case SPELL_IDENT:
3312       {
3313         size_t i;
3314         const unsigned char * name = NODE_NAME (token->val.node.node);
3315
3316         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
3317           if (name[i] & ~0x7F)
3318             {
3319               unsigned char buffer[10];
3320               i += utf8_to_ucn (buffer, name + i) - 1;
3321               fwrite (buffer, 1, 10, fp);
3322             }
3323           else
3324             fputc (NODE_NAME (token->val.node.node)[i], fp);
3325       }
3326       break;
3327
3328     case SPELL_LITERAL:
3329       fwrite (token->val.str.text, 1, token->val.str.len, fp);
3330       break;
3331
3332     case SPELL_NONE:
3333       /* An error, most probably.  */
3334       break;
3335     }
3336 }
3337
3338 /* Compare two tokens.  */
3339 int
3340 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
3341 {
3342   if (a->type == b->type && a->flags == b->flags)
3343     switch (TOKEN_SPELL (a))
3344       {
3345       default:                  /* Keep compiler happy.  */
3346       case SPELL_OPERATOR:
3347         /* token_no is used to track where multiple consecutive ##
3348            tokens were originally located.  */
3349         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
3350       case SPELL_NONE:
3351         return (a->type != CPP_MACRO_ARG
3352                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
3353                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
3354       case SPELL_IDENT:
3355         return (a->val.node.node == b->val.node.node
3356                 && a->val.node.spelling == b->val.node.spelling);
3357       case SPELL_LITERAL:
3358         return (a->val.str.len == b->val.str.len
3359                 && !memcmp (a->val.str.text, b->val.str.text,
3360                             a->val.str.len));
3361       }
3362
3363   return 0;
3364 }
3365
3366 /* Returns nonzero if a space should be inserted to avoid an
3367    accidental token paste for output.  For simplicity, it is
3368    conservative, and occasionally advises a space where one is not
3369    needed, e.g. "." and ".2".  */
3370 int
3371 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
3372                  const cpp_token *token2)
3373 {
3374   enum cpp_ttype a = token1->type, b = token2->type;
3375   cppchar_t c;
3376
3377   if (token1->flags & NAMED_OP)
3378     a = CPP_NAME;
3379   if (token2->flags & NAMED_OP)
3380     b = CPP_NAME;
3381
3382   c = EOF;
3383   if (token2->flags & DIGRAPH)
3384     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
3385   else if (token_spellings[b].category == SPELL_OPERATOR)
3386     c = token_spellings[b].name[0];
3387
3388   /* Quickly get everything that can paste with an '='.  */
3389   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
3390     return 1;
3391
3392   switch (a)
3393     {
3394     case CPP_GREATER:   return c == '>';
3395     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
3396     case CPP_PLUS:      return c == '+';
3397     case CPP_MINUS:     return c == '-' || c == '>';
3398     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
3399     case CPP_MOD:       return c == ':' || c == '>';
3400     case CPP_AND:       return c == '&';
3401     case CPP_OR:        return c == '|';
3402     case CPP_COLON:     return c == ':' || c == '>';
3403     case CPP_DEREF:     return c == '*';
3404     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
3405     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
3406     case CPP_NAME:      return ((b == CPP_NUMBER
3407                                  && name_p (pfile, &token2->val.str))
3408                                 || b == CPP_NAME
3409                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
3410     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
3411                                 || c == '.' || c == '+' || c == '-');
3412                                       /* UCNs */
3413     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
3414                                  && b == CPP_NAME)
3415                                 || (CPP_OPTION (pfile, objc)
3416                                     && token1->val.str.text[0] == '@'
3417                                     && (b == CPP_NAME || b == CPP_STRING)));
3418     case CPP_STRING:
3419     case CPP_WSTRING:
3420     case CPP_UTF8STRING:
3421     case CPP_STRING16:
3422     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
3423                                 && (b == CPP_NAME
3424                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
3425                                         && ISIDST (token2->val.str.text[0]))));
3426
3427     default:            break;
3428     }
3429
3430   return 0;
3431 }
3432
3433 /* Output all the remaining tokens on the current line, and a newline
3434    character, to FP.  Leading whitespace is removed.  If there are
3435    macros, special token padding is not performed.  */
3436 void
3437 cpp_output_line (cpp_reader *pfile, FILE *fp)
3438 {
3439   const cpp_token *token;
3440
3441   token = cpp_get_token (pfile);
3442   while (token->type != CPP_EOF)
3443     {
3444       cpp_output_token (token, fp);
3445       token = cpp_get_token (pfile);
3446       if (token->flags & PREV_WHITE)
3447         putc (' ', fp);
3448     }
3449
3450   putc ('\n', fp);
3451 }
3452
3453 /* Return a string representation of all the remaining tokens on the
3454    current line.  The result is allocated using xmalloc and must be
3455    freed by the caller.  */
3456 unsigned char *
3457 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
3458 {
3459   const cpp_token *token;
3460   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
3461   unsigned int alloced = 120 + out;
3462   unsigned char *result = (unsigned char *) xmalloc (alloced);
3463
3464   /* If DIR_NAME is empty, there are no initial contents.  */
3465   if (dir_name)
3466     {
3467       sprintf ((char *) result, "#%s ", dir_name);
3468       out += 2;
3469     }
3470
3471   token = cpp_get_token (pfile);
3472   while (token->type != CPP_EOF)
3473     {
3474       unsigned char *last;
3475       /* Include room for a possible space and the terminating nul.  */
3476       unsigned int len = cpp_token_len (token) + 2;
3477
3478       if (out + len > alloced)
3479         {
3480           alloced *= 2;
3481           if (out + len > alloced)
3482             alloced = out + len;
3483           result = (unsigned char *) xrealloc (result, alloced);
3484         }
3485
3486       last = cpp_spell_token (pfile, token, &result[out], 0);
3487       out = last - result;
3488
3489       token = cpp_get_token (pfile);
3490       if (token->flags & PREV_WHITE)
3491         result[out++] = ' ';
3492     }
3493
3494   result[out] = '\0';
3495   return result;
3496 }
3497
3498 /* Memory buffers.  Changing these three constants can have a dramatic
3499    effect on performance.  The values here are reasonable defaults,
3500    but might be tuned.  If you adjust them, be sure to test across a
3501    range of uses of cpplib, including heavy nested function-like macro
3502    expansion.  Also check the change in peak memory usage (NJAMD is a
3503    good tool for this).  */
3504 #define MIN_BUFF_SIZE 8000
3505 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3506 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
3507         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
3508
3509 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
3510   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
3511 #endif
3512
3513 /* Create a new allocation buffer.  Place the control block at the end
3514    of the buffer, so that buffer overflows will cause immediate chaos.  */
3515 static _cpp_buff *
3516 new_buff (size_t len)
3517 {
3518   _cpp_buff *result;
3519   unsigned char *base;
3520
3521   if (len < MIN_BUFF_SIZE)
3522     len = MIN_BUFF_SIZE;
3523   len = CPP_ALIGN (len);
3524
3525 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3526   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
3527      struct first.  */
3528   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
3529   base = XNEWVEC (unsigned char, len + slen);
3530   result = (_cpp_buff *) base;
3531   base += slen;
3532 #else
3533   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3534   result = (_cpp_buff *) (base + len);
3535 #endif
3536   result->base = base;
3537   result->cur = base;
3538   result->limit = base + len;
3539   result->next = NULL;
3540   return result;
3541 }
3542
3543 /* Place a chain of unwanted allocation buffers on the free list.  */
3544 void
3545 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3546 {
3547   _cpp_buff *end = buff;
3548
3549   while (end->next)
3550     end = end->next;
3551   end->next = pfile->free_buffs;
3552   pfile->free_buffs = buff;
3553 }
3554
3555 /* Return a free buffer of size at least MIN_SIZE.  */
3556 _cpp_buff *
3557 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3558 {
3559   _cpp_buff *result, **p;
3560
3561   for (p = &pfile->free_buffs;; p = &(*p)->next)
3562     {
3563       size_t size;
3564
3565       if (*p == NULL)
3566         return new_buff (min_size);
3567       result = *p;
3568       size = result->limit - result->base;
3569       /* Return a buffer that's big enough, but don't waste one that's
3570          way too big.  */
3571       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3572         break;
3573     }
3574
3575   *p = result->next;
3576   result->next = NULL;
3577   result->cur = result->base;
3578   return result;
3579 }
3580
3581 /* Creates a new buffer with enough space to hold the uncommitted
3582    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
3583    the excess bytes to the new buffer.  Chains the new buffer after
3584    BUFF, and returns the new buffer.  */
3585 _cpp_buff *
3586 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3587 {
3588   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3589   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3590
3591   buff->next = new_buff;
3592   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3593   return new_buff;
3594 }
3595
3596 /* Creates a new buffer with enough space to hold the uncommitted
3597    remaining bytes of the buffer pointed to by BUFF, and at least
3598    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3599    Chains the new buffer before the buffer pointed to by BUFF, and
3600    updates the pointer to point to the new buffer.  */
3601 void
3602 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3603 {
3604   _cpp_buff *new_buff, *old_buff = *pbuff;
3605   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3606
3607   new_buff = _cpp_get_buff (pfile, size);
3608   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3609   new_buff->next = old_buff;
3610   *pbuff = new_buff;
3611 }
3612
3613 /* Free a chain of buffers starting at BUFF.  */
3614 void
3615 _cpp_free_buff (_cpp_buff *buff)
3616 {
3617   _cpp_buff *next;
3618
3619   for (; buff; buff = next)
3620     {
3621       next = buff->next;
3622 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3623       free (buff);
3624 #else
3625       free (buff->base);
3626 #endif
3627     }
3628 }
3629
3630 /* Allocate permanent, unaligned storage of length LEN.  */
3631 unsigned char *
3632 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3633 {
3634   _cpp_buff *buff = pfile->u_buff;
3635   unsigned char *result = buff->cur;
3636
3637   if (len > (size_t) (buff->limit - result))
3638     {
3639       buff = _cpp_get_buff (pfile, len);
3640       buff->next = pfile->u_buff;
3641       pfile->u_buff = buff;
3642       result = buff->cur;
3643     }
3644
3645   buff->cur = result + len;
3646   return result;
3647 }
3648
3649 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3650    That buffer is used for growing allocations when saving macro
3651    replacement lists in a #define, and when parsing an answer to an
3652    assertion in #assert, #unassert or #if (and therefore possibly
3653    whilst expanding macros).  It therefore must not be used by any
3654    code that they might call: specifically the lexer and the guts of
3655    the macro expander.
3656
3657    All existing other uses clearly fit this restriction: storing
3658    registered pragmas during initialization.  */
3659 unsigned char *
3660 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3661 {
3662   _cpp_buff *buff = pfile->a_buff;
3663   unsigned char *result = buff->cur;
3664
3665   if (len > (size_t) (buff->limit - result))
3666     {
3667       buff = _cpp_get_buff (pfile, len);
3668       buff->next = pfile->a_buff;
3669       pfile->a_buff = buff;
3670       result = buff->cur;
3671     }
3672
3673   buff->cur = result + len;
3674   return result;
3675 }
3676
3677 /* Say which field of TOK is in use.  */
3678
3679 enum cpp_token_fld_kind
3680 cpp_token_val_index (const cpp_token *tok)
3681 {
3682   switch (TOKEN_SPELL (tok))
3683     {
3684     case SPELL_IDENT:
3685       return CPP_TOKEN_FLD_NODE;
3686     case SPELL_LITERAL:
3687       return CPP_TOKEN_FLD_STR;
3688     case SPELL_OPERATOR:
3689       if (tok->type == CPP_PASTE)
3690         return CPP_TOKEN_FLD_TOKEN_NO;
3691       else
3692         return CPP_TOKEN_FLD_NONE;
3693     case SPELL_NONE:
3694       if (tok->type == CPP_MACRO_ARG)
3695         return CPP_TOKEN_FLD_ARG_NO;
3696       else if (tok->type == CPP_PADDING)
3697         return CPP_TOKEN_FLD_SOURCE;
3698       else if (tok->type == CPP_PRAGMA)
3699         return CPP_TOKEN_FLD_PRAGMA;
3700       /* fall through */
3701     default:
3702       return CPP_TOKEN_FLD_NONE;
3703     }
3704 }
3705
3706 /* All tokens lexed in R after calling this function will be forced to have
3707    their source_location the same as the location referenced by P, until
3708    cpp_stop_forcing_token_locations is called for R.  */
3709
3710 void
3711 cpp_force_token_locations (cpp_reader *r, source_location *p)
3712 {
3713   r->forced_token_location_p = p;
3714 }
3715
3716 /* Go back to assigning locations naturally for lexed tokens.  */
3717
3718 void
3719 cpp_stop_forcing_token_locations (cpp_reader *r)
3720 {
3721   r->forced_token_location_p = NULL;
3722 }