libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2017 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problem can be properly
 267    autoconfed:
 268
 269    The Solaris 10+ assembler tags objects with the instruction set
 270    extensions used, so SSE4.2 executables cannot run on machines that
 271    don't support that extension.  */
 272
 273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 274
 275 /* Replicated character data to be shared between implementations.
 276    Recall that outside of a context with vector support we can't
 277    define compatible vector types, therefore these are all defined
 278    in terms of raw characters.  */
 279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 280   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 281     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 282   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 283     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 284   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 285     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 286   { '?', '?', '?', '?', '?', '?', '?', '?',
 287     '?', '?', '?', '?', '?', '?', '?', '?' },
 288 };
 289
 290 /* A version of the fast scanner using MMX vectorized byte compare insns.
 291
 292    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 293    which was packaged into SSE1; it is also present in the AMD MMX
 294    extension.  Mark the function as using "sse" so that we emit a real
 295    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 296
 297 static const uchar *
 298 #ifndef __SSE__
 299 __attribute__((__target__("sse")))
 300 #endif
 301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 302 {
 303   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 304   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 305
 306   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 307   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 308   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 309   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 310
 311   unsigned int misalign, found, mask;
 312   const v8qi *p;
 313   v8qi data, t, c;
 314
 315   /* Align the source pointer.  While MMX doesn't generate unaligned data
 316      faults, this allows us to safely scan to the end of the buffer without
 317      reading beyond the end of the last page.  */
 318   misalign = (uintptr_t)s & 7;
 319   p = (const v8qi *)((uintptr_t)s & -8);
 320   data = *p;
 321
 322   /* Create a mask for the bytes that are valid within the first
 323      16-byte block.  The Idea here is that the AND with the mask
 324      within the loop is "free", since we need some AND or TEST
 325      insn in order to set the flags for the branch anyway.  */
 326   mask = -1u << misalign;
 327
 328   /* Main loop processing 8 bytes at a time.  */
 329   goto start;
 330   do
 331     {
 332       data = *++p;
 333       mask = -1;
 334
 335     start:
 336       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 337       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 338       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 339       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       found = __builtin_ia32_pmovmskb (t);
 344       found &= mask;
 345     }
 346   while (!found);
 347
 348   __builtin_ia32_emms ();
 349
 350   /* FOUND contains 1 in bits for which we matched a relevant
 351      character.  Conversion to the byte index is trivial.  */
 352   found = __builtin_ctz(found);
 353   return (const uchar *)p + found;
 354 }
 355
 356 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 357
 358 static const uchar *
 359 #ifndef __SSE2__
 360 __attribute__((__target__("sse2")))
 361 #endif
 362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 363 {
 364   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 365
 366   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 367   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 368   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 369   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 370
 371   unsigned int misalign, found, mask;
 372   const v16qi *p;
 373   v16qi data, t;
 374
 375   /* Align the source pointer.  */
 376   misalign = (uintptr_t)s & 15;
 377   p = (const v16qi *)((uintptr_t)s & -16);
 378   data = *p;
 379
 380   /* Create a mask for the bytes that are valid within the first
 381      16-byte block.  The Idea here is that the AND with the mask
 382      within the loop is "free", since we need some AND or TEST
 383      insn in order to set the flags for the branch anyway.  */
 384   mask = -1u << misalign;
 385
 386   /* Main loop processing 16 bytes at a time.  */
 387   goto start;
 388   do
 389     {
 390       data = *++p;
 391       mask = -1;
 392
 393     start:
 394       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 395       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 396       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 397       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 398       found = __builtin_ia32_pmovmskb128 (t);
 399       found &= mask;
 400     }
 401   while (!found);
 402
 403   /* FOUND contains 1 in bits for which we matched a relevant
 404      character.  Conversion to the byte index is trivial.  */
 405   found = __builtin_ctz(found);
 406   return (const uchar *)p + found;
 407 }
 408
 409 #ifdef HAVE_SSE4
 410 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 411
 412 static const uchar *
 413 #ifndef __SSE4_2__
 414 __attribute__((__target__("sse4.2")))
 415 #endif
 416 search_line_sse42 (const uchar *s, const uchar *end)
 417 {
 418   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 419   static const v16qi search = { '\n', '\r', '?', '\\' };
 420
 421   uintptr_t si = (uintptr_t)s;
 422   uintptr_t index;
 423
 424   /* Check for unaligned input.  */
 425   if (si & 15)
 426     {
 427       v16qi sv;
 428
 429       if (__builtin_expect (end - s < 16, 0)
 430           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 431         {
 432           /* There are less than 16 bytes left in the buffer, and less
 433              than 16 bytes left on the page.  Reading 16 bytes at this
 434              point might generate a spurious page fault.  Defer to the
 435              SSE2 implementation, which already handles alignment.  */
 436           return search_line_sse2 (s, end);
 437         }
 438
 439       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 440          memory need not be aligned.  */
 441       sv = __builtin_ia32_loaddqu ((const char *) s);
 442       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 443
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 15) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  */
 454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 455   while (1)
 456     {
 457       char f;
 458
 459       /* By using inline assembly instead of the builtin,
 460          we can use the result, as well as the flags set.  */
 461       __asm ("%vpcmpestri\t$0, %2, %3"
 462              : "=c"(index), "=@ccc"(f)
 463              : "m"(*s), "x"(search), "a"(4), "d"(16));
 464       if (f)
 465         break;
 466
 467       s += 16;
 468     }
 469 #else
 470   s -= 16;
 471   /* By doing the whole loop in inline assembly,
 472      we can make proper use of the flags set.  */
 473   __asm (      ".balign 16\n"
 474         "0:     add $16, %1\n"
 475         "       %vpcmpestri\t$0, (%1), %2\n"
 476         "       jnc 0b"
 477         : "=&c"(index), "+r"(s)
 478         : "x"(search), "a"(4), "d"(16));
 479 #endif
 480
 481  found:
 482   return s + index;
 483 }
 484
 485 #else
 486 /* Work around out-dated assemblers without sse4 support.  */
 487 #define search_line_sse42 search_line_sse2
 488 #endif
 489
 490 /* Check the CPU capabilities.  */
 491
 492 #include "../gcc/config/i386/cpuid.h"
 493
 494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 495 static search_line_fast_type search_line_fast;
 496
 497 #define HAVE_init_vectorized_lexer 1
 498 static inline void
 499 init_vectorized_lexer (void)
 500 {
 501   unsigned dummy, ecx = 0, edx = 0;
 502   search_line_fast_type impl = search_line_acc_char;
 503   int minimum = 0;
 504
 505 #if defined(__SSE4_2__)
 506   minimum = 3;
 507 #elif defined(__SSE2__)
 508   minimum = 2;
 509 #elif defined(__SSE__)
 510   minimum = 1;
 511 #endif
 512
 513   if (minimum == 3)
 514     impl = search_line_sse42;
 515   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 516     {
 517       if (minimum == 3 || (ecx & bit_SSE4_2))
 518         impl = search_line_sse42;
 519       else if (minimum == 2 || (edx & bit_SSE2))
 520         impl = search_line_sse2;
 521       else if (minimum == 1 || (edx & bit_SSE))
 522         impl = search_line_mmx;
 523     }
 524   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 525     {
 526       if (minimum == 1
 527           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 528         impl = search_line_mmx;
 529     }
 530
 531   search_line_fast = impl;
 532 }
 533
 534 #elif defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 535
 536 /* A vection of the fast scanner using AltiVec vectorized byte compares
 537    and VSX unaligned loads (when VSX is available).  This is otherwise
 538    the same as the pre-GCC 5 version.  */
 539
 540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 541 static const uchar *
 542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 543 {
 544   typedef __attribute__((altivec(vector))) unsigned char vc;
 545
 546   const vc repl_nl = {
 547     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 548     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 549   };
 550   const vc repl_cr = {
 551     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 552     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 553   };
 554   const vc repl_bs = {
 555     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 556     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 557   };
 558   const vc repl_qm = {
 559     '?', '?', '?', '?', '?', '?', '?', '?',
 560     '?', '?', '?', '?', '?', '?', '?', '?',
 561   };
 562   const vc zero = { 0 };
 563
 564   vc data, t;
 565
 566   /* Main loop processing 16 bytes at a time.  */
 567   do
 568     {
 569       vc m_nl, m_cr, m_bs, m_qm;
 570
 571       data = *((const vc *)s);
 572       s += 16;
 573
 574       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 575       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 576       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 577       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 578       t = (m_nl | m_cr) | (m_bs | m_qm);
 579
 580       /* T now contains 0xff in bytes for which we matched one of the relevant
 581          characters.  We want to exit the loop if any byte in T is non-zero.
 582          Below is the expansion of vec_any_ne(t, zero).  */
 583     }
 584   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 585
 586   /* Restore s to to point to the 16 bytes we just processed.  */
 587   s -= 16;
 588
 589   {
 590 #define N  (sizeof(vc) / sizeof(long))
 591
 592     union {
 593       vc v;
 594       /* Statically assert that N is 2 or 4.  */
 595       unsigned long l[(N == 2 || N == 4) ? N : -1];
 596     } u;
 597     unsigned long l, i = 0;
 598
 599     u.v = t;
 600
 601     /* Find the first word of T that is non-zero.  */
 602     switch (N)
 603       {
 604       case 4:
 605         l = u.l[i++];
 606         if (l != 0)
 607           break;
 608         s += sizeof(unsigned long);
 609         l = u.l[i++];
 610         if (l != 0)
 611           break;
 612         s += sizeof(unsigned long);
 613         /* FALLTHRU */
 614       case 2:
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619         l = u.l[i];
 620       }
 621
 622     /* L now contains 0xff in bytes for which we matched one of the
 623        relevant characters.  We can find the byte index by finding
 624        its bit index and dividing by 8.  */
 625 #ifdef __BIG_ENDIAN__
 626     l = __builtin_clzl(l) >> 3;
 627 #else
 628     l = __builtin_ctzl(l) >> 3;
 629 #endif
 630     return s + l;
 631
 632 #undef N
 633   }
 634 }
 635
 636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 637
 638 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 639    This cannot be used for little endian because vec_lvsl/lvsr are
 640    deprecated for little endian and the code won't work properly.  */
 641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 642    so we can't compile this function without -maltivec on the command line
 643    (or implied by some other switch).  */
 644
 645 static const uchar *
 646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 647 {
 648   typedef __attribute__((altivec(vector))) unsigned char vc;
 649
 650   const vc repl_nl = {
 651     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 652     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 653   };
 654   const vc repl_cr = {
 655     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 656     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 657   };
 658   const vc repl_bs = {
 659     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 660     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 661   };
 662   const vc repl_qm = {
 663     '?', '?', '?', '?', '?', '?', '?', '?',
 664     '?', '?', '?', '?', '?', '?', '?', '?',
 665   };
 666   const vc ones = {
 667     -1, -1, -1, -1, -1, -1, -1, -1,
 668     -1, -1, -1, -1, -1, -1, -1, -1,
 669   };
 670   const vc zero = { 0 };
 671
 672   vc data, mask, t;
 673
 674   /* Altivec loads automatically mask addresses with -16.  This lets us
 675      issue the first load as early as possible.  */
 676   data = __builtin_vec_ld(0, (const vc *)s);
 677
 678   /* Discard bytes before the beginning of the buffer.  Do this by
 679      beginning with all ones and shifting in zeros according to the
 680      mis-alignment.  The LVSR instruction pulls the exact shift we
 681      want from the address.  */
 682   mask = __builtin_vec_lvsr(0, s);
 683   mask = __builtin_vec_perm(zero, ones, mask);
 684   data &= mask;
 685
 686   /* While altivec loads mask addresses, we still need to align S so
 687      that the offset we compute at the end is correct.  */
 688   s = (const uchar *)((uintptr_t)s & -16);
 689
 690   /* Main loop processing 16 bytes at a time.  */
 691   goto start;
 692   do
 693     {
 694       vc m_nl, m_cr, m_bs, m_qm;
 695
 696       s += 16;
 697       data = __builtin_vec_ld(0, (const vc *)s);
 698
 699     start:
 700       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 701       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 702       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 703       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 704       t = (m_nl | m_cr) | (m_bs | m_qm);
 705
 706       /* T now contains 0xff in bytes for which we matched one of the relevant
 707          characters.  We want to exit the loop if any byte in T is non-zero.
 708          Below is the expansion of vec_any_ne(t, zero).  */
 709     }
 710   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 711
 712   {
 713 #define N  (sizeof(vc) / sizeof(long))
 714
 715     union {
 716       vc v;
 717       /* Statically assert that N is 2 or 4.  */
 718       unsigned long l[(N == 2 || N == 4) ? N : -1];
 719     } u;
 720     unsigned long l, i = 0;
 721
 722     u.v = t;
 723
 724     /* Find the first word of T that is non-zero.  */
 725     switch (N)
 726       {
 727       case 4:
 728         l = u.l[i++];
 729         if (l != 0)
 730           break;
 731         s += sizeof(unsigned long);
 732         l = u.l[i++];
 733         if (l != 0)
 734           break;
 735         s += sizeof(unsigned long);
 736         /* FALLTHROUGH */
 737       case 2:
 738         l = u.l[i++];
 739         if (l != 0)
 740           break;
 741         s += sizeof(unsigned long);
 742         l = u.l[i];
 743       }
 744
 745     /* L now contains 0xff in bytes for which we matched one of the
 746        relevant characters.  We can find the byte index by finding
 747        its bit index and dividing by 8.  */
 748     l = __builtin_clzl(l) >> 3;
 749     return s + l;
 750
 751 #undef N
 752   }
 753 }
 754
 755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 756 #include "arm_neon.h"
 757
 758 /* This doesn't have to be the exact page size, but no system may use
 759    a size smaller than this.  ARMv8 requires a minimum page size of
 760    4k.  The impact of being conservative here is a small number of
 761    cases will take the slightly slower entry path into the main
 762    loop.  */
 763
 764 #define AARCH64_MIN_PAGE_SIZE 4096
 765
 766 static const uchar *
 767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 768 {
 769   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 770   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 771   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 772   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 773   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 774
 775 #ifdef __AARCH64EB
 776   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 777 #else
 778   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 779 #endif
 780
 781   unsigned int found;
 782   const uint8_t *p;
 783   uint8x16_t data;
 784   uint8x16_t t;
 785   uint16x8_t m;
 786   uint8x16_t u, v, w;
 787
 788   /* Align the source pointer.  */
 789   p = (const uint8_t *)((uintptr_t)s & -16);
 790
 791   /* Assuming random string start positions, with a 4k page size we'll take
 792      the slow path about 0.37% of the time.  */
 793   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 794                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 795                         < 16, 0))
 796     {
 797       /* Slow path: the string starts near a possible page boundary.  */
 798       uint32_t misalign, mask;
 799
 800       misalign = (uintptr_t)s & 15;
 801       mask = (-1u << misalign) & 0xffff;
 802       data = vld1q_u8 (p);
 803       t = vceqq_u8 (data, repl_nl);
 804       u = vceqq_u8 (data, repl_cr);
 805       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 806       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 807       t = vorrq_u8 (v, w);
 808       t = vandq_u8 (t, xmask);
 809       m = vpaddlq_u8 (t);
 810       m = vshlq_u16 (m, shift);
 811       found = vaddvq_u16 (m);
 812       found &= mask;
 813       if (found)
 814         return (const uchar*)p + __builtin_ctz (found);
 815     }
 816   else
 817     {
 818       data = vld1q_u8 ((const uint8_t *) s);
 819       t = vceqq_u8 (data, repl_nl);
 820       u = vceqq_u8 (data, repl_cr);
 821       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 822       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 823       t = vorrq_u8 (v, w);
 824       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 825         goto done;
 826     }
 827
 828   do
 829     {
 830       p += 16;
 831       data = vld1q_u8 (p);
 832       t = vceqq_u8 (data, repl_nl);
 833       u = vceqq_u8 (data, repl_cr);
 834       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 835       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 836       t = vorrq_u8 (v, w);
 837     } while (!vpaddd_u64 ((uint64x2_t)t));
 838
 839 done:
 840   /* Now that we've found the terminating substring, work out precisely where
 841      we need to stop.  */
 842   t = vandq_u8 (t, xmask);
 843   m = vpaddlq_u8 (t);
 844   m = vshlq_u16 (m, shift);
 845   found = vaddvq_u16 (m);
 846   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 847           + __builtin_ctz (found));
 848 }
 849
 850 #elif defined (__ARM_NEON)
 851 #include "arm_neon.h"
 852
 853 static const uchar *
 854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 855 {
 856   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 857   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 858   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 859   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 860   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 861
 862   unsigned int misalign, found, mask;
 863   const uint8_t *p;
 864   uint8x16_t data;
 865
 866   /* Align the source pointer.  */
 867   misalign = (uintptr_t)s & 15;
 868   p = (const uint8_t *)((uintptr_t)s & -16);
 869   data = vld1q_u8 (p);
 870
 871   /* Create a mask for the bytes that are valid within the first
 872      16-byte block.  The Idea here is that the AND with the mask
 873      within the loop is "free", since we need some AND or TEST
 874      insn in order to set the flags for the branch anyway.  */
 875   mask = (-1u << misalign) & 0xffff;
 876
 877   /* Main loop, processing 16 bytes at a time.  */
 878   goto start;
 879
 880   do
 881     {
 882       uint8x8_t l;
 883       uint16x4_t m;
 884       uint32x2_t n;
 885       uint8x16_t t, u, v, w;
 886
 887       p += 16;
 888       data = vld1q_u8 (p);
 889       mask = 0xffff;
 890
 891     start:
 892       t = vceqq_u8 (data, repl_nl);
 893       u = vceqq_u8 (data, repl_cr);
 894       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 895       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 896       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 897       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 898       m = vpaddl_u8 (l);
 899       n = vpaddl_u16 (m);
 900
 901       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 902               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 903       found &= mask;
 904     }
 905   while (!found);
 906
 907   /* FOUND contains 1 in bits for which we matched a relevant
 908      character.  Conversion to the byte index is trivial.  */
 909   found = __builtin_ctz (found);
 910   return (const uchar *)p + found;
 911 }
 912
 913 #else
 914
 915 /* We only have one accelerated alternative.  Use a direct call so that
 916    we encourage inlining.  */
 917
 918 #define search_line_fast  search_line_acc_char
 919
 920 #endif
 921
 922 /* Initialize the lexer if needed.  */
 923
 924 void
 925 _cpp_init_lexer (void)
 926 {
 927 #ifdef HAVE_init_vectorized_lexer
 928   init_vectorized_lexer ();
 929 #endif
 930 }
 931
 932 /* Returns with a logical line that contains no escaped newlines or
 933    trigraphs.  This is a time-critical inner loop.  */
 934 void
 935 _cpp_clean_line (cpp_reader *pfile)
 936 {
 937   cpp_buffer *buffer;
 938   const uchar *s;
 939   uchar c, *d, *p;
 940
 941   buffer = pfile->buffer;
 942   buffer->cur_note = buffer->notes_used = 0;
 943   buffer->cur = buffer->line_base = buffer->next_line;
 944   buffer->need_line = false;
 945   s = buffer->next_line;
 946
 947   if (!buffer->from_stage3)
 948     {
 949       const uchar *pbackslash = NULL;
 950
 951       /* Fast path.  This is the common case of an un-escaped line with
 952          no trigraphs.  The primary win here is by not writing any
 953          data back to memory until we have to.  */
 954       while (1)
 955         {
 956           /* Perform an optimized search for \n, \r, \\, ?.  */
 957           s = search_line_fast (s, buffer->rlimit);
 958
 959           c = *s;
 960           if (c == '\\')
 961             {
 962               /* Record the location of the backslash and continue.  */
 963               pbackslash = s++;
 964             }
 965           else if (__builtin_expect (c == '?', 0))
 966             {
 967               if (__builtin_expect (s[1] == '?', false)
 968                    && _cpp_trigraph_map[s[2]])
 969                 {
 970                   /* Have a trigraph.  We may or may not have to convert
 971                      it.  Add a line note regardless, for -Wtrigraphs.  */
 972                   add_line_note (buffer, s, s[2]);
 973                   if (CPP_OPTION (pfile, trigraphs))
 974                     {
 975                       /* We do, and that means we have to switch to the
 976                          slow path.  */
 977                       d = (uchar *) s;
 978                       *d = _cpp_trigraph_map[s[2]];
 979                       s += 2;
 980                       goto slow_path;
 981                     }
 982                 }
 983               /* Not a trigraph.  Continue on fast-path.  */
 984               s++;
 985             }
 986           else
 987             break;
 988         }
 989
 990       /* This must be \r or \n.  We're either done, or we'll be forced
 991          to write back to the buffer and continue on the slow path.  */
 992       d = (uchar *) s;
 993
 994       if (__builtin_expect (s == buffer->rlimit, false))
 995         goto done;
 996
 997       /* DOS line ending? */
 998       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 999         {
1000           s++;
1001           if (s == buffer->rlimit)
1002             goto done;
1003         }
1004
1005       if (__builtin_expect (pbackslash == NULL, true))
1006         goto done;
1007
1008       /* Check for escaped newline.  */
1009       p = d;
1010       while (is_nvspace (p[-1]))
1011         p--;
1012       if (p - 1 != pbackslash)
1013         goto done;
1014
1015       /* Have an escaped newline; process it and proceed to
1016          the slow path.  */
1017       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018       d = p - 2;
1019       buffer->next_line = p - 1;
1020
1021     slow_path:
1022       while (1)
1023         {
1024           c = *++s;
1025           *++d = c;
1026
1027           if (c == '\n' || c == '\r')
1028             {
1029               /* Handle DOS line endings.  */
1030               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031                 s++;
1032               if (s == buffer->rlimit)
1033                 break;
1034
1035               /* Escaped?  */
1036               p = d;
1037               while (p != buffer->next_line && is_nvspace (p[-1]))
1038                 p--;
1039               if (p == buffer->next_line || p[-1] != '\\')
1040                 break;
1041
1042               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043               d = p - 2;
1044               buffer->next_line = p - 1;
1045             }
1046           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1047             {
1048               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1049               add_line_note (buffer, d, s[2]);
1050               if (CPP_OPTION (pfile, trigraphs))
1051                 {
1052                   *d = _cpp_trigraph_map[s[2]];
1053                   s += 2;
1054                 }
1055             }
1056         }
1057     }
1058   else
1059     {
1060       while (*s != '\n' && *s != '\r')
1061         s++;
1062       d = (uchar *) s;
1063
1064       /* Handle DOS line endings.  */
1065       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
1066         s++;
1067     }
1068
1069  done:
1070   *d = '\n';
1071   /* A sentinel note that should never be processed.  */
1072   add_line_note (buffer, d + 1, '\n');
1073   buffer->next_line = s + 1;
1074 }
1075
1076 /* Return true if the trigraph indicated by NOTE should be warned
1077    about in a comment.  */
1078 static bool
1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1080 {
1081   const uchar *p;
1082
1083   /* Within comments we don't warn about trigraphs, unless the
1084      trigraph forms an escaped newline, as that may change
1085      behavior.  */
1086   if (note->type != '/')
1087     return false;
1088
1089   /* If -trigraphs, then this was an escaped newline iff the next note
1090      is coincident.  */
1091   if (CPP_OPTION (pfile, trigraphs))
1092     return note[1].pos == note->pos;
1093
1094   /* Otherwise, see if this forms an escaped newline.  */
1095   p = note->pos + 3;
1096   while (is_nvspace (*p))
1097     p++;
1098
1099   /* There might have been escaped newlines between the trigraph and the
1100      newline we found.  Hence the position test.  */
1101   return (*p == '\n' && p < note[1].pos);
1102 }
1103
1104 /* Process the notes created by add_line_note as far as the current
1105    location.  */
1106 void
1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1108 {
1109   cpp_buffer *buffer = pfile->buffer;
1110
1111   for (;;)
1112     {
1113       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114       unsigned int col;
1115
1116       if (note->pos > buffer->cur)
1117         break;
1118
1119       buffer->cur_note++;
1120       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1121
1122       if (note->type == '\\' || note->type == ' ')
1123         {
1124           if (note->type == ' ' && !in_comment)
1125             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126                                  "backslash and newline separated by space");
1127
1128           if (buffer->next_line > buffer->rlimit)
1129             {
1130               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131                                    "backslash-newline at end of file");
1132               /* Prevent "no newline at end of file" warning.  */
1133               buffer->next_line = buffer->rlimit;
1134             }
1135
1136           buffer->line_base = note->pos;
1137           CPP_INCREMENT_LINE (pfile, 0);
1138         }
1139       else if (_cpp_trigraph_map[note->type])
1140         {
1141           if (CPP_OPTION (pfile, warn_trigraphs)
1142               && (!in_comment || warn_in_comment (pfile, note)))
1143             {
1144               if (CPP_OPTION (pfile, trigraphs))
1145                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146                                        pfile->line_table->highest_line, col,
1147                                        "trigraph ??%c converted to %c",
1148                                        note->type,
1149                                        (int) _cpp_trigraph_map[note->type]);
1150               else
1151                 {
1152                   cpp_warning_with_line
1153                     (pfile, CPP_W_TRIGRAPHS,
1154                      pfile->line_table->highest_line, col,
1155                      "trigraph ??%c ignored, use -trigraphs to enable",
1156                      note->type);
1157                 }
1158             }
1159         }
1160       else if (note->type == 0)
1161         /* Already processed in lex_raw_string.  */;
1162       else
1163         abort ();
1164     }
1165 }
1166
1167 /* Skip a C-style block comment.  We find the end of the comment by
1168    seeing if an asterisk is before every '/' we encounter.  Returns
1169    nonzero if comment terminated by EOF, zero otherwise.
1170
1171    Buffer->cur points to the initial asterisk of the comment.  */
1172 bool
1173 _cpp_skip_block_comment (cpp_reader *pfile)
1174 {
1175   cpp_buffer *buffer = pfile->buffer;
1176   const uchar *cur = buffer->cur;
1177   uchar c;
1178
1179   cur++;
1180   if (*cur == '/')
1181     cur++;
1182
1183   for (;;)
1184     {
1185       /* People like decorating comments with '*', so check for '/'
1186          instead for efficiency.  */
1187       c = *cur++;
1188
1189       if (c == '/')
1190         {
1191           if (cur[-2] == '*')
1192             break;
1193
1194           /* Warn about potential nested comments, but not if the '/'
1195              comes immediately before the true comment delimiter.
1196              Don't bother to get it right across escaped newlines.  */
1197           if (CPP_OPTION (pfile, warn_comments)
1198               && cur[0] == '*' && cur[1] != '/')
1199             {
1200               buffer->cur = cur;
1201               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1202                                      pfile->line_table->highest_line,
1203                                      CPP_BUF_COL (buffer),
1204                                      "\"/*\" within comment");
1205             }
1206         }
1207       else if (c == '\n')
1208         {
1209           unsigned int cols;
1210           buffer->cur = cur - 1;
1211           _cpp_process_line_notes (pfile, true);
1212           if (buffer->next_line >= buffer->rlimit)
1213             return true;
1214           _cpp_clean_line (pfile);
1215
1216           cols = buffer->next_line - buffer->line_base;
1217           CPP_INCREMENT_LINE (pfile, cols);
1218
1219           cur = buffer->cur;
1220         }
1221     }
1222
1223   buffer->cur = cur;
1224   _cpp_process_line_notes (pfile, true);
1225   return false;
1226 }
1227
1228 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1229    terminating newline.  Handles escaped newlines.  Returns nonzero
1230    if a multiline comment.  */
1231 static int
1232 skip_line_comment (cpp_reader *pfile)
1233 {
1234   cpp_buffer *buffer = pfile->buffer;
1235   source_location orig_line = pfile->line_table->highest_line;
1236
1237   while (*buffer->cur != '\n')
1238     buffer->cur++;
1239
1240   _cpp_process_line_notes (pfile, true);
1241   return orig_line != pfile->line_table->highest_line;
1242 }
1243
1244 /* Skips whitespace, saving the next non-whitespace character.  */
1245 static void
1246 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1247 {
1248   cpp_buffer *buffer = pfile->buffer;
1249   bool saw_NUL = false;
1250
1251   do
1252     {
1253       /* Horizontal space always OK.  */
1254       if (c == ' ' || c == '\t')
1255         ;
1256       /* Just \f \v or \0 left.  */
1257       else if (c == '\0')
1258         saw_NUL = true;
1259       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1260         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1261                              CPP_BUF_COL (buffer),
1262                              "%s in preprocessing directive",
1263                              c == '\f' ? "form feed" : "vertical tab");
1264
1265       c = *buffer->cur++;
1266     }
1267   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1268   while (is_nvspace (c));
1269
1270   if (saw_NUL)
1271     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1272
1273   buffer->cur--;
1274 }
1275
1276 /* See if the characters of a number token are valid in a name (no
1277    '.', '+' or '-').  */
1278 static int
1279 name_p (cpp_reader *pfile, const cpp_string *string)
1280 {
1281   unsigned int i;
1282
1283   for (i = 0; i < string->len; i++)
1284     if (!is_idchar (string->text[i]))
1285       return 0;
1286
1287   return 1;
1288 }
1289
1290 /* After parsing an identifier or other sequence, produce a warning about
1291    sequences not in NFC/NFKC.  */
1292 static void
1293 warn_about_normalization (cpp_reader *pfile,
1294                           const cpp_token *token,
1295                           const struct normalize_state *s)
1296 {
1297   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1298       && !pfile->state.skipping)
1299     {
1300       /* Make sure that the token is printed using UCNs, even
1301          if we'd otherwise happily print UTF-8.  */
1302       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1303       size_t sz;
1304
1305       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1306       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1307         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1308                                "`%.*s' is not in NFKC", (int) sz, buf);
1309       else
1310         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1311                                "`%.*s' is not in NFC", (int) sz, buf);
1312       free (buf);
1313     }
1314 }
1315
1316 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1317    an identifier.  FIRST is TRUE if this starts an identifier.  */
1318 static bool
1319 forms_identifier_p (cpp_reader *pfile, int first,
1320                     struct normalize_state *state)
1321 {
1322   cpp_buffer *buffer = pfile->buffer;
1323
1324   if (*buffer->cur == '$')
1325     {
1326       if (!CPP_OPTION (pfile, dollars_in_ident))
1327         return false;
1328
1329       buffer->cur++;
1330       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1331         {
1332           CPP_OPTION (pfile, warn_dollars) = 0;
1333           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1334         }
1335
1336       return true;
1337     }
1338
1339   /* Is this a syntactically valid UCN?  */
1340   if (CPP_OPTION (pfile, extended_identifiers)
1341       && *buffer->cur == '\\'
1342       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1343     {
1344       cppchar_t s;
1345       buffer->cur += 2;
1346       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1347                           state, &s, NULL, NULL))
1348         return true;
1349       buffer->cur -= 2;
1350     }
1351
1352   return false;
1353 }
1354
1355 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1356 static cpp_hashnode *
1357 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1358 {
1359   cpp_hashnode *result;
1360   const uchar *cur;
1361   unsigned int len;
1362   unsigned int hash = HT_HASHSTEP (0, *base);
1363
1364   cur = base + 1;
1365   while (ISIDNUM (*cur))
1366     {
1367       hash = HT_HASHSTEP (hash, *cur);
1368       cur++;
1369     }
1370   len = cur - base;
1371   hash = HT_HASHFINISH (hash, len);
1372   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1373                                               base, len, hash, HT_ALLOC));
1374
1375   /* Rarely, identifiers require diagnostics when lexed.  */
1376   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1377                         && !pfile->state.skipping, 0))
1378     {
1379       /* It is allowed to poison the same identifier twice.  */
1380       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1381         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1382                    NODE_NAME (result));
1383
1384       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1385          replacement list of a variadic macro.  */
1386       if (result == pfile->spec_nodes.n__VA_ARGS__
1387           && !pfile->state.va_args_ok)
1388         {
1389           if (CPP_OPTION (pfile, cplusplus))
1390             cpp_error (pfile, CPP_DL_PEDWARN,
1391                        "__VA_ARGS__ can only appear in the expansion"
1392                        " of a C++11 variadic macro");
1393           else
1394             cpp_error (pfile, CPP_DL_PEDWARN,
1395                        "__VA_ARGS__ can only appear in the expansion"
1396                        " of a C99 variadic macro");
1397         }
1398
1399       /* For -Wc++-compat, warn about use of C++ named operators.  */
1400       if (result->flags & NODE_WARN_OPERATOR)
1401         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1402                      "identifier \"%s\" is a special operator name in C++",
1403                      NODE_NAME (result));
1404     }
1405
1406   return result;
1407 }
1408
1409 /* Get the cpp_hashnode of an identifier specified by NAME in
1410    the current cpp_reader object.  If none is found, NULL is returned.  */
1411 cpp_hashnode *
1412 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1413 {
1414   cpp_hashnode *result;
1415   result = lex_identifier_intern (pfile, (uchar *) name);
1416   return result;
1417 }
1418
1419 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1420 static cpp_hashnode *
1421 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1422                 struct normalize_state *nst, cpp_hashnode **spelling)
1423 {
1424   cpp_hashnode *result;
1425   const uchar *cur;
1426   unsigned int len;
1427   unsigned int hash = HT_HASHSTEP (0, *base);
1428
1429   cur = pfile->buffer->cur;
1430   if (! starts_ucn)
1431     {
1432       while (ISIDNUM (*cur))
1433         {
1434           hash = HT_HASHSTEP (hash, *cur);
1435           cur++;
1436         }
1437       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1438     }
1439   pfile->buffer->cur = cur;
1440   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1441     {
1442       /* Slower version for identifiers containing UCNs (or $).  */
1443       do {
1444         while (ISIDNUM (*pfile->buffer->cur))
1445           {
1446             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1447             pfile->buffer->cur++;
1448           }
1449       } while (forms_identifier_p (pfile, false, nst));
1450       result = _cpp_interpret_identifier (pfile, base,
1451                                           pfile->buffer->cur - base);
1452       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1453     }
1454   else
1455     {
1456       len = cur - base;
1457       hash = HT_HASHFINISH (hash, len);
1458
1459       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1460                                                   base, len, hash, HT_ALLOC));
1461       *spelling = result;
1462     }
1463
1464   /* Rarely, identifiers require diagnostics when lexed.  */
1465   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1466                         && !pfile->state.skipping, 0))
1467     {
1468       /* It is allowed to poison the same identifier twice.  */
1469       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1470         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1471                    NODE_NAME (result));
1472
1473       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1474          replacement list of a variadic macro.  */
1475       if (result == pfile->spec_nodes.n__VA_ARGS__
1476           && !pfile->state.va_args_ok)
1477         {
1478           if (CPP_OPTION (pfile, cplusplus))
1479             cpp_error (pfile, CPP_DL_PEDWARN,
1480                        "__VA_ARGS__ can only appear in the expansion"
1481                        " of a C++11 variadic macro");
1482           else
1483             cpp_error (pfile, CPP_DL_PEDWARN,
1484                        "__VA_ARGS__ can only appear in the expansion"
1485                        " of a C99 variadic macro");
1486         }
1487
1488       /* For -Wc++-compat, warn about use of C++ named operators.  */
1489       if (result->flags & NODE_WARN_OPERATOR)
1490         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1491                      "identifier \"%s\" is a special operator name in C++",
1492                      NODE_NAME (result));
1493     }
1494
1495   return result;
1496 }
1497
1498 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1499 static void
1500 lex_number (cpp_reader *pfile, cpp_string *number,
1501             struct normalize_state *nst)
1502 {
1503   const uchar *cur;
1504   const uchar *base;
1505   uchar *dest;
1506
1507   base = pfile->buffer->cur - 1;
1508   do
1509     {
1510       cur = pfile->buffer->cur;
1511
1512       /* N.B. ISIDNUM does not include $.  */
1513       while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1514              || VALID_SIGN (*cur, cur[-1]))
1515         {
1516           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1517           cur++;
1518         }
1519       /* A number can't end with a digit separator.  */
1520       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
1521         --cur;
1522
1523       pfile->buffer->cur = cur;
1524     }
1525   while (forms_identifier_p (pfile, false, nst));
1526
1527   number->len = cur - base;
1528   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1529   memcpy (dest, base, number->len);
1530   dest[number->len] = '\0';
1531   number->text = dest;
1532 }
1533
1534 /* Create a token of type TYPE with a literal spelling.  */
1535 static void
1536 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1537                 unsigned int len, enum cpp_ttype type)
1538 {
1539   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1540
1541   memcpy (dest, base, len);
1542   dest[len] = '\0';
1543   token->type = type;
1544   token->val.str.len = len;
1545   token->val.str.text = dest;
1546 }
1547
1548 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1549    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1550
1551 static void
1552 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1553                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1554 {
1555   _cpp_buff *first_buff = *first_buff_p;
1556   _cpp_buff *last_buff = *last_buff_p;
1557
1558   if (first_buff == NULL)
1559     first_buff = last_buff = _cpp_get_buff (pfile, len);
1560   else if (len > BUFF_ROOM (last_buff))
1561     {
1562       size_t room = BUFF_ROOM (last_buff);
1563       memcpy (BUFF_FRONT (last_buff), base, room);
1564       BUFF_FRONT (last_buff) += room;
1565       base += room;
1566       len -= room;
1567       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1568     }
1569
1570   memcpy (BUFF_FRONT (last_buff), base, len);
1571   BUFF_FRONT (last_buff) += len;
1572
1573   *first_buff_p = first_buff;
1574   *last_buff_p = last_buff;
1575 }
1576
1577
1578 /* Returns true if a macro has been defined.
1579    This might not work if compile with -save-temps,
1580    or preprocess separately from compilation.  */
1581
1582 static bool
1583 is_macro(cpp_reader *pfile, const uchar *base)
1584 {
1585   const uchar *cur = base;
1586   if (! ISIDST (*cur))
1587     return false;
1588   unsigned int hash = HT_HASHSTEP (0, *cur);
1589   ++cur;
1590   while (ISIDNUM (*cur))
1591     {
1592       hash = HT_HASHSTEP (hash, *cur);
1593       ++cur;
1594     }
1595   hash = HT_HASHFINISH (hash, cur - base);
1596
1597   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1598                                         base, cur - base, hash, HT_NO_INSERT));
1599
1600   return !result ? false : (result->type == NT_MACRO);
1601 }
1602
1603
1604 /* Lexes a raw string.  The stored string contains the spelling, including
1605    double quotes, delimiter string, '(' and ')', any leading
1606    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1607    literal, or CPP_OTHER if it was not properly terminated.
1608
1609    The spelling is NUL-terminated, but it is not guaranteed that this
1610    is the first NUL since embedded NULs are preserved.  */
1611
1612 static void
1613 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1614                 const uchar *cur)
1615 {
1616   uchar raw_prefix[17];
1617   uchar temp_buffer[18];
1618   const uchar *orig_base;
1619   unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
1620   enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
1621   raw_str_phase phase = RAW_STR_PREFIX;
1622   enum cpp_ttype type;
1623   size_t total_len = 0;
1624   /* Index into temp_buffer during phases other than RAW_STR,
1625      during RAW_STR phase 17 to tell BUF_APPEND that nothing should
1626      be appended to temp_buffer.  */
1627   size_t temp_buffer_len = 0;
1628   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1629   size_t raw_prefix_start;
1630   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1631
1632   type = (*base == 'L' ? CPP_WSTRING :
1633           *base == 'U' ? CPP_STRING32 :
1634           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1635           : CPP_STRING);
1636
1637 #define BUF_APPEND(STR,LEN)                                     \
1638       do {                                                      \
1639         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1640                         &first_buff, &last_buff);               \
1641         total_len += (LEN);                                     \
1642         if (__builtin_expect (temp_buffer_len < 17, 0)          \
1643             && (const uchar *)(STR) != base                     \
1644             && (LEN) <= 2)                                      \
1645           {                                                     \
1646             memcpy (temp_buffer + temp_buffer_len,              \
1647                     (const uchar *)(STR), (LEN));               \
1648             temp_buffer_len += (LEN);                           \
1649           }                                                     \
1650       } while (0)
1651
1652   orig_base = base;
1653   ++cur;
1654   raw_prefix_start = cur - base;
1655   for (;;)
1656     {
1657       cppchar_t c;
1658
1659       /* If we previously performed any trigraph or line splicing
1660          transformations, undo them in between the opening and closing
1661          double quote.  */
1662       while (note->pos < cur)
1663         ++note;
1664       for (; note->pos == cur; ++note)
1665         {
1666           switch (note->type)
1667             {
1668             case '\\':
1669             case ' ':
1670               /* Restore backslash followed by newline.  */
1671               BUF_APPEND (base, cur - base);
1672               base = cur;
1673               BUF_APPEND ("\\", 1);
1674             after_backslash:
1675               if (note->type == ' ')
1676                 {
1677                   /* GNU backslash whitespace newline extension.  FIXME
1678                      could be any sequence of non-vertical space.  When we
1679                      can properly restore any such sequence, we should mark
1680                      this note as handled so _cpp_process_line_notes
1681                      doesn't warn.  */
1682                   BUF_APPEND (" ", 1);
1683                 }
1684
1685               BUF_APPEND ("\n", 1);
1686               break;
1687
1688             case 0:
1689               /* Already handled.  */
1690               break;
1691
1692             default:
1693               if (_cpp_trigraph_map[note->type])
1694                 {
1695                   /* Don't warn about this trigraph in
1696                      _cpp_process_line_notes, since trigraphs show up as
1697                      trigraphs in raw strings.  */
1698                   uchar type = note->type;
1699                   note->type = 0;
1700
1701                   if (!CPP_OPTION (pfile, trigraphs))
1702                     /* If we didn't convert the trigraph in the first
1703                        place, don't do anything now either.  */
1704                     break;
1705
1706                   BUF_APPEND (base, cur - base);
1707                   base = cur;
1708                   BUF_APPEND ("??", 2);
1709
1710                   /* ??/ followed by newline gets two line notes, one for
1711                      the trigraph and one for the backslash/newline.  */
1712                   if (type == '/' && note[1].pos == cur)
1713                     {
1714                       if (note[1].type != '\\'
1715                           && note[1].type != ' ')
1716                         abort ();
1717                       BUF_APPEND ("/", 1);
1718                       ++note;
1719                       goto after_backslash;
1720                     }
1721                   else
1722                     {
1723                       /* Skip the replacement character.  */
1724                       base = ++cur;
1725                       BUF_APPEND (&type, 1);
1726                       c = type;
1727                       goto check_c;
1728                     }
1729                 }
1730               else
1731                 abort ();
1732               break;
1733             }
1734         }
1735       c = *cur++;
1736       if (__builtin_expect (temp_buffer_len < 17, 0))
1737         temp_buffer[temp_buffer_len++] = c;
1738
1739      check_c:
1740       if (phase == RAW_STR_PREFIX)
1741         {
1742           while (raw_prefix_len < temp_buffer_len)
1743             {
1744               raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len];
1745               switch (raw_prefix[raw_prefix_len])
1746                 {
1747                 case ' ': case '(': case ')': case '\\': case '\t':
1748                 case '\v': case '\f': case '\n': default:
1749                   break;
1750                 /* Basic source charset except the above chars.  */
1751                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1752                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1753                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1754                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1755                 case 'y': case 'z':
1756                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1757                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1758                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1759                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1760                 case 'Y': case 'Z':
1761                 case '0': case '1': case '2': case '3': case '4': case '5':
1762                 case '6': case '7': case '8': case '9':
1763                 case '_': case '{': case '}': case '#': case '[': case ']':
1764                 case '<': case '>': case '%': case ':': case ';': case '.':
1765                 case '?': case '*': case '+': case '-': case '/': case '^':
1766                 case '&': case '|': case '~': case '!': case '=': case ',':
1767                 case '"': case '\'':
1768                   if (raw_prefix_len < 16)
1769                     {
1770                       raw_prefix_len++;
1771                       continue;
1772                     }
1773                   break;
1774                 }
1775
1776               if (raw_prefix[raw_prefix_len] != '(')
1777                 {
1778                   int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1;
1779                   if (raw_prefix_len == 16)
1780                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1781                                          col, "raw string delimiter longer "
1782                                               "than 16 characters");
1783                   else if (raw_prefix[raw_prefix_len] == '\n')
1784                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1785                                          col, "invalid new-line in raw "
1786                                               "string delimiter");
1787                   else
1788                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1789                                          col, "invalid character '%c' in "
1790                                               "raw string delimiter",
1791                                          (int) raw_prefix[raw_prefix_len]);
1792                   pfile->buffer->cur = orig_base + raw_prefix_start - 1;
1793                   create_literal (pfile, token, orig_base,
1794                                   raw_prefix_start - 1, CPP_OTHER);
1795                   if (first_buff)
1796                     _cpp_release_buff (pfile, first_buff);
1797                   return;
1798                 }
1799               raw_prefix[raw_prefix_len] = '"';
1800               phase = RAW_STR;
1801               /* Nothing should be appended to temp_buffer during
1802                  RAW_STR phase.  */
1803               temp_buffer_len = 17;
1804               break;
1805             }
1806           continue;
1807         }
1808       else if (phase == RAW_STR_SUFFIX)
1809         {
1810           while (raw_suffix_len <= raw_prefix_len
1811                  && raw_suffix_len < temp_buffer_len
1812                  && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len])
1813             raw_suffix_len++;
1814           if (raw_suffix_len > raw_prefix_len)
1815             break;
1816           if (raw_suffix_len == temp_buffer_len)
1817             continue;
1818           phase = RAW_STR;
1819           /* Nothing should be appended to temp_buffer during
1820              RAW_STR phase.  */
1821           temp_buffer_len = 17;
1822         }
1823       if (c == ')')
1824         {
1825           phase = RAW_STR_SUFFIX;
1826           raw_suffix_len = 0;
1827           temp_buffer_len = 0;
1828         }
1829       else if (c == '\n')
1830         {
1831           if (pfile->state.in_directive
1832               || (pfile->state.parsing_args
1833                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
1834             {
1835               cur--;
1836               type = CPP_OTHER;
1837               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1838                                    "unterminated raw string");
1839               break;
1840             }
1841
1842           BUF_APPEND (base, cur - base);
1843
1844           if (pfile->buffer->cur < pfile->buffer->rlimit)
1845             CPP_INCREMENT_LINE (pfile, 0);
1846           pfile->buffer->need_line = true;
1847
1848           pfile->buffer->cur = cur-1;
1849           _cpp_process_line_notes (pfile, false);
1850           if (!_cpp_get_fresh_line (pfile))
1851             {
1852               source_location src_loc = token->src_loc;
1853               token->type = CPP_EOF;
1854               /* Tell the compiler the line number of the EOF token.  */
1855               token->src_loc = pfile->line_table->highest_line;
1856               token->flags = BOL;
1857               if (first_buff != NULL)
1858                 _cpp_release_buff (pfile, first_buff);
1859               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1860                                    "unterminated raw string");
1861               return;
1862             }
1863
1864           cur = base = pfile->buffer->cur;
1865           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1866         }
1867     }
1868
1869   if (CPP_OPTION (pfile, user_literals))
1870     {
1871       /* If a string format macro, say from inttypes.h, is placed touching
1872          a string literal it could be parsed as a C++11 user-defined string
1873          literal thus breaking the program.
1874          Try to identify macros with is_macro. A warning is issued.
1875          The macro name should not start with '_' for this warning. */
1876       if ((*cur != '_') && is_macro (pfile, cur))
1877         {
1878           /* Raise a warning, but do not consume subsequent tokens.  */
1879           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1880             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1881                                    token->src_loc, 0,
1882                                    "invalid suffix on literal; C++11 requires "
1883                                    "a space between literal and string macro");
1884         }
1885       /* Grab user defined literal suffix.  */
1886       else if (ISIDST (*cur))
1887         {
1888           type = cpp_userdef_string_add_type (type);
1889           ++cur;
1890
1891           while (ISIDNUM (*cur))
1892             ++cur;
1893         }
1894     }
1895
1896   pfile->buffer->cur = cur;
1897   if (first_buff == NULL)
1898     create_literal (pfile, token, base, cur - base, type);
1899   else
1900     {
1901       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1902
1903       token->type = type;
1904       token->val.str.len = total_len + (cur - base);
1905       token->val.str.text = dest;
1906       last_buff = first_buff;
1907       while (last_buff != NULL)
1908         {
1909           memcpy (dest, last_buff->base,
1910                   BUFF_FRONT (last_buff) - last_buff->base);
1911           dest += BUFF_FRONT (last_buff) - last_buff->base;
1912           last_buff = last_buff->next;
1913         }
1914       _cpp_release_buff (pfile, first_buff);
1915       memcpy (dest, base, cur - base);
1916       dest[cur - base] = '\0';
1917     }
1918 }
1919
1920 /* Lexes a string, character constant, or angle-bracketed header file
1921    name.  The stored string contains the spelling, including opening
1922    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1923    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1924    if it was not properly terminated, or CPP_LESS for an unterminated
1925    header name which must be relexed as normal tokens.
1926
1927    The spelling is NUL-terminated, but it is not guaranteed that this
1928    is the first NUL since embedded NULs are preserved.  */
1929 static void
1930 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1931 {
1932   bool saw_NUL = false;
1933   const uchar *cur;
1934   cppchar_t terminator;
1935   enum cpp_ttype type;
1936
1937   cur = base;
1938   terminator = *cur++;
1939   if (terminator == 'L' || terminator == 'U')
1940     terminator = *cur++;
1941   else if (terminator == 'u')
1942     {
1943       terminator = *cur++;
1944       if (terminator == '8')
1945         terminator = *cur++;
1946     }
1947   if (terminator == 'R')
1948     {
1949       lex_raw_string (pfile, token, base, cur);
1950       return;
1951     }
1952   if (terminator == '"')
1953     type = (*base == 'L' ? CPP_WSTRING :
1954             *base == 'U' ? CPP_STRING32 :
1955             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1956                          : CPP_STRING);
1957   else if (terminator == '\'')
1958     type = (*base == 'L' ? CPP_WCHAR :
1959             *base == 'U' ? CPP_CHAR32 :
1960             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
1961                          : CPP_CHAR);
1962   else
1963     terminator = '>', type = CPP_HEADER_NAME;
1964
1965   for (;;)
1966     {
1967       cppchar_t c = *cur++;
1968
1969       /* In #include-style directives, terminators are not escapable.  */
1970       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1971         cur++;
1972       else if (c == terminator)
1973         break;
1974       else if (c == '\n')
1975         {
1976           cur--;
1977           /* Unmatched quotes always yield undefined behavior, but
1978              greedy lexing means that what appears to be an unterminated
1979              header name may actually be a legitimate sequence of tokens.  */
1980           if (terminator == '>')
1981             {
1982               token->type = CPP_LESS;
1983               return;
1984             }
1985           type = CPP_OTHER;
1986           break;
1987         }
1988       else if (c == '\0')
1989         saw_NUL = true;
1990     }
1991
1992   if (saw_NUL && !pfile->state.skipping)
1993     cpp_error (pfile, CPP_DL_WARNING,
1994                "null character(s) preserved in literal");
1995
1996   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1997     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1998                (int) terminator);
1999
2000   if (CPP_OPTION (pfile, user_literals))
2001     {
2002       /* If a string format macro, say from inttypes.h, is placed touching
2003          a string literal it could be parsed as a C++11 user-defined string
2004          literal thus breaking the program.
2005          Try to identify macros with is_macro. A warning is issued.
2006          The macro name should not start with '_' for this warning. */
2007       if ((*cur != '_') && is_macro (pfile, cur))
2008         {
2009           /* Raise a warning, but do not consume subsequent tokens.  */
2010           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2011             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2012                                    token->src_loc, 0,
2013                                    "invalid suffix on literal; C++11 requires "
2014                                    "a space between literal and string macro");
2015         }
2016       /* Grab user defined literal suffix.  */
2017       else if (ISIDST (*cur))
2018         {
2019           type = cpp_userdef_char_add_type (type);
2020           type = cpp_userdef_string_add_type (type);
2021           ++cur;
2022
2023           while (ISIDNUM (*cur))
2024             ++cur;
2025         }
2026     }
2027   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2028            && is_macro (pfile, cur)
2029            && !pfile->state.skipping)
2030     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2031                            token->src_loc, 0, "C++11 requires a space "
2032                            "between string literal and macro");
2033
2034   pfile->buffer->cur = cur;
2035   create_literal (pfile, token, base, cur - base, type);
2036 }
2037
2038 /* Return the comment table. The client may not make any assumption
2039    about the ordering of the table.  */
2040 cpp_comment_table *
2041 cpp_get_comments (cpp_reader *pfile)
2042 {
2043   return &pfile->comments;
2044 }
2045
2046 /* Append a comment to the end of the comment table. */
2047 static void
2048 store_comment (cpp_reader *pfile, cpp_token *token)
2049 {
2050   int len;
2051
2052   if (pfile->comments.allocated == 0)
2053     {
2054       pfile->comments.allocated = 256;
2055       pfile->comments.entries = (cpp_comment *) xmalloc
2056         (pfile->comments.allocated * sizeof (cpp_comment));
2057     }
2058
2059   if (pfile->comments.count == pfile->comments.allocated)
2060     {
2061       pfile->comments.allocated *= 2;
2062       pfile->comments.entries = (cpp_comment *) xrealloc
2063         (pfile->comments.entries,
2064          pfile->comments.allocated * sizeof (cpp_comment));
2065     }
2066
2067   len = token->val.str.len;
2068
2069   /* Copy comment. Note, token may not be NULL terminated. */
2070   pfile->comments.entries[pfile->comments.count].comment =
2071     (char *) xmalloc (sizeof (char) * (len + 1));
2072   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2073           token->val.str.text, len);
2074   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2075
2076   /* Set source location. */
2077   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2078
2079   /* Increment the count of entries in the comment table. */
2080   pfile->comments.count++;
2081 }
2082
2083 /* The stored comment includes the comment start and any terminator.  */
2084 static void
2085 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2086               cppchar_t type)
2087 {
2088   unsigned char *buffer;
2089   unsigned int len, clen, i;
2090
2091   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2092
2093   /* C++ comments probably (not definitely) have moved past a new
2094      line, which we don't want to save in the comment.  */
2095   if (is_vspace (pfile->buffer->cur[-1]))
2096     len--;
2097
2098   /* If we are currently in a directive or in argument parsing, then
2099      we need to store all C++ comments as C comments internally, and
2100      so we need to allocate a little extra space in that case.
2101
2102      Note that the only time we encounter a directive here is
2103      when we are saving comments in a "#define".  */
2104   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2105           && type == '/') ? len + 2 : len;
2106
2107   buffer = _cpp_unaligned_alloc (pfile, clen);
2108
2109   token->type = CPP_COMMENT;
2110   token->val.str.len = clen;
2111   token->val.str.text = buffer;
2112
2113   buffer[0] = '/';
2114   memcpy (buffer + 1, from, len - 1);
2115
2116   /* Finish conversion to a C comment, if necessary.  */
2117   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2118     {
2119       buffer[1] = '*';
2120       buffer[clen - 2] = '*';
2121       buffer[clen - 1] = '/';
2122       /* As there can be in a C++ comments illegal sequences for C comments
2123          we need to filter them out.  */
2124       for (i = 2; i < (clen - 2); i++)
2125         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2126           buffer[i] = '|';
2127     }
2128
2129   /* Finally store this comment for use by clients of libcpp. */
2130   store_comment (pfile, token);
2131 }
2132
2133 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2134    comment.  */
2135
2136 static bool
2137 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2138 {
2139   const unsigned char *from = comment_start + 1;
2140
2141   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2142     {
2143       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2144          don't recognize any comments.  The latter only checks attributes,
2145          the former doesn't warn.  */
2146     case 0:
2147     default:
2148       return false;
2149       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2150          content it has.  */
2151     case 1:
2152       return true;
2153     case 2:
2154       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2155          .*falls?[ \t-]*thr(u|ough).* regex.  */
2156       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2157            from++)
2158         {
2159           /* Is there anything like strpbrk with upper boundary, or
2160              memchr looking for 2 characters rather than just one?  */
2161           if (from[0] != 'f' && from[0] != 'F')
2162             continue;
2163           if (from[1] != 'a' && from[1] != 'A')
2164             continue;
2165           if (from[2] != 'l' && from[2] != 'L')
2166             continue;
2167           if (from[3] != 'l' && from[3] != 'L')
2168             continue;
2169           from += sizeof "fall" - 1;
2170           if (from[0] == 's' || from[0] == 'S')
2171             from++;
2172           while (*from == ' ' || *from == '\t' || *from == '-')
2173             from++;
2174           if (from[0] != 't' && from[0] != 'T')
2175             continue;
2176           if (from[1] != 'h' && from[1] != 'H')
2177             continue;
2178           if (from[2] != 'r' && from[2] != 'R')
2179             continue;
2180           if (from[3] == 'u' || from[3] == 'U')
2181             return true;
2182           if (from[3] != 'o' && from[3] != 'O')
2183             continue;
2184           if (from[4] != 'u' && from[4] != 'U')
2185             continue;
2186           if (from[5] != 'g' && from[5] != 'G')
2187             continue;
2188           if (from[6] != 'h' && from[6] != 'H')
2189             continue;
2190           return true;
2191         }
2192       return false;
2193     case 3:
2194     case 4:
2195       break;
2196     }
2197
2198   /* Whole comment contents:
2199      -fallthrough
2200      @fallthrough@
2201    */
2202   if (*from == '-' || *from == '@')
2203     {
2204       size_t len = sizeof "fallthrough" - 1;
2205       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2206         return false;
2207       if (memcmp (from + 1, "fallthrough", len))
2208         return false;
2209       if (*from == '@')
2210         {
2211           if (from[len + 1] != '@')
2212             return false;
2213           len++;
2214         }
2215       from += 1 + len;
2216     }
2217   /* Whole comment contents (regex):
2218      lint -fallthrough[ \t]*
2219    */
2220   else if (*from == 'l')
2221     {
2222       size_t len = sizeof "int -fallthrough" - 1;
2223       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2224         return false;
2225       if (memcmp (from + 1, "int -fallthrough", len))
2226         return false;
2227       from += 1 + len;
2228       while (*from == ' ' || *from == '\t')
2229         from++;
2230     }
2231   /* Whole comment contents (regex):
2232      [ \t]*FALLTHR(U|OUGH)[ \t]*
2233    */
2234   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2235     {
2236       while (*from == ' ' || *from == '\t')
2237         from++;
2238       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
2239         return false;
2240       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2241         return false;
2242       from += sizeof "FALLTHR" - 1;
2243       if (*from == 'U')
2244         from++;
2245       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
2246         return false;
2247       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2248         return false;
2249       else
2250         from += sizeof "OUGH" - 1;
2251       while (*from == ' ' || *from == '\t')
2252         from++;
2253     }
2254   /* Whole comment contents (regex):
2255      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2256      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2257      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2258    */
2259   else
2260     {
2261       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2262         from++;
2263       unsigned char f = *from;
2264       bool all_upper = false;
2265       if (f == 'E' || f == 'e')
2266         {
2267           if ((size_t) (pfile->buffer->cur - from)
2268               < sizeof "else fallthru" - 1)
2269             return false;
2270           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2271             all_upper = true;
2272           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2273             return false;
2274           from += sizeof "else" - 1;
2275           if (*from == ',')
2276             from++;
2277           if (*from != ' ')
2278             return false;
2279           from++;
2280           if (all_upper && *from == 'f')
2281             return false;
2282           if (f == 'e' && *from == 'F')
2283             return false;
2284           f = *from;
2285         }
2286       else if (f == 'I' || f == 'i')
2287         {
2288           if ((size_t) (pfile->buffer->cur - from)
2289               < sizeof "intentional fallthru" - 1)
2290             return false;
2291           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2292                                   sizeof "NTENTIONAL" - 1) == 0)
2293             all_upper = true;
2294           else if (memcmp (from + 1, "ntentional",
2295                            sizeof "ntentional" - 1))
2296             return false;
2297           from += sizeof "intentional" - 1;
2298           if (*from == ' ')
2299             {
2300               from++;
2301               if (all_upper && *from == 'f')
2302                 return false;
2303             }
2304           else if (all_upper)
2305             {
2306               if (memcmp (from, "LY F", sizeof "LY F" - 1))
2307                 return false;
2308               from += sizeof "LY " - 1;
2309             }
2310           else
2311             {
2312               if (memcmp (from, "ly ", sizeof "ly " - 1))
2313                 return false;
2314               from += sizeof "ly " - 1;
2315             }
2316           if (f == 'i' && *from == 'F')
2317             return false;
2318           f = *from;
2319         }
2320       if (f != 'F' && f != 'f')
2321         return false;
2322       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
2323         return false;
2324       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
2325         all_upper = true;
2326       else if (all_upper)
2327         return false;
2328       else if (memcmp (from + 1, "all", sizeof "all" - 1))
2329         return false;
2330       from += sizeof "fall" - 1;
2331       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
2332         from += 2;
2333       else if (*from == ' ' || *from == '-')
2334         from++;
2335       else if (*from != (all_upper ? 'T' : 't'))
2336         return false;
2337       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
2338         return false;
2339       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
2340         return false;
2341       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
2342         {
2343           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
2344             return false;
2345           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
2346                       sizeof "hrough" - 1))
2347             return false;
2348           from += sizeof "through" - 1;
2349         }
2350       else
2351         from += sizeof "thru" - 1;
2352       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2353         from++;
2354       if (*from == '-')
2355         {
2356           from++;
2357           if (*comment_start == '*')
2358             {
2359               do
2360                 {
2361                   while (*from && *from != '*'
2362                          && *from != '\n' && *from != '\r')
2363                     from++;
2364                   if (*from != '*' || from[1] == '/')
2365                     break;
2366                   from++;
2367                 }
2368               while (1);
2369             }
2370           else
2371             while (*from && *from != '\n' && *from != '\r')
2372               from++;
2373         }
2374     }
2375   /* C block comment.  */
2376   if (*comment_start == '*')
2377     {
2378       if (*from != '*' || from[1] != '/')
2379         return false;
2380     }
2381   /* C++ line comment.  */
2382   else if (*from != '\n')
2383     return false;
2384
2385   return true;
2386 }
2387
2388 /* Allocate COUNT tokens for RUN.  */
2389 void
2390 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
2391 {
2392   run->base = XNEWVEC (cpp_token, count);
2393   run->limit = run->base + count;
2394   run->next = NULL;
2395 }
2396
2397 /* Returns the next tokenrun, or creates one if there is none.  */
2398 static tokenrun *
2399 next_tokenrun (tokenrun *run)
2400 {
2401   if (run->next == NULL)
2402     {
2403       run->next = XNEW (tokenrun);
2404       run->next->prev = run;
2405       _cpp_init_tokenrun (run->next, 250);
2406     }
2407
2408   return run->next;
2409 }
2410
2411 /* Return the number of not yet processed token in a given
2412    context.  */
2413 int
2414 _cpp_remaining_tokens_num_in_context (cpp_context *context)
2415 {
2416   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2417     return (LAST (context).token - FIRST (context).token);
2418   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2419            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2420     return (LAST (context).ptoken - FIRST (context).ptoken);
2421   else
2422       abort ();
2423 }
2424
2425 /* Returns the token present at index INDEX in a given context.  If
2426    INDEX is zero, the next token to be processed is returned.  */
2427 static const cpp_token*
2428 _cpp_token_from_context_at (cpp_context *context, int index)
2429 {
2430   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2431     return &(FIRST (context).token[index]);
2432   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2433            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2434     return FIRST (context).ptoken[index];
2435  else
2436    abort ();
2437 }
2438
2439 /* Look ahead in the input stream.  */
2440 const cpp_token *
2441 cpp_peek_token (cpp_reader *pfile, int index)
2442 {
2443   cpp_context *context = pfile->context;
2444   const cpp_token *peektok;
2445   int count;
2446
2447   /* First, scan through any pending cpp_context objects.  */
2448   while (context->prev)
2449     {
2450       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2451
2452       if (index < (int) sz)
2453         return _cpp_token_from_context_at (context, index);
2454       index -= (int) sz;
2455       context = context->prev;
2456     }
2457
2458   /* We will have to read some new tokens after all (and do so
2459      without invalidating preceding tokens).  */
2460   count = index;
2461   pfile->keep_tokens++;
2462
2463   /* For peeked tokens temporarily disable line_change reporting,
2464      until the tokens are parsed for real.  */
2465   void (*line_change) (cpp_reader *, const cpp_token *, int)
2466     = pfile->cb.line_change;
2467   pfile->cb.line_change = NULL;
2468
2469   do
2470     {
2471       peektok = _cpp_lex_token (pfile);
2472       if (peektok->type == CPP_EOF)
2473         {
2474           index--;
2475           break;
2476         }
2477     }
2478   while (index--);
2479
2480   _cpp_backup_tokens_direct (pfile, count - index);
2481   pfile->keep_tokens--;
2482   pfile->cb.line_change = line_change;
2483
2484   return peektok;
2485 }
2486
2487 /* Allocate a single token that is invalidated at the same time as the
2488    rest of the tokens on the line.  Has its line and col set to the
2489    same as the last lexed token, so that diagnostics appear in the
2490    right place.  */
2491 cpp_token *
2492 _cpp_temp_token (cpp_reader *pfile)
2493 {
2494   cpp_token *old, *result;
2495   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2496   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2497
2498   old = pfile->cur_token - 1;
2499   /* Any pre-existing lookaheads must not be clobbered.  */
2500   if (la)
2501     {
2502       if (sz <= la)
2503         {
2504           tokenrun *next = next_tokenrun (pfile->cur_run);
2505
2506           if (sz < la)
2507             memmove (next->base + 1, next->base,
2508                      (la - sz) * sizeof (cpp_token));
2509
2510           next->base[0] = pfile->cur_run->limit[-1];
2511         }
2512
2513       if (sz > 1)
2514         memmove (pfile->cur_token + 1, pfile->cur_token,
2515                  MIN (la, sz - 1) * sizeof (cpp_token));
2516     }
2517
2518   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2519     {
2520       pfile->cur_run = next_tokenrun (pfile->cur_run);
2521       pfile->cur_token = pfile->cur_run->base;
2522     }
2523
2524   result = pfile->cur_token++;
2525   result->src_loc = old->src_loc;
2526   return result;
2527 }
2528
2529 /* Lex a token into RESULT (external interface).  Takes care of issues
2530    like directive handling, token lookahead, multiple include
2531    optimization and skipping.  */
2532 const cpp_token *
2533 _cpp_lex_token (cpp_reader *pfile)
2534 {
2535   cpp_token *result;
2536
2537   for (;;)
2538     {
2539       if (pfile->cur_token == pfile->cur_run->limit)
2540         {
2541           pfile->cur_run = next_tokenrun (pfile->cur_run);
2542           pfile->cur_token = pfile->cur_run->base;
2543         }
2544       /* We assume that the current token is somewhere in the current
2545          run.  */
2546       if (pfile->cur_token < pfile->cur_run->base
2547           || pfile->cur_token >= pfile->cur_run->limit)
2548         abort ();
2549
2550       if (pfile->lookaheads)
2551         {
2552           pfile->lookaheads--;
2553           result = pfile->cur_token++;
2554         }
2555       else
2556         result = _cpp_lex_direct (pfile);
2557
2558       if (result->flags & BOL)
2559         {
2560           /* Is this a directive.  If _cpp_handle_directive returns
2561              false, it is an assembler #.  */
2562           if (result->type == CPP_HASH
2563               /* 6.10.3 p 11: Directives in a list of macro arguments
2564                  gives undefined behavior.  This implementation
2565                  handles the directive as normal.  */
2566               && pfile->state.parsing_args != 1)
2567             {
2568               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2569                 {
2570                   if (pfile->directive_result.type == CPP_PADDING)
2571                     continue;
2572                   result = &pfile->directive_result;
2573                 }
2574             }
2575           else if (pfile->state.in_deferred_pragma)
2576             result = &pfile->directive_result;
2577
2578           if (pfile->cb.line_change && !pfile->state.skipping)
2579             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2580         }
2581
2582       /* We don't skip tokens in directives.  */
2583       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2584         break;
2585
2586       /* Outside a directive, invalidate controlling macros.  At file
2587          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2588          get here and MI optimization works.  */
2589       pfile->mi_valid = false;
2590
2591       if (!pfile->state.skipping || result->type == CPP_EOF)
2592         break;
2593     }
2594
2595   return result;
2596 }
2597
2598 /* Returns true if a fresh line has been loaded.  */
2599 bool
2600 _cpp_get_fresh_line (cpp_reader *pfile)
2601 {
2602   int return_at_eof;
2603
2604   /* We can't get a new line until we leave the current directive.  */
2605   if (pfile->state.in_directive)
2606     return false;
2607
2608   for (;;)
2609     {
2610       cpp_buffer *buffer = pfile->buffer;
2611
2612       if (!buffer->need_line)
2613         return true;
2614
2615       if (buffer->next_line < buffer->rlimit)
2616         {
2617           _cpp_clean_line (pfile);
2618           return true;
2619         }
2620
2621       /* First, get out of parsing arguments state.  */
2622       if (pfile->state.parsing_args)
2623         return false;
2624
2625       /* End of buffer.  Non-empty files should end in a newline.  */
2626       if (buffer->buf != buffer->rlimit
2627           && buffer->next_line > buffer->rlimit
2628           && !buffer->from_stage3)
2629         {
2630           /* Clip to buffer size.  */
2631           buffer->next_line = buffer->rlimit;
2632         }
2633
2634       return_at_eof = buffer->return_at_eof;
2635       _cpp_pop_buffer (pfile);
2636       if (pfile->buffer == NULL || return_at_eof)
2637         return false;
2638     }
2639 }
2640
2641 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2642   do                                                    \
2643     {                                                   \
2644       result->type = ELSE_TYPE;                         \
2645       if (*buffer->cur == CHAR)                         \
2646         buffer->cur++, result->type = THEN_TYPE;        \
2647     }                                                   \
2648   while (0)
2649
2650 /* Lex a token into pfile->cur_token, which is also incremented, to
2651    get diagnostics pointing to the correct location.
2652
2653    Does not handle issues such as token lookahead, multiple-include
2654    optimization, directives, skipping etc.  This function is only
2655    suitable for use by _cpp_lex_token, and in special cases like
2656    lex_expansion_token which doesn't care for any of these issues.
2657
2658    When meeting a newline, returns CPP_EOF if parsing a directive,
2659    otherwise returns to the start of the token buffer if permissible.
2660    Returns the location of the lexed token.  */
2661 cpp_token *
2662 _cpp_lex_direct (cpp_reader *pfile)
2663 {
2664   cppchar_t c;
2665   cpp_buffer *buffer;
2666   const unsigned char *comment_start;
2667   bool fallthrough_comment = false;
2668   cpp_token *result = pfile->cur_token++;
2669
2670  fresh_line:
2671   result->flags = 0;
2672   buffer = pfile->buffer;
2673   if (buffer->need_line)
2674     {
2675       if (pfile->state.in_deferred_pragma)
2676         {
2677           result->type = CPP_PRAGMA_EOL;
2678           pfile->state.in_deferred_pragma = false;
2679           if (!pfile->state.pragma_allow_expansion)
2680             pfile->state.prevent_expansion--;
2681           return result;
2682         }
2683       if (!_cpp_get_fresh_line (pfile))
2684         {
2685           result->type = CPP_EOF;
2686           if (!pfile->state.in_directive)
2687             {
2688               /* Tell the compiler the line number of the EOF token.  */
2689               result->src_loc = pfile->line_table->highest_line;
2690               result->flags = BOL;
2691             }
2692           return result;
2693         }
2694       if (buffer != pfile->buffer)
2695         fallthrough_comment = false;
2696       if (!pfile->keep_tokens)
2697         {
2698           pfile->cur_run = &pfile->base_run;
2699           result = pfile->base_run.base;
2700           pfile->cur_token = result + 1;
2701         }
2702       result->flags = BOL;
2703       if (pfile->state.parsing_args == 2)
2704         result->flags |= PREV_WHITE;
2705     }
2706   buffer = pfile->buffer;
2707  update_tokens_line:
2708   result->src_loc = pfile->line_table->highest_line;
2709
2710  skipped_white:
2711   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2712       && !pfile->overlaid_buffer)
2713     {
2714       _cpp_process_line_notes (pfile, false);
2715       result->src_loc = pfile->line_table->highest_line;
2716     }
2717   c = *buffer->cur++;
2718
2719   if (pfile->forced_token_location_p)
2720     result->src_loc = *pfile->forced_token_location_p;
2721   else
2722     result->src_loc = linemap_position_for_column (pfile->line_table,
2723                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2724
2725   switch (c)
2726     {
2727     case ' ': case '\t': case '\f': case '\v': case '\0':
2728       result->flags |= PREV_WHITE;
2729       skip_whitespace (pfile, c);
2730       goto skipped_white;
2731
2732     case '\n':
2733       if (buffer->cur < buffer->rlimit)
2734         CPP_INCREMENT_LINE (pfile, 0);
2735       buffer->need_line = true;
2736       goto fresh_line;
2737
2738     case '0': case '1': case '2': case '3': case '4':
2739     case '5': case '6': case '7': case '8': case '9':
2740       {
2741         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2742         result->type = CPP_NUMBER;
2743         lex_number (pfile, &result->val.str, &nst);
2744         warn_about_normalization (pfile, result, &nst);
2745         break;
2746       }
2747
2748     case 'L':
2749     case 'u':
2750     case 'U':
2751     case 'R':
2752       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2753          wide strings or raw strings.  */
2754       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2755           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2756         {
2757           if ((*buffer->cur == '\'' && c != 'R')
2758               || *buffer->cur == '"'
2759               || (*buffer->cur == 'R'
2760                   && c != 'R'
2761                   && buffer->cur[1] == '"'
2762                   && CPP_OPTION (pfile, rliterals))
2763               || (*buffer->cur == '8'
2764                   && c == 'u'
2765                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
2766                                 && CPP_OPTION (pfile, utf8_char_literals)))
2767                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2768                           && CPP_OPTION (pfile, rliterals)))))
2769             {
2770               lex_string (pfile, result, buffer->cur - 1);
2771               break;
2772             }
2773         }
2774       /* Fall through.  */
2775
2776     case '_':
2777     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2778     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2779     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2780     case 's': case 't':           case 'v': case 'w': case 'x':
2781     case 'y': case 'z':
2782     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2783     case 'G': case 'H': case 'I': case 'J': case 'K':
2784     case 'M': case 'N': case 'O': case 'P': case 'Q':
2785     case 'S': case 'T':           case 'V': case 'W': case 'X':
2786     case 'Y': case 'Z':
2787       result->type = CPP_NAME;
2788       {
2789         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2790         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2791                                                 &nst,
2792                                                 &result->val.node.spelling);
2793         warn_about_normalization (pfile, result, &nst);
2794       }
2795
2796       /* Convert named operators to their proper types.  */
2797       if (result->val.node.node->flags & NODE_OPERATOR)
2798         {
2799           result->flags |= NAMED_OP;
2800           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2801         }
2802
2803       /* Signal FALLTHROUGH comment followed by another token.  */
2804       if (fallthrough_comment)
2805         result->flags |= PREV_FALLTHROUGH;
2806       break;
2807
2808     case '\'':
2809     case '"':
2810       lex_string (pfile, result, buffer->cur - 1);
2811       break;
2812
2813     case '/':
2814       /* A potential block or line comment.  */
2815       comment_start = buffer->cur;
2816       c = *buffer->cur;
2817
2818       if (c == '*')
2819         {
2820           if (_cpp_skip_block_comment (pfile))
2821             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2822         }
2823       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
2824         {
2825           /* Don't warn for system headers.  */
2826           if (cpp_in_system_header (pfile))
2827             ;
2828           /* Warn about comments if pedantically GNUC89, and not
2829              in system headers.  */
2830           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
2831                    && CPP_PEDANTIC (pfile)
2832                    && ! buffer->warned_cplusplus_comments)
2833             {
2834               cpp_error (pfile, CPP_DL_PEDWARN,
2835                          "C++ style comments are not allowed in ISO C90");
2836               cpp_error (pfile, CPP_DL_PEDWARN,
2837                          "(this will be reported only once per input file)");
2838               buffer->warned_cplusplus_comments = 1;
2839             }
2840           /* Or if specifically desired via -Wc90-c99-compat.  */
2841           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
2842                    && ! CPP_OPTION (pfile, cplusplus)
2843                    && ! buffer->warned_cplusplus_comments)
2844             {
2845               cpp_error (pfile, CPP_DL_WARNING,
2846                          "C++ style comments are incompatible with C90");
2847               cpp_error (pfile, CPP_DL_WARNING,
2848                          "(this will be reported only once per input file)");
2849               buffer->warned_cplusplus_comments = 1;
2850             }
2851           /* In C89/C94, C++ style comments are forbidden.  */
2852           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
2853                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
2854             {
2855               /* But don't be confused about valid code such as
2856                  - // immediately followed by *,
2857                  - // in a preprocessing directive,
2858                  - // in an #if 0 block.  */
2859               if (buffer->cur[1] == '*'
2860                   || pfile->state.in_directive
2861                   || pfile->state.skipping)
2862                 {
2863                   result->type = CPP_DIV;
2864                   break;
2865                 }
2866               else if (! buffer->warned_cplusplus_comments)
2867                 {
2868                   cpp_error (pfile, CPP_DL_ERROR,
2869                              "C++ style comments are not allowed in ISO C90");
2870                   cpp_error (pfile, CPP_DL_ERROR,
2871                              "(this will be reported only once per input "
2872                              "file)");
2873                   buffer->warned_cplusplus_comments = 1;
2874                 }
2875             }
2876           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2877             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2878         }
2879       else if (c == '=')
2880         {
2881           buffer->cur++;
2882           result->type = CPP_DIV_EQ;
2883           break;
2884         }
2885       else
2886         {
2887           result->type = CPP_DIV;
2888           break;
2889         }
2890
2891       if (fallthrough_comment_p (pfile, comment_start))
2892         fallthrough_comment = true;
2893
2894       if (pfile->cb.comment)
2895         {
2896           size_t len = pfile->buffer->cur - comment_start;
2897           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
2898                              len + 1);
2899         }
2900
2901       if (!pfile->state.save_comments)
2902         {
2903           result->flags |= PREV_WHITE;
2904           goto update_tokens_line;
2905         }
2906
2907       if (fallthrough_comment)
2908         result->flags |= PREV_FALLTHROUGH;
2909
2910       /* Save the comment as a token in its own right.  */
2911       save_comment (pfile, result, comment_start, c);
2912       break;
2913
2914     case '<':
2915       if (pfile->state.angled_headers)
2916         {
2917           lex_string (pfile, result, buffer->cur - 1);
2918           if (result->type != CPP_LESS)
2919             break;
2920         }
2921
2922       result->type = CPP_LESS;
2923       if (*buffer->cur == '=')
2924         buffer->cur++, result->type = CPP_LESS_EQ;
2925       else if (*buffer->cur == '<')
2926         {
2927           buffer->cur++;
2928           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2929         }
2930       else if (CPP_OPTION (pfile, digraphs))
2931         {
2932           if (*buffer->cur == ':')
2933             {
2934               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2935                  three characters are <:: and the subsequent character
2936                  is neither : nor >, the < is treated as a preprocessor
2937                  token by itself".  */
2938               if (CPP_OPTION (pfile, cplusplus)
2939                   && CPP_OPTION (pfile, lang) != CLK_CXX98
2940                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
2941                   && buffer->cur[1] == ':'
2942                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2943                 break;
2944
2945               buffer->cur++;
2946               result->flags |= DIGRAPH;
2947               result->type = CPP_OPEN_SQUARE;
2948             }
2949           else if (*buffer->cur == '%')
2950             {
2951               buffer->cur++;
2952               result->flags |= DIGRAPH;
2953               result->type = CPP_OPEN_BRACE;
2954             }
2955         }
2956       break;
2957
2958     case '>':
2959       result->type = CPP_GREATER;
2960       if (*buffer->cur == '=')
2961         buffer->cur++, result->type = CPP_GREATER_EQ;
2962       else if (*buffer->cur == '>')
2963         {
2964           buffer->cur++;
2965           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2966         }
2967       break;
2968
2969     case '%':
2970       result->type = CPP_MOD;
2971       if (*buffer->cur == '=')
2972         buffer->cur++, result->type = CPP_MOD_EQ;
2973       else if (CPP_OPTION (pfile, digraphs))
2974         {
2975           if (*buffer->cur == ':')
2976             {
2977               buffer->cur++;
2978               result->flags |= DIGRAPH;
2979               result->type = CPP_HASH;
2980               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2981                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2982             }
2983           else if (*buffer->cur == '>')
2984             {
2985               buffer->cur++;
2986               result->flags |= DIGRAPH;
2987               result->type = CPP_CLOSE_BRACE;
2988             }
2989         }
2990       break;
2991
2992     case '.':
2993       result->type = CPP_DOT;
2994       if (ISDIGIT (*buffer->cur))
2995         {
2996           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2997           result->type = CPP_NUMBER;
2998           lex_number (pfile, &result->val.str, &nst);
2999           warn_about_normalization (pfile, result, &nst);
3000         }
3001       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3002         buffer->cur += 2, result->type = CPP_ELLIPSIS;
3003       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3004         buffer->cur++, result->type = CPP_DOT_STAR;
3005       break;
3006
3007     case '+':
3008       result->type = CPP_PLUS;
3009       if (*buffer->cur == '+')
3010         buffer->cur++, result->type = CPP_PLUS_PLUS;
3011       else if (*buffer->cur == '=')
3012         buffer->cur++, result->type = CPP_PLUS_EQ;
3013       break;
3014
3015     case '-':
3016       result->type = CPP_MINUS;
3017       if (*buffer->cur == '>')
3018         {
3019           buffer->cur++;
3020           result->type = CPP_DEREF;
3021           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3022             buffer->cur++, result->type = CPP_DEREF_STAR;
3023         }
3024       else if (*buffer->cur == '-')
3025         buffer->cur++, result->type = CPP_MINUS_MINUS;
3026       else if (*buffer->cur == '=')
3027         buffer->cur++, result->type = CPP_MINUS_EQ;
3028       break;
3029
3030     case '&':
3031       result->type = CPP_AND;
3032       if (*buffer->cur == '&')
3033         buffer->cur++, result->type = CPP_AND_AND;
3034       else if (*buffer->cur == '=')
3035         buffer->cur++, result->type = CPP_AND_EQ;
3036       break;
3037
3038     case '|':
3039       result->type = CPP_OR;
3040       if (*buffer->cur == '|')
3041         buffer->cur++, result->type = CPP_OR_OR;
3042       else if (*buffer->cur == '=')
3043         buffer->cur++, result->type = CPP_OR_EQ;
3044       break;
3045
3046     case ':':
3047       result->type = CPP_COLON;
3048       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
3049         buffer->cur++, result->type = CPP_SCOPE;
3050       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3051         {
3052           buffer->cur++;
3053           result->flags |= DIGRAPH;
3054           result->type = CPP_CLOSE_SQUARE;
3055         }
3056       break;
3057
3058     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3059     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3060     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3061     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3062     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3063
3064     case '?': result->type = CPP_QUERY; break;
3065     case '~': result->type = CPP_COMPL; break;
3066     case ',': result->type = CPP_COMMA; break;
3067     case '(': result->type = CPP_OPEN_PAREN; break;
3068     case ')': result->type = CPP_CLOSE_PAREN; break;
3069     case '[': result->type = CPP_OPEN_SQUARE; break;
3070     case ']': result->type = CPP_CLOSE_SQUARE; break;
3071     case '{': result->type = CPP_OPEN_BRACE; break;
3072     case '}': result->type = CPP_CLOSE_BRACE; break;
3073     case ';': result->type = CPP_SEMICOLON; break;
3074
3075       /* @ is a punctuator in Objective-C.  */
3076     case '@': result->type = CPP_ATSIGN; break;
3077
3078     case '$':
3079     case '\\':
3080       {
3081         const uchar *base = --buffer->cur;
3082         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3083
3084         if (forms_identifier_p (pfile, true, &nst))
3085           {
3086             result->type = CPP_NAME;
3087             result->val.node.node = lex_identifier (pfile, base, true, &nst,
3088                                                     &result->val.node.spelling);
3089             warn_about_normalization (pfile, result, &nst);
3090             break;
3091           }
3092         buffer->cur++;
3093       }
3094       /* FALLTHRU */
3095
3096     default:
3097       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
3098       break;
3099     }
3100
3101   /* Potentially convert the location of the token to a range.  */
3102   if (result->src_loc >= RESERVED_LOCATION_COUNT
3103       && result->type != CPP_EOF)
3104     {
3105       /* Ensure that any line notes are processed, so that we have the
3106          correct physical line/column for the end-point of the token even
3107          when a logical line is split via one or more backslashes.  */
3108       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3109           && !pfile->overlaid_buffer)
3110         _cpp_process_line_notes (pfile, false);
3111
3112       source_range tok_range;
3113       tok_range.m_start = result->src_loc;
3114       tok_range.m_finish
3115         = linemap_position_for_column (pfile->line_table,
3116                                        CPP_BUF_COLUMN (buffer, buffer->cur));
3117
3118       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
3119                                                result->src_loc,
3120                                                tok_range, NULL);
3121     }
3122
3123   return result;
3124 }
3125
3126 /* An upper bound on the number of bytes needed to spell TOKEN.
3127    Does not include preceding whitespace.  */
3128 unsigned int
3129 cpp_token_len (const cpp_token *token)
3130 {
3131   unsigned int len;
3132
3133   switch (TOKEN_SPELL (token))
3134     {
3135     default:            len = 6;                                break;
3136     case SPELL_LITERAL: len = token->val.str.len;               break;
3137     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
3138     }
3139
3140   return len;
3141 }
3142
3143 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
3144    Return the number of bytes read out of NAME.  (There are always
3145    10 bytes written to BUFFER.)  */
3146
3147 static size_t
3148 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
3149 {
3150   int j;
3151   int ucn_len = 0;
3152   int ucn_len_c;
3153   unsigned t;
3154   unsigned long utf32;
3155
3156   /* Compute the length of the UTF-8 sequence.  */
3157   for (t = *name; t & 0x80; t <<= 1)
3158     ucn_len++;
3159
3160   utf32 = *name & (0x7F >> ucn_len);
3161   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
3162     {
3163       utf32 = (utf32 << 6) | (*++name & 0x3F);
3164
3165       /* Ill-formed UTF-8.  */
3166       if ((*name & ~0x3F) != 0x80)
3167         abort ();
3168     }
3169
3170   *buffer++ = '\\';
3171   *buffer++ = 'U';
3172   for (j = 7; j >= 0; j--)
3173     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
3174   return ucn_len;
3175 }
3176
3177 /* Given a token TYPE corresponding to a digraph, return a pointer to
3178    the spelling of the digraph.  */
3179 static const unsigned char *
3180 cpp_digraph2name (enum cpp_ttype type)
3181 {
3182   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
3183 }
3184
3185 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
3186    The buffer must already contain the enough space to hold the
3187    token's spelling.  Returns a pointer to the character after the
3188    last character written.  */
3189 unsigned char *
3190 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
3191 {
3192   size_t i;
3193   const unsigned char *name = NODE_NAME (ident);
3194
3195   for (i = 0; i < NODE_LEN (ident); i++)
3196     if (name[i] & ~0x7F)
3197       {
3198         i += utf8_to_ucn (buffer, name + i) - 1;
3199         buffer += 10;
3200       }
3201     else
3202       *buffer++ = name[i];
3203
3204   return buffer;
3205 }
3206
3207 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
3208    already contain the enough space to hold the token's spelling.
3209    Returns a pointer to the character after the last character written.
3210    FORSTRING is true if this is to be the spelling after translation
3211    phase 1 (with the original spelling of extended identifiers), false
3212    if extended identifiers should always be written using UCNs (there is
3213    no option for always writing them in the internal UTF-8 form).
3214    FIXME: Would be nice if we didn't need the PFILE argument.  */
3215 unsigned char *
3216 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
3217                  unsigned char *buffer, bool forstring)
3218 {
3219   switch (TOKEN_SPELL (token))
3220     {
3221     case SPELL_OPERATOR:
3222       {
3223         const unsigned char *spelling;
3224         unsigned char c;
3225
3226         if (token->flags & DIGRAPH)
3227           spelling = cpp_digraph2name (token->type);
3228         else if (token->flags & NAMED_OP)
3229           goto spell_ident;
3230         else
3231           spelling = TOKEN_NAME (token);
3232
3233         while ((c = *spelling++) != '\0')
3234           *buffer++ = c;
3235       }
3236       break;
3237
3238     spell_ident:
3239     case SPELL_IDENT:
3240       if (forstring)
3241         {
3242           memcpy (buffer, NODE_NAME (token->val.node.spelling),
3243                   NODE_LEN (token->val.node.spelling));
3244           buffer += NODE_LEN (token->val.node.spelling);
3245         }
3246       else
3247         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
3248       break;
3249
3250     case SPELL_LITERAL:
3251       memcpy (buffer, token->val.str.text, token->val.str.len);
3252       buffer += token->val.str.len;
3253       break;
3254
3255     case SPELL_NONE:
3256       cpp_error (pfile, CPP_DL_ICE,
3257                  "unspellable token %s", TOKEN_NAME (token));
3258       break;
3259     }
3260
3261   return buffer;
3262 }
3263
3264 /* Returns TOKEN spelt as a null-terminated string.  The string is
3265    freed when the reader is destroyed.  Useful for diagnostics.  */
3266 unsigned char *
3267 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
3268 {
3269   unsigned int len = cpp_token_len (token) + 1;
3270   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
3271
3272   end = cpp_spell_token (pfile, token, start, false);
3273   end[0] = '\0';
3274
3275   return start;
3276 }
3277
3278 /* Returns a pointer to a string which spells the token defined by
3279    TYPE and FLAGS.  Used by C front ends, which really should move to
3280    using cpp_token_as_text.  */
3281 const char *
3282 cpp_type2name (enum cpp_ttype type, unsigned char flags)
3283 {
3284   if (flags & DIGRAPH)
3285     return (const char *) cpp_digraph2name (type);
3286   else if (flags & NAMED_OP)
3287     return cpp_named_operator2name (type);
3288
3289   return (const char *) token_spellings[type].name;
3290 }
3291
3292 /* Writes the spelling of token to FP, without any preceding space.
3293    Separated from cpp_spell_token for efficiency - to avoid stdio
3294    double-buffering.  */
3295 void
3296 cpp_output_token (const cpp_token *token, FILE *fp)
3297 {
3298   switch (TOKEN_SPELL (token))
3299     {
3300     case SPELL_OPERATOR:
3301       {
3302         const unsigned char *spelling;
3303         int c;
3304
3305         if (token->flags & DIGRAPH)
3306           spelling = cpp_digraph2name (token->type);
3307         else if (token->flags & NAMED_OP)
3308           goto spell_ident;
3309         else
3310           spelling = TOKEN_NAME (token);
3311
3312         c = *spelling;
3313         do
3314           putc (c, fp);
3315         while ((c = *++spelling) != '\0');
3316       }
3317       break;
3318
3319     spell_ident:
3320     case SPELL_IDENT:
3321       {
3322         size_t i;
3323         const unsigned char * name = NODE_NAME (token->val.node.node);
3324
3325         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
3326           if (name[i] & ~0x7F)
3327             {
3328               unsigned char buffer[10];
3329               i += utf8_to_ucn (buffer, name + i) - 1;
3330               fwrite (buffer, 1, 10, fp);
3331             }
3332           else
3333             fputc (NODE_NAME (token->val.node.node)[i], fp);
3334       }
3335       break;
3336
3337     case SPELL_LITERAL:
3338       fwrite (token->val.str.text, 1, token->val.str.len, fp);
3339       break;
3340
3341     case SPELL_NONE:
3342       /* An error, most probably.  */
3343       break;
3344     }
3345 }
3346
3347 /* Compare two tokens.  */
3348 int
3349 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
3350 {
3351   if (a->type == b->type && a->flags == b->flags)
3352     switch (TOKEN_SPELL (a))
3353       {
3354       default:                  /* Keep compiler happy.  */
3355       case SPELL_OPERATOR:
3356         /* token_no is used to track where multiple consecutive ##
3357            tokens were originally located.  */
3358         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
3359       case SPELL_NONE:
3360         return (a->type != CPP_MACRO_ARG
3361                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
3362                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
3363       case SPELL_IDENT:
3364         return (a->val.node.node == b->val.node.node
3365                 && a->val.node.spelling == b->val.node.spelling);
3366       case SPELL_LITERAL:
3367         return (a->val.str.len == b->val.str.len
3368                 && !memcmp (a->val.str.text, b->val.str.text,
3369                             a->val.str.len));
3370       }
3371
3372   return 0;
3373 }
3374
3375 /* Returns nonzero if a space should be inserted to avoid an
3376    accidental token paste for output.  For simplicity, it is
3377    conservative, and occasionally advises a space where one is not
3378    needed, e.g. "." and ".2".  */
3379 int
3380 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
3381                  const cpp_token *token2)
3382 {
3383   enum cpp_ttype a = token1->type, b = token2->type;
3384   cppchar_t c;
3385
3386   if (token1->flags & NAMED_OP)
3387     a = CPP_NAME;
3388   if (token2->flags & NAMED_OP)
3389     b = CPP_NAME;
3390
3391   c = EOF;
3392   if (token2->flags & DIGRAPH)
3393     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
3394   else if (token_spellings[b].category == SPELL_OPERATOR)
3395     c = token_spellings[b].name[0];
3396
3397   /* Quickly get everything that can paste with an '='.  */
3398   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
3399     return 1;
3400
3401   switch (a)
3402     {
3403     case CPP_GREATER:   return c == '>';
3404     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
3405     case CPP_PLUS:      return c == '+';
3406     case CPP_MINUS:     return c == '-' || c == '>';
3407     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
3408     case CPP_MOD:       return c == ':' || c == '>';
3409     case CPP_AND:       return c == '&';
3410     case CPP_OR:        return c == '|';
3411     case CPP_COLON:     return c == ':' || c == '>';
3412     case CPP_DEREF:     return c == '*';
3413     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
3414     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
3415     case CPP_NAME:      return ((b == CPP_NUMBER
3416                                  && name_p (pfile, &token2->val.str))
3417                                 || b == CPP_NAME
3418                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
3419     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
3420                                 || c == '.' || c == '+' || c == '-');
3421                                       /* UCNs */
3422     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
3423                                  && b == CPP_NAME)
3424                                 || (CPP_OPTION (pfile, objc)
3425                                     && token1->val.str.text[0] == '@'
3426                                     && (b == CPP_NAME || b == CPP_STRING)));
3427     case CPP_STRING:
3428     case CPP_WSTRING:
3429     case CPP_UTF8STRING:
3430     case CPP_STRING16:
3431     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
3432                                 && (b == CPP_NAME
3433                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
3434                                         && ISIDST (token2->val.str.text[0]))));
3435
3436     default:            break;
3437     }
3438
3439   return 0;
3440 }
3441
3442 /* Output all the remaining tokens on the current line, and a newline
3443    character, to FP.  Leading whitespace is removed.  If there are
3444    macros, special token padding is not performed.  */
3445 void
3446 cpp_output_line (cpp_reader *pfile, FILE *fp)
3447 {
3448   const cpp_token *token;
3449
3450   token = cpp_get_token (pfile);
3451   while (token->type != CPP_EOF)
3452     {
3453       cpp_output_token (token, fp);
3454       token = cpp_get_token (pfile);
3455       if (token->flags & PREV_WHITE)
3456         putc (' ', fp);
3457     }
3458
3459   putc ('\n', fp);
3460 }
3461
3462 /* Return a string representation of all the remaining tokens on the
3463    current line.  The result is allocated using xmalloc and must be
3464    freed by the caller.  */
3465 unsigned char *
3466 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
3467 {
3468   const cpp_token *token;
3469   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
3470   unsigned int alloced = 120 + out;
3471   unsigned char *result = (unsigned char *) xmalloc (alloced);
3472
3473   /* If DIR_NAME is empty, there are no initial contents.  */
3474   if (dir_name)
3475     {
3476       sprintf ((char *) result, "#%s ", dir_name);
3477       out += 2;
3478     }
3479
3480   token = cpp_get_token (pfile);
3481   while (token->type != CPP_EOF)
3482     {
3483       unsigned char *last;
3484       /* Include room for a possible space and the terminating nul.  */
3485       unsigned int len = cpp_token_len (token) + 2;
3486
3487       if (out + len > alloced)
3488         {
3489           alloced *= 2;
3490           if (out + len > alloced)
3491             alloced = out + len;
3492           result = (unsigned char *) xrealloc (result, alloced);
3493         }
3494
3495       last = cpp_spell_token (pfile, token, &result[out], 0);
3496       out = last - result;
3497
3498       token = cpp_get_token (pfile);
3499       if (token->flags & PREV_WHITE)
3500         result[out++] = ' ';
3501     }
3502
3503   result[out] = '\0';
3504   return result;
3505 }
3506
3507 /* Memory buffers.  Changing these three constants can have a dramatic
3508    effect on performance.  The values here are reasonable defaults,
3509    but might be tuned.  If you adjust them, be sure to test across a
3510    range of uses of cpplib, including heavy nested function-like macro
3511    expansion.  Also check the change in peak memory usage (NJAMD is a
3512    good tool for this).  */
3513 #define MIN_BUFF_SIZE 8000
3514 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3515 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
3516         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
3517
3518 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
3519   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
3520 #endif
3521
3522 /* Create a new allocation buffer.  Place the control block at the end
3523    of the buffer, so that buffer overflows will cause immediate chaos.  */
3524 static _cpp_buff *
3525 new_buff (size_t len)
3526 {
3527   _cpp_buff *result;
3528   unsigned char *base;
3529
3530   if (len < MIN_BUFF_SIZE)
3531     len = MIN_BUFF_SIZE;
3532   len = CPP_ALIGN (len);
3533
3534 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3535   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
3536      struct first.  */
3537   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
3538   base = XNEWVEC (unsigned char, len + slen);
3539   result = (_cpp_buff *) base;
3540   base += slen;
3541 #else
3542   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3543   result = (_cpp_buff *) (base + len);
3544 #endif
3545   result->base = base;
3546   result->cur = base;
3547   result->limit = base + len;
3548   result->next = NULL;
3549   return result;
3550 }
3551
3552 /* Place a chain of unwanted allocation buffers on the free list.  */
3553 void
3554 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3555 {
3556   _cpp_buff *end = buff;
3557
3558   while (end->next)
3559     end = end->next;
3560   end->next = pfile->free_buffs;
3561   pfile->free_buffs = buff;
3562 }
3563
3564 /* Return a free buffer of size at least MIN_SIZE.  */
3565 _cpp_buff *
3566 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3567 {
3568   _cpp_buff *result, **p;
3569
3570   for (p = &pfile->free_buffs;; p = &(*p)->next)
3571     {
3572       size_t size;
3573
3574       if (*p == NULL)
3575         return new_buff (min_size);
3576       result = *p;
3577       size = result->limit - result->base;
3578       /* Return a buffer that's big enough, but don't waste one that's
3579          way too big.  */
3580       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3581         break;
3582     }
3583
3584   *p = result->next;
3585   result->next = NULL;
3586   result->cur = result->base;
3587   return result;
3588 }
3589
3590 /* Creates a new buffer with enough space to hold the uncommitted
3591    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
3592    the excess bytes to the new buffer.  Chains the new buffer after
3593    BUFF, and returns the new buffer.  */
3594 _cpp_buff *
3595 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3596 {
3597   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3598   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3599
3600   buff->next = new_buff;
3601   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3602   return new_buff;
3603 }
3604
3605 /* Creates a new buffer with enough space to hold the uncommitted
3606    remaining bytes of the buffer pointed to by BUFF, and at least
3607    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3608    Chains the new buffer before the buffer pointed to by BUFF, and
3609    updates the pointer to point to the new buffer.  */
3610 void
3611 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3612 {
3613   _cpp_buff *new_buff, *old_buff = *pbuff;
3614   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3615
3616   new_buff = _cpp_get_buff (pfile, size);
3617   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3618   new_buff->next = old_buff;
3619   *pbuff = new_buff;
3620 }
3621
3622 /* Free a chain of buffers starting at BUFF.  */
3623 void
3624 _cpp_free_buff (_cpp_buff *buff)
3625 {
3626   _cpp_buff *next;
3627
3628   for (; buff; buff = next)
3629     {
3630       next = buff->next;
3631 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3632       free (buff);
3633 #else
3634       free (buff->base);
3635 #endif
3636     }
3637 }
3638
3639 /* Allocate permanent, unaligned storage of length LEN.  */
3640 unsigned char *
3641 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3642 {
3643   _cpp_buff *buff = pfile->u_buff;
3644   unsigned char *result = buff->cur;
3645
3646   if (len > (size_t) (buff->limit - result))
3647     {
3648       buff = _cpp_get_buff (pfile, len);
3649       buff->next = pfile->u_buff;
3650       pfile->u_buff = buff;
3651       result = buff->cur;
3652     }
3653
3654   buff->cur = result + len;
3655   return result;
3656 }
3657
3658 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3659    That buffer is used for growing allocations when saving macro
3660    replacement lists in a #define, and when parsing an answer to an
3661    assertion in #assert, #unassert or #if (and therefore possibly
3662    whilst expanding macros).  It therefore must not be used by any
3663    code that they might call: specifically the lexer and the guts of
3664    the macro expander.
3665
3666    All existing other uses clearly fit this restriction: storing
3667    registered pragmas during initialization.  */
3668 unsigned char *
3669 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3670 {
3671   _cpp_buff *buff = pfile->a_buff;
3672   unsigned char *result = buff->cur;
3673
3674   if (len > (size_t) (buff->limit - result))
3675     {
3676       buff = _cpp_get_buff (pfile, len);
3677       buff->next = pfile->a_buff;
3678       pfile->a_buff = buff;
3679       result = buff->cur;
3680     }
3681
3682   buff->cur = result + len;
3683   return result;
3684 }
3685
3686 /* Say which field of TOK is in use.  */
3687
3688 enum cpp_token_fld_kind
3689 cpp_token_val_index (const cpp_token *tok)
3690 {
3691   switch (TOKEN_SPELL (tok))
3692     {
3693     case SPELL_IDENT:
3694       return CPP_TOKEN_FLD_NODE;
3695     case SPELL_LITERAL:
3696       return CPP_TOKEN_FLD_STR;
3697     case SPELL_OPERATOR:
3698       if (tok->type == CPP_PASTE)
3699         return CPP_TOKEN_FLD_TOKEN_NO;
3700       else
3701         return CPP_TOKEN_FLD_NONE;
3702     case SPELL_NONE:
3703       if (tok->type == CPP_MACRO_ARG)
3704         return CPP_TOKEN_FLD_ARG_NO;
3705       else if (tok->type == CPP_PADDING)
3706         return CPP_TOKEN_FLD_SOURCE;
3707       else if (tok->type == CPP_PRAGMA)
3708         return CPP_TOKEN_FLD_PRAGMA;
3709       /* fall through */
3710     default:
3711       return CPP_TOKEN_FLD_NONE;
3712     }
3713 }
3714
3715 /* All tokens lexed in R after calling this function will be forced to have
3716    their source_location the same as the location referenced by P, until
3717    cpp_stop_forcing_token_locations is called for R.  */
3718
3719 void
3720 cpp_force_token_locations (cpp_reader *r, source_location *p)
3721 {
3722   r->forced_token_location_p = p;
3723 }
3724
3725 /* Go back to assigning locations naturally for lexed tokens.  */
3726
3727 void
3728 cpp_stop_forcing_token_locations (cpp_reader *r)
3729 {
3730   r->forced_token_location_p = NULL;
3731 }