libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2014 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problems can be properly
 267    autoconfed:
 268
 269    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 270    Before Solaris 9 Update 6, SSE insns cannot be executed.
 271    The Solaris 10+ assembler tags objects with the instruction set
 272    extensions used, so SSE4.2 executables cannot run on machines that
 273    don't support that extension.  */
 274
 275 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 276
 277 /* Replicated character data to be shared between implementations.
 278    Recall that outside of a context with vector support we can't
 279    define compatible vector types, therefore these are all defined
 280    in terms of raw characters.  */
 281 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 282   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 283     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 284   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 285     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 286   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 287     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 288   { '?', '?', '?', '?', '?', '?', '?', '?',
 289     '?', '?', '?', '?', '?', '?', '?', '?' },
 290 };
 291
 292 /* A version of the fast scanner using MMX vectorized byte compare insns.
 293
 294    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 295    which was packaged into SSE1; it is also present in the AMD MMX
 296    extension.  Mark the function as using "sse" so that we emit a real
 297    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 298
 299 static const uchar *
 300 #ifndef __SSE__
 301 __attribute__((__target__("sse")))
 302 #endif
 303 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 304 {
 305   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 306   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 307
 308   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 309   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 310   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 311   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 312
 313   unsigned int misalign, found, mask;
 314   const v8qi *p;
 315   v8qi data, t, c;
 316
 317   /* Align the source pointer.  While MMX doesn't generate unaligned data
 318      faults, this allows us to safely scan to the end of the buffer without
 319      reading beyond the end of the last page.  */
 320   misalign = (uintptr_t)s & 7;
 321   p = (const v8qi *)((uintptr_t)s & -8);
 322   data = *p;
 323
 324   /* Create a mask for the bytes that are valid within the first
 325      16-byte block.  The Idea here is that the AND with the mask
 326      within the loop is "free", since we need some AND or TEST
 327      insn in order to set the flags for the branch anyway.  */
 328   mask = -1u << misalign;
 329
 330   /* Main loop processing 8 bytes at a time.  */
 331   goto start;
 332   do
 333     {
 334       data = *++p;
 335       mask = -1;
 336
 337     start:
 338       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 339       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 344       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 345       found = __builtin_ia32_pmovmskb (t);
 346       found &= mask;
 347     }
 348   while (!found);
 349
 350   __builtin_ia32_emms ();
 351
 352   /* FOUND contains 1 in bits for which we matched a relevant
 353      character.  Conversion to the byte index is trivial.  */
 354   found = __builtin_ctz(found);
 355   return (const uchar *)p + found;
 356 }
 357
 358 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 359
 360 static const uchar *
 361 #ifndef __SSE2__
 362 __attribute__((__target__("sse2")))
 363 #endif
 364 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 365 {
 366   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 367
 368   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 369   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 370   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 371   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 372
 373   unsigned int misalign, found, mask;
 374   const v16qi *p;
 375   v16qi data, t;
 376
 377   /* Align the source pointer.  */
 378   misalign = (uintptr_t)s & 15;
 379   p = (const v16qi *)((uintptr_t)s & -16);
 380   data = *p;
 381
 382   /* Create a mask for the bytes that are valid within the first
 383      16-byte block.  The Idea here is that the AND with the mask
 384      within the loop is "free", since we need some AND or TEST
 385      insn in order to set the flags for the branch anyway.  */
 386   mask = -1u << misalign;
 387
 388   /* Main loop processing 16 bytes at a time.  */
 389   goto start;
 390   do
 391     {
 392       data = *++p;
 393       mask = -1;
 394
 395     start:
 396       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 397       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 398       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 400       found = __builtin_ia32_pmovmskb128 (t);
 401       found &= mask;
 402     }
 403   while (!found);
 404
 405   /* FOUND contains 1 in bits for which we matched a relevant
 406      character.  Conversion to the byte index is trivial.  */
 407   found = __builtin_ctz(found);
 408   return (const uchar *)p + found;
 409 }
 410
 411 #ifdef HAVE_SSE4
 412 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 413
 414 static const uchar *
 415 #ifndef __SSE4_2__
 416 __attribute__((__target__("sse4.2")))
 417 #endif
 418 search_line_sse42 (const uchar *s, const uchar *end)
 419 {
 420   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 421   static const v16qi search = { '\n', '\r', '?', '\\' };
 422
 423   uintptr_t si = (uintptr_t)s;
 424   uintptr_t index;
 425
 426   /* Check for unaligned input.  */
 427   if (si & 15)
 428     {
 429       v16qi sv;
 430
 431       if (__builtin_expect (end - s < 16, 0)
 432           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 433         {
 434           /* There are less than 16 bytes left in the buffer, and less
 435              than 16 bytes left on the page.  Reading 16 bytes at this
 436              point might generate a spurious page fault.  Defer to the
 437              SSE2 implementation, which already handles alignment.  */
 438           return search_line_sse2 (s, end);
 439         }
 440
 441       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 442          memory need not be aligned.  */
 443       sv = __builtin_ia32_loaddqu ((const char *) s);
 444       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 445
 446       if (__builtin_expect (index < 16, 0))
 447         goto found;
 448
 449       /* Advance the pointer to an aligned address.  We will re-scan a
 450          few bytes, but we no longer need care for reading past the
 451          end of a page, since we're guaranteed a match.  */
 452       s = (const uchar *)((si + 16) & -16);
 453     }
 454
 455   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 456      in inline assembly, we can make proper use of the flags set.  */
 457   __asm (      "sub $16, %1\n"
 458         "       .balign 16\n"
 459         "0:     add $16, %1\n"
 460         "       %vpcmpestri $0, (%1), %2\n"
 461         "       jnc 0b"
 462         : "=&c"(index), "+r"(s)
 463         : "x"(search), "a"(4), "d"(16));
 464
 465  found:
 466   return s + index;
 467 }
 468
 469 #else
 470 /* Work around out-dated assemblers without sse4 support.  */
 471 #define search_line_sse42 search_line_sse2
 472 #endif
 473
 474 /* Check the CPU capabilities.  */
 475
 476 #include "../gcc/config/i386/cpuid.h"
 477
 478 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 479 static search_line_fast_type search_line_fast;
 480
 481 #define HAVE_init_vectorized_lexer 1
 482 static inline void
 483 init_vectorized_lexer (void)
 484 {
 485   unsigned dummy, ecx = 0, edx = 0;
 486   search_line_fast_type impl = search_line_acc_char;
 487   int minimum = 0;
 488
 489 #if defined(__SSE4_2__)
 490   minimum = 3;
 491 #elif defined(__SSE2__)
 492   minimum = 2;
 493 #elif defined(__SSE__)
 494   minimum = 1;
 495 #endif
 496
 497   if (minimum == 3)
 498     impl = search_line_sse42;
 499   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 500     {
 501       if (minimum == 3 || (ecx & bit_SSE4_2))
 502         impl = search_line_sse42;
 503       else if (minimum == 2 || (edx & bit_SSE2))
 504         impl = search_line_sse2;
 505       else if (minimum == 1 || (edx & bit_SSE))
 506         impl = search_line_mmx;
 507     }
 508   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 509     {
 510       if (minimum == 1
 511           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 512         impl = search_line_mmx;
 513     }
 514
 515   search_line_fast = impl;
 516 }
 517
 518 #elif defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 519
 520 /* A vection of the fast scanner using AltiVec vectorized byte compares
 521    and VSX unaligned loads (when VSX is available).  This is otherwise
 522    the same as the pre-GCC 5 version.  */
 523
 524 static const uchar *
 525 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 526 {
 527   typedef __attribute__((altivec(vector))) unsigned char vc;
 528
 529   const vc repl_nl = {
 530     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 531     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 532   };
 533   const vc repl_cr = {
 534     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 535     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 536   };
 537   const vc repl_bs = {
 538     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 539     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 540   };
 541   const vc repl_qm = {
 542     '?', '?', '?', '?', '?', '?', '?', '?',
 543     '?', '?', '?', '?', '?', '?', '?', '?',
 544   };
 545   const vc zero = { 0 };
 546
 547   vc data, t;
 548
 549   /* Main loop processing 16 bytes at a time.  */
 550   do
 551     {
 552       vc m_nl, m_cr, m_bs, m_qm;
 553
 554       data = *((const vc *)s);
 555       s += 16;
 556
 557       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 558       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 559       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 560       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 561       t = (m_nl | m_cr) | (m_bs | m_qm);
 562
 563       /* T now contains 0xff in bytes for which we matched one of the relevant
 564          characters.  We want to exit the loop if any byte in T is non-zero.
 565          Below is the expansion of vec_any_ne(t, zero).  */
 566     }
 567   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 568
 569   /* Restore s to to point to the 16 bytes we just processed.  */
 570   s -= 16;
 571
 572   {
 573 #define N  (sizeof(vc) / sizeof(long))
 574
 575     union {
 576       vc v;
 577       /* Statically assert that N is 2 or 4.  */
 578       unsigned long l[(N == 2 || N == 4) ? N : -1];
 579     } u;
 580     unsigned long l, i = 0;
 581
 582     u.v = t;
 583
 584     /* Find the first word of T that is non-zero.  */
 585     switch (N)
 586       {
 587       case 4:
 588         l = u.l[i++];
 589         if (l != 0)
 590           break;
 591         s += sizeof(unsigned long);
 592         l = u.l[i++];
 593         if (l != 0)
 594           break;
 595         s += sizeof(unsigned long);
 596       case 2:
 597         l = u.l[i++];
 598         if (l != 0)
 599           break;
 600         s += sizeof(unsigned long);
 601         l = u.l[i];
 602       }
 603
 604     /* L now contains 0xff in bytes for which we matched one of the
 605        relevant characters.  We can find the byte index by finding
 606        its bit index and dividing by 8.  */
 607 #ifdef __BIG_ENDIAN__
 608     l = __builtin_clzl(l) >> 3;
 609 #else
 610     l = __builtin_ctzl(l) >> 3;
 611 #endif
 612     return s + l;
 613
 614 #undef N
 615   }
 616 }
 617
 618 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 619
 620 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 621    This cannot be used for little endian because vec_lvsl/lvsr are
 622    deprecated for little endian and the code won't work properly.  */
 623 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 624    so we can't compile this function without -maltivec on the command line
 625    (or implied by some other switch).  */
 626
 627 static const uchar *
 628 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 629 {
 630   typedef __attribute__((altivec(vector))) unsigned char vc;
 631
 632   const vc repl_nl = {
 633     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 634     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 635   };
 636   const vc repl_cr = {
 637     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 638     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 639   };
 640   const vc repl_bs = {
 641     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 642     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 643   };
 644   const vc repl_qm = {
 645     '?', '?', '?', '?', '?', '?', '?', '?',
 646     '?', '?', '?', '?', '?', '?', '?', '?',
 647   };
 648   const vc ones = {
 649     -1, -1, -1, -1, -1, -1, -1, -1,
 650     -1, -1, -1, -1, -1, -1, -1, -1,
 651   };
 652   const vc zero = { 0 };
 653
 654   vc data, mask, t;
 655
 656   /* Altivec loads automatically mask addresses with -16.  This lets us
 657      issue the first load as early as possible.  */
 658   data = __builtin_vec_ld(0, (const vc *)s);
 659
 660   /* Discard bytes before the beginning of the buffer.  Do this by
 661      beginning with all ones and shifting in zeros according to the
 662      mis-alignment.  The LVSR instruction pulls the exact shift we
 663      want from the address.  */
 664   mask = __builtin_vec_lvsr(0, s);
 665   mask = __builtin_vec_perm(zero, ones, mask);
 666   data &= mask;
 667
 668   /* While altivec loads mask addresses, we still need to align S so
 669      that the offset we compute at the end is correct.  */
 670   s = (const uchar *)((uintptr_t)s & -16);
 671
 672   /* Main loop processing 16 bytes at a time.  */
 673   goto start;
 674   do
 675     {
 676       vc m_nl, m_cr, m_bs, m_qm;
 677
 678       s += 16;
 679       data = __builtin_vec_ld(0, (const vc *)s);
 680
 681     start:
 682       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 683       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 684       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 685       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 686       t = (m_nl | m_cr) | (m_bs | m_qm);
 687
 688       /* T now contains 0xff in bytes for which we matched one of the relevant
 689          characters.  We want to exit the loop if any byte in T is non-zero.
 690          Below is the expansion of vec_any_ne(t, zero).  */
 691     }
 692   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 693
 694   {
 695 #define N  (sizeof(vc) / sizeof(long))
 696
 697     union {
 698       vc v;
 699       /* Statically assert that N is 2 or 4.  */
 700       unsigned long l[(N == 2 || N == 4) ? N : -1];
 701     } u;
 702     unsigned long l, i = 0;
 703
 704     u.v = t;
 705
 706     /* Find the first word of T that is non-zero.  */
 707     switch (N)
 708       {
 709       case 4:
 710         l = u.l[i++];
 711         if (l != 0)
 712           break;
 713         s += sizeof(unsigned long);
 714         l = u.l[i++];
 715         if (l != 0)
 716           break;
 717         s += sizeof(unsigned long);
 718       case 2:
 719         l = u.l[i++];
 720         if (l != 0)
 721           break;
 722         s += sizeof(unsigned long);
 723         l = u.l[i];
 724       }
 725
 726     /* L now contains 0xff in bytes for which we matched one of the
 727        relevant characters.  We can find the byte index by finding
 728        its bit index and dividing by 8.  */
 729     l = __builtin_clzl(l) >> 3;
 730     return s + l;
 731
 732 #undef N
 733   }
 734 }
 735
 736 #elif defined (__ARM_NEON__)
 737 #include "arm_neon.h"
 738
 739 static const uchar *
 740 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 741 {
 742   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 743   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 744   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 745   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 746   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 747
 748   unsigned int misalign, found, mask;
 749   const uint8_t *p;
 750   uint8x16_t data;
 751
 752   /* Align the source pointer.  */
 753   misalign = (uintptr_t)s & 15;
 754   p = (const uint8_t *)((uintptr_t)s & -16);
 755   data = vld1q_u8 (p);
 756
 757   /* Create a mask for the bytes that are valid within the first
 758      16-byte block.  The Idea here is that the AND with the mask
 759      within the loop is "free", since we need some AND or TEST
 760      insn in order to set the flags for the branch anyway.  */
 761   mask = (-1u << misalign) & 0xffff;
 762
 763   /* Main loop, processing 16 bytes at a time.  */
 764   goto start;
 765
 766   do
 767     {
 768       uint8x8_t l;
 769       uint16x4_t m;
 770       uint32x2_t n;
 771       uint8x16_t t, u, v, w;
 772
 773       p += 16;
 774       data = vld1q_u8 (p);
 775       mask = 0xffff;
 776
 777     start:
 778       t = vceqq_u8 (data, repl_nl);
 779       u = vceqq_u8 (data, repl_cr);
 780       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 781       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 782       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 783       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 784       m = vpaddl_u8 (l);
 785       n = vpaddl_u16 (m);
 786
 787       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 788               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 789       found &= mask;
 790     }
 791   while (!found);
 792
 793   /* FOUND contains 1 in bits for which we matched a relevant
 794      character.  Conversion to the byte index is trivial.  */
 795   found = __builtin_ctz (found);
 796   return (const uchar *)p + found;
 797 }
 798
 799 #else
 800
 801 /* We only have one accellerated alternative.  Use a direct call so that
 802    we encourage inlining.  */
 803
 804 #define search_line_fast  search_line_acc_char
 805
 806 #endif
 807
 808 /* Initialize the lexer if needed.  */
 809
 810 void
 811 _cpp_init_lexer (void)
 812 {
 813 #ifdef HAVE_init_vectorized_lexer
 814   init_vectorized_lexer ();
 815 #endif
 816 }
 817
 818 /* Returns with a logical line that contains no escaped newlines or
 819    trigraphs.  This is a time-critical inner loop.  */
 820 void
 821 _cpp_clean_line (cpp_reader *pfile)
 822 {
 823   cpp_buffer *buffer;
 824   const uchar *s;
 825   uchar c, *d, *p;
 826
 827   buffer = pfile->buffer;
 828   buffer->cur_note = buffer->notes_used = 0;
 829   buffer->cur = buffer->line_base = buffer->next_line;
 830   buffer->need_line = false;
 831   s = buffer->next_line;
 832
 833   if (!buffer->from_stage3)
 834     {
 835       const uchar *pbackslash = NULL;
 836
 837       /* Fast path.  This is the common case of an un-escaped line with
 838          no trigraphs.  The primary win here is by not writing any
 839          data back to memory until we have to.  */
 840       while (1)
 841         {
 842           /* Perform an optimized search for \n, \r, \\, ?.  */
 843           s = search_line_fast (s, buffer->rlimit);
 844
 845           c = *s;
 846           if (c == '\\')
 847             {
 848               /* Record the location of the backslash and continue.  */
 849               pbackslash = s++;
 850             }
 851           else if (__builtin_expect (c == '?', 0))
 852             {
 853               if (__builtin_expect (s[1] == '?', false)
 854                    && _cpp_trigraph_map[s[2]])
 855                 {
 856                   /* Have a trigraph.  We may or may not have to convert
 857                      it.  Add a line note regardless, for -Wtrigraphs.  */
 858                   add_line_note (buffer, s, s[2]);
 859                   if (CPP_OPTION (pfile, trigraphs))
 860                     {
 861                       /* We do, and that means we have to switch to the
 862                          slow path.  */
 863                       d = (uchar *) s;
 864                       *d = _cpp_trigraph_map[s[2]];
 865                       s += 2;
 866                       goto slow_path;
 867                     }
 868                 }
 869               /* Not a trigraph.  Continue on fast-path.  */
 870               s++;
 871             }
 872           else
 873             break;
 874         }
 875
 876       /* This must be \r or \n.  We're either done, or we'll be forced
 877          to write back to the buffer and continue on the slow path.  */
 878       d = (uchar *) s;
 879
 880       if (__builtin_expect (s == buffer->rlimit, false))
 881         goto done;
 882
 883       /* DOS line ending? */
 884       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 885         {
 886           s++;
 887           if (s == buffer->rlimit)
 888             goto done;
 889         }
 890
 891       if (__builtin_expect (pbackslash == NULL, true))
 892         goto done;
 893
 894       /* Check for escaped newline.  */
 895       p = d;
 896       while (is_nvspace (p[-1]))
 897         p--;
 898       if (p - 1 != pbackslash)
 899         goto done;
 900
 901       /* Have an escaped newline; process it and proceed to
 902          the slow path.  */
 903       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 904       d = p - 2;
 905       buffer->next_line = p - 1;
 906
 907     slow_path:
 908       while (1)
 909         {
 910           c = *++s;
 911           *++d = c;
 912
 913           if (c == '\n' || c == '\r')
 914             {
 915               /* Handle DOS line endings.  */
 916               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 917                 s++;
 918               if (s == buffer->rlimit)
 919                 break;
 920
 921               /* Escaped?  */
 922               p = d;
 923               while (p != buffer->next_line && is_nvspace (p[-1]))
 924                 p--;
 925               if (p == buffer->next_line || p[-1] != '\\')
 926                 break;
 927
 928               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 929               d = p - 2;
 930               buffer->next_line = p - 1;
 931             }
 932           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 933             {
 934               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 935               add_line_note (buffer, d, s[2]);
 936               if (CPP_OPTION (pfile, trigraphs))
 937                 {
 938                   *d = _cpp_trigraph_map[s[2]];
 939                   s += 2;
 940                 }
 941             }
 942         }
 943     }
 944   else
 945     {
 946       while (*s != '\n' && *s != '\r')
 947         s++;
 948       d = (uchar *) s;
 949
 950       /* Handle DOS line endings.  */
 951       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 952         s++;
 953     }
 954
 955  done:
 956   *d = '\n';
 957   /* A sentinel note that should never be processed.  */
 958   add_line_note (buffer, d + 1, '\n');
 959   buffer->next_line = s + 1;
 960 }
 961
 962 /* Return true if the trigraph indicated by NOTE should be warned
 963    about in a comment.  */
 964 static bool
 965 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 966 {
 967   const uchar *p;
 968
 969   /* Within comments we don't warn about trigraphs, unless the
 970      trigraph forms an escaped newline, as that may change
 971      behavior.  */
 972   if (note->type != '/')
 973     return false;
 974
 975   /* If -trigraphs, then this was an escaped newline iff the next note
 976      is coincident.  */
 977   if (CPP_OPTION (pfile, trigraphs))
 978     return note[1].pos == note->pos;
 979
 980   /* Otherwise, see if this forms an escaped newline.  */
 981   p = note->pos + 3;
 982   while (is_nvspace (*p))
 983     p++;
 984
 985   /* There might have been escaped newlines between the trigraph and the
 986      newline we found.  Hence the position test.  */
 987   return (*p == '\n' && p < note[1].pos);
 988 }
 989
 990 /* Process the notes created by add_line_note as far as the current
 991    location.  */
 992 void
 993 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 994 {
 995   cpp_buffer *buffer = pfile->buffer;
 996
 997   for (;;)
 998     {
 999       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1000       unsigned int col;
1001
1002       if (note->pos > buffer->cur)
1003         break;
1004
1005       buffer->cur_note++;
1006       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1007
1008       if (note->type == '\\' || note->type == ' ')
1009         {
1010           if (note->type == ' ' && !in_comment)
1011             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1012                                  "backslash and newline separated by space");
1013
1014           if (buffer->next_line > buffer->rlimit)
1015             {
1016               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1017                                    "backslash-newline at end of file");
1018               /* Prevent "no newline at end of file" warning.  */
1019               buffer->next_line = buffer->rlimit;
1020             }
1021
1022           buffer->line_base = note->pos;
1023           CPP_INCREMENT_LINE (pfile, 0);
1024         }
1025       else if (_cpp_trigraph_map[note->type])
1026         {
1027           if (CPP_OPTION (pfile, warn_trigraphs)
1028               && (!in_comment || warn_in_comment (pfile, note)))
1029             {
1030               if (CPP_OPTION (pfile, trigraphs))
1031                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1032                                        pfile->line_table->highest_line, col,
1033                                        "trigraph ??%c converted to %c",
1034                                        note->type,
1035                                        (int) _cpp_trigraph_map[note->type]);
1036               else
1037                 {
1038                   cpp_warning_with_line
1039                     (pfile, CPP_W_TRIGRAPHS,
1040                      pfile->line_table->highest_line, col,
1041                      "trigraph ??%c ignored, use -trigraphs to enable",
1042                      note->type);
1043                 }
1044             }
1045         }
1046       else if (note->type == 0)
1047         /* Already processed in lex_raw_string.  */;
1048       else
1049         abort ();
1050     }
1051 }
1052
1053 /* Skip a C-style block comment.  We find the end of the comment by
1054    seeing if an asterisk is before every '/' we encounter.  Returns
1055    nonzero if comment terminated by EOF, zero otherwise.
1056
1057    Buffer->cur points to the initial asterisk of the comment.  */
1058 bool
1059 _cpp_skip_block_comment (cpp_reader *pfile)
1060 {
1061   cpp_buffer *buffer = pfile->buffer;
1062   const uchar *cur = buffer->cur;
1063   uchar c;
1064
1065   cur++;
1066   if (*cur == '/')
1067     cur++;
1068
1069   for (;;)
1070     {
1071       /* People like decorating comments with '*', so check for '/'
1072          instead for efficiency.  */
1073       c = *cur++;
1074
1075       if (c == '/')
1076         {
1077           if (cur[-2] == '*')
1078             break;
1079
1080           /* Warn about potential nested comments, but not if the '/'
1081              comes immediately before the true comment delimiter.
1082              Don't bother to get it right across escaped newlines.  */
1083           if (CPP_OPTION (pfile, warn_comments)
1084               && cur[0] == '*' && cur[1] != '/')
1085             {
1086               buffer->cur = cur;
1087               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1088                                      pfile->line_table->highest_line,
1089                                      CPP_BUF_COL (buffer),
1090                                      "\"/*\" within comment");
1091             }
1092         }
1093       else if (c == '\n')
1094         {
1095           unsigned int cols;
1096           buffer->cur = cur - 1;
1097           _cpp_process_line_notes (pfile, true);
1098           if (buffer->next_line >= buffer->rlimit)
1099             return true;
1100           _cpp_clean_line (pfile);
1101
1102           cols = buffer->next_line - buffer->line_base;
1103           CPP_INCREMENT_LINE (pfile, cols);
1104
1105           cur = buffer->cur;
1106         }
1107     }
1108
1109   buffer->cur = cur;
1110   _cpp_process_line_notes (pfile, true);
1111   return false;
1112 }
1113
1114 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1115    terminating newline.  Handles escaped newlines.  Returns nonzero
1116    if a multiline comment.  */
1117 static int
1118 skip_line_comment (cpp_reader *pfile)
1119 {
1120   cpp_buffer *buffer = pfile->buffer;
1121   source_location orig_line = pfile->line_table->highest_line;
1122
1123   while (*buffer->cur != '\n')
1124     buffer->cur++;
1125
1126   _cpp_process_line_notes (pfile, true);
1127   return orig_line != pfile->line_table->highest_line;
1128 }
1129
1130 /* Skips whitespace, saving the next non-whitespace character.  */
1131 static void
1132 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1133 {
1134   cpp_buffer *buffer = pfile->buffer;
1135   bool saw_NUL = false;
1136
1137   do
1138     {
1139       /* Horizontal space always OK.  */
1140       if (c == ' ' || c == '\t')
1141         ;
1142       /* Just \f \v or \0 left.  */
1143       else if (c == '\0')
1144         saw_NUL = true;
1145       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1146         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1147                              CPP_BUF_COL (buffer),
1148                              "%s in preprocessing directive",
1149                              c == '\f' ? "form feed" : "vertical tab");
1150
1151       c = *buffer->cur++;
1152     }
1153   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1154   while (is_nvspace (c));
1155
1156   if (saw_NUL)
1157     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1158
1159   buffer->cur--;
1160 }
1161
1162 /* See if the characters of a number token are valid in a name (no
1163    '.', '+' or '-').  */
1164 static int
1165 name_p (cpp_reader *pfile, const cpp_string *string)
1166 {
1167   unsigned int i;
1168
1169   for (i = 0; i < string->len; i++)
1170     if (!is_idchar (string->text[i]))
1171       return 0;
1172
1173   return 1;
1174 }
1175
1176 /* After parsing an identifier or other sequence, produce a warning about
1177    sequences not in NFC/NFKC.  */
1178 static void
1179 warn_about_normalization (cpp_reader *pfile,
1180                           const cpp_token *token,
1181                           const struct normalize_state *s)
1182 {
1183   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1184       && !pfile->state.skipping)
1185     {
1186       /* Make sure that the token is printed using UCNs, even
1187          if we'd otherwise happily print UTF-8.  */
1188       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1189       size_t sz;
1190
1191       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1192       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1193         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1194                                "`%.*s' is not in NFKC", (int) sz, buf);
1195       else
1196         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1197                                "`%.*s' is not in NFC", (int) sz, buf);
1198       free (buf);
1199     }
1200 }
1201
1202 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1203    an identifier.  FIRST is TRUE if this starts an identifier.  */
1204 static bool
1205 forms_identifier_p (cpp_reader *pfile, int first,
1206                     struct normalize_state *state)
1207 {
1208   cpp_buffer *buffer = pfile->buffer;
1209
1210   if (*buffer->cur == '$')
1211     {
1212       if (!CPP_OPTION (pfile, dollars_in_ident))
1213         return false;
1214
1215       buffer->cur++;
1216       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1217         {
1218           CPP_OPTION (pfile, warn_dollars) = 0;
1219           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1220         }
1221
1222       return true;
1223     }
1224
1225   /* Is this a syntactically valid UCN?  */
1226   if (CPP_OPTION (pfile, extended_identifiers)
1227       && *buffer->cur == '\\'
1228       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1229     {
1230       buffer->cur += 2;
1231       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1232                           state))
1233         return true;
1234       buffer->cur -= 2;
1235     }
1236
1237   return false;
1238 }
1239
1240 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1241 static cpp_hashnode *
1242 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1243 {
1244   cpp_hashnode *result;
1245   const uchar *cur;
1246   unsigned int len;
1247   unsigned int hash = HT_HASHSTEP (0, *base);
1248
1249   cur = base + 1;
1250   while (ISIDNUM (*cur))
1251     {
1252       hash = HT_HASHSTEP (hash, *cur);
1253       cur++;
1254     }
1255   len = cur - base;
1256   hash = HT_HASHFINISH (hash, len);
1257   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1258                                               base, len, hash, HT_ALLOC));
1259
1260   /* Rarely, identifiers require diagnostics when lexed.  */
1261   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1262                         && !pfile->state.skipping, 0))
1263     {
1264       /* It is allowed to poison the same identifier twice.  */
1265       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1266         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1267                    NODE_NAME (result));
1268
1269       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1270          replacement list of a variadic macro.  */
1271       if (result == pfile->spec_nodes.n__VA_ARGS__
1272           && !pfile->state.va_args_ok)
1273         cpp_error (pfile, CPP_DL_PEDWARN,
1274                    "__VA_ARGS__ can only appear in the expansion"
1275                    " of a C99 variadic macro");
1276
1277       /* For -Wc++-compat, warn about use of C++ named operators.  */
1278       if (result->flags & NODE_WARN_OPERATOR)
1279         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1280                      "identifier \"%s\" is a special operator name in C++",
1281                      NODE_NAME (result));
1282     }
1283
1284   return result;
1285 }
1286
1287 /* Get the cpp_hashnode of an identifier specified by NAME in
1288    the current cpp_reader object.  If none is found, NULL is returned.  */
1289 cpp_hashnode *
1290 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1291 {
1292   cpp_hashnode *result;
1293   result = lex_identifier_intern (pfile, (uchar *) name);
1294   return result;
1295 }
1296
1297 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1298 static cpp_hashnode *
1299 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1300                 struct normalize_state *nst)
1301 {
1302   cpp_hashnode *result;
1303   const uchar *cur;
1304   unsigned int len;
1305   unsigned int hash = HT_HASHSTEP (0, *base);
1306
1307   cur = pfile->buffer->cur;
1308   if (! starts_ucn)
1309     {
1310       while (ISIDNUM (*cur))
1311         {
1312           hash = HT_HASHSTEP (hash, *cur);
1313           cur++;
1314         }
1315       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1316     }
1317   pfile->buffer->cur = cur;
1318   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1319     {
1320       /* Slower version for identifiers containing UCNs (or $).  */
1321       do {
1322         while (ISIDNUM (*pfile->buffer->cur))
1323           {
1324             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1325             pfile->buffer->cur++;
1326           }
1327       } while (forms_identifier_p (pfile, false, nst));
1328       result = _cpp_interpret_identifier (pfile, base,
1329                                           pfile->buffer->cur - base);
1330     }
1331   else
1332     {
1333       len = cur - base;
1334       hash = HT_HASHFINISH (hash, len);
1335
1336       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1337                                                   base, len, hash, HT_ALLOC));
1338     }
1339
1340   /* Rarely, identifiers require diagnostics when lexed.  */
1341   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1342                         && !pfile->state.skipping, 0))
1343     {
1344       /* It is allowed to poison the same identifier twice.  */
1345       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1346         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1347                    NODE_NAME (result));
1348
1349       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1350          replacement list of a variadic macro.  */
1351       if (result == pfile->spec_nodes.n__VA_ARGS__
1352           && !pfile->state.va_args_ok)
1353         cpp_error (pfile, CPP_DL_PEDWARN,
1354                    "__VA_ARGS__ can only appear in the expansion"
1355                    " of a C99 variadic macro");
1356
1357       /* For -Wc++-compat, warn about use of C++ named operators.  */
1358       if (result->flags & NODE_WARN_OPERATOR)
1359         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1360                      "identifier \"%s\" is a special operator name in C++",
1361                      NODE_NAME (result));
1362     }
1363
1364   return result;
1365 }
1366
1367 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1368 static void
1369 lex_number (cpp_reader *pfile, cpp_string *number,
1370             struct normalize_state *nst)
1371 {
1372   const uchar *cur;
1373   const uchar *base;
1374   uchar *dest;
1375
1376   base = pfile->buffer->cur - 1;
1377   do
1378     {
1379       cur = pfile->buffer->cur;
1380
1381       /* N.B. ISIDNUM does not include $.  */
1382       while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1383              || VALID_SIGN (*cur, cur[-1]))
1384         {
1385           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1386           cur++;
1387         }
1388
1389       pfile->buffer->cur = cur;
1390     }
1391   while (forms_identifier_p (pfile, false, nst));
1392
1393   number->len = cur - base;
1394   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1395   memcpy (dest, base, number->len);
1396   dest[number->len] = '\0';
1397   number->text = dest;
1398 }
1399
1400 /* Create a token of type TYPE with a literal spelling.  */
1401 static void
1402 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1403                 unsigned int len, enum cpp_ttype type)
1404 {
1405   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1406
1407   memcpy (dest, base, len);
1408   dest[len] = '\0';
1409   token->type = type;
1410   token->val.str.len = len;
1411   token->val.str.text = dest;
1412 }
1413
1414 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1415    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1416
1417 static void
1418 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1419                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1420 {
1421   _cpp_buff *first_buff = *first_buff_p;
1422   _cpp_buff *last_buff = *last_buff_p;
1423
1424   if (first_buff == NULL)
1425     first_buff = last_buff = _cpp_get_buff (pfile, len);
1426   else if (len > BUFF_ROOM (last_buff))
1427     {
1428       size_t room = BUFF_ROOM (last_buff);
1429       memcpy (BUFF_FRONT (last_buff), base, room);
1430       BUFF_FRONT (last_buff) += room;
1431       base += room;
1432       len -= room;
1433       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1434     }
1435
1436   memcpy (BUFF_FRONT (last_buff), base, len);
1437   BUFF_FRONT (last_buff) += len;
1438
1439   *first_buff_p = first_buff;
1440   *last_buff_p = last_buff;
1441 }
1442
1443
1444 /* Returns true if a macro has been defined.
1445    This might not work if compile with -save-temps,
1446    or preprocess separately from compilation.  */
1447
1448 static bool
1449 is_macro(cpp_reader *pfile, const uchar *base)
1450 {
1451   const uchar *cur = base;
1452   if (! ISIDST (*cur))
1453     return false;
1454   unsigned int hash = HT_HASHSTEP (0, *cur);
1455   ++cur;
1456   while (ISIDNUM (*cur))
1457     {
1458       hash = HT_HASHSTEP (hash, *cur);
1459       ++cur;
1460     }
1461   hash = HT_HASHFINISH (hash, cur - base);
1462
1463   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1464                                         base, cur - base, hash, HT_NO_INSERT));
1465
1466   return !result ? false : (result->type == NT_MACRO);
1467 }
1468
1469
1470 /* Lexes a raw string.  The stored string contains the spelling, including
1471    double quotes, delimiter string, '(' and ')', any leading
1472    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1473    literal, or CPP_OTHER if it was not properly terminated.
1474
1475    The spelling is NUL-terminated, but it is not guaranteed that this
1476    is the first NUL since embedded NULs are preserved.  */
1477
1478 static void
1479 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1480                 const uchar *cur)
1481 {
1482   uchar raw_prefix[17];
1483   uchar temp_buffer[18];
1484   const uchar *orig_base;
1485   unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
1486   enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
1487   raw_str_phase phase = RAW_STR_PREFIX;
1488   enum cpp_ttype type;
1489   size_t total_len = 0;
1490   /* Index into temp_buffer during phases other than RAW_STR,
1491      during RAW_STR phase 17 to tell BUF_APPEND that nothing should
1492      be appended to temp_buffer.  */
1493   size_t temp_buffer_len = 0;
1494   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1495   size_t raw_prefix_start;
1496   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1497
1498   type = (*base == 'L' ? CPP_WSTRING :
1499           *base == 'U' ? CPP_STRING32 :
1500           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1501           : CPP_STRING);
1502
1503 #define BUF_APPEND(STR,LEN)                                     \
1504       do {                                                      \
1505         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1506                         &first_buff, &last_buff);               \
1507         total_len += (LEN);                                     \
1508         if (__builtin_expect (temp_buffer_len < 17, 0)          \
1509             && (const uchar *)(STR) != base                     \
1510             && (LEN) <= 2)                                      \
1511           {                                                     \
1512             memcpy (temp_buffer + temp_buffer_len,              \
1513                     (const uchar *)(STR), (LEN));               \
1514             temp_buffer_len += (LEN);                           \
1515           }                                                     \
1516       } while (0);
1517
1518   orig_base = base;
1519   ++cur;
1520   raw_prefix_start = cur - base;
1521   for (;;)
1522     {
1523       cppchar_t c;
1524
1525       /* If we previously performed any trigraph or line splicing
1526          transformations, undo them in between the opening and closing
1527          double quote.  */
1528       while (note->pos < cur)
1529         ++note;
1530       for (; note->pos == cur; ++note)
1531         {
1532           switch (note->type)
1533             {
1534             case '\\':
1535             case ' ':
1536               /* Restore backslash followed by newline.  */
1537               BUF_APPEND (base, cur - base);
1538               base = cur;
1539               BUF_APPEND ("\\", 1);
1540             after_backslash:
1541               if (note->type == ' ')
1542                 {
1543                   /* GNU backslash whitespace newline extension.  FIXME
1544                      could be any sequence of non-vertical space.  When we
1545                      can properly restore any such sequence, we should mark
1546                      this note as handled so _cpp_process_line_notes
1547                      doesn't warn.  */
1548                   BUF_APPEND (" ", 1);
1549                 }
1550
1551               BUF_APPEND ("\n", 1);
1552               break;
1553
1554             case 0:
1555               /* Already handled.  */
1556               break;
1557
1558             default:
1559               if (_cpp_trigraph_map[note->type])
1560                 {
1561                   /* Don't warn about this trigraph in
1562                      _cpp_process_line_notes, since trigraphs show up as
1563                      trigraphs in raw strings.  */
1564                   uchar type = note->type;
1565                   note->type = 0;
1566
1567                   if (!CPP_OPTION (pfile, trigraphs))
1568                     /* If we didn't convert the trigraph in the first
1569                        place, don't do anything now either.  */
1570                     break;
1571
1572                   BUF_APPEND (base, cur - base);
1573                   base = cur;
1574                   BUF_APPEND ("??", 2);
1575
1576                   /* ??/ followed by newline gets two line notes, one for
1577                      the trigraph and one for the backslash/newline.  */
1578                   if (type == '/' && note[1].pos == cur)
1579                     {
1580                       if (note[1].type != '\\'
1581                           && note[1].type != ' ')
1582                         abort ();
1583                       BUF_APPEND ("/", 1);
1584                       ++note;
1585                       goto after_backslash;
1586                     }
1587                   else
1588                     {
1589                       /* Skip the replacement character.  */
1590                       base = ++cur;
1591                       BUF_APPEND (&type, 1);
1592                       c = type;
1593                       goto check_c;
1594                     }
1595                 }
1596               else
1597                 abort ();
1598               break;
1599             }
1600         }
1601       c = *cur++;
1602       if (__builtin_expect (temp_buffer_len < 17, 0))
1603         temp_buffer[temp_buffer_len++] = c;
1604
1605      check_c:
1606       if (phase == RAW_STR_PREFIX)
1607         {
1608           while (raw_prefix_len < temp_buffer_len)
1609             {
1610               raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len];
1611               switch (raw_prefix[raw_prefix_len])
1612                 {
1613                 case ' ': case '(': case ')': case '\\': case '\t':
1614                 case '\v': case '\f': case '\n': default:
1615                   break;
1616                 /* Basic source charset except the above chars.  */
1617                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1618                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1619                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1620                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1621                 case 'y': case 'z':
1622                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1623                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1624                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1625                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1626                 case 'Y': case 'Z':
1627                 case '0': case '1': case '2': case '3': case '4': case '5':
1628                 case '6': case '7': case '8': case '9':
1629                 case '_': case '{': case '}': case '#': case '[': case ']':
1630                 case '<': case '>': case '%': case ':': case ';': case '.':
1631                 case '?': case '*': case '+': case '-': case '/': case '^':
1632                 case '&': case '|': case '~': case '!': case '=': case ',':
1633                 case '"': case '\'':
1634                   if (raw_prefix_len < 16)
1635                     {
1636                       raw_prefix_len++;
1637                       continue;
1638                     }
1639                   break;
1640                 }
1641
1642               if (raw_prefix[raw_prefix_len] != '(')
1643                 {
1644                   int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1;
1645                   if (raw_prefix_len == 16)
1646                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1647                                          col, "raw string delimiter longer "
1648                                               "than 16 characters");
1649                   else if (raw_prefix[raw_prefix_len] == '\n')
1650                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1651                                          col, "invalid new-line in raw "
1652                                               "string delimiter");
1653                   else
1654                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1655                                          col, "invalid character '%c' in "
1656                                               "raw string delimiter",
1657                                          (int) raw_prefix[raw_prefix_len]);
1658                   pfile->buffer->cur = orig_base + raw_prefix_start - 1;
1659                   create_literal (pfile, token, orig_base,
1660                                   raw_prefix_start - 1, CPP_OTHER);
1661                   if (first_buff)
1662                     _cpp_release_buff (pfile, first_buff);
1663                   return;
1664                 }
1665               raw_prefix[raw_prefix_len] = '"';
1666               phase = RAW_STR;
1667               /* Nothing should be appended to temp_buffer during
1668                  RAW_STR phase.  */
1669               temp_buffer_len = 17;
1670               break;
1671             }
1672           continue;
1673         }
1674       else if (phase == RAW_STR_SUFFIX)
1675         {
1676           while (raw_suffix_len <= raw_prefix_len
1677                  && raw_suffix_len < temp_buffer_len
1678                  && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len])
1679             raw_suffix_len++;
1680           if (raw_suffix_len > raw_prefix_len)
1681             break;
1682           if (raw_suffix_len == temp_buffer_len)
1683             continue;
1684           phase = RAW_STR;
1685           /* Nothing should be appended to temp_buffer during
1686              RAW_STR phase.  */
1687           temp_buffer_len = 17;
1688         }
1689       if (c == ')')
1690         {
1691           phase = RAW_STR_SUFFIX;
1692           raw_suffix_len = 0;
1693           temp_buffer_len = 0;
1694         }
1695       else if (c == '\n')
1696         {
1697           if (pfile->state.in_directive
1698               || (pfile->state.parsing_args
1699                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
1700             {
1701               cur--;
1702               type = CPP_OTHER;
1703               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1704                                    "unterminated raw string");
1705               break;
1706             }
1707
1708           BUF_APPEND (base, cur - base);
1709
1710           if (pfile->buffer->cur < pfile->buffer->rlimit)
1711             CPP_INCREMENT_LINE (pfile, 0);
1712           pfile->buffer->need_line = true;
1713
1714           pfile->buffer->cur = cur-1;
1715           _cpp_process_line_notes (pfile, false);
1716           if (!_cpp_get_fresh_line (pfile))
1717             {
1718               source_location src_loc = token->src_loc;
1719               token->type = CPP_EOF;
1720               /* Tell the compiler the line number of the EOF token.  */
1721               token->src_loc = pfile->line_table->highest_line;
1722               token->flags = BOL;
1723               if (first_buff != NULL)
1724                 _cpp_release_buff (pfile, first_buff);
1725               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1726                                    "unterminated raw string");
1727               return;
1728             }
1729
1730           cur = base = pfile->buffer->cur;
1731           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1732         }
1733     }
1734
1735   if (CPP_OPTION (pfile, user_literals))
1736     {
1737       /* If a string format macro, say from inttypes.h, is placed touching
1738          a string literal it could be parsed as a C++11 user-defined string
1739          literal thus breaking the program.
1740          Try to identify macros with is_macro. A warning is issued. */
1741       if (is_macro (pfile, cur))
1742         {
1743           /* Raise a warning, but do not consume subsequent tokens.  */
1744           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1745             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1746                                    token->src_loc, 0,
1747                                    "invalid suffix on literal; C++11 requires "
1748                                    "a space between literal and string macro");
1749         }
1750       /* Grab user defined literal suffix.  */
1751       else if (ISIDST (*cur))
1752         {
1753           type = cpp_userdef_string_add_type (type);
1754           ++cur;
1755
1756           while (ISIDNUM (*cur))
1757             ++cur;
1758         }
1759     }
1760
1761   pfile->buffer->cur = cur;
1762   if (first_buff == NULL)
1763     create_literal (pfile, token, base, cur - base, type);
1764   else
1765     {
1766       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1767
1768       token->type = type;
1769       token->val.str.len = total_len + (cur - base);
1770       token->val.str.text = dest;
1771       last_buff = first_buff;
1772       while (last_buff != NULL)
1773         {
1774           memcpy (dest, last_buff->base,
1775                   BUFF_FRONT (last_buff) - last_buff->base);
1776           dest += BUFF_FRONT (last_buff) - last_buff->base;
1777           last_buff = last_buff->next;
1778         }
1779       _cpp_release_buff (pfile, first_buff);
1780       memcpy (dest, base, cur - base);
1781       dest[cur - base] = '\0';
1782     }
1783 }
1784
1785 /* Lexes a string, character constant, or angle-bracketed header file
1786    name.  The stored string contains the spelling, including opening
1787    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1788    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1789    if it was not properly terminated, or CPP_LESS for an unterminated
1790    header name which must be relexed as normal tokens.
1791
1792    The spelling is NUL-terminated, but it is not guaranteed that this
1793    is the first NUL since embedded NULs are preserved.  */
1794 static void
1795 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1796 {
1797   bool saw_NUL = false;
1798   const uchar *cur;
1799   cppchar_t terminator;
1800   enum cpp_ttype type;
1801
1802   cur = base;
1803   terminator = *cur++;
1804   if (terminator == 'L' || terminator == 'U')
1805     terminator = *cur++;
1806   else if (terminator == 'u')
1807     {
1808       terminator = *cur++;
1809       if (terminator == '8')
1810         terminator = *cur++;
1811     }
1812   if (terminator == 'R')
1813     {
1814       lex_raw_string (pfile, token, base, cur);
1815       return;
1816     }
1817   if (terminator == '"')
1818     type = (*base == 'L' ? CPP_WSTRING :
1819             *base == 'U' ? CPP_STRING32 :
1820             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1821                          : CPP_STRING);
1822   else if (terminator == '\'')
1823     type = (*base == 'L' ? CPP_WCHAR :
1824             *base == 'U' ? CPP_CHAR32 :
1825             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1826   else
1827     terminator = '>', type = CPP_HEADER_NAME;
1828
1829   for (;;)
1830     {
1831       cppchar_t c = *cur++;
1832
1833       /* In #include-style directives, terminators are not escapable.  */
1834       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1835         cur++;
1836       else if (c == terminator)
1837         break;
1838       else if (c == '\n')
1839         {
1840           cur--;
1841           /* Unmatched quotes always yield undefined behavior, but
1842              greedy lexing means that what appears to be an unterminated
1843              header name may actually be a legitimate sequence of tokens.  */
1844           if (terminator == '>')
1845             {
1846               token->type = CPP_LESS;
1847               return;
1848             }
1849           type = CPP_OTHER;
1850           break;
1851         }
1852       else if (c == '\0')
1853         saw_NUL = true;
1854     }
1855
1856   if (saw_NUL && !pfile->state.skipping)
1857     cpp_error (pfile, CPP_DL_WARNING,
1858                "null character(s) preserved in literal");
1859
1860   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1861     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1862                (int) terminator);
1863
1864   if (CPP_OPTION (pfile, user_literals))
1865     {
1866       /* If a string format macro, say from inttypes.h, is placed touching
1867          a string literal it could be parsed as a C++11 user-defined string
1868          literal thus breaking the program.
1869          Try to identify macros with is_macro. A warning is issued. */
1870       if (is_macro (pfile, cur))
1871         {
1872           /* Raise a warning, but do not consume subsequent tokens.  */
1873           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1874             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1875                                    token->src_loc, 0,
1876                                    "invalid suffix on literal; C++11 requires "
1877                                    "a space between literal and string macro");
1878         }
1879       /* Grab user defined literal suffix.  */
1880       else if (ISIDST (*cur))
1881         {
1882           type = cpp_userdef_char_add_type (type);
1883           type = cpp_userdef_string_add_type (type);
1884           ++cur;
1885
1886           while (ISIDNUM (*cur))
1887             ++cur;
1888         }
1889     }
1890
1891   pfile->buffer->cur = cur;
1892   create_literal (pfile, token, base, cur - base, type);
1893 }
1894
1895 /* Return the comment table. The client may not make any assumption
1896    about the ordering of the table.  */
1897 cpp_comment_table *
1898 cpp_get_comments (cpp_reader *pfile)
1899 {
1900   return &pfile->comments;
1901 }
1902
1903 /* Append a comment to the end of the comment table. */
1904 static void
1905 store_comment (cpp_reader *pfile, cpp_token *token)
1906 {
1907   int len;
1908
1909   if (pfile->comments.allocated == 0)
1910     {
1911       pfile->comments.allocated = 256;
1912       pfile->comments.entries = (cpp_comment *) xmalloc
1913         (pfile->comments.allocated * sizeof (cpp_comment));
1914     }
1915
1916   if (pfile->comments.count == pfile->comments.allocated)
1917     {
1918       pfile->comments.allocated *= 2;
1919       pfile->comments.entries = (cpp_comment *) xrealloc
1920         (pfile->comments.entries,
1921          pfile->comments.allocated * sizeof (cpp_comment));
1922     }
1923
1924   len = token->val.str.len;
1925
1926   /* Copy comment. Note, token may not be NULL terminated. */
1927   pfile->comments.entries[pfile->comments.count].comment =
1928     (char *) xmalloc (sizeof (char) * (len + 1));
1929   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1930           token->val.str.text, len);
1931   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1932
1933   /* Set source location. */
1934   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1935
1936   /* Increment the count of entries in the comment table. */
1937   pfile->comments.count++;
1938 }
1939
1940 /* The stored comment includes the comment start and any terminator.  */
1941 static void
1942 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1943               cppchar_t type)
1944 {
1945   unsigned char *buffer;
1946   unsigned int len, clen, i;
1947
1948   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1949
1950   /* C++ comments probably (not definitely) have moved past a new
1951      line, which we don't want to save in the comment.  */
1952   if (is_vspace (pfile->buffer->cur[-1]))
1953     len--;
1954
1955   /* If we are currently in a directive or in argument parsing, then
1956      we need to store all C++ comments as C comments internally, and
1957      so we need to allocate a little extra space in that case.
1958
1959      Note that the only time we encounter a directive here is
1960      when we are saving comments in a "#define".  */
1961   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1962           && type == '/') ? len + 2 : len;
1963
1964   buffer = _cpp_unaligned_alloc (pfile, clen);
1965
1966   token->type = CPP_COMMENT;
1967   token->val.str.len = clen;
1968   token->val.str.text = buffer;
1969
1970   buffer[0] = '/';
1971   memcpy (buffer + 1, from, len - 1);
1972
1973   /* Finish conversion to a C comment, if necessary.  */
1974   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1975     {
1976       buffer[1] = '*';
1977       buffer[clen - 2] = '*';
1978       buffer[clen - 1] = '/';
1979       /* As there can be in a C++ comments illegal sequences for C comments
1980          we need to filter them out.  */
1981       for (i = 2; i < (clen - 2); i++)
1982         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1983           buffer[i] = '|';
1984     }
1985
1986   /* Finally store this comment for use by clients of libcpp. */
1987   store_comment (pfile, token);
1988 }
1989
1990 /* Allocate COUNT tokens for RUN.  */
1991 void
1992 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1993 {
1994   run->base = XNEWVEC (cpp_token, count);
1995   run->limit = run->base + count;
1996   run->next = NULL;
1997 }
1998
1999 /* Returns the next tokenrun, or creates one if there is none.  */
2000 static tokenrun *
2001 next_tokenrun (tokenrun *run)
2002 {
2003   if (run->next == NULL)
2004     {
2005       run->next = XNEW (tokenrun);
2006       run->next->prev = run;
2007       _cpp_init_tokenrun (run->next, 250);
2008     }
2009
2010   return run->next;
2011 }
2012
2013 /* Return the number of not yet processed token in a given
2014    context.  */
2015 int
2016 _cpp_remaining_tokens_num_in_context (cpp_context *context)
2017 {
2018   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2019     return (LAST (context).token - FIRST (context).token);
2020   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2021            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2022     return (LAST (context).ptoken - FIRST (context).ptoken);
2023   else
2024       abort ();
2025 }
2026
2027 /* Returns the token present at index INDEX in a given context.  If
2028    INDEX is zero, the next token to be processed is returned.  */
2029 static const cpp_token*
2030 _cpp_token_from_context_at (cpp_context *context, int index)
2031 {
2032   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2033     return &(FIRST (context).token[index]);
2034   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2035            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2036     return FIRST (context).ptoken[index];
2037  else
2038    abort ();
2039 }
2040
2041 /* Look ahead in the input stream.  */
2042 const cpp_token *
2043 cpp_peek_token (cpp_reader *pfile, int index)
2044 {
2045   cpp_context *context = pfile->context;
2046   const cpp_token *peektok;
2047   int count;
2048
2049   /* First, scan through any pending cpp_context objects.  */
2050   while (context->prev)
2051     {
2052       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2053
2054       if (index < (int) sz)
2055         return _cpp_token_from_context_at (context, index);
2056       index -= (int) sz;
2057       context = context->prev;
2058     }
2059
2060   /* We will have to read some new tokens after all (and do so
2061      without invalidating preceding tokens).  */
2062   count = index;
2063   pfile->keep_tokens++;
2064
2065   do
2066     {
2067       peektok = _cpp_lex_token (pfile);
2068       if (peektok->type == CPP_EOF)
2069         return peektok;
2070     }
2071   while (index--);
2072
2073   _cpp_backup_tokens_direct (pfile, count + 1);
2074   pfile->keep_tokens--;
2075
2076   return peektok;
2077 }
2078
2079 /* Allocate a single token that is invalidated at the same time as the
2080    rest of the tokens on the line.  Has its line and col set to the
2081    same as the last lexed token, so that diagnostics appear in the
2082    right place.  */
2083 cpp_token *
2084 _cpp_temp_token (cpp_reader *pfile)
2085 {
2086   cpp_token *old, *result;
2087   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2088   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2089
2090   old = pfile->cur_token - 1;
2091   /* Any pre-existing lookaheads must not be clobbered.  */
2092   if (la)
2093     {
2094       if (sz <= la)
2095         {
2096           tokenrun *next = next_tokenrun (pfile->cur_run);
2097
2098           if (sz < la)
2099             memmove (next->base + 1, next->base,
2100                      (la - sz) * sizeof (cpp_token));
2101
2102           next->base[0] = pfile->cur_run->limit[-1];
2103         }
2104
2105       if (sz > 1)
2106         memmove (pfile->cur_token + 1, pfile->cur_token,
2107                  MIN (la, sz - 1) * sizeof (cpp_token));
2108     }
2109
2110   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2111     {
2112       pfile->cur_run = next_tokenrun (pfile->cur_run);
2113       pfile->cur_token = pfile->cur_run->base;
2114     }
2115
2116   result = pfile->cur_token++;
2117   result->src_loc = old->src_loc;
2118   return result;
2119 }
2120
2121 /* Lex a token into RESULT (external interface).  Takes care of issues
2122    like directive handling, token lookahead, multiple include
2123    optimization and skipping.  */
2124 const cpp_token *
2125 _cpp_lex_token (cpp_reader *pfile)
2126 {
2127   cpp_token *result;
2128
2129   for (;;)
2130     {
2131       if (pfile->cur_token == pfile->cur_run->limit)
2132         {
2133           pfile->cur_run = next_tokenrun (pfile->cur_run);
2134           pfile->cur_token = pfile->cur_run->base;
2135         }
2136       /* We assume that the current token is somewhere in the current
2137          run.  */
2138       if (pfile->cur_token < pfile->cur_run->base
2139           || pfile->cur_token >= pfile->cur_run->limit)
2140         abort ();
2141
2142       if (pfile->lookaheads)
2143         {
2144           pfile->lookaheads--;
2145           result = pfile->cur_token++;
2146         }
2147       else
2148         result = _cpp_lex_direct (pfile);
2149
2150       if (result->flags & BOL)
2151         {
2152           /* Is this a directive.  If _cpp_handle_directive returns
2153              false, it is an assembler #.  */
2154           if (result->type == CPP_HASH
2155               /* 6.10.3 p 11: Directives in a list of macro arguments
2156                  gives undefined behavior.  This implementation
2157                  handles the directive as normal.  */
2158               && pfile->state.parsing_args != 1)
2159             {
2160               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2161                 {
2162                   if (pfile->directive_result.type == CPP_PADDING)
2163                     continue;
2164                   result = &pfile->directive_result;
2165                 }
2166             }
2167           else if (pfile->state.in_deferred_pragma)
2168             result = &pfile->directive_result;
2169
2170           if (pfile->cb.line_change && !pfile->state.skipping)
2171             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2172         }
2173
2174       /* We don't skip tokens in directives.  */
2175       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2176         break;
2177
2178       /* Outside a directive, invalidate controlling macros.  At file
2179          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2180          get here and MI optimization works.  */
2181       pfile->mi_valid = false;
2182
2183       if (!pfile->state.skipping || result->type == CPP_EOF)
2184         break;
2185     }
2186
2187   return result;
2188 }
2189
2190 /* Returns true if a fresh line has been loaded.  */
2191 bool
2192 _cpp_get_fresh_line (cpp_reader *pfile)
2193 {
2194   int return_at_eof;
2195
2196   /* We can't get a new line until we leave the current directive.  */
2197   if (pfile->state.in_directive)
2198     return false;
2199
2200   for (;;)
2201     {
2202       cpp_buffer *buffer = pfile->buffer;
2203
2204       if (!buffer->need_line)
2205         return true;
2206
2207       if (buffer->next_line < buffer->rlimit)
2208         {
2209           _cpp_clean_line (pfile);
2210           return true;
2211         }
2212
2213       /* First, get out of parsing arguments state.  */
2214       if (pfile->state.parsing_args)
2215         return false;
2216
2217       /* End of buffer.  Non-empty files should end in a newline.  */
2218       if (buffer->buf != buffer->rlimit
2219           && buffer->next_line > buffer->rlimit
2220           && !buffer->from_stage3)
2221         {
2222           /* Clip to buffer size.  */
2223           buffer->next_line = buffer->rlimit;
2224         }
2225
2226       return_at_eof = buffer->return_at_eof;
2227       _cpp_pop_buffer (pfile);
2228       if (pfile->buffer == NULL || return_at_eof)
2229         return false;
2230     }
2231 }
2232
2233 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2234   do                                                    \
2235     {                                                   \
2236       result->type = ELSE_TYPE;                         \
2237       if (*buffer->cur == CHAR)                         \
2238         buffer->cur++, result->type = THEN_TYPE;        \
2239     }                                                   \
2240   while (0)
2241
2242 /* Lex a token into pfile->cur_token, which is also incremented, to
2243    get diagnostics pointing to the correct location.
2244
2245    Does not handle issues such as token lookahead, multiple-include
2246    optimization, directives, skipping etc.  This function is only
2247    suitable for use by _cpp_lex_token, and in special cases like
2248    lex_expansion_token which doesn't care for any of these issues.
2249
2250    When meeting a newline, returns CPP_EOF if parsing a directive,
2251    otherwise returns to the start of the token buffer if permissible.
2252    Returns the location of the lexed token.  */
2253 cpp_token *
2254 _cpp_lex_direct (cpp_reader *pfile)
2255 {
2256   cppchar_t c;
2257   cpp_buffer *buffer;
2258   const unsigned char *comment_start;
2259   cpp_token *result = pfile->cur_token++;
2260
2261  fresh_line:
2262   result->flags = 0;
2263   buffer = pfile->buffer;
2264   if (buffer->need_line)
2265     {
2266       if (pfile->state.in_deferred_pragma)
2267         {
2268           result->type = CPP_PRAGMA_EOL;
2269           pfile->state.in_deferred_pragma = false;
2270           if (!pfile->state.pragma_allow_expansion)
2271             pfile->state.prevent_expansion--;
2272           return result;
2273         }
2274       if (!_cpp_get_fresh_line (pfile))
2275         {
2276           result->type = CPP_EOF;
2277           if (!pfile->state.in_directive)
2278             {
2279               /* Tell the compiler the line number of the EOF token.  */
2280               result->src_loc = pfile->line_table->highest_line;
2281               result->flags = BOL;
2282             }
2283           return result;
2284         }
2285       if (!pfile->keep_tokens)
2286         {
2287           pfile->cur_run = &pfile->base_run;
2288           result = pfile->base_run.base;
2289           pfile->cur_token = result + 1;
2290         }
2291       result->flags = BOL;
2292       if (pfile->state.parsing_args == 2)
2293         result->flags |= PREV_WHITE;
2294     }
2295   buffer = pfile->buffer;
2296  update_tokens_line:
2297   result->src_loc = pfile->line_table->highest_line;
2298
2299  skipped_white:
2300   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2301       && !pfile->overlaid_buffer)
2302     {
2303       _cpp_process_line_notes (pfile, false);
2304       result->src_loc = pfile->line_table->highest_line;
2305     }
2306   c = *buffer->cur++;
2307
2308   if (pfile->forced_token_location_p)
2309     result->src_loc = *pfile->forced_token_location_p;
2310   else
2311     result->src_loc = linemap_position_for_column (pfile->line_table,
2312                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2313
2314   switch (c)
2315     {
2316     case ' ': case '\t': case '\f': case '\v': case '\0':
2317       result->flags |= PREV_WHITE;
2318       skip_whitespace (pfile, c);
2319       goto skipped_white;
2320
2321     case '\n':
2322       if (buffer->cur < buffer->rlimit)
2323         CPP_INCREMENT_LINE (pfile, 0);
2324       buffer->need_line = true;
2325       goto fresh_line;
2326
2327     case '0': case '1': case '2': case '3': case '4':
2328     case '5': case '6': case '7': case '8': case '9':
2329       {
2330         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2331         result->type = CPP_NUMBER;
2332         lex_number (pfile, &result->val.str, &nst);
2333         warn_about_normalization (pfile, result, &nst);
2334         break;
2335       }
2336
2337     case 'L':
2338     case 'u':
2339     case 'U':
2340     case 'R':
2341       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2342          wide strings or raw strings.  */
2343       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2344           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2345         {
2346           if ((*buffer->cur == '\'' && c != 'R')
2347               || *buffer->cur == '"'
2348               || (*buffer->cur == 'R'
2349                   && c != 'R'
2350                   && buffer->cur[1] == '"'
2351                   && CPP_OPTION (pfile, rliterals))
2352               || (*buffer->cur == '8'
2353                   && c == 'u'
2354                   && (buffer->cur[1] == '"'
2355                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2356                           && CPP_OPTION (pfile, rliterals)))))
2357             {
2358               lex_string (pfile, result, buffer->cur - 1);
2359               break;
2360             }
2361         }
2362       /* Fall through.  */
2363
2364     case '_':
2365     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2366     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2367     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2368     case 's': case 't':           case 'v': case 'w': case 'x':
2369     case 'y': case 'z':
2370     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2371     case 'G': case 'H': case 'I': case 'J': case 'K':
2372     case 'M': case 'N': case 'O': case 'P': case 'Q':
2373     case 'S': case 'T':           case 'V': case 'W': case 'X':
2374     case 'Y': case 'Z':
2375       result->type = CPP_NAME;
2376       {
2377         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2378         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2379                                                 &nst);
2380         warn_about_normalization (pfile, result, &nst);
2381       }
2382
2383       /* Convert named operators to their proper types.  */
2384       if (result->val.node.node->flags & NODE_OPERATOR)
2385         {
2386           result->flags |= NAMED_OP;
2387           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2388         }
2389       break;
2390
2391     case '\'':
2392     case '"':
2393       lex_string (pfile, result, buffer->cur - 1);
2394       break;
2395
2396     case '/':
2397       /* A potential block or line comment.  */
2398       comment_start = buffer->cur;
2399       c = *buffer->cur;
2400
2401       if (c == '*')
2402         {
2403           if (_cpp_skip_block_comment (pfile))
2404             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2405         }
2406       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2407                             || cpp_in_system_header (pfile)))
2408         {
2409           /* Warn about comments only if pedantically GNUC89, and not
2410              in system headers.  */
2411           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2412               && ! buffer->warned_cplusplus_comments)
2413             {
2414               cpp_error (pfile, CPP_DL_PEDWARN,
2415                          "C++ style comments are not allowed in ISO C90");
2416               cpp_error (pfile, CPP_DL_PEDWARN,
2417                          "(this will be reported only once per input file)");
2418               buffer->warned_cplusplus_comments = 1;
2419             }
2420
2421           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2422             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2423         }
2424       else if (c == '=')
2425         {
2426           buffer->cur++;
2427           result->type = CPP_DIV_EQ;
2428           break;
2429         }
2430       else
2431         {
2432           result->type = CPP_DIV;
2433           break;
2434         }
2435
2436       if (!pfile->state.save_comments)
2437         {
2438           result->flags |= PREV_WHITE;
2439           goto update_tokens_line;
2440         }
2441
2442       /* Save the comment as a token in its own right.  */
2443       save_comment (pfile, result, comment_start, c);
2444       break;
2445
2446     case '<':
2447       if (pfile->state.angled_headers)
2448         {
2449           lex_string (pfile, result, buffer->cur - 1);
2450           if (result->type != CPP_LESS)
2451             break;
2452         }
2453
2454       result->type = CPP_LESS;
2455       if (*buffer->cur == '=')
2456         buffer->cur++, result->type = CPP_LESS_EQ;
2457       else if (*buffer->cur == '<')
2458         {
2459           buffer->cur++;
2460           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2461         }
2462       else if (CPP_OPTION (pfile, digraphs))
2463         {
2464           if (*buffer->cur == ':')
2465             {
2466               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2467                  three characters are <:: and the subsequent character
2468                  is neither : nor >, the < is treated as a preprocessor
2469                  token by itself".  */
2470               if (CPP_OPTION (pfile, cplusplus)
2471                   && CPP_OPTION (pfile, lang) != CLK_CXX98
2472                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
2473                   && buffer->cur[1] == ':'
2474                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2475                 break;
2476
2477               buffer->cur++;
2478               result->flags |= DIGRAPH;
2479               result->type = CPP_OPEN_SQUARE;
2480             }
2481           else if (*buffer->cur == '%')
2482             {
2483               buffer->cur++;
2484               result->flags |= DIGRAPH;
2485               result->type = CPP_OPEN_BRACE;
2486             }
2487         }
2488       break;
2489
2490     case '>':
2491       result->type = CPP_GREATER;
2492       if (*buffer->cur == '=')
2493         buffer->cur++, result->type = CPP_GREATER_EQ;
2494       else if (*buffer->cur == '>')
2495         {
2496           buffer->cur++;
2497           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2498         }
2499       break;
2500
2501     case '%':
2502       result->type = CPP_MOD;
2503       if (*buffer->cur == '=')
2504         buffer->cur++, result->type = CPP_MOD_EQ;
2505       else if (CPP_OPTION (pfile, digraphs))
2506         {
2507           if (*buffer->cur == ':')
2508             {
2509               buffer->cur++;
2510               result->flags |= DIGRAPH;
2511               result->type = CPP_HASH;
2512               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2513                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2514             }
2515           else if (*buffer->cur == '>')
2516             {
2517               buffer->cur++;
2518               result->flags |= DIGRAPH;
2519               result->type = CPP_CLOSE_BRACE;
2520             }
2521         }
2522       break;
2523
2524     case '.':
2525       result->type = CPP_DOT;
2526       if (ISDIGIT (*buffer->cur))
2527         {
2528           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2529           result->type = CPP_NUMBER;
2530           lex_number (pfile, &result->val.str, &nst);
2531           warn_about_normalization (pfile, result, &nst);
2532         }
2533       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2534         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2535       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2536         buffer->cur++, result->type = CPP_DOT_STAR;
2537       break;
2538
2539     case '+':
2540       result->type = CPP_PLUS;
2541       if (*buffer->cur == '+')
2542         buffer->cur++, result->type = CPP_PLUS_PLUS;
2543       else if (*buffer->cur == '=')
2544         buffer->cur++, result->type = CPP_PLUS_EQ;
2545       break;
2546
2547     case '-':
2548       result->type = CPP_MINUS;
2549       if (*buffer->cur == '>')
2550         {
2551           buffer->cur++;
2552           result->type = CPP_DEREF;
2553           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2554             buffer->cur++, result->type = CPP_DEREF_STAR;
2555         }
2556       else if (*buffer->cur == '-')
2557         buffer->cur++, result->type = CPP_MINUS_MINUS;
2558       else if (*buffer->cur == '=')
2559         buffer->cur++, result->type = CPP_MINUS_EQ;
2560       break;
2561
2562     case '&':
2563       result->type = CPP_AND;
2564       if (*buffer->cur == '&')
2565         buffer->cur++, result->type = CPP_AND_AND;
2566       else if (*buffer->cur == '=')
2567         buffer->cur++, result->type = CPP_AND_EQ;
2568       break;
2569
2570     case '|':
2571       result->type = CPP_OR;
2572       if (*buffer->cur == '|')
2573         buffer->cur++, result->type = CPP_OR_OR;
2574       else if (*buffer->cur == '=')
2575         buffer->cur++, result->type = CPP_OR_EQ;
2576       break;
2577
2578     case ':':
2579       result->type = CPP_COLON;
2580       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2581         buffer->cur++, result->type = CPP_SCOPE;
2582       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2583         {
2584           buffer->cur++;
2585           result->flags |= DIGRAPH;
2586           result->type = CPP_CLOSE_SQUARE;
2587         }
2588       break;
2589
2590     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2591     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2592     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2593     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2594     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2595
2596     case '?': result->type = CPP_QUERY; break;
2597     case '~': result->type = CPP_COMPL; break;
2598     case ',': result->type = CPP_COMMA; break;
2599     case '(': result->type = CPP_OPEN_PAREN; break;
2600     case ')': result->type = CPP_CLOSE_PAREN; break;
2601     case '[': result->type = CPP_OPEN_SQUARE; break;
2602     case ']': result->type = CPP_CLOSE_SQUARE; break;
2603     case '{': result->type = CPP_OPEN_BRACE; break;
2604     case '}': result->type = CPP_CLOSE_BRACE; break;
2605     case ';': result->type = CPP_SEMICOLON; break;
2606
2607       /* @ is a punctuator in Objective-C.  */
2608     case '@': result->type = CPP_ATSIGN; break;
2609
2610     case '$':
2611     case '\\':
2612       {
2613         const uchar *base = --buffer->cur;
2614         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2615
2616         if (forms_identifier_p (pfile, true, &nst))
2617           {
2618             result->type = CPP_NAME;
2619             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2620             warn_about_normalization (pfile, result, &nst);
2621             break;
2622           }
2623         buffer->cur++;
2624       }
2625
2626     default:
2627       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2628       break;
2629     }
2630
2631   return result;
2632 }
2633
2634 /* An upper bound on the number of bytes needed to spell TOKEN.
2635    Does not include preceding whitespace.  */
2636 unsigned int
2637 cpp_token_len (const cpp_token *token)
2638 {
2639   unsigned int len;
2640
2641   switch (TOKEN_SPELL (token))
2642     {
2643     default:            len = 6;                                break;
2644     case SPELL_LITERAL: len = token->val.str.len;               break;
2645     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2646     }
2647
2648   return len;
2649 }
2650
2651 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2652    Return the number of bytes read out of NAME.  (There are always
2653    10 bytes written to BUFFER.)  */
2654
2655 static size_t
2656 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2657 {
2658   int j;
2659   int ucn_len = 0;
2660   int ucn_len_c;
2661   unsigned t;
2662   unsigned long utf32;
2663
2664   /* Compute the length of the UTF-8 sequence.  */
2665   for (t = *name; t & 0x80; t <<= 1)
2666     ucn_len++;
2667
2668   utf32 = *name & (0x7F >> ucn_len);
2669   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2670     {
2671       utf32 = (utf32 << 6) | (*++name & 0x3F);
2672
2673       /* Ill-formed UTF-8.  */
2674       if ((*name & ~0x3F) != 0x80)
2675         abort ();
2676     }
2677
2678   *buffer++ = '\\';
2679   *buffer++ = 'U';
2680   for (j = 7; j >= 0; j--)
2681     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2682   return ucn_len;
2683 }
2684
2685 /* Given a token TYPE corresponding to a digraph, return a pointer to
2686    the spelling of the digraph.  */
2687 static const unsigned char *
2688 cpp_digraph2name (enum cpp_ttype type)
2689 {
2690   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2691 }
2692
2693 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2694    already contain the enough space to hold the token's spelling.
2695    Returns a pointer to the character after the last character written.
2696    FORSTRING is true if this is to be the spelling after translation
2697    phase 1 (this is different for UCNs).
2698    FIXME: Would be nice if we didn't need the PFILE argument.  */
2699 unsigned char *
2700 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2701                  unsigned char *buffer, bool forstring)
2702 {
2703   switch (TOKEN_SPELL (token))
2704     {
2705     case SPELL_OPERATOR:
2706       {
2707         const unsigned char *spelling;
2708         unsigned char c;
2709
2710         if (token->flags & DIGRAPH)
2711           spelling = cpp_digraph2name (token->type);
2712         else if (token->flags & NAMED_OP)
2713           goto spell_ident;
2714         else
2715           spelling = TOKEN_NAME (token);
2716
2717         while ((c = *spelling++) != '\0')
2718           *buffer++ = c;
2719       }
2720       break;
2721
2722     spell_ident:
2723     case SPELL_IDENT:
2724       if (forstring)
2725         {
2726           memcpy (buffer, NODE_NAME (token->val.node.node),
2727                   NODE_LEN (token->val.node.node));
2728           buffer += NODE_LEN (token->val.node.node);
2729         }
2730       else
2731         {
2732           size_t i;
2733           const unsigned char * name = NODE_NAME (token->val.node.node);
2734
2735           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2736             if (name[i] & ~0x7F)
2737               {
2738                 i += utf8_to_ucn (buffer, name + i) - 1;
2739                 buffer += 10;
2740               }
2741             else
2742               *buffer++ = NODE_NAME (token->val.node.node)[i];
2743         }
2744       break;
2745
2746     case SPELL_LITERAL:
2747       memcpy (buffer, token->val.str.text, token->val.str.len);
2748       buffer += token->val.str.len;
2749       break;
2750
2751     case SPELL_NONE:
2752       cpp_error (pfile, CPP_DL_ICE,
2753                  "unspellable token %s", TOKEN_NAME (token));
2754       break;
2755     }
2756
2757   return buffer;
2758 }
2759
2760 /* Returns TOKEN spelt as a null-terminated string.  The string is
2761    freed when the reader is destroyed.  Useful for diagnostics.  */
2762 unsigned char *
2763 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2764 {
2765   unsigned int len = cpp_token_len (token) + 1;
2766   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2767
2768   end = cpp_spell_token (pfile, token, start, false);
2769   end[0] = '\0';
2770
2771   return start;
2772 }
2773
2774 /* Returns a pointer to a string which spells the token defined by
2775    TYPE and FLAGS.  Used by C front ends, which really should move to
2776    using cpp_token_as_text.  */
2777 const char *
2778 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2779 {
2780   if (flags & DIGRAPH)
2781     return (const char *) cpp_digraph2name (type);
2782   else if (flags & NAMED_OP)
2783     return cpp_named_operator2name (type);
2784
2785   return (const char *) token_spellings[type].name;
2786 }
2787
2788 /* Writes the spelling of token to FP, without any preceding space.
2789    Separated from cpp_spell_token for efficiency - to avoid stdio
2790    double-buffering.  */
2791 void
2792 cpp_output_token (const cpp_token *token, FILE *fp)
2793 {
2794   switch (TOKEN_SPELL (token))
2795     {
2796     case SPELL_OPERATOR:
2797       {
2798         const unsigned char *spelling;
2799         int c;
2800
2801         if (token->flags & DIGRAPH)
2802           spelling = cpp_digraph2name (token->type);
2803         else if (token->flags & NAMED_OP)
2804           goto spell_ident;
2805         else
2806           spelling = TOKEN_NAME (token);
2807
2808         c = *spelling;
2809         do
2810           putc (c, fp);
2811         while ((c = *++spelling) != '\0');
2812       }
2813       break;
2814
2815     spell_ident:
2816     case SPELL_IDENT:
2817       {
2818         size_t i;
2819         const unsigned char * name = NODE_NAME (token->val.node.node);
2820
2821         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2822           if (name[i] & ~0x7F)
2823             {
2824               unsigned char buffer[10];
2825               i += utf8_to_ucn (buffer, name + i) - 1;
2826               fwrite (buffer, 1, 10, fp);
2827             }
2828           else
2829             fputc (NODE_NAME (token->val.node.node)[i], fp);
2830       }
2831       break;
2832
2833     case SPELL_LITERAL:
2834       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2835       break;
2836
2837     case SPELL_NONE:
2838       /* An error, most probably.  */
2839       break;
2840     }
2841 }
2842
2843 /* Compare two tokens.  */
2844 int
2845 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2846 {
2847   if (a->type == b->type && a->flags == b->flags)
2848     switch (TOKEN_SPELL (a))
2849       {
2850       default:                  /* Keep compiler happy.  */
2851       case SPELL_OPERATOR:
2852         /* token_no is used to track where multiple consecutive ##
2853            tokens were originally located.  */
2854         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2855       case SPELL_NONE:
2856         return (a->type != CPP_MACRO_ARG
2857                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2858       case SPELL_IDENT:
2859         return a->val.node.node == b->val.node.node;
2860       case SPELL_LITERAL:
2861         return (a->val.str.len == b->val.str.len
2862                 && !memcmp (a->val.str.text, b->val.str.text,
2863                             a->val.str.len));
2864       }
2865
2866   return 0;
2867 }
2868
2869 /* Returns nonzero if a space should be inserted to avoid an
2870    accidental token paste for output.  For simplicity, it is
2871    conservative, and occasionally advises a space where one is not
2872    needed, e.g. "." and ".2".  */
2873 int
2874 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2875                  const cpp_token *token2)
2876 {
2877   enum cpp_ttype a = token1->type, b = token2->type;
2878   cppchar_t c;
2879
2880   if (token1->flags & NAMED_OP)
2881     a = CPP_NAME;
2882   if (token2->flags & NAMED_OP)
2883     b = CPP_NAME;
2884
2885   c = EOF;
2886   if (token2->flags & DIGRAPH)
2887     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2888   else if (token_spellings[b].category == SPELL_OPERATOR)
2889     c = token_spellings[b].name[0];
2890
2891   /* Quickly get everything that can paste with an '='.  */
2892   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2893     return 1;
2894
2895   switch (a)
2896     {
2897     case CPP_GREATER:   return c == '>';
2898     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2899     case CPP_PLUS:      return c == '+';
2900     case CPP_MINUS:     return c == '-' || c == '>';
2901     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2902     case CPP_MOD:       return c == ':' || c == '>';
2903     case CPP_AND:       return c == '&';
2904     case CPP_OR:        return c == '|';
2905     case CPP_COLON:     return c == ':' || c == '>';
2906     case CPP_DEREF:     return c == '*';
2907     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2908     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2909     case CPP_NAME:      return ((b == CPP_NUMBER
2910                                  && name_p (pfile, &token2->val.str))
2911                                 || b == CPP_NAME
2912                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2913     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2914                                 || c == '.' || c == '+' || c == '-');
2915                                       /* UCNs */
2916     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2917                                  && b == CPP_NAME)
2918                                 || (CPP_OPTION (pfile, objc)
2919                                     && token1->val.str.text[0] == '@'
2920                                     && (b == CPP_NAME || b == CPP_STRING)));
2921     case CPP_STRING:
2922     case CPP_WSTRING:
2923     case CPP_UTF8STRING:
2924     case CPP_STRING16:
2925     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
2926                                 && (b == CPP_NAME
2927                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
2928                                         && ISIDST (token2->val.str.text[0]))));
2929
2930     default:            break;
2931     }
2932
2933   return 0;
2934 }
2935
2936 /* Output all the remaining tokens on the current line, and a newline
2937    character, to FP.  Leading whitespace is removed.  If there are
2938    macros, special token padding is not performed.  */
2939 void
2940 cpp_output_line (cpp_reader *pfile, FILE *fp)
2941 {
2942   const cpp_token *token;
2943
2944   token = cpp_get_token (pfile);
2945   while (token->type != CPP_EOF)
2946     {
2947       cpp_output_token (token, fp);
2948       token = cpp_get_token (pfile);
2949       if (token->flags & PREV_WHITE)
2950         putc (' ', fp);
2951     }
2952
2953   putc ('\n', fp);
2954 }
2955
2956 /* Return a string representation of all the remaining tokens on the
2957    current line.  The result is allocated using xmalloc and must be
2958    freed by the caller.  */
2959 unsigned char *
2960 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2961 {
2962   const cpp_token *token;
2963   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2964   unsigned int alloced = 120 + out;
2965   unsigned char *result = (unsigned char *) xmalloc (alloced);
2966
2967   /* If DIR_NAME is empty, there are no initial contents.  */
2968   if (dir_name)
2969     {
2970       sprintf ((char *) result, "#%s ", dir_name);
2971       out += 2;
2972     }
2973
2974   token = cpp_get_token (pfile);
2975   while (token->type != CPP_EOF)
2976     {
2977       unsigned char *last;
2978       /* Include room for a possible space and the terminating nul.  */
2979       unsigned int len = cpp_token_len (token) + 2;
2980
2981       if (out + len > alloced)
2982         {
2983           alloced *= 2;
2984           if (out + len > alloced)
2985             alloced = out + len;
2986           result = (unsigned char *) xrealloc (result, alloced);
2987         }
2988
2989       last = cpp_spell_token (pfile, token, &result[out], 0);
2990       out = last - result;
2991
2992       token = cpp_get_token (pfile);
2993       if (token->flags & PREV_WHITE)
2994         result[out++] = ' ';
2995     }
2996
2997   result[out] = '\0';
2998   return result;
2999 }
3000
3001 /* Memory buffers.  Changing these three constants can have a dramatic
3002    effect on performance.  The values here are reasonable defaults,
3003    but might be tuned.  If you adjust them, be sure to test across a
3004    range of uses of cpplib, including heavy nested function-like macro
3005    expansion.  Also check the change in peak memory usage (NJAMD is a
3006    good tool for this).  */
3007 #define MIN_BUFF_SIZE 8000
3008 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3009 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
3010         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
3011
3012 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
3013   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
3014 #endif
3015
3016 /* Create a new allocation buffer.  Place the control block at the end
3017    of the buffer, so that buffer overflows will cause immediate chaos.  */
3018 static _cpp_buff *
3019 new_buff (size_t len)
3020 {
3021   _cpp_buff *result;
3022   unsigned char *base;
3023
3024   if (len < MIN_BUFF_SIZE)
3025     len = MIN_BUFF_SIZE;
3026   len = CPP_ALIGN (len);
3027
3028 #ifdef ENABLE_VALGRIND_CHECKING
3029   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
3030      struct first.  */
3031   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
3032   base = XNEWVEC (unsigned char, len + slen);
3033   result = (_cpp_buff *) base;
3034   base += slen;
3035 #else
3036   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3037   result = (_cpp_buff *) (base + len);
3038 #endif
3039   result->base = base;
3040   result->cur = base;
3041   result->limit = base + len;
3042   result->next = NULL;
3043   return result;
3044 }
3045
3046 /* Place a chain of unwanted allocation buffers on the free list.  */
3047 void
3048 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3049 {
3050   _cpp_buff *end = buff;
3051
3052   while (end->next)
3053     end = end->next;
3054   end->next = pfile->free_buffs;
3055   pfile->free_buffs = buff;
3056 }
3057
3058 /* Return a free buffer of size at least MIN_SIZE.  */
3059 _cpp_buff *
3060 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3061 {
3062   _cpp_buff *result, **p;
3063
3064   for (p = &pfile->free_buffs;; p = &(*p)->next)
3065     {
3066       size_t size;
3067
3068       if (*p == NULL)
3069         return new_buff (min_size);
3070       result = *p;
3071       size = result->limit - result->base;
3072       /* Return a buffer that's big enough, but don't waste one that's
3073          way too big.  */
3074       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3075         break;
3076     }
3077
3078   *p = result->next;
3079   result->next = NULL;
3080   result->cur = result->base;
3081   return result;
3082 }
3083
3084 /* Creates a new buffer with enough space to hold the uncommitted
3085    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
3086    the excess bytes to the new buffer.  Chains the new buffer after
3087    BUFF, and returns the new buffer.  */
3088 _cpp_buff *
3089 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3090 {
3091   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3092   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3093
3094   buff->next = new_buff;
3095   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3096   return new_buff;
3097 }
3098
3099 /* Creates a new buffer with enough space to hold the uncommitted
3100    remaining bytes of the buffer pointed to by BUFF, and at least
3101    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3102    Chains the new buffer before the buffer pointed to by BUFF, and
3103    updates the pointer to point to the new buffer.  */
3104 void
3105 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3106 {
3107   _cpp_buff *new_buff, *old_buff = *pbuff;
3108   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3109
3110   new_buff = _cpp_get_buff (pfile, size);
3111   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3112   new_buff->next = old_buff;
3113   *pbuff = new_buff;
3114 }
3115
3116 /* Free a chain of buffers starting at BUFF.  */
3117 void
3118 _cpp_free_buff (_cpp_buff *buff)
3119 {
3120   _cpp_buff *next;
3121
3122   for (; buff; buff = next)
3123     {
3124       next = buff->next;
3125 #ifdef ENABLE_VALGRIND_CHECKING
3126       free (buff);
3127 #else
3128       free (buff->base);
3129 #endif
3130     }
3131 }
3132
3133 /* Allocate permanent, unaligned storage of length LEN.  */
3134 unsigned char *
3135 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3136 {
3137   _cpp_buff *buff = pfile->u_buff;
3138   unsigned char *result = buff->cur;
3139
3140   if (len > (size_t) (buff->limit - result))
3141     {
3142       buff = _cpp_get_buff (pfile, len);
3143       buff->next = pfile->u_buff;
3144       pfile->u_buff = buff;
3145       result = buff->cur;
3146     }
3147
3148   buff->cur = result + len;
3149   return result;
3150 }
3151
3152 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3153    That buffer is used for growing allocations when saving macro
3154    replacement lists in a #define, and when parsing an answer to an
3155    assertion in #assert, #unassert or #if (and therefore possibly
3156    whilst expanding macros).  It therefore must not be used by any
3157    code that they might call: specifically the lexer and the guts of
3158    the macro expander.
3159
3160    All existing other uses clearly fit this restriction: storing
3161    registered pragmas during initialization.  */
3162 unsigned char *
3163 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3164 {
3165   _cpp_buff *buff = pfile->a_buff;
3166   unsigned char *result = buff->cur;
3167
3168   if (len > (size_t) (buff->limit - result))
3169     {
3170       buff = _cpp_get_buff (pfile, len);
3171       buff->next = pfile->a_buff;
3172       pfile->a_buff = buff;
3173       result = buff->cur;
3174     }
3175
3176   buff->cur = result + len;
3177   return result;
3178 }
3179
3180 /* Say which field of TOK is in use.  */
3181
3182 enum cpp_token_fld_kind
3183 cpp_token_val_index (const cpp_token *tok)
3184 {
3185   switch (TOKEN_SPELL (tok))
3186     {
3187     case SPELL_IDENT:
3188       return CPP_TOKEN_FLD_NODE;
3189     case SPELL_LITERAL:
3190       return CPP_TOKEN_FLD_STR;
3191     case SPELL_OPERATOR:
3192       if (tok->type == CPP_PASTE)
3193         return CPP_TOKEN_FLD_TOKEN_NO;
3194       else
3195         return CPP_TOKEN_FLD_NONE;
3196     case SPELL_NONE:
3197       if (tok->type == CPP_MACRO_ARG)
3198         return CPP_TOKEN_FLD_ARG_NO;
3199       else if (tok->type == CPP_PADDING)
3200         return CPP_TOKEN_FLD_SOURCE;
3201       else if (tok->type == CPP_PRAGMA)
3202         return CPP_TOKEN_FLD_PRAGMA;
3203       /* else fall through */
3204     default:
3205       return CPP_TOKEN_FLD_NONE;
3206     }
3207 }
3208
3209 /* All tokens lexed in R after calling this function will be forced to have
3210    their source_location the same as the location referenced by P, until
3211    cpp_stop_forcing_token_locations is called for R.  */
3212
3213 void
3214 cpp_force_token_locations (cpp_reader *r, source_location *p)
3215 {
3216   r->forced_token_location_p = p;
3217 }
3218
3219 /* Go back to assigning locations naturally for lexed tokens.  */
3220
3221 void
3222 cpp_stop_forcing_token_locations (cpp_reader *r)
3223 {
3224   r->forced_token_location_p = NULL;
3225 }