libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2013 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problems can be properly
 267    autoconfed:
 268
 269    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 270    Before Solaris 9 Update 6, SSE insns cannot be executed.
 271    The Solaris 10+ assembler tags objects with the instruction set
 272    extensions used, so SSE4.2 executables cannot run on machines that
 273    don't support that extension.  */
 274
 275 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 276
 277 /* Replicated character data to be shared between implementations.
 278    Recall that outside of a context with vector support we can't
 279    define compatible vector types, therefore these are all defined
 280    in terms of raw characters.  */
 281 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 282   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 283     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 284   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 285     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 286   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 287     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 288   { '?', '?', '?', '?', '?', '?', '?', '?',
 289     '?', '?', '?', '?', '?', '?', '?', '?' },
 290 };
 291
 292 /* A version of the fast scanner using MMX vectorized byte compare insns.
 293
 294    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 295    which was packaged into SSE1; it is also present in the AMD MMX
 296    extension.  Mark the function as using "sse" so that we emit a real
 297    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 298
 299 static const uchar *
 300 #ifndef __SSE__
 301 __attribute__((__target__("sse")))
 302 #endif
 303 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 304 {
 305   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 306   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 307
 308   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 309   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 310   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 311   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 312
 313   unsigned int misalign, found, mask;
 314   const v8qi *p;
 315   v8qi data, t, c;
 316
 317   /* Align the source pointer.  While MMX doesn't generate unaligned data
 318      faults, this allows us to safely scan to the end of the buffer without
 319      reading beyond the end of the last page.  */
 320   misalign = (uintptr_t)s & 7;
 321   p = (const v8qi *)((uintptr_t)s & -8);
 322   data = *p;
 323
 324   /* Create a mask for the bytes that are valid within the first
 325      16-byte block.  The Idea here is that the AND with the mask
 326      within the loop is "free", since we need some AND or TEST
 327      insn in order to set the flags for the branch anyway.  */
 328   mask = -1u << misalign;
 329
 330   /* Main loop processing 8 bytes at a time.  */
 331   goto start;
 332   do
 333     {
 334       data = *++p;
 335       mask = -1;
 336
 337     start:
 338       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 339       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 344       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 345       found = __builtin_ia32_pmovmskb (t);
 346       found &= mask;
 347     }
 348   while (!found);
 349
 350   __builtin_ia32_emms ();
 351
 352   /* FOUND contains 1 in bits for which we matched a relevant
 353      character.  Conversion to the byte index is trivial.  */
 354   found = __builtin_ctz(found);
 355   return (const uchar *)p + found;
 356 }
 357
 358 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 359
 360 static const uchar *
 361 #ifndef __SSE2__
 362 __attribute__((__target__("sse2")))
 363 #endif
 364 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 365 {
 366   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 367
 368   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 369   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 370   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 371   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 372
 373   unsigned int misalign, found, mask;
 374   const v16qi *p;
 375   v16qi data, t;
 376
 377   /* Align the source pointer.  */
 378   misalign = (uintptr_t)s & 15;
 379   p = (const v16qi *)((uintptr_t)s & -16);
 380   data = *p;
 381
 382   /* Create a mask for the bytes that are valid within the first
 383      16-byte block.  The Idea here is that the AND with the mask
 384      within the loop is "free", since we need some AND or TEST
 385      insn in order to set the flags for the branch anyway.  */
 386   mask = -1u << misalign;
 387
 388   /* Main loop processing 16 bytes at a time.  */
 389   goto start;
 390   do
 391     {
 392       data = *++p;
 393       mask = -1;
 394
 395     start:
 396       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 397       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 398       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 400       found = __builtin_ia32_pmovmskb128 (t);
 401       found &= mask;
 402     }
 403   while (!found);
 404
 405   /* FOUND contains 1 in bits for which we matched a relevant
 406      character.  Conversion to the byte index is trivial.  */
 407   found = __builtin_ctz(found);
 408   return (const uchar *)p + found;
 409 }
 410
 411 #ifdef HAVE_SSE4
 412 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 413
 414 static const uchar *
 415 #ifndef __SSE4_2__
 416 __attribute__((__target__("sse4.2")))
 417 #endif
 418 search_line_sse42 (const uchar *s, const uchar *end)
 419 {
 420   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 421   static const v16qi search = { '\n', '\r', '?', '\\' };
 422
 423   uintptr_t si = (uintptr_t)s;
 424   uintptr_t index;
 425
 426   /* Check for unaligned input.  */
 427   if (si & 15)
 428     {
 429       v16qi sv;
 430
 431       if (__builtin_expect (end - s < 16, 0)
 432           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 433         {
 434           /* There are less than 16 bytes left in the buffer, and less
 435              than 16 bytes left on the page.  Reading 16 bytes at this
 436              point might generate a spurious page fault.  Defer to the
 437              SSE2 implementation, which already handles alignment.  */
 438           return search_line_sse2 (s, end);
 439         }
 440
 441       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 442          memory need not be aligned.  */
 443       sv = __builtin_ia32_loaddqu ((const char *) s);
 444       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 445
 446       if (__builtin_expect (index < 16, 0))
 447         goto found;
 448
 449       /* Advance the pointer to an aligned address.  We will re-scan a
 450          few bytes, but we no longer need care for reading past the
 451          end of a page, since we're guaranteed a match.  */
 452       s = (const uchar *)((si + 16) & -16);
 453     }
 454
 455   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 456      in inline assembly, we can make proper use of the flags set.  */
 457   __asm (      "sub $16, %1\n"
 458         "       .balign 16\n"
 459         "0:     add $16, %1\n"
 460         "       %vpcmpestri $0, (%1), %2\n"
 461         "       jnc 0b"
 462         : "=&c"(index), "+r"(s)
 463         : "x"(search), "a"(4), "d"(16));
 464
 465  found:
 466   return s + index;
 467 }
 468
 469 #else
 470 /* Work around out-dated assemblers without sse4 support.  */
 471 #define search_line_sse42 search_line_sse2
 472 #endif
 473
 474 /* Check the CPU capabilities.  */
 475
 476 #include "../gcc/config/i386/cpuid.h"
 477
 478 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 479 static search_line_fast_type search_line_fast;
 480
 481 #define HAVE_init_vectorized_lexer 1
 482 static inline void
 483 init_vectorized_lexer (void)
 484 {
 485   unsigned dummy, ecx = 0, edx = 0;
 486   search_line_fast_type impl = search_line_acc_char;
 487   int minimum = 0;
 488
 489 #if defined(__SSE4_2__)
 490   minimum = 3;
 491 #elif defined(__SSE2__)
 492   minimum = 2;
 493 #elif defined(__SSE__)
 494   minimum = 1;
 495 #endif
 496
 497   if (minimum == 3)
 498     impl = search_line_sse42;
 499   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 500     {
 501       if (minimum == 3 || (ecx & bit_SSE4_2))
 502         impl = search_line_sse42;
 503       else if (minimum == 2 || (edx & bit_SSE2))
 504         impl = search_line_sse2;
 505       else if (minimum == 1 || (edx & bit_SSE))
 506         impl = search_line_mmx;
 507     }
 508   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 509     {
 510       if (minimum == 1
 511           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 512         impl = search_line_mmx;
 513     }
 514
 515   search_line_fast = impl;
 516 }
 517
 518 #elif defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 519
 520 /* A vection of the fast scanner using AltiVec vectorized byte compares
 521    and VSX unaligned loads (when VSX is available).  This is otherwise
 522    the same as the pre-GCC 5 version.  */
 523
 524 static const uchar *
 525 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 526 {
 527   typedef __attribute__((altivec(vector))) unsigned char vc;
 528
 529   const vc repl_nl = {
 530     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 531     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 532   };
 533   const vc repl_cr = {
 534     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 535     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 536   };
 537   const vc repl_bs = {
 538     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 539     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 540   };
 541   const vc repl_qm = {
 542     '?', '?', '?', '?', '?', '?', '?', '?',
 543     '?', '?', '?', '?', '?', '?', '?', '?',
 544   };
 545   const vc zero = { 0 };
 546
 547   vc data, t;
 548
 549   /* Main loop processing 16 bytes at a time.  */
 550   do
 551     {
 552       vc m_nl, m_cr, m_bs, m_qm;
 553
 554       data = *((const vc *)s);
 555       s += 16;
 556
 557       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 558       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 559       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 560       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 561       t = (m_nl | m_cr) | (m_bs | m_qm);
 562
 563       /* T now contains 0xff in bytes for which we matched one of the relevant
 564          characters.  We want to exit the loop if any byte in T is non-zero.
 565          Below is the expansion of vec_any_ne(t, zero).  */
 566     }
 567   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 568
 569   /* Restore s to to point to the 16 bytes we just processed.  */
 570   s -= 16;
 571
 572   {
 573 #define N  (sizeof(vc) / sizeof(long))
 574
 575     union {
 576       vc v;
 577       /* Statically assert that N is 2 or 4.  */
 578       unsigned long l[(N == 2 || N == 4) ? N : -1];
 579     } u;
 580     unsigned long l, i = 0;
 581
 582     u.v = t;
 583
 584     /* Find the first word of T that is non-zero.  */
 585     switch (N)
 586       {
 587       case 4:
 588         l = u.l[i++];
 589         if (l != 0)
 590           break;
 591         s += sizeof(unsigned long);
 592         l = u.l[i++];
 593         if (l != 0)
 594           break;
 595         s += sizeof(unsigned long);
 596       case 2:
 597         l = u.l[i++];
 598         if (l != 0)
 599           break;
 600         s += sizeof(unsigned long);
 601         l = u.l[i];
 602       }
 603
 604     /* L now contains 0xff in bytes for which we matched one of the
 605        relevant characters.  We can find the byte index by finding
 606        its bit index and dividing by 8.  */
 607 #ifdef __BIG_ENDIAN__
 608     l = __builtin_clzl(l) >> 3;
 609 #else
 610     l = __builtin_ctzl(l) >> 3;
 611 #endif
 612     return s + l;
 613
 614 #undef N
 615   }
 616 }
 617
 618 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 619
 620 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 621    This cannot be used for little endian because vec_lvsl/lvsr are
 622    deprecated for little endian and the code won't work properly.  */
 623 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 624    so we can't compile this function without -maltivec on the command line
 625    (or implied by some other switch).  */
 626
 627 static const uchar *
 628 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 629 {
 630   typedef __attribute__((altivec(vector))) unsigned char vc;
 631
 632   const vc repl_nl = {
 633     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 634     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 635   };
 636   const vc repl_cr = {
 637     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 638     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 639   };
 640   const vc repl_bs = {
 641     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 642     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 643   };
 644   const vc repl_qm = {
 645     '?', '?', '?', '?', '?', '?', '?', '?',
 646     '?', '?', '?', '?', '?', '?', '?', '?',
 647   };
 648   const vc ones = {
 649     -1, -1, -1, -1, -1, -1, -1, -1,
 650     -1, -1, -1, -1, -1, -1, -1, -1,
 651   };
 652   const vc zero = { 0 };
 653
 654   vc data, mask, t;
 655
 656   /* Altivec loads automatically mask addresses with -16.  This lets us
 657      issue the first load as early as possible.  */
 658   data = __builtin_vec_ld(0, (const vc *)s);
 659
 660   /* Discard bytes before the beginning of the buffer.  Do this by
 661      beginning with all ones and shifting in zeros according to the
 662      mis-alignment.  The LVSR instruction pulls the exact shift we
 663      want from the address.  */
 664   mask = __builtin_vec_lvsr(0, s);
 665   mask = __builtin_vec_perm(zero, ones, mask);
 666   data &= mask;
 667
 668   /* While altivec loads mask addresses, we still need to align S so
 669      that the offset we compute at the end is correct.  */
 670   s = (const uchar *)((uintptr_t)s & -16);
 671
 672   /* Main loop processing 16 bytes at a time.  */
 673   goto start;
 674   do
 675     {
 676       vc m_nl, m_cr, m_bs, m_qm;
 677
 678       s += 16;
 679       data = __builtin_vec_ld(0, (const vc *)s);
 680
 681     start:
 682       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 683       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 684       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 685       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 686       t = (m_nl | m_cr) | (m_bs | m_qm);
 687
 688       /* T now contains 0xff in bytes for which we matched one of the relevant
 689          characters.  We want to exit the loop if any byte in T is non-zero.
 690          Below is the expansion of vec_any_ne(t, zero).  */
 691     }
 692   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 693
 694   {
 695 #define N  (sizeof(vc) / sizeof(long))
 696
 697     union {
 698       vc v;
 699       /* Statically assert that N is 2 or 4.  */
 700       unsigned long l[(N == 2 || N == 4) ? N : -1];
 701     } u;
 702     unsigned long l, i = 0;
 703
 704     u.v = t;
 705
 706     /* Find the first word of T that is non-zero.  */
 707     switch (N)
 708       {
 709       case 4:
 710         l = u.l[i++];
 711         if (l != 0)
 712           break;
 713         s += sizeof(unsigned long);
 714         l = u.l[i++];
 715         if (l != 0)
 716           break;
 717         s += sizeof(unsigned long);
 718       case 2:
 719         l = u.l[i++];
 720         if (l != 0)
 721           break;
 722         s += sizeof(unsigned long);
 723         l = u.l[i];
 724       }
 725
 726     /* L now contains 0xff in bytes for which we matched one of the
 727        relevant characters.  We can find the byte index by finding
 728        its bit index and dividing by 8.  */
 729     l = __builtin_clzl(l) >> 3;
 730     return s + l;
 731
 732 #undef N
 733   }
 734 }
 735
 736 #elif defined (__ARM_NEON__)
 737 #include "arm_neon.h"
 738
 739 static const uchar *
 740 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 741 {
 742   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 743   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 744   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 745   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 746   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 747
 748   unsigned int misalign, found, mask;
 749   const uint8_t *p;
 750   uint8x16_t data;
 751
 752   /* Align the source pointer.  */
 753   misalign = (uintptr_t)s & 15;
 754   p = (const uint8_t *)((uintptr_t)s & -16);
 755   data = vld1q_u8 (p);
 756
 757   /* Create a mask for the bytes that are valid within the first
 758      16-byte block.  The Idea here is that the AND with the mask
 759      within the loop is "free", since we need some AND or TEST
 760      insn in order to set the flags for the branch anyway.  */
 761   mask = (-1u << misalign) & 0xffff;
 762
 763   /* Main loop, processing 16 bytes at a time.  */
 764   goto start;
 765
 766   do
 767     {
 768       uint8x8_t l;
 769       uint16x4_t m;
 770       uint32x2_t n;
 771       uint8x16_t t, u, v, w;
 772
 773       p += 16;
 774       data = vld1q_u8 (p);
 775       mask = 0xffff;
 776
 777     start:
 778       t = vceqq_u8 (data, repl_nl);
 779       u = vceqq_u8 (data, repl_cr);
 780       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 781       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 782       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 783       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 784       m = vpaddl_u8 (l);
 785       n = vpaddl_u16 (m);
 786
 787       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 788               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 789       found &= mask;
 790     }
 791   while (!found);
 792
 793   /* FOUND contains 1 in bits for which we matched a relevant
 794      character.  Conversion to the byte index is trivial.  */
 795   found = __builtin_ctz (found);
 796   return (const uchar *)p + found;
 797 }
 798
 799 #else
 800
 801 /* We only have one accellerated alternative.  Use a direct call so that
 802    we encourage inlining.  */
 803
 804 #define search_line_fast  search_line_acc_char
 805
 806 #endif
 807
 808 /* Initialize the lexer if needed.  */
 809
 810 void
 811 _cpp_init_lexer (void)
 812 {
 813 #ifdef HAVE_init_vectorized_lexer
 814   init_vectorized_lexer ();
 815 #endif
 816 }
 817
 818 /* Returns with a logical line that contains no escaped newlines or
 819    trigraphs.  This is a time-critical inner loop.  */
 820 void
 821 _cpp_clean_line (cpp_reader *pfile)
 822 {
 823   cpp_buffer *buffer;
 824   const uchar *s;
 825   uchar c, *d, *p;
 826
 827   buffer = pfile->buffer;
 828   buffer->cur_note = buffer->notes_used = 0;
 829   buffer->cur = buffer->line_base = buffer->next_line;
 830   buffer->need_line = false;
 831   s = buffer->next_line;
 832
 833   if (!buffer->from_stage3)
 834     {
 835       const uchar *pbackslash = NULL;
 836
 837       /* Fast path.  This is the common case of an un-escaped line with
 838          no trigraphs.  The primary win here is by not writing any
 839          data back to memory until we have to.  */
 840       while (1)
 841         {
 842           /* Perform an optimized search for \n, \r, \\, ?.  */
 843           s = search_line_fast (s, buffer->rlimit);
 844
 845           c = *s;
 846           if (c == '\\')
 847             {
 848               /* Record the location of the backslash and continue.  */
 849               pbackslash = s++;
 850             }
 851           else if (__builtin_expect (c == '?', 0))
 852             {
 853               if (__builtin_expect (s[1] == '?', false)
 854                    && _cpp_trigraph_map[s[2]])
 855                 {
 856                   /* Have a trigraph.  We may or may not have to convert
 857                      it.  Add a line note regardless, for -Wtrigraphs.  */
 858                   add_line_note (buffer, s, s[2]);
 859                   if (CPP_OPTION (pfile, trigraphs))
 860                     {
 861                       /* We do, and that means we have to switch to the
 862                          slow path.  */
 863                       d = (uchar *) s;
 864                       *d = _cpp_trigraph_map[s[2]];
 865                       s += 2;
 866                       goto slow_path;
 867                     }
 868                 }
 869               /* Not a trigraph.  Continue on fast-path.  */
 870               s++;
 871             }
 872           else
 873             break;
 874         }
 875
 876       /* This must be \r or \n.  We're either done, or we'll be forced
 877          to write back to the buffer and continue on the slow path.  */
 878       d = (uchar *) s;
 879
 880       if (__builtin_expect (s == buffer->rlimit, false))
 881         goto done;
 882
 883       /* DOS line ending? */
 884       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 885         {
 886           s++;
 887           if (s == buffer->rlimit)
 888             goto done;
 889         }
 890
 891       if (__builtin_expect (pbackslash == NULL, true))
 892         goto done;
 893
 894       /* Check for escaped newline.  */
 895       p = d;
 896       while (is_nvspace (p[-1]))
 897         p--;
 898       if (p - 1 != pbackslash)
 899         goto done;
 900
 901       /* Have an escaped newline; process it and proceed to
 902          the slow path.  */
 903       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 904       d = p - 2;
 905       buffer->next_line = p - 1;
 906
 907     slow_path:
 908       while (1)
 909         {
 910           c = *++s;
 911           *++d = c;
 912
 913           if (c == '\n' || c == '\r')
 914             {
 915               /* Handle DOS line endings.  */
 916               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 917                 s++;
 918               if (s == buffer->rlimit)
 919                 break;
 920
 921               /* Escaped?  */
 922               p = d;
 923               while (p != buffer->next_line && is_nvspace (p[-1]))
 924                 p--;
 925               if (p == buffer->next_line || p[-1] != '\\')
 926                 break;
 927
 928               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 929               d = p - 2;
 930               buffer->next_line = p - 1;
 931             }
 932           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 933             {
 934               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 935               add_line_note (buffer, d, s[2]);
 936               if (CPP_OPTION (pfile, trigraphs))
 937                 {
 938                   *d = _cpp_trigraph_map[s[2]];
 939                   s += 2;
 940                 }
 941             }
 942         }
 943     }
 944   else
 945     {
 946       while (*s != '\n' && *s != '\r')
 947         s++;
 948       d = (uchar *) s;
 949
 950       /* Handle DOS line endings.  */
 951       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 952         s++;
 953     }
 954
 955  done:
 956   *d = '\n';
 957   /* A sentinel note that should never be processed.  */
 958   add_line_note (buffer, d + 1, '\n');
 959   buffer->next_line = s + 1;
 960 }
 961
 962 /* Return true if the trigraph indicated by NOTE should be warned
 963    about in a comment.  */
 964 static bool
 965 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 966 {
 967   const uchar *p;
 968
 969   /* Within comments we don't warn about trigraphs, unless the
 970      trigraph forms an escaped newline, as that may change
 971      behavior.  */
 972   if (note->type != '/')
 973     return false;
 974
 975   /* If -trigraphs, then this was an escaped newline iff the next note
 976      is coincident.  */
 977   if (CPP_OPTION (pfile, trigraphs))
 978     return note[1].pos == note->pos;
 979
 980   /* Otherwise, see if this forms an escaped newline.  */
 981   p = note->pos + 3;
 982   while (is_nvspace (*p))
 983     p++;
 984
 985   /* There might have been escaped newlines between the trigraph and the
 986      newline we found.  Hence the position test.  */
 987   return (*p == '\n' && p < note[1].pos);
 988 }
 989
 990 /* Process the notes created by add_line_note as far as the current
 991    location.  */
 992 void
 993 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 994 {
 995   cpp_buffer *buffer = pfile->buffer;
 996
 997   for (;;)
 998     {
 999       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1000       unsigned int col;
1001
1002       if (note->pos > buffer->cur)
1003         break;
1004
1005       buffer->cur_note++;
1006       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1007
1008       if (note->type == '\\' || note->type == ' ')
1009         {
1010           if (note->type == ' ' && !in_comment)
1011             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1012                                  "backslash and newline separated by space");
1013
1014           if (buffer->next_line > buffer->rlimit)
1015             {
1016               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1017                                    "backslash-newline at end of file");
1018               /* Prevent "no newline at end of file" warning.  */
1019               buffer->next_line = buffer->rlimit;
1020             }
1021
1022           buffer->line_base = note->pos;
1023           CPP_INCREMENT_LINE (pfile, 0);
1024         }
1025       else if (_cpp_trigraph_map[note->type])
1026         {
1027           if (CPP_OPTION (pfile, warn_trigraphs)
1028               && (!in_comment || warn_in_comment (pfile, note)))
1029             {
1030               if (CPP_OPTION (pfile, trigraphs))
1031                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1032                                        pfile->line_table->highest_line, col,
1033                                        "trigraph ??%c converted to %c",
1034                                        note->type,
1035                                        (int) _cpp_trigraph_map[note->type]);
1036               else
1037                 {
1038                   cpp_warning_with_line
1039                     (pfile, CPP_W_TRIGRAPHS,
1040                      pfile->line_table->highest_line, col,
1041                      "trigraph ??%c ignored, use -trigraphs to enable",
1042                      note->type);
1043                 }
1044             }
1045         }
1046       else if (note->type == 0)
1047         /* Already processed in lex_raw_string.  */;
1048       else
1049         abort ();
1050     }
1051 }
1052
1053 /* Skip a C-style block comment.  We find the end of the comment by
1054    seeing if an asterisk is before every '/' we encounter.  Returns
1055    nonzero if comment terminated by EOF, zero otherwise.
1056
1057    Buffer->cur points to the initial asterisk of the comment.  */
1058 bool
1059 _cpp_skip_block_comment (cpp_reader *pfile)
1060 {
1061   cpp_buffer *buffer = pfile->buffer;
1062   const uchar *cur = buffer->cur;
1063   uchar c;
1064
1065   cur++;
1066   if (*cur == '/')
1067     cur++;
1068
1069   for (;;)
1070     {
1071       /* People like decorating comments with '*', so check for '/'
1072          instead for efficiency.  */
1073       c = *cur++;
1074
1075       if (c == '/')
1076         {
1077           if (cur[-2] == '*')
1078             break;
1079
1080           /* Warn about potential nested comments, but not if the '/'
1081              comes immediately before the true comment delimiter.
1082              Don't bother to get it right across escaped newlines.  */
1083           if (CPP_OPTION (pfile, warn_comments)
1084               && cur[0] == '*' && cur[1] != '/')
1085             {
1086               buffer->cur = cur;
1087               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1088                                      pfile->line_table->highest_line,
1089                                      CPP_BUF_COL (buffer),
1090                                      "\"/*\" within comment");
1091             }
1092         }
1093       else if (c == '\n')
1094         {
1095           unsigned int cols;
1096           buffer->cur = cur - 1;
1097           _cpp_process_line_notes (pfile, true);
1098           if (buffer->next_line >= buffer->rlimit)
1099             return true;
1100           _cpp_clean_line (pfile);
1101
1102           cols = buffer->next_line - buffer->line_base;
1103           CPP_INCREMENT_LINE (pfile, cols);
1104
1105           cur = buffer->cur;
1106         }
1107     }
1108
1109   buffer->cur = cur;
1110   _cpp_process_line_notes (pfile, true);
1111   return false;
1112 }
1113
1114 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1115    terminating newline.  Handles escaped newlines.  Returns nonzero
1116    if a multiline comment.  */
1117 static int
1118 skip_line_comment (cpp_reader *pfile)
1119 {
1120   cpp_buffer *buffer = pfile->buffer;
1121   source_location orig_line = pfile->line_table->highest_line;
1122
1123   while (*buffer->cur != '\n')
1124     buffer->cur++;
1125
1126   _cpp_process_line_notes (pfile, true);
1127   return orig_line != pfile->line_table->highest_line;
1128 }
1129
1130 /* Skips whitespace, saving the next non-whitespace character.  */
1131 static void
1132 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1133 {
1134   cpp_buffer *buffer = pfile->buffer;
1135   bool saw_NUL = false;
1136
1137   do
1138     {
1139       /* Horizontal space always OK.  */
1140       if (c == ' ' || c == '\t')
1141         ;
1142       /* Just \f \v or \0 left.  */
1143       else if (c == '\0')
1144         saw_NUL = true;
1145       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1146         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1147                              CPP_BUF_COL (buffer),
1148                              "%s in preprocessing directive",
1149                              c == '\f' ? "form feed" : "vertical tab");
1150
1151       c = *buffer->cur++;
1152     }
1153   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1154   while (is_nvspace (c));
1155
1156   if (saw_NUL)
1157     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1158
1159   buffer->cur--;
1160 }
1161
1162 /* See if the characters of a number token are valid in a name (no
1163    '.', '+' or '-').  */
1164 static int
1165 name_p (cpp_reader *pfile, const cpp_string *string)
1166 {
1167   unsigned int i;
1168
1169   for (i = 0; i < string->len; i++)
1170     if (!is_idchar (string->text[i]))
1171       return 0;
1172
1173   return 1;
1174 }
1175
1176 /* After parsing an identifier or other sequence, produce a warning about
1177    sequences not in NFC/NFKC.  */
1178 static void
1179 warn_about_normalization (cpp_reader *pfile,
1180                           const cpp_token *token,
1181                           const struct normalize_state *s)
1182 {
1183   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1184       && !pfile->state.skipping)
1185     {
1186       /* Make sure that the token is printed using UCNs, even
1187          if we'd otherwise happily print UTF-8.  */
1188       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1189       size_t sz;
1190
1191       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1192       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1193         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1194                                "`%.*s' is not in NFKC", (int) sz, buf);
1195       else
1196         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1197                                "`%.*s' is not in NFC", (int) sz, buf);
1198       free (buf);
1199     }
1200 }
1201
1202 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1203    an identifier.  FIRST is TRUE if this starts an identifier.  */
1204 static bool
1205 forms_identifier_p (cpp_reader *pfile, int first,
1206                     struct normalize_state *state)
1207 {
1208   cpp_buffer *buffer = pfile->buffer;
1209
1210   if (*buffer->cur == '$')
1211     {
1212       if (!CPP_OPTION (pfile, dollars_in_ident))
1213         return false;
1214
1215       buffer->cur++;
1216       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1217         {
1218           CPP_OPTION (pfile, warn_dollars) = 0;
1219           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1220         }
1221
1222       return true;
1223     }
1224
1225   /* Is this a syntactically valid UCN?  */
1226   if (CPP_OPTION (pfile, extended_identifiers)
1227       && *buffer->cur == '\\'
1228       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1229     {
1230       buffer->cur += 2;
1231       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1232                           state))
1233         return true;
1234       buffer->cur -= 2;
1235     }
1236
1237   return false;
1238 }
1239
1240 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1241 static cpp_hashnode *
1242 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1243 {
1244   cpp_hashnode *result;
1245   const uchar *cur;
1246   unsigned int len;
1247   unsigned int hash = HT_HASHSTEP (0, *base);
1248
1249   cur = base + 1;
1250   while (ISIDNUM (*cur))
1251     {
1252       hash = HT_HASHSTEP (hash, *cur);
1253       cur++;
1254     }
1255   len = cur - base;
1256   hash = HT_HASHFINISH (hash, len);
1257   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1258                                               base, len, hash, HT_ALLOC));
1259
1260   /* Rarely, identifiers require diagnostics when lexed.  */
1261   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1262                         && !pfile->state.skipping, 0))
1263     {
1264       /* It is allowed to poison the same identifier twice.  */
1265       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1266         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1267                    NODE_NAME (result));
1268
1269       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1270          replacement list of a variadic macro.  */
1271       if (result == pfile->spec_nodes.n__VA_ARGS__
1272           && !pfile->state.va_args_ok)
1273         cpp_error (pfile, CPP_DL_PEDWARN,
1274                    "__VA_ARGS__ can only appear in the expansion"
1275                    " of a C99 variadic macro");
1276
1277       /* For -Wc++-compat, warn about use of C++ named operators.  */
1278       if (result->flags & NODE_WARN_OPERATOR)
1279         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1280                      "identifier \"%s\" is a special operator name in C++",
1281                      NODE_NAME (result));
1282     }
1283
1284   return result;
1285 }
1286
1287 /* Get the cpp_hashnode of an identifier specified by NAME in
1288    the current cpp_reader object.  If none is found, NULL is returned.  */
1289 cpp_hashnode *
1290 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1291 {
1292   cpp_hashnode *result;
1293   result = lex_identifier_intern (pfile, (uchar *) name);
1294   return result;
1295 }
1296
1297 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1298 static cpp_hashnode *
1299 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1300                 struct normalize_state *nst)
1301 {
1302   cpp_hashnode *result;
1303   const uchar *cur;
1304   unsigned int len;
1305   unsigned int hash = HT_HASHSTEP (0, *base);
1306
1307   cur = pfile->buffer->cur;
1308   if (! starts_ucn)
1309     while (ISIDNUM (*cur))
1310       {
1311         hash = HT_HASHSTEP (hash, *cur);
1312         cur++;
1313       }
1314   pfile->buffer->cur = cur;
1315   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1316     {
1317       /* Slower version for identifiers containing UCNs (or $).  */
1318       do {
1319         while (ISIDNUM (*pfile->buffer->cur))
1320           {
1321             pfile->buffer->cur++;
1322             NORMALIZE_STATE_UPDATE_IDNUM (nst);
1323           }
1324       } while (forms_identifier_p (pfile, false, nst));
1325       result = _cpp_interpret_identifier (pfile, base,
1326                                           pfile->buffer->cur - base);
1327     }
1328   else
1329     {
1330       len = cur - base;
1331       hash = HT_HASHFINISH (hash, len);
1332
1333       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1334                                                   base, len, hash, HT_ALLOC));
1335     }
1336
1337   /* Rarely, identifiers require diagnostics when lexed.  */
1338   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1339                         && !pfile->state.skipping, 0))
1340     {
1341       /* It is allowed to poison the same identifier twice.  */
1342       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1343         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1344                    NODE_NAME (result));
1345
1346       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1347          replacement list of a variadic macro.  */
1348       if (result == pfile->spec_nodes.n__VA_ARGS__
1349           && !pfile->state.va_args_ok)
1350         cpp_error (pfile, CPP_DL_PEDWARN,
1351                    "__VA_ARGS__ can only appear in the expansion"
1352                    " of a C99 variadic macro");
1353
1354       /* For -Wc++-compat, warn about use of C++ named operators.  */
1355       if (result->flags & NODE_WARN_OPERATOR)
1356         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1357                      "identifier \"%s\" is a special operator name in C++",
1358                      NODE_NAME (result));
1359     }
1360
1361   return result;
1362 }
1363
1364 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1365 static void
1366 lex_number (cpp_reader *pfile, cpp_string *number,
1367             struct normalize_state *nst)
1368 {
1369   const uchar *cur;
1370   const uchar *base;
1371   uchar *dest;
1372
1373   base = pfile->buffer->cur - 1;
1374   do
1375     {
1376       cur = pfile->buffer->cur;
1377
1378       /* N.B. ISIDNUM does not include $.  */
1379       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
1380         {
1381           cur++;
1382           NORMALIZE_STATE_UPDATE_IDNUM (nst);
1383         }
1384
1385       pfile->buffer->cur = cur;
1386     }
1387   while (forms_identifier_p (pfile, false, nst));
1388
1389   number->len = cur - base;
1390   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1391   memcpy (dest, base, number->len);
1392   dest[number->len] = '\0';
1393   number->text = dest;
1394 }
1395
1396 /* Create a token of type TYPE with a literal spelling.  */
1397 static void
1398 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1399                 unsigned int len, enum cpp_ttype type)
1400 {
1401   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1402
1403   memcpy (dest, base, len);
1404   dest[len] = '\0';
1405   token->type = type;
1406   token->val.str.len = len;
1407   token->val.str.text = dest;
1408 }
1409
1410 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1411    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1412
1413 static void
1414 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1415                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1416 {
1417   _cpp_buff *first_buff = *first_buff_p;
1418   _cpp_buff *last_buff = *last_buff_p;
1419
1420   if (first_buff == NULL)
1421     first_buff = last_buff = _cpp_get_buff (pfile, len);
1422   else if (len > BUFF_ROOM (last_buff))
1423     {
1424       size_t room = BUFF_ROOM (last_buff);
1425       memcpy (BUFF_FRONT (last_buff), base, room);
1426       BUFF_FRONT (last_buff) += room;
1427       base += room;
1428       len -= room;
1429       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1430     }
1431
1432   memcpy (BUFF_FRONT (last_buff), base, len);
1433   BUFF_FRONT (last_buff) += len;
1434
1435   *first_buff_p = first_buff;
1436   *last_buff_p = last_buff;
1437 }
1438
1439 /* Lexes a raw string.  The stored string contains the spelling, including
1440    double quotes, delimiter string, '(' and ')', any leading
1441    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1442    literal, or CPP_OTHER if it was not properly terminated.
1443
1444    The spelling is NUL-terminated, but it is not guaranteed that this
1445    is the first NUL since embedded NULs are preserved.  */
1446
1447 static void
1448 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1449                 const uchar *cur)
1450 {
1451   const uchar *raw_prefix;
1452   unsigned int raw_prefix_len = 0;
1453   enum cpp_ttype type;
1454   size_t total_len = 0;
1455   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1456   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1457
1458   type = (*base == 'L' ? CPP_WSTRING :
1459           *base == 'U' ? CPP_STRING32 :
1460           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1461           : CPP_STRING);
1462
1463   raw_prefix = cur + 1;
1464   while (raw_prefix_len < 16)
1465     {
1466       switch (raw_prefix[raw_prefix_len])
1467         {
1468         case ' ': case '(': case ')': case '\\': case '\t':
1469         case '\v': case '\f': case '\n': default:
1470           break;
1471         /* Basic source charset except the above chars.  */
1472         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1473         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1474         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1475         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1476         case 'y': case 'z':
1477         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1478         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1479         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1480         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1481         case 'Y': case 'Z':
1482         case '0': case '1': case '2': case '3': case '4': case '5':
1483         case '6': case '7': case '8': case '9':
1484         case '_': case '{': case '}': case '#': case '[': case ']':
1485         case '<': case '>': case '%': case ':': case ';': case '.':
1486         case '?': case '*': case '+': case '-': case '/': case '^':
1487         case '&': case '|': case '~': case '!': case '=': case ',':
1488         case '"': case '\'':
1489           raw_prefix_len++;
1490           continue;
1491         }
1492       break;
1493     }
1494
1495   if (raw_prefix[raw_prefix_len] != '(')
1496     {
1497       int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
1498                 + 1;
1499       if (raw_prefix_len == 16)
1500         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1501                              "raw string delimiter longer than 16 characters");
1502       else
1503         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1504                              "invalid character '%c' in raw string delimiter",
1505                              (int) raw_prefix[raw_prefix_len]);
1506       pfile->buffer->cur = raw_prefix - 1;
1507       create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
1508       return;
1509     }
1510
1511   cur = raw_prefix + raw_prefix_len + 1;
1512   for (;;)
1513     {
1514 #define BUF_APPEND(STR,LEN)                                     \
1515       do {                                                      \
1516         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1517                         &first_buff, &last_buff);               \
1518         total_len += (LEN);                                     \
1519       } while (0);
1520
1521       cppchar_t c;
1522
1523       /* If we previously performed any trigraph or line splicing
1524          transformations, undo them within the body of the raw string.  */
1525       while (note->pos < cur)
1526         ++note;
1527       for (; note->pos == cur; ++note)
1528         {
1529           switch (note->type)
1530             {
1531             case '\\':
1532             case ' ':
1533               /* Restore backslash followed by newline.  */
1534               BUF_APPEND (base, cur - base);
1535               base = cur;
1536               BUF_APPEND ("\\", 1);
1537             after_backslash:
1538               if (note->type == ' ')
1539                 {
1540                   /* GNU backslash whitespace newline extension.  FIXME
1541                      could be any sequence of non-vertical space.  When we
1542                      can properly restore any such sequence, we should mark
1543                      this note as handled so _cpp_process_line_notes
1544                      doesn't warn.  */
1545                   BUF_APPEND (" ", 1);
1546                 }
1547
1548               BUF_APPEND ("\n", 1);
1549               break;
1550
1551             case 0:
1552               /* Already handled.  */
1553               break;
1554
1555             default:
1556               if (_cpp_trigraph_map[note->type])
1557                 {
1558                   /* Don't warn about this trigraph in
1559                      _cpp_process_line_notes, since trigraphs show up as
1560                      trigraphs in raw strings.  */
1561                   uchar type = note->type;
1562                   note->type = 0;
1563
1564                   if (!CPP_OPTION (pfile, trigraphs))
1565                     /* If we didn't convert the trigraph in the first
1566                        place, don't do anything now either.  */
1567                     break;
1568
1569                   BUF_APPEND (base, cur - base);
1570                   base = cur;
1571                   BUF_APPEND ("??", 2);
1572
1573                   /* ??/ followed by newline gets two line notes, one for
1574                      the trigraph and one for the backslash/newline.  */
1575                   if (type == '/' && note[1].pos == cur)
1576                     {
1577                       if (note[1].type != '\\'
1578                           && note[1].type != ' ')
1579                         abort ();
1580                       BUF_APPEND ("/", 1);
1581                       ++note;
1582                       goto after_backslash;
1583                     }
1584                   /* The ) from ??) could be part of the suffix.  */
1585                   else if (type == ')'
1586                            && strncmp ((const char *) cur+1,
1587                                        (const char *) raw_prefix,
1588                                        raw_prefix_len) == 0
1589                            && cur[raw_prefix_len+1] == '"')
1590                     {
1591                       BUF_APPEND (")", 1);
1592                       base++;
1593                       cur += raw_prefix_len + 2;
1594                       goto break_outer_loop;
1595                     }
1596                   else
1597                     {
1598                       /* Skip the replacement character.  */
1599                       base = ++cur;
1600                       BUF_APPEND (&type, 1);
1601                     }
1602                 }
1603               else
1604                 abort ();
1605               break;
1606             }
1607         }
1608       c = *cur++;
1609
1610       if (c == ')'
1611           && strncmp ((const char *) cur, (const char *) raw_prefix,
1612                       raw_prefix_len) == 0
1613           && cur[raw_prefix_len] == '"')
1614         {
1615           cur += raw_prefix_len + 1;
1616           break;
1617         }
1618       else if (c == '\n')
1619         {
1620           if (pfile->state.in_directive
1621               || pfile->state.parsing_args
1622               || pfile->state.in_deferred_pragma)
1623             {
1624               cur--;
1625               type = CPP_OTHER;
1626               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1627                                    "unterminated raw string");
1628               break;
1629             }
1630
1631           BUF_APPEND (base, cur - base);
1632
1633           if (pfile->buffer->cur < pfile->buffer->rlimit)
1634             CPP_INCREMENT_LINE (pfile, 0);
1635           pfile->buffer->need_line = true;
1636
1637           pfile->buffer->cur = cur-1;
1638           _cpp_process_line_notes (pfile, false);
1639           if (!_cpp_get_fresh_line (pfile))
1640             {
1641               source_location src_loc = token->src_loc;
1642               token->type = CPP_EOF;
1643               /* Tell the compiler the line number of the EOF token.  */
1644               token->src_loc = pfile->line_table->highest_line;
1645               token->flags = BOL;
1646               if (first_buff != NULL)
1647                 _cpp_release_buff (pfile, first_buff);
1648               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1649                                    "unterminated raw string");
1650               return;
1651             }
1652
1653           cur = base = pfile->buffer->cur;
1654           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1655         }
1656     }
1657  break_outer_loop:
1658
1659   if (CPP_OPTION (pfile, user_literals))
1660     {
1661       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1662          underscore is ill-formed.  Since this breaks programs using macros
1663          from inttypes.h, we generate a warning and treat the ud-suffix as a
1664          separate preprocessing token.  This approach is under discussion by
1665          the standards committee, and has been adopted as a conforming
1666          extension by other front ends such as clang.
1667          A special exception is made for the suffix 's' which will be
1668          standardized as a user-defined literal suffix for strings.  */
1669       if (ISALPHA (*cur) && *cur != 's')
1670         {
1671           /* Raise a warning, but do not consume subsequent tokens.  */
1672           if (CPP_OPTION (pfile, warn_literal_suffix))
1673             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1674                                    token->src_loc, 0,
1675                                    "invalid suffix on literal; C++11 requires "
1676                                    "a space between literal and identifier");
1677         }
1678       /* Grab user defined literal suffix.  */
1679       else if (ISIDST (*cur))
1680         {
1681           type = cpp_userdef_string_add_type (type);
1682           ++cur;
1683
1684           while (ISIDNUM (*cur))
1685             ++cur;
1686         }
1687     }
1688
1689   pfile->buffer->cur = cur;
1690   if (first_buff == NULL)
1691     create_literal (pfile, token, base, cur - base, type);
1692   else
1693     {
1694       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1695
1696       token->type = type;
1697       token->val.str.len = total_len + (cur - base);
1698       token->val.str.text = dest;
1699       last_buff = first_buff;
1700       while (last_buff != NULL)
1701         {
1702           memcpy (dest, last_buff->base,
1703                   BUFF_FRONT (last_buff) - last_buff->base);
1704           dest += BUFF_FRONT (last_buff) - last_buff->base;
1705           last_buff = last_buff->next;
1706         }
1707       _cpp_release_buff (pfile, first_buff);
1708       memcpy (dest, base, cur - base);
1709       dest[cur - base] = '\0';
1710     }
1711 }
1712
1713 /* Lexes a string, character constant, or angle-bracketed header file
1714    name.  The stored string contains the spelling, including opening
1715    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1716    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1717    if it was not properly terminated, or CPP_LESS for an unterminated
1718    header name which must be relexed as normal tokens.
1719
1720    The spelling is NUL-terminated, but it is not guaranteed that this
1721    is the first NUL since embedded NULs are preserved.  */
1722 static void
1723 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1724 {
1725   bool saw_NUL = false;
1726   const uchar *cur;
1727   cppchar_t terminator;
1728   enum cpp_ttype type;
1729
1730   cur = base;
1731   terminator = *cur++;
1732   if (terminator == 'L' || terminator == 'U')
1733     terminator = *cur++;
1734   else if (terminator == 'u')
1735     {
1736       terminator = *cur++;
1737       if (terminator == '8')
1738         terminator = *cur++;
1739     }
1740   if (terminator == 'R')
1741     {
1742       lex_raw_string (pfile, token, base, cur);
1743       return;
1744     }
1745   if (terminator == '"')
1746     type = (*base == 'L' ? CPP_WSTRING :
1747             *base == 'U' ? CPP_STRING32 :
1748             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1749                          : CPP_STRING);
1750   else if (terminator == '\'')
1751     type = (*base == 'L' ? CPP_WCHAR :
1752             *base == 'U' ? CPP_CHAR32 :
1753             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1754   else
1755     terminator = '>', type = CPP_HEADER_NAME;
1756
1757   for (;;)
1758     {
1759       cppchar_t c = *cur++;
1760
1761       /* In #include-style directives, terminators are not escapable.  */
1762       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1763         cur++;
1764       else if (c == terminator)
1765         break;
1766       else if (c == '\n')
1767         {
1768           cur--;
1769           /* Unmatched quotes always yield undefined behavior, but
1770              greedy lexing means that what appears to be an unterminated
1771              header name may actually be a legitimate sequence of tokens.  */
1772           if (terminator == '>')
1773             {
1774               token->type = CPP_LESS;
1775               return;
1776             }
1777           type = CPP_OTHER;
1778           break;
1779         }
1780       else if (c == '\0')
1781         saw_NUL = true;
1782     }
1783
1784   if (saw_NUL && !pfile->state.skipping)
1785     cpp_error (pfile, CPP_DL_WARNING,
1786                "null character(s) preserved in literal");
1787
1788   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1789     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1790                (int) terminator);
1791
1792   if (CPP_OPTION (pfile, user_literals))
1793     {
1794       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1795          underscore is ill-formed.  Since this breaks programs using macros
1796          from inttypes.h, we generate a warning and treat the ud-suffix as a
1797          separate preprocessing token.  This approach is under discussion by
1798          the standards committee, and has been adopted as a conforming
1799          extension by other front ends such as clang.
1800          A special exception is made for the suffix 's' which will be
1801          standardized as a user-defined literal suffix for strings.  */
1802       if (ISALPHA (*cur) && *cur != 's')
1803         {
1804           /* Raise a warning, but do not consume subsequent tokens.  */
1805           if (CPP_OPTION (pfile, warn_literal_suffix))
1806             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1807                                    token->src_loc, 0,
1808                                    "invalid suffix on literal; C++11 requires "
1809                                    "a space between literal and identifier");
1810         }
1811       /* Grab user defined literal suffix.  */
1812       else if (ISIDST (*cur))
1813         {
1814           type = cpp_userdef_char_add_type (type);
1815           type = cpp_userdef_string_add_type (type);
1816           ++cur;
1817
1818           while (ISIDNUM (*cur))
1819             ++cur;
1820         }
1821     }
1822
1823   pfile->buffer->cur = cur;
1824   create_literal (pfile, token, base, cur - base, type);
1825 }
1826
1827 /* Return the comment table. The client may not make any assumption
1828    about the ordering of the table.  */
1829 cpp_comment_table *
1830 cpp_get_comments (cpp_reader *pfile)
1831 {
1832   return &pfile->comments;
1833 }
1834
1835 /* Append a comment to the end of the comment table. */
1836 static void
1837 store_comment (cpp_reader *pfile, cpp_token *token)
1838 {
1839   int len;
1840
1841   if (pfile->comments.allocated == 0)
1842     {
1843       pfile->comments.allocated = 256;
1844       pfile->comments.entries = (cpp_comment *) xmalloc
1845         (pfile->comments.allocated * sizeof (cpp_comment));
1846     }
1847
1848   if (pfile->comments.count == pfile->comments.allocated)
1849     {
1850       pfile->comments.allocated *= 2;
1851       pfile->comments.entries = (cpp_comment *) xrealloc
1852         (pfile->comments.entries,
1853          pfile->comments.allocated * sizeof (cpp_comment));
1854     }
1855
1856   len = token->val.str.len;
1857
1858   /* Copy comment. Note, token may not be NULL terminated. */
1859   pfile->comments.entries[pfile->comments.count].comment =
1860     (char *) xmalloc (sizeof (char) * (len + 1));
1861   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1862           token->val.str.text, len);
1863   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1864
1865   /* Set source location. */
1866   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1867
1868   /* Increment the count of entries in the comment table. */
1869   pfile->comments.count++;
1870 }
1871
1872 /* The stored comment includes the comment start and any terminator.  */
1873 static void
1874 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1875               cppchar_t type)
1876 {
1877   unsigned char *buffer;
1878   unsigned int len, clen, i;
1879
1880   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1881
1882   /* C++ comments probably (not definitely) have moved past a new
1883      line, which we don't want to save in the comment.  */
1884   if (is_vspace (pfile->buffer->cur[-1]))
1885     len--;
1886
1887   /* If we are currently in a directive or in argument parsing, then
1888      we need to store all C++ comments as C comments internally, and
1889      so we need to allocate a little extra space in that case.
1890
1891      Note that the only time we encounter a directive here is
1892      when we are saving comments in a "#define".  */
1893   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1894           && type == '/') ? len + 2 : len;
1895
1896   buffer = _cpp_unaligned_alloc (pfile, clen);
1897
1898   token->type = CPP_COMMENT;
1899   token->val.str.len = clen;
1900   token->val.str.text = buffer;
1901
1902   buffer[0] = '/';
1903   memcpy (buffer + 1, from, len - 1);
1904
1905   /* Finish conversion to a C comment, if necessary.  */
1906   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1907     {
1908       buffer[1] = '*';
1909       buffer[clen - 2] = '*';
1910       buffer[clen - 1] = '/';
1911       /* As there can be in a C++ comments illegal sequences for C comments
1912          we need to filter them out.  */
1913       for (i = 2; i < (clen - 2); i++)
1914         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1915           buffer[i] = '|';
1916     }
1917
1918   /* Finally store this comment for use by clients of libcpp. */
1919   store_comment (pfile, token);
1920 }
1921
1922 /* Allocate COUNT tokens for RUN.  */
1923 void
1924 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1925 {
1926   run->base = XNEWVEC (cpp_token, count);
1927   run->limit = run->base + count;
1928   run->next = NULL;
1929 }
1930
1931 /* Returns the next tokenrun, or creates one if there is none.  */
1932 static tokenrun *
1933 next_tokenrun (tokenrun *run)
1934 {
1935   if (run->next == NULL)
1936     {
1937       run->next = XNEW (tokenrun);
1938       run->next->prev = run;
1939       _cpp_init_tokenrun (run->next, 250);
1940     }
1941
1942   return run->next;
1943 }
1944
1945 /* Return the number of not yet processed token in a given
1946    context.  */
1947 int
1948 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1949 {
1950   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1951     return (LAST (context).token - FIRST (context).token);
1952   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1953            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1954     return (LAST (context).ptoken - FIRST (context).ptoken);
1955   else
1956       abort ();
1957 }
1958
1959 /* Returns the token present at index INDEX in a given context.  If
1960    INDEX is zero, the next token to be processed is returned.  */
1961 static const cpp_token*
1962 _cpp_token_from_context_at (cpp_context *context, int index)
1963 {
1964   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1965     return &(FIRST (context).token[index]);
1966   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1967            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1968     return FIRST (context).ptoken[index];
1969  else
1970    abort ();
1971 }
1972
1973 /* Look ahead in the input stream.  */
1974 const cpp_token *
1975 cpp_peek_token (cpp_reader *pfile, int index)
1976 {
1977   cpp_context *context = pfile->context;
1978   const cpp_token *peektok;
1979   int count;
1980
1981   /* First, scan through any pending cpp_context objects.  */
1982   while (context->prev)
1983     {
1984       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1985
1986       if (index < (int) sz)
1987         return _cpp_token_from_context_at (context, index);
1988       index -= (int) sz;
1989       context = context->prev;
1990     }
1991
1992   /* We will have to read some new tokens after all (and do so
1993      without invalidating preceding tokens).  */
1994   count = index;
1995   pfile->keep_tokens++;
1996
1997   do
1998     {
1999       peektok = _cpp_lex_token (pfile);
2000       if (peektok->type == CPP_EOF)
2001         return peektok;
2002     }
2003   while (index--);
2004
2005   _cpp_backup_tokens_direct (pfile, count + 1);
2006   pfile->keep_tokens--;
2007
2008   return peektok;
2009 }
2010
2011 /* Allocate a single token that is invalidated at the same time as the
2012    rest of the tokens on the line.  Has its line and col set to the
2013    same as the last lexed token, so that diagnostics appear in the
2014    right place.  */
2015 cpp_token *
2016 _cpp_temp_token (cpp_reader *pfile)
2017 {
2018   cpp_token *old, *result;
2019   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2020   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2021
2022   old = pfile->cur_token - 1;
2023   /* Any pre-existing lookaheads must not be clobbered.  */
2024   if (la)
2025     {
2026       if (sz <= la)
2027         {
2028           tokenrun *next = next_tokenrun (pfile->cur_run);
2029
2030           if (sz < la)
2031             memmove (next->base + 1, next->base,
2032                      (la - sz) * sizeof (cpp_token));
2033
2034           next->base[0] = pfile->cur_run->limit[-1];
2035         }
2036
2037       if (sz > 1)
2038         memmove (pfile->cur_token + 1, pfile->cur_token,
2039                  MIN (la, sz - 1) * sizeof (cpp_token));
2040     }
2041
2042   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2043     {
2044       pfile->cur_run = next_tokenrun (pfile->cur_run);
2045       pfile->cur_token = pfile->cur_run->base;
2046     }
2047
2048   result = pfile->cur_token++;
2049   result->src_loc = old->src_loc;
2050   return result;
2051 }
2052
2053 /* Lex a token into RESULT (external interface).  Takes care of issues
2054    like directive handling, token lookahead, multiple include
2055    optimization and skipping.  */
2056 const cpp_token *
2057 _cpp_lex_token (cpp_reader *pfile)
2058 {
2059   cpp_token *result;
2060
2061   for (;;)
2062     {
2063       if (pfile->cur_token == pfile->cur_run->limit)
2064         {
2065           pfile->cur_run = next_tokenrun (pfile->cur_run);
2066           pfile->cur_token = pfile->cur_run->base;
2067         }
2068       /* We assume that the current token is somewhere in the current
2069          run.  */
2070       if (pfile->cur_token < pfile->cur_run->base
2071           || pfile->cur_token >= pfile->cur_run->limit)
2072         abort ();
2073
2074       if (pfile->lookaheads)
2075         {
2076           pfile->lookaheads--;
2077           result = pfile->cur_token++;
2078         }
2079       else
2080         result = _cpp_lex_direct (pfile);
2081
2082       if (result->flags & BOL)
2083         {
2084           /* Is this a directive.  If _cpp_handle_directive returns
2085              false, it is an assembler #.  */
2086           if (result->type == CPP_HASH
2087               /* 6.10.3 p 11: Directives in a list of macro arguments
2088                  gives undefined behavior.  This implementation
2089                  handles the directive as normal.  */
2090               && pfile->state.parsing_args != 1)
2091             {
2092               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2093                 {
2094                   if (pfile->directive_result.type == CPP_PADDING)
2095                     continue;
2096                   result = &pfile->directive_result;
2097                 }
2098             }
2099           else if (pfile->state.in_deferred_pragma)
2100             result = &pfile->directive_result;
2101
2102           if (pfile->cb.line_change && !pfile->state.skipping)
2103             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2104         }
2105
2106       /* We don't skip tokens in directives.  */
2107       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2108         break;
2109
2110       /* Outside a directive, invalidate controlling macros.  At file
2111          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2112          get here and MI optimization works.  */
2113       pfile->mi_valid = false;
2114
2115       if (!pfile->state.skipping || result->type == CPP_EOF)
2116         break;
2117     }
2118
2119   return result;
2120 }
2121
2122 /* Returns true if a fresh line has been loaded.  */
2123 bool
2124 _cpp_get_fresh_line (cpp_reader *pfile)
2125 {
2126   int return_at_eof;
2127
2128   /* We can't get a new line until we leave the current directive.  */
2129   if (pfile->state.in_directive)
2130     return false;
2131
2132   for (;;)
2133     {
2134       cpp_buffer *buffer = pfile->buffer;
2135
2136       if (!buffer->need_line)
2137         return true;
2138
2139       if (buffer->next_line < buffer->rlimit)
2140         {
2141           _cpp_clean_line (pfile);
2142           return true;
2143         }
2144
2145       /* First, get out of parsing arguments state.  */
2146       if (pfile->state.parsing_args)
2147         return false;
2148
2149       /* End of buffer.  Non-empty files should end in a newline.  */
2150       if (buffer->buf != buffer->rlimit
2151           && buffer->next_line > buffer->rlimit
2152           && !buffer->from_stage3)
2153         {
2154           /* Clip to buffer size.  */
2155           buffer->next_line = buffer->rlimit;
2156         }
2157
2158       return_at_eof = buffer->return_at_eof;
2159       _cpp_pop_buffer (pfile);
2160       if (pfile->buffer == NULL || return_at_eof)
2161         return false;
2162     }
2163 }
2164
2165 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2166   do                                                    \
2167     {                                                   \
2168       result->type = ELSE_TYPE;                         \
2169       if (*buffer->cur == CHAR)                         \
2170         buffer->cur++, result->type = THEN_TYPE;        \
2171     }                                                   \
2172   while (0)
2173
2174 /* Lex a token into pfile->cur_token, which is also incremented, to
2175    get diagnostics pointing to the correct location.
2176
2177    Does not handle issues such as token lookahead, multiple-include
2178    optimization, directives, skipping etc.  This function is only
2179    suitable for use by _cpp_lex_token, and in special cases like
2180    lex_expansion_token which doesn't care for any of these issues.
2181
2182    When meeting a newline, returns CPP_EOF if parsing a directive,
2183    otherwise returns to the start of the token buffer if permissible.
2184    Returns the location of the lexed token.  */
2185 cpp_token *
2186 _cpp_lex_direct (cpp_reader *pfile)
2187 {
2188   cppchar_t c;
2189   cpp_buffer *buffer;
2190   const unsigned char *comment_start;
2191   cpp_token *result = pfile->cur_token++;
2192
2193  fresh_line:
2194   result->flags = 0;
2195   buffer = pfile->buffer;
2196   if (buffer->need_line)
2197     {
2198       if (pfile->state.in_deferred_pragma)
2199         {
2200           result->type = CPP_PRAGMA_EOL;
2201           pfile->state.in_deferred_pragma = false;
2202           if (!pfile->state.pragma_allow_expansion)
2203             pfile->state.prevent_expansion--;
2204           return result;
2205         }
2206       if (!_cpp_get_fresh_line (pfile))
2207         {
2208           result->type = CPP_EOF;
2209           if (!pfile->state.in_directive)
2210             {
2211               /* Tell the compiler the line number of the EOF token.  */
2212               result->src_loc = pfile->line_table->highest_line;
2213               result->flags = BOL;
2214             }
2215           return result;
2216         }
2217       if (!pfile->keep_tokens)
2218         {
2219           pfile->cur_run = &pfile->base_run;
2220           result = pfile->base_run.base;
2221           pfile->cur_token = result + 1;
2222         }
2223       result->flags = BOL;
2224       if (pfile->state.parsing_args == 2)
2225         result->flags |= PREV_WHITE;
2226     }
2227   buffer = pfile->buffer;
2228  update_tokens_line:
2229   result->src_loc = pfile->line_table->highest_line;
2230
2231  skipped_white:
2232   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2233       && !pfile->overlaid_buffer)
2234     {
2235       _cpp_process_line_notes (pfile, false);
2236       result->src_loc = pfile->line_table->highest_line;
2237     }
2238   c = *buffer->cur++;
2239
2240   if (pfile->forced_token_location_p)
2241     result->src_loc = *pfile->forced_token_location_p;
2242   else
2243     result->src_loc = linemap_position_for_column (pfile->line_table,
2244                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2245
2246   switch (c)
2247     {
2248     case ' ': case '\t': case '\f': case '\v': case '\0':
2249       result->flags |= PREV_WHITE;
2250       skip_whitespace (pfile, c);
2251       goto skipped_white;
2252
2253     case '\n':
2254       if (buffer->cur < buffer->rlimit)
2255         CPP_INCREMENT_LINE (pfile, 0);
2256       buffer->need_line = true;
2257       goto fresh_line;
2258
2259     case '0': case '1': case '2': case '3': case '4':
2260     case '5': case '6': case '7': case '8': case '9':
2261       {
2262         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2263         result->type = CPP_NUMBER;
2264         lex_number (pfile, &result->val.str, &nst);
2265         warn_about_normalization (pfile, result, &nst);
2266         break;
2267       }
2268
2269     case 'L':
2270     case 'u':
2271     case 'U':
2272     case 'R':
2273       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2274          wide strings or raw strings.  */
2275       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2276           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2277         {
2278           if ((*buffer->cur == '\'' && c != 'R')
2279               || *buffer->cur == '"'
2280               || (*buffer->cur == 'R'
2281                   && c != 'R'
2282                   && buffer->cur[1] == '"'
2283                   && CPP_OPTION (pfile, rliterals))
2284               || (*buffer->cur == '8'
2285                   && c == 'u'
2286                   && (buffer->cur[1] == '"'
2287                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2288                           && CPP_OPTION (pfile, rliterals)))))
2289             {
2290               lex_string (pfile, result, buffer->cur - 1);
2291               break;
2292             }
2293         }
2294       /* Fall through.  */
2295
2296     case '_':
2297     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2298     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2299     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2300     case 's': case 't':           case 'v': case 'w': case 'x':
2301     case 'y': case 'z':
2302     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2303     case 'G': case 'H': case 'I': case 'J': case 'K':
2304     case 'M': case 'N': case 'O': case 'P': case 'Q':
2305     case 'S': case 'T':           case 'V': case 'W': case 'X':
2306     case 'Y': case 'Z':
2307       result->type = CPP_NAME;
2308       {
2309         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2310         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2311                                                 &nst);
2312         warn_about_normalization (pfile, result, &nst);
2313       }
2314
2315       /* Convert named operators to their proper types.  */
2316       if (result->val.node.node->flags & NODE_OPERATOR)
2317         {
2318           result->flags |= NAMED_OP;
2319           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2320         }
2321       break;
2322
2323     case '\'':
2324     case '"':
2325       lex_string (pfile, result, buffer->cur - 1);
2326       break;
2327
2328     case '/':
2329       /* A potential block or line comment.  */
2330       comment_start = buffer->cur;
2331       c = *buffer->cur;
2332
2333       if (c == '*')
2334         {
2335           if (_cpp_skip_block_comment (pfile))
2336             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2337         }
2338       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2339                             || cpp_in_system_header (pfile)))
2340         {
2341           /* Warn about comments only if pedantically GNUC89, and not
2342              in system headers.  */
2343           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2344               && ! buffer->warned_cplusplus_comments)
2345             {
2346               cpp_error (pfile, CPP_DL_PEDWARN,
2347                          "C++ style comments are not allowed in ISO C90");
2348               cpp_error (pfile, CPP_DL_PEDWARN,
2349                          "(this will be reported only once per input file)");
2350               buffer->warned_cplusplus_comments = 1;
2351             }
2352
2353           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2354             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2355         }
2356       else if (c == '=')
2357         {
2358           buffer->cur++;
2359           result->type = CPP_DIV_EQ;
2360           break;
2361         }
2362       else
2363         {
2364           result->type = CPP_DIV;
2365           break;
2366         }
2367
2368       if (!pfile->state.save_comments)
2369         {
2370           result->flags |= PREV_WHITE;
2371           goto update_tokens_line;
2372         }
2373
2374       /* Save the comment as a token in its own right.  */
2375       save_comment (pfile, result, comment_start, c);
2376       break;
2377
2378     case '<':
2379       if (pfile->state.angled_headers)
2380         {
2381           lex_string (pfile, result, buffer->cur - 1);
2382           if (result->type != CPP_LESS)
2383             break;
2384         }
2385
2386       result->type = CPP_LESS;
2387       if (*buffer->cur == '=')
2388         buffer->cur++, result->type = CPP_LESS_EQ;
2389       else if (*buffer->cur == '<')
2390         {
2391           buffer->cur++;
2392           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2393         }
2394       else if (CPP_OPTION (pfile, digraphs))
2395         {
2396           if (*buffer->cur == ':')
2397             {
2398               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2399                  three characters are <:: and the subsequent character
2400                  is neither : nor >, the < is treated as a preprocessor
2401                  token by itself".  */
2402               if (CPP_OPTION (pfile, cplusplus)
2403                   && (CPP_OPTION (pfile, lang) == CLK_CXX11
2404                       || CPP_OPTION (pfile, lang) == CLK_GNUCXX11)
2405                   && buffer->cur[1] == ':'
2406                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2407                 break;
2408
2409               buffer->cur++;
2410               result->flags |= DIGRAPH;
2411               result->type = CPP_OPEN_SQUARE;
2412             }
2413           else if (*buffer->cur == '%')
2414             {
2415               buffer->cur++;
2416               result->flags |= DIGRAPH;
2417               result->type = CPP_OPEN_BRACE;
2418             }
2419         }
2420       break;
2421
2422     case '>':
2423       result->type = CPP_GREATER;
2424       if (*buffer->cur == '=')
2425         buffer->cur++, result->type = CPP_GREATER_EQ;
2426       else if (*buffer->cur == '>')
2427         {
2428           buffer->cur++;
2429           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2430         }
2431       break;
2432
2433     case '%':
2434       result->type = CPP_MOD;
2435       if (*buffer->cur == '=')
2436         buffer->cur++, result->type = CPP_MOD_EQ;
2437       else if (CPP_OPTION (pfile, digraphs))
2438         {
2439           if (*buffer->cur == ':')
2440             {
2441               buffer->cur++;
2442               result->flags |= DIGRAPH;
2443               result->type = CPP_HASH;
2444               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2445                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2446             }
2447           else if (*buffer->cur == '>')
2448             {
2449               buffer->cur++;
2450               result->flags |= DIGRAPH;
2451               result->type = CPP_CLOSE_BRACE;
2452             }
2453         }
2454       break;
2455
2456     case '.':
2457       result->type = CPP_DOT;
2458       if (ISDIGIT (*buffer->cur))
2459         {
2460           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2461           result->type = CPP_NUMBER;
2462           lex_number (pfile, &result->val.str, &nst);
2463           warn_about_normalization (pfile, result, &nst);
2464         }
2465       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2466         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2467       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2468         buffer->cur++, result->type = CPP_DOT_STAR;
2469       break;
2470
2471     case '+':
2472       result->type = CPP_PLUS;
2473       if (*buffer->cur == '+')
2474         buffer->cur++, result->type = CPP_PLUS_PLUS;
2475       else if (*buffer->cur == '=')
2476         buffer->cur++, result->type = CPP_PLUS_EQ;
2477       break;
2478
2479     case '-':
2480       result->type = CPP_MINUS;
2481       if (*buffer->cur == '>')
2482         {
2483           buffer->cur++;
2484           result->type = CPP_DEREF;
2485           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2486             buffer->cur++, result->type = CPP_DEREF_STAR;
2487         }
2488       else if (*buffer->cur == '-')
2489         buffer->cur++, result->type = CPP_MINUS_MINUS;
2490       else if (*buffer->cur == '=')
2491         buffer->cur++, result->type = CPP_MINUS_EQ;
2492       break;
2493
2494     case '&':
2495       result->type = CPP_AND;
2496       if (*buffer->cur == '&')
2497         buffer->cur++, result->type = CPP_AND_AND;
2498       else if (*buffer->cur == '=')
2499         buffer->cur++, result->type = CPP_AND_EQ;
2500       break;
2501
2502     case '|':
2503       result->type = CPP_OR;
2504       if (*buffer->cur == '|')
2505         buffer->cur++, result->type = CPP_OR_OR;
2506       else if (*buffer->cur == '=')
2507         buffer->cur++, result->type = CPP_OR_EQ;
2508       break;
2509
2510     case ':':
2511       result->type = CPP_COLON;
2512       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2513         buffer->cur++, result->type = CPP_SCOPE;
2514       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2515         {
2516           buffer->cur++;
2517           result->flags |= DIGRAPH;
2518           result->type = CPP_CLOSE_SQUARE;
2519         }
2520       break;
2521
2522     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2523     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2524     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2525     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2526     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2527
2528     case '?': result->type = CPP_QUERY; break;
2529     case '~': result->type = CPP_COMPL; break;
2530     case ',': result->type = CPP_COMMA; break;
2531     case '(': result->type = CPP_OPEN_PAREN; break;
2532     case ')': result->type = CPP_CLOSE_PAREN; break;
2533     case '[': result->type = CPP_OPEN_SQUARE; break;
2534     case ']': result->type = CPP_CLOSE_SQUARE; break;
2535     case '{': result->type = CPP_OPEN_BRACE; break;
2536     case '}': result->type = CPP_CLOSE_BRACE; break;
2537     case ';': result->type = CPP_SEMICOLON; break;
2538
2539       /* @ is a punctuator in Objective-C.  */
2540     case '@': result->type = CPP_ATSIGN; break;
2541
2542     case '$':
2543     case '\\':
2544       {
2545         const uchar *base = --buffer->cur;
2546         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2547
2548         if (forms_identifier_p (pfile, true, &nst))
2549           {
2550             result->type = CPP_NAME;
2551             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2552             warn_about_normalization (pfile, result, &nst);
2553             break;
2554           }
2555         buffer->cur++;
2556       }
2557
2558     default:
2559       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2560       break;
2561     }
2562
2563   return result;
2564 }
2565
2566 /* An upper bound on the number of bytes needed to spell TOKEN.
2567    Does not include preceding whitespace.  */
2568 unsigned int
2569 cpp_token_len (const cpp_token *token)
2570 {
2571   unsigned int len;
2572
2573   switch (TOKEN_SPELL (token))
2574     {
2575     default:            len = 6;                                break;
2576     case SPELL_LITERAL: len = token->val.str.len;               break;
2577     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2578     }
2579
2580   return len;
2581 }
2582
2583 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2584    Return the number of bytes read out of NAME.  (There are always
2585    10 bytes written to BUFFER.)  */
2586
2587 static size_t
2588 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2589 {
2590   int j;
2591   int ucn_len = 0;
2592   int ucn_len_c;
2593   unsigned t;
2594   unsigned long utf32;
2595
2596   /* Compute the length of the UTF-8 sequence.  */
2597   for (t = *name; t & 0x80; t <<= 1)
2598     ucn_len++;
2599
2600   utf32 = *name & (0x7F >> ucn_len);
2601   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2602     {
2603       utf32 = (utf32 << 6) | (*++name & 0x3F);
2604
2605       /* Ill-formed UTF-8.  */
2606       if ((*name & ~0x3F) != 0x80)
2607         abort ();
2608     }
2609
2610   *buffer++ = '\\';
2611   *buffer++ = 'U';
2612   for (j = 7; j >= 0; j--)
2613     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2614   return ucn_len;
2615 }
2616
2617 /* Given a token TYPE corresponding to a digraph, return a pointer to
2618    the spelling of the digraph.  */
2619 static const unsigned char *
2620 cpp_digraph2name (enum cpp_ttype type)
2621 {
2622   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2623 }
2624
2625 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2626    already contain the enough space to hold the token's spelling.
2627    Returns a pointer to the character after the last character written.
2628    FORSTRING is true if this is to be the spelling after translation
2629    phase 1 (this is different for UCNs).
2630    FIXME: Would be nice if we didn't need the PFILE argument.  */
2631 unsigned char *
2632 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2633                  unsigned char *buffer, bool forstring)
2634 {
2635   switch (TOKEN_SPELL (token))
2636     {
2637     case SPELL_OPERATOR:
2638       {
2639         const unsigned char *spelling;
2640         unsigned char c;
2641
2642         if (token->flags & DIGRAPH)
2643           spelling = cpp_digraph2name (token->type);
2644         else if (token->flags & NAMED_OP)
2645           goto spell_ident;
2646         else
2647           spelling = TOKEN_NAME (token);
2648
2649         while ((c = *spelling++) != '\0')
2650           *buffer++ = c;
2651       }
2652       break;
2653
2654     spell_ident:
2655     case SPELL_IDENT:
2656       if (forstring)
2657         {
2658           memcpy (buffer, NODE_NAME (token->val.node.node),
2659                   NODE_LEN (token->val.node.node));
2660           buffer += NODE_LEN (token->val.node.node);
2661         }
2662       else
2663         {
2664           size_t i;
2665           const unsigned char * name = NODE_NAME (token->val.node.node);
2666
2667           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2668             if (name[i] & ~0x7F)
2669               {
2670                 i += utf8_to_ucn (buffer, name + i) - 1;
2671                 buffer += 10;
2672               }
2673             else
2674               *buffer++ = NODE_NAME (token->val.node.node)[i];
2675         }
2676       break;
2677
2678     case SPELL_LITERAL:
2679       memcpy (buffer, token->val.str.text, token->val.str.len);
2680       buffer += token->val.str.len;
2681       break;
2682
2683     case SPELL_NONE:
2684       cpp_error (pfile, CPP_DL_ICE,
2685                  "unspellable token %s", TOKEN_NAME (token));
2686       break;
2687     }
2688
2689   return buffer;
2690 }
2691
2692 /* Returns TOKEN spelt as a null-terminated string.  The string is
2693    freed when the reader is destroyed.  Useful for diagnostics.  */
2694 unsigned char *
2695 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2696 {
2697   unsigned int len = cpp_token_len (token) + 1;
2698   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2699
2700   end = cpp_spell_token (pfile, token, start, false);
2701   end[0] = '\0';
2702
2703   return start;
2704 }
2705
2706 /* Returns a pointer to a string which spells the token defined by
2707    TYPE and FLAGS.  Used by C front ends, which really should move to
2708    using cpp_token_as_text.  */
2709 const char *
2710 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2711 {
2712   if (flags & DIGRAPH)
2713     return (const char *) cpp_digraph2name (type);
2714   else if (flags & NAMED_OP)
2715     return cpp_named_operator2name (type);
2716
2717   return (const char *) token_spellings[type].name;
2718 }
2719
2720 /* Writes the spelling of token to FP, without any preceding space.
2721    Separated from cpp_spell_token for efficiency - to avoid stdio
2722    double-buffering.  */
2723 void
2724 cpp_output_token (const cpp_token *token, FILE *fp)
2725 {
2726   switch (TOKEN_SPELL (token))
2727     {
2728     case SPELL_OPERATOR:
2729       {
2730         const unsigned char *spelling;
2731         int c;
2732
2733         if (token->flags & DIGRAPH)
2734           spelling = cpp_digraph2name (token->type);
2735         else if (token->flags & NAMED_OP)
2736           goto spell_ident;
2737         else
2738           spelling = TOKEN_NAME (token);
2739
2740         c = *spelling;
2741         do
2742           putc (c, fp);
2743         while ((c = *++spelling) != '\0');
2744       }
2745       break;
2746
2747     spell_ident:
2748     case SPELL_IDENT:
2749       {
2750         size_t i;
2751         const unsigned char * name = NODE_NAME (token->val.node.node);
2752
2753         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2754           if (name[i] & ~0x7F)
2755             {
2756               unsigned char buffer[10];
2757               i += utf8_to_ucn (buffer, name + i) - 1;
2758               fwrite (buffer, 1, 10, fp);
2759             }
2760           else
2761             fputc (NODE_NAME (token->val.node.node)[i], fp);
2762       }
2763       break;
2764
2765     case SPELL_LITERAL:
2766       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2767       break;
2768
2769     case SPELL_NONE:
2770       /* An error, most probably.  */
2771       break;
2772     }
2773 }
2774
2775 /* Compare two tokens.  */
2776 int
2777 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2778 {
2779   if (a->type == b->type && a->flags == b->flags)
2780     switch (TOKEN_SPELL (a))
2781       {
2782       default:                  /* Keep compiler happy.  */
2783       case SPELL_OPERATOR:
2784         /* token_no is used to track where multiple consecutive ##
2785            tokens were originally located.  */
2786         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2787       case SPELL_NONE:
2788         return (a->type != CPP_MACRO_ARG
2789                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2790       case SPELL_IDENT:
2791         return a->val.node.node == b->val.node.node;
2792       case SPELL_LITERAL:
2793         return (a->val.str.len == b->val.str.len
2794                 && !memcmp (a->val.str.text, b->val.str.text,
2795                             a->val.str.len));
2796       }
2797
2798   return 0;
2799 }
2800
2801 /* Returns nonzero if a space should be inserted to avoid an
2802    accidental token paste for output.  For simplicity, it is
2803    conservative, and occasionally advises a space where one is not
2804    needed, e.g. "." and ".2".  */
2805 int
2806 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2807                  const cpp_token *token2)
2808 {
2809   enum cpp_ttype a = token1->type, b = token2->type;
2810   cppchar_t c;
2811
2812   if (token1->flags & NAMED_OP)
2813     a = CPP_NAME;
2814   if (token2->flags & NAMED_OP)
2815     b = CPP_NAME;
2816
2817   c = EOF;
2818   if (token2->flags & DIGRAPH)
2819     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2820   else if (token_spellings[b].category == SPELL_OPERATOR)
2821     c = token_spellings[b].name[0];
2822
2823   /* Quickly get everything that can paste with an '='.  */
2824   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2825     return 1;
2826
2827   switch (a)
2828     {
2829     case CPP_GREATER:   return c == '>';
2830     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2831     case CPP_PLUS:      return c == '+';
2832     case CPP_MINUS:     return c == '-' || c == '>';
2833     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2834     case CPP_MOD:       return c == ':' || c == '>';
2835     case CPP_AND:       return c == '&';
2836     case CPP_OR:        return c == '|';
2837     case CPP_COLON:     return c == ':' || c == '>';
2838     case CPP_DEREF:     return c == '*';
2839     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2840     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2841     case CPP_NAME:      return ((b == CPP_NUMBER
2842                                  && name_p (pfile, &token2->val.str))
2843                                 || b == CPP_NAME
2844                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2845     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2846                                 || c == '.' || c == '+' || c == '-');
2847                                       /* UCNs */
2848     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2849                                  && b == CPP_NAME)
2850                                 || (CPP_OPTION (pfile, objc)
2851                                     && token1->val.str.text[0] == '@'
2852                                     && (b == CPP_NAME || b == CPP_STRING)));
2853     default:            break;
2854     }
2855
2856   return 0;
2857 }
2858
2859 /* Output all the remaining tokens on the current line, and a newline
2860    character, to FP.  Leading whitespace is removed.  If there are
2861    macros, special token padding is not performed.  */
2862 void
2863 cpp_output_line (cpp_reader *pfile, FILE *fp)
2864 {
2865   const cpp_token *token;
2866
2867   token = cpp_get_token (pfile);
2868   while (token->type != CPP_EOF)
2869     {
2870       cpp_output_token (token, fp);
2871       token = cpp_get_token (pfile);
2872       if (token->flags & PREV_WHITE)
2873         putc (' ', fp);
2874     }
2875
2876   putc ('\n', fp);
2877 }
2878
2879 /* Return a string representation of all the remaining tokens on the
2880    current line.  The result is allocated using xmalloc and must be
2881    freed by the caller.  */
2882 unsigned char *
2883 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2884 {
2885   const cpp_token *token;
2886   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2887   unsigned int alloced = 120 + out;
2888   unsigned char *result = (unsigned char *) xmalloc (alloced);
2889
2890   /* If DIR_NAME is empty, there are no initial contents.  */
2891   if (dir_name)
2892     {
2893       sprintf ((char *) result, "#%s ", dir_name);
2894       out += 2;
2895     }
2896
2897   token = cpp_get_token (pfile);
2898   while (token->type != CPP_EOF)
2899     {
2900       unsigned char *last;
2901       /* Include room for a possible space and the terminating nul.  */
2902       unsigned int len = cpp_token_len (token) + 2;
2903
2904       if (out + len > alloced)
2905         {
2906           alloced *= 2;
2907           if (out + len > alloced)
2908             alloced = out + len;
2909           result = (unsigned char *) xrealloc (result, alloced);
2910         }
2911
2912       last = cpp_spell_token (pfile, token, &result[out], 0);
2913       out = last - result;
2914
2915       token = cpp_get_token (pfile);
2916       if (token->flags & PREV_WHITE)
2917         result[out++] = ' ';
2918     }
2919
2920   result[out] = '\0';
2921   return result;
2922 }
2923
2924 /* Memory buffers.  Changing these three constants can have a dramatic
2925    effect on performance.  The values here are reasonable defaults,
2926    but might be tuned.  If you adjust them, be sure to test across a
2927    range of uses of cpplib, including heavy nested function-like macro
2928    expansion.  Also check the change in peak memory usage (NJAMD is a
2929    good tool for this).  */
2930 #define MIN_BUFF_SIZE 8000
2931 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2932 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2933         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2934
2935 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2936   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2937 #endif
2938
2939 /* Create a new allocation buffer.  Place the control block at the end
2940    of the buffer, so that buffer overflows will cause immediate chaos.  */
2941 static _cpp_buff *
2942 new_buff (size_t len)
2943 {
2944   _cpp_buff *result;
2945   unsigned char *base;
2946
2947   if (len < MIN_BUFF_SIZE)
2948     len = MIN_BUFF_SIZE;
2949   len = CPP_ALIGN (len);
2950
2951 #ifdef ENABLE_VALGRIND_CHECKING
2952   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
2953      struct first.  */
2954   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
2955   base = XNEWVEC (unsigned char, len + slen);
2956   result = (_cpp_buff *) base;
2957   base += slen;
2958 #else
2959   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2960   result = (_cpp_buff *) (base + len);
2961 #endif
2962   result->base = base;
2963   result->cur = base;
2964   result->limit = base + len;
2965   result->next = NULL;
2966   return result;
2967 }
2968
2969 /* Place a chain of unwanted allocation buffers on the free list.  */
2970 void
2971 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2972 {
2973   _cpp_buff *end = buff;
2974
2975   while (end->next)
2976     end = end->next;
2977   end->next = pfile->free_buffs;
2978   pfile->free_buffs = buff;
2979 }
2980
2981 /* Return a free buffer of size at least MIN_SIZE.  */
2982 _cpp_buff *
2983 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2984 {
2985   _cpp_buff *result, **p;
2986
2987   for (p = &pfile->free_buffs;; p = &(*p)->next)
2988     {
2989       size_t size;
2990
2991       if (*p == NULL)
2992         return new_buff (min_size);
2993       result = *p;
2994       size = result->limit - result->base;
2995       /* Return a buffer that's big enough, but don't waste one that's
2996          way too big.  */
2997       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2998         break;
2999     }
3000
3001   *p = result->next;
3002   result->next = NULL;
3003   result->cur = result->base;
3004   return result;
3005 }
3006
3007 /* Creates a new buffer with enough space to hold the uncommitted
3008    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
3009    the excess bytes to the new buffer.  Chains the new buffer after
3010    BUFF, and returns the new buffer.  */
3011 _cpp_buff *
3012 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3013 {
3014   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3015   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3016
3017   buff->next = new_buff;
3018   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3019   return new_buff;
3020 }
3021
3022 /* Creates a new buffer with enough space to hold the uncommitted
3023    remaining bytes of the buffer pointed to by BUFF, and at least
3024    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3025    Chains the new buffer before the buffer pointed to by BUFF, and
3026    updates the pointer to point to the new buffer.  */
3027 void
3028 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3029 {
3030   _cpp_buff *new_buff, *old_buff = *pbuff;
3031   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3032
3033   new_buff = _cpp_get_buff (pfile, size);
3034   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3035   new_buff->next = old_buff;
3036   *pbuff = new_buff;
3037 }
3038
3039 /* Free a chain of buffers starting at BUFF.  */
3040 void
3041 _cpp_free_buff (_cpp_buff *buff)
3042 {
3043   _cpp_buff *next;
3044
3045   for (; buff; buff = next)
3046     {
3047       next = buff->next;
3048 #ifdef ENABLE_VALGRIND_CHECKING
3049       free (buff);
3050 #else
3051       free (buff->base);
3052 #endif
3053     }
3054 }
3055
3056 /* Allocate permanent, unaligned storage of length LEN.  */
3057 unsigned char *
3058 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3059 {
3060   _cpp_buff *buff = pfile->u_buff;
3061   unsigned char *result = buff->cur;
3062
3063   if (len > (size_t) (buff->limit - result))
3064     {
3065       buff = _cpp_get_buff (pfile, len);
3066       buff->next = pfile->u_buff;
3067       pfile->u_buff = buff;
3068       result = buff->cur;
3069     }
3070
3071   buff->cur = result + len;
3072   return result;
3073 }
3074
3075 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3076    That buffer is used for growing allocations when saving macro
3077    replacement lists in a #define, and when parsing an answer to an
3078    assertion in #assert, #unassert or #if (and therefore possibly
3079    whilst expanding macros).  It therefore must not be used by any
3080    code that they might call: specifically the lexer and the guts of
3081    the macro expander.
3082
3083    All existing other uses clearly fit this restriction: storing
3084    registered pragmas during initialization.  */
3085 unsigned char *
3086 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3087 {
3088   _cpp_buff *buff = pfile->a_buff;
3089   unsigned char *result = buff->cur;
3090
3091   if (len > (size_t) (buff->limit - result))
3092     {
3093       buff = _cpp_get_buff (pfile, len);
3094       buff->next = pfile->a_buff;
3095       pfile->a_buff = buff;
3096       result = buff->cur;
3097     }
3098
3099   buff->cur = result + len;
3100   return result;
3101 }
3102
3103 /* Say which field of TOK is in use.  */
3104
3105 enum cpp_token_fld_kind
3106 cpp_token_val_index (cpp_token *tok)
3107 {
3108   switch (TOKEN_SPELL (tok))
3109     {
3110     case SPELL_IDENT:
3111       return CPP_TOKEN_FLD_NODE;
3112     case SPELL_LITERAL:
3113       return CPP_TOKEN_FLD_STR;
3114     case SPELL_OPERATOR:
3115       if (tok->type == CPP_PASTE)
3116         return CPP_TOKEN_FLD_TOKEN_NO;
3117       else
3118         return CPP_TOKEN_FLD_NONE;
3119     case SPELL_NONE:
3120       if (tok->type == CPP_MACRO_ARG)
3121         return CPP_TOKEN_FLD_ARG_NO;
3122       else if (tok->type == CPP_PADDING)
3123         return CPP_TOKEN_FLD_SOURCE;
3124       else if (tok->type == CPP_PRAGMA)
3125         return CPP_TOKEN_FLD_PRAGMA;
3126       /* else fall through */
3127     default:
3128       return CPP_TOKEN_FLD_NONE;
3129     }
3130 }
3131
3132 /* All tokens lexed in R after calling this function will be forced to have
3133    their source_location the same as the location referenced by P, until
3134    cpp_stop_forcing_token_locations is called for R.  */
3135
3136 void
3137 cpp_force_token_locations (cpp_reader *r, source_location *p)
3138 {
3139   r->forced_token_location_p = p;
3140 }
3141
3142 /* Go back to assigning locations naturally for lexed tokens.  */
3143
3144 void
3145 cpp_stop_forcing_token_locations (cpp_reader *r)
3146 {
3147   r->forced_token_location_p = NULL;
3148 }