libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2014 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problem can be properly
 267    autoconfed:
 268
 269    The Solaris 10+ assembler tags objects with the instruction set
 270    extensions used, so SSE4.2 executables cannot run on machines that
 271    don't support that extension.  */
 272
 273 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 274
 275 /* Replicated character data to be shared between implementations.
 276    Recall that outside of a context with vector support we can't
 277    define compatible vector types, therefore these are all defined
 278    in terms of raw characters.  */
 279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 280   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 281     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 282   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 283     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 284   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 285     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 286   { '?', '?', '?', '?', '?', '?', '?', '?',
 287     '?', '?', '?', '?', '?', '?', '?', '?' },
 288 };
 289
 290 /* A version of the fast scanner using MMX vectorized byte compare insns.
 291
 292    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 293    which was packaged into SSE1; it is also present in the AMD MMX
 294    extension.  Mark the function as using "sse" so that we emit a real
 295    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 296
 297 static const uchar *
 298 #ifndef __SSE__
 299 __attribute__((__target__("sse")))
 300 #endif
 301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 302 {
 303   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 304   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 305
 306   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 307   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 308   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 309   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 310
 311   unsigned int misalign, found, mask;
 312   const v8qi *p;
 313   v8qi data, t, c;
 314
 315   /* Align the source pointer.  While MMX doesn't generate unaligned data
 316      faults, this allows us to safely scan to the end of the buffer without
 317      reading beyond the end of the last page.  */
 318   misalign = (uintptr_t)s & 7;
 319   p = (const v8qi *)((uintptr_t)s & -8);
 320   data = *p;
 321
 322   /* Create a mask for the bytes that are valid within the first
 323      16-byte block.  The Idea here is that the AND with the mask
 324      within the loop is "free", since we need some AND or TEST
 325      insn in order to set the flags for the branch anyway.  */
 326   mask = -1u << misalign;
 327
 328   /* Main loop processing 8 bytes at a time.  */
 329   goto start;
 330   do
 331     {
 332       data = *++p;
 333       mask = -1;
 334
 335     start:
 336       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 337       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 338       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 339       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       found = __builtin_ia32_pmovmskb (t);
 344       found &= mask;
 345     }
 346   while (!found);
 347
 348   __builtin_ia32_emms ();
 349
 350   /* FOUND contains 1 in bits for which we matched a relevant
 351      character.  Conversion to the byte index is trivial.  */
 352   found = __builtin_ctz(found);
 353   return (const uchar *)p + found;
 354 }
 355
 356 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 357
 358 static const uchar *
 359 #ifndef __SSE2__
 360 __attribute__((__target__("sse2")))
 361 #endif
 362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 363 {
 364   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 365
 366   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 367   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 368   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 369   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 370
 371   unsigned int misalign, found, mask;
 372   const v16qi *p;
 373   v16qi data, t;
 374
 375   /* Align the source pointer.  */
 376   misalign = (uintptr_t)s & 15;
 377   p = (const v16qi *)((uintptr_t)s & -16);
 378   data = *p;
 379
 380   /* Create a mask for the bytes that are valid within the first
 381      16-byte block.  The Idea here is that the AND with the mask
 382      within the loop is "free", since we need some AND or TEST
 383      insn in order to set the flags for the branch anyway.  */
 384   mask = -1u << misalign;
 385
 386   /* Main loop processing 16 bytes at a time.  */
 387   goto start;
 388   do
 389     {
 390       data = *++p;
 391       mask = -1;
 392
 393     start:
 394       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 395       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 396       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 397       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 398       found = __builtin_ia32_pmovmskb128 (t);
 399       found &= mask;
 400     }
 401   while (!found);
 402
 403   /* FOUND contains 1 in bits for which we matched a relevant
 404      character.  Conversion to the byte index is trivial.  */
 405   found = __builtin_ctz(found);
 406   return (const uchar *)p + found;
 407 }
 408
 409 #ifdef HAVE_SSE4
 410 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 411
 412 static const uchar *
 413 #ifndef __SSE4_2__
 414 __attribute__((__target__("sse4.2")))
 415 #endif
 416 search_line_sse42 (const uchar *s, const uchar *end)
 417 {
 418   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 419   static const v16qi search = { '\n', '\r', '?', '\\' };
 420
 421   uintptr_t si = (uintptr_t)s;
 422   uintptr_t index;
 423
 424   /* Check for unaligned input.  */
 425   if (si & 15)
 426     {
 427       v16qi sv;
 428
 429       if (__builtin_expect (end - s < 16, 0)
 430           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 431         {
 432           /* There are less than 16 bytes left in the buffer, and less
 433              than 16 bytes left on the page.  Reading 16 bytes at this
 434              point might generate a spurious page fault.  Defer to the
 435              SSE2 implementation, which already handles alignment.  */
 436           return search_line_sse2 (s, end);
 437         }
 438
 439       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 440          memory need not be aligned.  */
 441       sv = __builtin_ia32_loaddqu ((const char *) s);
 442       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 443
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 16) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 454      in inline assembly, we can make proper use of the flags set.  */
 455   __asm (      "sub $16, %1\n"
 456         "       .balign 16\n"
 457         "0:     add $16, %1\n"
 458         "       %vpcmpestri $0, (%1), %2\n"
 459         "       jnc 0b"
 460         : "=&c"(index), "+r"(s)
 461         : "x"(search), "a"(4), "d"(16));
 462
 463  found:
 464   return s + index;
 465 }
 466
 467 #else
 468 /* Work around out-dated assemblers without sse4 support.  */
 469 #define search_line_sse42 search_line_sse2
 470 #endif
 471
 472 /* Check the CPU capabilities.  */
 473
 474 #include "../gcc/config/i386/cpuid.h"
 475
 476 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 477 static search_line_fast_type search_line_fast;
 478
 479 #define HAVE_init_vectorized_lexer 1
 480 static inline void
 481 init_vectorized_lexer (void)
 482 {
 483   unsigned dummy, ecx = 0, edx = 0;
 484   search_line_fast_type impl = search_line_acc_char;
 485   int minimum = 0;
 486
 487 #if defined(__SSE4_2__)
 488   minimum = 3;
 489 #elif defined(__SSE2__)
 490   minimum = 2;
 491 #elif defined(__SSE__)
 492   minimum = 1;
 493 #endif
 494
 495   if (minimum == 3)
 496     impl = search_line_sse42;
 497   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 498     {
 499       if (minimum == 3 || (ecx & bit_SSE4_2))
 500         impl = search_line_sse42;
 501       else if (minimum == 2 || (edx & bit_SSE2))
 502         impl = search_line_sse2;
 503       else if (minimum == 1 || (edx & bit_SSE))
 504         impl = search_line_mmx;
 505     }
 506   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 507     {
 508       if (minimum == 1
 509           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 510         impl = search_line_mmx;
 511     }
 512
 513   search_line_fast = impl;
 514 }
 515
 516 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 517
 518 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 519 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 520    so we can't compile this function without -maltivec on the command line
 521    (or implied by some other switch).  */
 522
 523 static const uchar *
 524 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 525 {
 526   typedef __attribute__((altivec(vector))) unsigned char vc;
 527
 528   const vc repl_nl = {
 529     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 530     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 531   };
 532   const vc repl_cr = {
 533     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 534     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 535   };
 536   const vc repl_bs = {
 537     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 538     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 539   };
 540   const vc repl_qm = {
 541     '?', '?', '?', '?', '?', '?', '?', '?',
 542     '?', '?', '?', '?', '?', '?', '?', '?',
 543   };
 544   const vc ones = {
 545     -1, -1, -1, -1, -1, -1, -1, -1,
 546     -1, -1, -1, -1, -1, -1, -1, -1,
 547   };
 548   const vc zero = { 0 };
 549
 550   vc data, mask, t;
 551
 552   /* Altivec loads automatically mask addresses with -16.  This lets us
 553      issue the first load as early as possible.  */
 554   data = __builtin_vec_ld(0, (const vc *)s);
 555
 556   /* Discard bytes before the beginning of the buffer.  Do this by
 557      beginning with all ones and shifting in zeros according to the
 558      mis-alignment.  The LVSR instruction pulls the exact shift we
 559      want from the address.  */
 560 #ifdef __BIG_ENDIAN__
 561   mask = __builtin_vec_lvsr(0, s);
 562   mask = __builtin_vec_perm(zero, ones, mask);
 563 #else
 564   mask = __builtin_vec_lvsl(0, s);
 565   mask = __builtin_vec_perm(ones, zero, mask);
 566 #endif
 567   data &= mask;
 568
 569   /* While altivec loads mask addresses, we still need to align S so
 570      that the offset we compute at the end is correct.  */
 571   s = (const uchar *)((uintptr_t)s & -16);
 572
 573   /* Main loop processing 16 bytes at a time.  */
 574   goto start;
 575   do
 576     {
 577       vc m_nl, m_cr, m_bs, m_qm;
 578
 579       s += 16;
 580       data = __builtin_vec_ld(0, (const vc *)s);
 581
 582     start:
 583       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 584       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 585       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 586       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 587       t = (m_nl | m_cr) | (m_bs | m_qm);
 588
 589       /* T now contains 0xff in bytes for which we matched one of the relevant
 590          characters.  We want to exit the loop if any byte in T is non-zero.
 591          Below is the expansion of vec_any_ne(t, zero).  */
 592     }
 593   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 594
 595   {
 596 #define N  (sizeof(vc) / sizeof(long))
 597
 598     union {
 599       vc v;
 600       /* Statically assert that N is 2 or 4.  */
 601       unsigned long l[(N == 2 || N == 4) ? N : -1];
 602     } u;
 603     unsigned long l, i = 0;
 604
 605     u.v = t;
 606
 607     /* Find the first word of T that is non-zero.  */
 608     switch (N)
 609       {
 610       case 4:
 611         l = u.l[i++];
 612         if (l != 0)
 613           break;
 614         s += sizeof(unsigned long);
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619       case 2:
 620         l = u.l[i++];
 621         if (l != 0)
 622           break;
 623         s += sizeof(unsigned long);
 624         l = u.l[i];
 625       }
 626
 627     /* L now contains 0xff in bytes for which we matched one of the
 628        relevant characters.  We can find the byte index by finding
 629        its bit index and dividing by 8.  */
 630 #ifdef __BIG_ENDIAN__
 631     l = __builtin_clzl(l) >> 3;
 632 #else
 633     l = __builtin_ctzl(l) >> 3;
 634 #endif
 635     return s + l;
 636
 637 #undef N
 638   }
 639 }
 640
 641 #elif defined (__ARM_NEON__)
 642 #include "arm_neon.h"
 643
 644 static const uchar *
 645 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 646 {
 647   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 648   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 649   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 650   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 651   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 652
 653   unsigned int misalign, found, mask;
 654   const uint8_t *p;
 655   uint8x16_t data;
 656
 657   /* Align the source pointer.  */
 658   misalign = (uintptr_t)s & 15;
 659   p = (const uint8_t *)((uintptr_t)s & -16);
 660   data = vld1q_u8 (p);
 661
 662   /* Create a mask for the bytes that are valid within the first
 663      16-byte block.  The Idea here is that the AND with the mask
 664      within the loop is "free", since we need some AND or TEST
 665      insn in order to set the flags for the branch anyway.  */
 666   mask = (-1u << misalign) & 0xffff;
 667
 668   /* Main loop, processing 16 bytes at a time.  */
 669   goto start;
 670
 671   do
 672     {
 673       uint8x8_t l;
 674       uint16x4_t m;
 675       uint32x2_t n;
 676       uint8x16_t t, u, v, w;
 677
 678       p += 16;
 679       data = vld1q_u8 (p);
 680       mask = 0xffff;
 681
 682     start:
 683       t = vceqq_u8 (data, repl_nl);
 684       u = vceqq_u8 (data, repl_cr);
 685       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 686       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 687       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 688       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 689       m = vpaddl_u8 (l);
 690       n = vpaddl_u16 (m);
 691
 692       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 693               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 694       found &= mask;
 695     }
 696   while (!found);
 697
 698   /* FOUND contains 1 in bits for which we matched a relevant
 699      character.  Conversion to the byte index is trivial.  */
 700   found = __builtin_ctz (found);
 701   return (const uchar *)p + found;
 702 }
 703
 704 #else
 705
 706 /* We only have one accellerated alternative.  Use a direct call so that
 707    we encourage inlining.  */
 708
 709 #define search_line_fast  search_line_acc_char
 710
 711 #endif
 712
 713 /* Initialize the lexer if needed.  */
 714
 715 void
 716 _cpp_init_lexer (void)
 717 {
 718 #ifdef HAVE_init_vectorized_lexer
 719   init_vectorized_lexer ();
 720 #endif
 721 }
 722
 723 /* Returns with a logical line that contains no escaped newlines or
 724    trigraphs.  This is a time-critical inner loop.  */
 725 void
 726 _cpp_clean_line (cpp_reader *pfile)
 727 {
 728   cpp_buffer *buffer;
 729   const uchar *s;
 730   uchar c, *d, *p;
 731
 732   buffer = pfile->buffer;
 733   buffer->cur_note = buffer->notes_used = 0;
 734   buffer->cur = buffer->line_base = buffer->next_line;
 735   buffer->need_line = false;
 736   s = buffer->next_line;
 737
 738   if (!buffer->from_stage3)
 739     {
 740       const uchar *pbackslash = NULL;
 741
 742       /* Fast path.  This is the common case of an un-escaped line with
 743          no trigraphs.  The primary win here is by not writing any
 744          data back to memory until we have to.  */
 745       while (1)
 746         {
 747           /* Perform an optimized search for \n, \r, \\, ?.  */
 748           s = search_line_fast (s, buffer->rlimit);
 749
 750           c = *s;
 751           if (c == '\\')
 752             {
 753               /* Record the location of the backslash and continue.  */
 754               pbackslash = s++;
 755             }
 756           else if (__builtin_expect (c == '?', 0))
 757             {
 758               if (__builtin_expect (s[1] == '?', false)
 759                    && _cpp_trigraph_map[s[2]])
 760                 {
 761                   /* Have a trigraph.  We may or may not have to convert
 762                      it.  Add a line note regardless, for -Wtrigraphs.  */
 763                   add_line_note (buffer, s, s[2]);
 764                   if (CPP_OPTION (pfile, trigraphs))
 765                     {
 766                       /* We do, and that means we have to switch to the
 767                          slow path.  */
 768                       d = (uchar *) s;
 769                       *d = _cpp_trigraph_map[s[2]];
 770                       s += 2;
 771                       goto slow_path;
 772                     }
 773                 }
 774               /* Not a trigraph.  Continue on fast-path.  */
 775               s++;
 776             }
 777           else
 778             break;
 779         }
 780
 781       /* This must be \r or \n.  We're either done, or we'll be forced
 782          to write back to the buffer and continue on the slow path.  */
 783       d = (uchar *) s;
 784
 785       if (__builtin_expect (s == buffer->rlimit, false))
 786         goto done;
 787
 788       /* DOS line ending? */
 789       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 790         {
 791           s++;
 792           if (s == buffer->rlimit)
 793             goto done;
 794         }
 795
 796       if (__builtin_expect (pbackslash == NULL, true))
 797         goto done;
 798
 799       /* Check for escaped newline.  */
 800       p = d;
 801       while (is_nvspace (p[-1]))
 802         p--;
 803       if (p - 1 != pbackslash)
 804         goto done;
 805
 806       /* Have an escaped newline; process it and proceed to
 807          the slow path.  */
 808       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 809       d = p - 2;
 810       buffer->next_line = p - 1;
 811
 812     slow_path:
 813       while (1)
 814         {
 815           c = *++s;
 816           *++d = c;
 817
 818           if (c == '\n' || c == '\r')
 819             {
 820               /* Handle DOS line endings.  */
 821               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 822                 s++;
 823               if (s == buffer->rlimit)
 824                 break;
 825
 826               /* Escaped?  */
 827               p = d;
 828               while (p != buffer->next_line && is_nvspace (p[-1]))
 829                 p--;
 830               if (p == buffer->next_line || p[-1] != '\\')
 831                 break;
 832
 833               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 834               d = p - 2;
 835               buffer->next_line = p - 1;
 836             }
 837           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 838             {
 839               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 840               add_line_note (buffer, d, s[2]);
 841               if (CPP_OPTION (pfile, trigraphs))
 842                 {
 843                   *d = _cpp_trigraph_map[s[2]];
 844                   s += 2;
 845                 }
 846             }
 847         }
 848     }
 849   else
 850     {
 851       while (*s != '\n' && *s != '\r')
 852         s++;
 853       d = (uchar *) s;
 854
 855       /* Handle DOS line endings.  */
 856       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 857         s++;
 858     }
 859
 860  done:
 861   *d = '\n';
 862   /* A sentinel note that should never be processed.  */
 863   add_line_note (buffer, d + 1, '\n');
 864   buffer->next_line = s + 1;
 865 }
 866
 867 /* Return true if the trigraph indicated by NOTE should be warned
 868    about in a comment.  */
 869 static bool
 870 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 871 {
 872   const uchar *p;
 873
 874   /* Within comments we don't warn about trigraphs, unless the
 875      trigraph forms an escaped newline, as that may change
 876      behavior.  */
 877   if (note->type != '/')
 878     return false;
 879
 880   /* If -trigraphs, then this was an escaped newline iff the next note
 881      is coincident.  */
 882   if (CPP_OPTION (pfile, trigraphs))
 883     return note[1].pos == note->pos;
 884
 885   /* Otherwise, see if this forms an escaped newline.  */
 886   p = note->pos + 3;
 887   while (is_nvspace (*p))
 888     p++;
 889
 890   /* There might have been escaped newlines between the trigraph and the
 891      newline we found.  Hence the position test.  */
 892   return (*p == '\n' && p < note[1].pos);
 893 }
 894
 895 /* Process the notes created by add_line_note as far as the current
 896    location.  */
 897 void
 898 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 899 {
 900   cpp_buffer *buffer = pfile->buffer;
 901
 902   for (;;)
 903     {
 904       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 905       unsigned int col;
 906
 907       if (note->pos > buffer->cur)
 908         break;
 909
 910       buffer->cur_note++;
 911       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 912
 913       if (note->type == '\\' || note->type == ' ')
 914         {
 915           if (note->type == ' ' && !in_comment)
 916             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 917                                  "backslash and newline separated by space");
 918
 919           if (buffer->next_line > buffer->rlimit)
 920             {
 921               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 922                                    "backslash-newline at end of file");
 923               /* Prevent "no newline at end of file" warning.  */
 924               buffer->next_line = buffer->rlimit;
 925             }
 926
 927           buffer->line_base = note->pos;
 928           CPP_INCREMENT_LINE (pfile, 0);
 929         }
 930       else if (_cpp_trigraph_map[note->type])
 931         {
 932           if (CPP_OPTION (pfile, warn_trigraphs)
 933               && (!in_comment || warn_in_comment (pfile, note)))
 934             {
 935               if (CPP_OPTION (pfile, trigraphs))
 936                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 937                                        pfile->line_table->highest_line, col,
 938                                        "trigraph ??%c converted to %c",
 939                                        note->type,
 940                                        (int) _cpp_trigraph_map[note->type]);
 941               else
 942                 {
 943                   cpp_warning_with_line
 944                     (pfile, CPP_W_TRIGRAPHS,
 945                      pfile->line_table->highest_line, col,
 946                      "trigraph ??%c ignored, use -trigraphs to enable",
 947                      note->type);
 948                 }
 949             }
 950         }
 951       else if (note->type == 0)
 952         /* Already processed in lex_raw_string.  */;
 953       else
 954         abort ();
 955     }
 956 }
 957
 958 /* Skip a C-style block comment.  We find the end of the comment by
 959    seeing if an asterisk is before every '/' we encounter.  Returns
 960    nonzero if comment terminated by EOF, zero otherwise.
 961
 962    Buffer->cur points to the initial asterisk of the comment.  */
 963 bool
 964 _cpp_skip_block_comment (cpp_reader *pfile)
 965 {
 966   cpp_buffer *buffer = pfile->buffer;
 967   const uchar *cur = buffer->cur;
 968   uchar c;
 969
 970   cur++;
 971   if (*cur == '/')
 972     cur++;
 973
 974   for (;;)
 975     {
 976       /* People like decorating comments with '*', so check for '/'
 977          instead for efficiency.  */
 978       c = *cur++;
 979
 980       if (c == '/')
 981         {
 982           if (cur[-2] == '*')
 983             break;
 984
 985           /* Warn about potential nested comments, but not if the '/'
 986              comes immediately before the true comment delimiter.
 987              Don't bother to get it right across escaped newlines.  */
 988           if (CPP_OPTION (pfile, warn_comments)
 989               && cur[0] == '*' && cur[1] != '/')
 990             {
 991               buffer->cur = cur;
 992               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 993                                      pfile->line_table->highest_line,
 994                                      CPP_BUF_COL (buffer),
 995                                      "\"/*\" within comment");
 996             }
 997         }
 998       else if (c == '\n')
 999         {
1000           unsigned int cols;
1001           buffer->cur = cur - 1;
1002           _cpp_process_line_notes (pfile, true);
1003           if (buffer->next_line >= buffer->rlimit)
1004             return true;
1005           _cpp_clean_line (pfile);
1006
1007           cols = buffer->next_line - buffer->line_base;
1008           CPP_INCREMENT_LINE (pfile, cols);
1009
1010           cur = buffer->cur;
1011         }
1012     }
1013
1014   buffer->cur = cur;
1015   _cpp_process_line_notes (pfile, true);
1016   return false;
1017 }
1018
1019 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1020    terminating newline.  Handles escaped newlines.  Returns nonzero
1021    if a multiline comment.  */
1022 static int
1023 skip_line_comment (cpp_reader *pfile)
1024 {
1025   cpp_buffer *buffer = pfile->buffer;
1026   source_location orig_line = pfile->line_table->highest_line;
1027
1028   while (*buffer->cur != '\n')
1029     buffer->cur++;
1030
1031   _cpp_process_line_notes (pfile, true);
1032   return orig_line != pfile->line_table->highest_line;
1033 }
1034
1035 /* Skips whitespace, saving the next non-whitespace character.  */
1036 static void
1037 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1038 {
1039   cpp_buffer *buffer = pfile->buffer;
1040   bool saw_NUL = false;
1041
1042   do
1043     {
1044       /* Horizontal space always OK.  */
1045       if (c == ' ' || c == '\t')
1046         ;
1047       /* Just \f \v or \0 left.  */
1048       else if (c == '\0')
1049         saw_NUL = true;
1050       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1051         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1052                              CPP_BUF_COL (buffer),
1053                              "%s in preprocessing directive",
1054                              c == '\f' ? "form feed" : "vertical tab");
1055
1056       c = *buffer->cur++;
1057     }
1058   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1059   while (is_nvspace (c));
1060
1061   if (saw_NUL)
1062     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1063
1064   buffer->cur--;
1065 }
1066
1067 /* See if the characters of a number token are valid in a name (no
1068    '.', '+' or '-').  */
1069 static int
1070 name_p (cpp_reader *pfile, const cpp_string *string)
1071 {
1072   unsigned int i;
1073
1074   for (i = 0; i < string->len; i++)
1075     if (!is_idchar (string->text[i]))
1076       return 0;
1077
1078   return 1;
1079 }
1080
1081 /* After parsing an identifier or other sequence, produce a warning about
1082    sequences not in NFC/NFKC.  */
1083 static void
1084 warn_about_normalization (cpp_reader *pfile,
1085                           const cpp_token *token,
1086                           const struct normalize_state *s)
1087 {
1088   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1089       && !pfile->state.skipping)
1090     {
1091       /* Make sure that the token is printed using UCNs, even
1092          if we'd otherwise happily print UTF-8.  */
1093       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1094       size_t sz;
1095
1096       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1097       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1098         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1099                                "`%.*s' is not in NFKC", (int) sz, buf);
1100       else
1101         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1102                                "`%.*s' is not in NFC", (int) sz, buf);
1103       free (buf);
1104     }
1105 }
1106
1107 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1108    an identifier.  FIRST is TRUE if this starts an identifier.  */
1109 static bool
1110 forms_identifier_p (cpp_reader *pfile, int first,
1111                     struct normalize_state *state)
1112 {
1113   cpp_buffer *buffer = pfile->buffer;
1114
1115   if (*buffer->cur == '$')
1116     {
1117       if (!CPP_OPTION (pfile, dollars_in_ident))
1118         return false;
1119
1120       buffer->cur++;
1121       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1122         {
1123           CPP_OPTION (pfile, warn_dollars) = 0;
1124           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1125         }
1126
1127       return true;
1128     }
1129
1130   /* Is this a syntactically valid UCN?  */
1131   if (CPP_OPTION (pfile, extended_identifiers)
1132       && *buffer->cur == '\\'
1133       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1134     {
1135       buffer->cur += 2;
1136       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1137                           state))
1138         return true;
1139       buffer->cur -= 2;
1140     }
1141
1142   return false;
1143 }
1144
1145 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1146 static cpp_hashnode *
1147 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1148 {
1149   cpp_hashnode *result;
1150   const uchar *cur;
1151   unsigned int len;
1152   unsigned int hash = HT_HASHSTEP (0, *base);
1153
1154   cur = base + 1;
1155   while (ISIDNUM (*cur))
1156     {
1157       hash = HT_HASHSTEP (hash, *cur);
1158       cur++;
1159     }
1160   len = cur - base;
1161   hash = HT_HASHFINISH (hash, len);
1162   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1163                                               base, len, hash, HT_ALLOC));
1164
1165   /* Rarely, identifiers require diagnostics when lexed.  */
1166   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1167                         && !pfile->state.skipping, 0))
1168     {
1169       /* It is allowed to poison the same identifier twice.  */
1170       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1171         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1172                    NODE_NAME (result));
1173
1174       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1175          replacement list of a variadic macro.  */
1176       if (result == pfile->spec_nodes.n__VA_ARGS__
1177           && !pfile->state.va_args_ok)
1178         cpp_error (pfile, CPP_DL_PEDWARN,
1179                    "__VA_ARGS__ can only appear in the expansion"
1180                    " of a C99 variadic macro");
1181
1182       /* For -Wc++-compat, warn about use of C++ named operators.  */
1183       if (result->flags & NODE_WARN_OPERATOR)
1184         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1185                      "identifier \"%s\" is a special operator name in C++",
1186                      NODE_NAME (result));
1187     }
1188
1189   return result;
1190 }
1191
1192 /* Get the cpp_hashnode of an identifier specified by NAME in
1193    the current cpp_reader object.  If none is found, NULL is returned.  */
1194 cpp_hashnode *
1195 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1196 {
1197   cpp_hashnode *result;
1198   result = lex_identifier_intern (pfile, (uchar *) name);
1199   return result;
1200 }
1201
1202 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1203 static cpp_hashnode *
1204 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1205                 struct normalize_state *nst)
1206 {
1207   cpp_hashnode *result;
1208   const uchar *cur;
1209   unsigned int len;
1210   unsigned int hash = HT_HASHSTEP (0, *base);
1211
1212   cur = pfile->buffer->cur;
1213   if (! starts_ucn)
1214     {
1215       while (ISIDNUM (*cur))
1216         {
1217           hash = HT_HASHSTEP (hash, *cur);
1218           cur++;
1219         }
1220       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1221     }
1222   pfile->buffer->cur = cur;
1223   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1224     {
1225       /* Slower version for identifiers containing UCNs (or $).  */
1226       do {
1227         while (ISIDNUM (*pfile->buffer->cur))
1228           {
1229             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1230             pfile->buffer->cur++;
1231           }
1232       } while (forms_identifier_p (pfile, false, nst));
1233       result = _cpp_interpret_identifier (pfile, base,
1234                                           pfile->buffer->cur - base);
1235     }
1236   else
1237     {
1238       len = cur - base;
1239       hash = HT_HASHFINISH (hash, len);
1240
1241       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1242                                                   base, len, hash, HT_ALLOC));
1243     }
1244
1245   /* Rarely, identifiers require diagnostics when lexed.  */
1246   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1247                         && !pfile->state.skipping, 0))
1248     {
1249       /* It is allowed to poison the same identifier twice.  */
1250       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1251         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1252                    NODE_NAME (result));
1253
1254       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1255          replacement list of a variadic macro.  */
1256       if (result == pfile->spec_nodes.n__VA_ARGS__
1257           && !pfile->state.va_args_ok)
1258         cpp_error (pfile, CPP_DL_PEDWARN,
1259                    "__VA_ARGS__ can only appear in the expansion"
1260                    " of a C99 variadic macro");
1261
1262       /* For -Wc++-compat, warn about use of C++ named operators.  */
1263       if (result->flags & NODE_WARN_OPERATOR)
1264         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1265                      "identifier \"%s\" is a special operator name in C++",
1266                      NODE_NAME (result));
1267     }
1268
1269   return result;
1270 }
1271
1272 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1273 static void
1274 lex_number (cpp_reader *pfile, cpp_string *number,
1275             struct normalize_state *nst)
1276 {
1277   const uchar *cur;
1278   const uchar *base;
1279   uchar *dest;
1280
1281   base = pfile->buffer->cur - 1;
1282   do
1283     {
1284       cur = pfile->buffer->cur;
1285
1286       /* N.B. ISIDNUM does not include $.  */
1287       while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1288              || VALID_SIGN (*cur, cur[-1]))
1289         {
1290           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1291           cur++;
1292         }
1293
1294       pfile->buffer->cur = cur;
1295     }
1296   while (forms_identifier_p (pfile, false, nst));
1297
1298   number->len = cur - base;
1299   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1300   memcpy (dest, base, number->len);
1301   dest[number->len] = '\0';
1302   number->text = dest;
1303 }
1304
1305 /* Create a token of type TYPE with a literal spelling.  */
1306 static void
1307 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1308                 unsigned int len, enum cpp_ttype type)
1309 {
1310   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1311
1312   memcpy (dest, base, len);
1313   dest[len] = '\0';
1314   token->type = type;
1315   token->val.str.len = len;
1316   token->val.str.text = dest;
1317 }
1318
1319 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1320    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1321
1322 static void
1323 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1324                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1325 {
1326   _cpp_buff *first_buff = *first_buff_p;
1327   _cpp_buff *last_buff = *last_buff_p;
1328
1329   if (first_buff == NULL)
1330     first_buff = last_buff = _cpp_get_buff (pfile, len);
1331   else if (len > BUFF_ROOM (last_buff))
1332     {
1333       size_t room = BUFF_ROOM (last_buff);
1334       memcpy (BUFF_FRONT (last_buff), base, room);
1335       BUFF_FRONT (last_buff) += room;
1336       base += room;
1337       len -= room;
1338       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1339     }
1340
1341   memcpy (BUFF_FRONT (last_buff), base, len);
1342   BUFF_FRONT (last_buff) += len;
1343
1344   *first_buff_p = first_buff;
1345   *last_buff_p = last_buff;
1346 }
1347
1348
1349 /* Returns true if a macro has been defined.
1350    This might not work if compile with -save-temps,
1351    or preprocess separately from compilation.  */
1352
1353 static bool
1354 is_macro(cpp_reader *pfile, const uchar *base)
1355 {
1356   const uchar *cur = base;
1357   if (! ISIDST (*cur))
1358     return false;
1359   unsigned int hash = HT_HASHSTEP (0, *cur);
1360   ++cur;
1361   while (ISIDNUM (*cur))
1362     {
1363       hash = HT_HASHSTEP (hash, *cur);
1364       ++cur;
1365     }
1366   hash = HT_HASHFINISH (hash, cur - base);
1367
1368   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1369                                         base, cur - base, hash, HT_NO_INSERT));
1370
1371   return !result ? false : (result->type == NT_MACRO);
1372 }
1373
1374
1375 /* Lexes a raw string.  The stored string contains the spelling, including
1376    double quotes, delimiter string, '(' and ')', any leading
1377    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1378    literal, or CPP_OTHER if it was not properly terminated.
1379
1380    The spelling is NUL-terminated, but it is not guaranteed that this
1381    is the first NUL since embedded NULs are preserved.  */
1382
1383 static void
1384 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1385                 const uchar *cur)
1386 {
1387   uchar raw_prefix[17];
1388   uchar temp_buffer[18];
1389   const uchar *orig_base;
1390   unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
1391   enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
1392   raw_str_phase phase = RAW_STR_PREFIX;
1393   enum cpp_ttype type;
1394   size_t total_len = 0;
1395   /* Index into temp_buffer during phases other than RAW_STR,
1396      during RAW_STR phase 17 to tell BUF_APPEND that nothing should
1397      be appended to temp_buffer.  */
1398   size_t temp_buffer_len = 0;
1399   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1400   size_t raw_prefix_start;
1401   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1402
1403   type = (*base == 'L' ? CPP_WSTRING :
1404           *base == 'U' ? CPP_STRING32 :
1405           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1406           : CPP_STRING);
1407
1408 #define BUF_APPEND(STR,LEN)                                     \
1409       do {                                                      \
1410         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1411                         &first_buff, &last_buff);               \
1412         total_len += (LEN);                                     \
1413         if (__builtin_expect (temp_buffer_len < 17, 0)          \
1414             && (const uchar *)(STR) != base                     \
1415             && (LEN) <= 2)                                      \
1416           {                                                     \
1417             memcpy (temp_buffer + temp_buffer_len,              \
1418                     (const uchar *)(STR), (LEN));               \
1419             temp_buffer_len += (LEN);                           \
1420           }                                                     \
1421       } while (0);
1422
1423   orig_base = base;
1424   ++cur;
1425   raw_prefix_start = cur - base;
1426   for (;;)
1427     {
1428       cppchar_t c;
1429
1430       /* If we previously performed any trigraph or line splicing
1431          transformations, undo them in between the opening and closing
1432          double quote.  */
1433       while (note->pos < cur)
1434         ++note;
1435       for (; note->pos == cur; ++note)
1436         {
1437           switch (note->type)
1438             {
1439             case '\\':
1440             case ' ':
1441               /* Restore backslash followed by newline.  */
1442               BUF_APPEND (base, cur - base);
1443               base = cur;
1444               BUF_APPEND ("\\", 1);
1445             after_backslash:
1446               if (note->type == ' ')
1447                 {
1448                   /* GNU backslash whitespace newline extension.  FIXME
1449                      could be any sequence of non-vertical space.  When we
1450                      can properly restore any such sequence, we should mark
1451                      this note as handled so _cpp_process_line_notes
1452                      doesn't warn.  */
1453                   BUF_APPEND (" ", 1);
1454                 }
1455
1456               BUF_APPEND ("\n", 1);
1457               break;
1458
1459             case 0:
1460               /* Already handled.  */
1461               break;
1462
1463             default:
1464               if (_cpp_trigraph_map[note->type])
1465                 {
1466                   /* Don't warn about this trigraph in
1467                      _cpp_process_line_notes, since trigraphs show up as
1468                      trigraphs in raw strings.  */
1469                   uchar type = note->type;
1470                   note->type = 0;
1471
1472                   if (!CPP_OPTION (pfile, trigraphs))
1473                     /* If we didn't convert the trigraph in the first
1474                        place, don't do anything now either.  */
1475                     break;
1476
1477                   BUF_APPEND (base, cur - base);
1478                   base = cur;
1479                   BUF_APPEND ("??", 2);
1480
1481                   /* ??/ followed by newline gets two line notes, one for
1482                      the trigraph and one for the backslash/newline.  */
1483                   if (type == '/' && note[1].pos == cur)
1484                     {
1485                       if (note[1].type != '\\'
1486                           && note[1].type != ' ')
1487                         abort ();
1488                       BUF_APPEND ("/", 1);
1489                       ++note;
1490                       goto after_backslash;
1491                     }
1492                   else
1493                     {
1494                       /* Skip the replacement character.  */
1495                       base = ++cur;
1496                       BUF_APPEND (&type, 1);
1497                       c = type;
1498                       goto check_c;
1499                     }
1500                 }
1501               else
1502                 abort ();
1503               break;
1504             }
1505         }
1506       c = *cur++;
1507       if (__builtin_expect (temp_buffer_len < 17, 0))
1508         temp_buffer[temp_buffer_len++] = c;
1509
1510      check_c:
1511       if (phase == RAW_STR_PREFIX)
1512         {
1513           while (raw_prefix_len < temp_buffer_len)
1514             {
1515               raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len];
1516               switch (raw_prefix[raw_prefix_len])
1517                 {
1518                 case ' ': case '(': case ')': case '\\': case '\t':
1519                 case '\v': case '\f': case '\n': default:
1520                   break;
1521                 /* Basic source charset except the above chars.  */
1522                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1523                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1524                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1525                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1526                 case 'y': case 'z':
1527                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1528                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1529                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1530                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1531                 case 'Y': case 'Z':
1532                 case '0': case '1': case '2': case '3': case '4': case '5':
1533                 case '6': case '7': case '8': case '9':
1534                 case '_': case '{': case '}': case '#': case '[': case ']':
1535                 case '<': case '>': case '%': case ':': case ';': case '.':
1536                 case '?': case '*': case '+': case '-': case '/': case '^':
1537                 case '&': case '|': case '~': case '!': case '=': case ',':
1538                 case '"': case '\'':
1539                   if (raw_prefix_len < 16)
1540                     {
1541                       raw_prefix_len++;
1542                       continue;
1543                     }
1544                   break;
1545                 }
1546
1547               if (raw_prefix[raw_prefix_len] != '(')
1548                 {
1549                   int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1;
1550                   if (raw_prefix_len == 16)
1551                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1552                                          col, "raw string delimiter longer "
1553                                               "than 16 characters");
1554                   else if (raw_prefix[raw_prefix_len] == '\n')
1555                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1556                                          col, "invalid new-line in raw "
1557                                               "string delimiter");
1558                   else
1559                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1560                                          col, "invalid character '%c' in "
1561                                               "raw string delimiter",
1562                                          (int) raw_prefix[raw_prefix_len]);
1563                   pfile->buffer->cur = orig_base + raw_prefix_start - 1;
1564                   create_literal (pfile, token, orig_base,
1565                                   raw_prefix_start - 1, CPP_OTHER);
1566                   if (first_buff)
1567                     _cpp_release_buff (pfile, first_buff);
1568                   return;
1569                 }
1570               raw_prefix[raw_prefix_len] = '"';
1571               phase = RAW_STR;
1572               /* Nothing should be appended to temp_buffer during
1573                  RAW_STR phase.  */
1574               temp_buffer_len = 17;
1575               break;
1576             }
1577           continue;
1578         }
1579       else if (phase == RAW_STR_SUFFIX)
1580         {
1581           while (raw_suffix_len <= raw_prefix_len
1582                  && raw_suffix_len < temp_buffer_len
1583                  && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len])
1584             raw_suffix_len++;
1585           if (raw_suffix_len > raw_prefix_len)
1586             break;
1587           if (raw_suffix_len == temp_buffer_len)
1588             continue;
1589           phase = RAW_STR;
1590           /* Nothing should be appended to temp_buffer during
1591              RAW_STR phase.  */
1592           temp_buffer_len = 17;
1593         }
1594       if (c == ')')
1595         {
1596           phase = RAW_STR_SUFFIX;
1597           raw_suffix_len = 0;
1598           temp_buffer_len = 0;
1599         }
1600       else if (c == '\n')
1601         {
1602           if (pfile->state.in_directive
1603               || (pfile->state.parsing_args
1604                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
1605             {
1606               cur--;
1607               type = CPP_OTHER;
1608               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1609                                    "unterminated raw string");
1610               break;
1611             }
1612
1613           BUF_APPEND (base, cur - base);
1614
1615           if (pfile->buffer->cur < pfile->buffer->rlimit)
1616             CPP_INCREMENT_LINE (pfile, 0);
1617           pfile->buffer->need_line = true;
1618
1619           pfile->buffer->cur = cur-1;
1620           _cpp_process_line_notes (pfile, false);
1621           if (!_cpp_get_fresh_line (pfile))
1622             {
1623               source_location src_loc = token->src_loc;
1624               token->type = CPP_EOF;
1625               /* Tell the compiler the line number of the EOF token.  */
1626               token->src_loc = pfile->line_table->highest_line;
1627               token->flags = BOL;
1628               if (first_buff != NULL)
1629                 _cpp_release_buff (pfile, first_buff);
1630               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1631                                    "unterminated raw string");
1632               return;
1633             }
1634
1635           cur = base = pfile->buffer->cur;
1636           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1637         }
1638     }
1639
1640   if (CPP_OPTION (pfile, user_literals))
1641     {
1642       /* If a string format macro, say from inttypes.h, is placed touching
1643          a string literal it could be parsed as a C++11 user-defined string
1644          literal thus breaking the program.
1645          Try to identify macros with is_macro. A warning is issued. */
1646       if (is_macro (pfile, cur))
1647         {
1648           /* Raise a warning, but do not consume subsequent tokens.  */
1649           if (CPP_OPTION (pfile, warn_literal_suffix))
1650             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1651                                    token->src_loc, 0,
1652                                    "invalid suffix on literal; C++11 requires "
1653                                    "a space between literal and string macro");
1654         }
1655       /* Grab user defined literal suffix.  */
1656       else if (ISIDST (*cur))
1657         {
1658           type = cpp_userdef_string_add_type (type);
1659           ++cur;
1660
1661           while (ISIDNUM (*cur))
1662             ++cur;
1663         }
1664     }
1665
1666   pfile->buffer->cur = cur;
1667   if (first_buff == NULL)
1668     create_literal (pfile, token, base, cur - base, type);
1669   else
1670     {
1671       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1672
1673       token->type = type;
1674       token->val.str.len = total_len + (cur - base);
1675       token->val.str.text = dest;
1676       last_buff = first_buff;
1677       while (last_buff != NULL)
1678         {
1679           memcpy (dest, last_buff->base,
1680                   BUFF_FRONT (last_buff) - last_buff->base);
1681           dest += BUFF_FRONT (last_buff) - last_buff->base;
1682           last_buff = last_buff->next;
1683         }
1684       _cpp_release_buff (pfile, first_buff);
1685       memcpy (dest, base, cur - base);
1686       dest[cur - base] = '\0';
1687     }
1688 }
1689
1690 /* Lexes a string, character constant, or angle-bracketed header file
1691    name.  The stored string contains the spelling, including opening
1692    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1693    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1694    if it was not properly terminated, or CPP_LESS for an unterminated
1695    header name which must be relexed as normal tokens.
1696
1697    The spelling is NUL-terminated, but it is not guaranteed that this
1698    is the first NUL since embedded NULs are preserved.  */
1699 static void
1700 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1701 {
1702   bool saw_NUL = false;
1703   const uchar *cur;
1704   cppchar_t terminator;
1705   enum cpp_ttype type;
1706
1707   cur = base;
1708   terminator = *cur++;
1709   if (terminator == 'L' || terminator == 'U')
1710     terminator = *cur++;
1711   else if (terminator == 'u')
1712     {
1713       terminator = *cur++;
1714       if (terminator == '8')
1715         terminator = *cur++;
1716     }
1717   if (terminator == 'R')
1718     {
1719       lex_raw_string (pfile, token, base, cur);
1720       return;
1721     }
1722   if (terminator == '"')
1723     type = (*base == 'L' ? CPP_WSTRING :
1724             *base == 'U' ? CPP_STRING32 :
1725             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1726                          : CPP_STRING);
1727   else if (terminator == '\'')
1728     type = (*base == 'L' ? CPP_WCHAR :
1729             *base == 'U' ? CPP_CHAR32 :
1730             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1731   else
1732     terminator = '>', type = CPP_HEADER_NAME;
1733
1734   for (;;)
1735     {
1736       cppchar_t c = *cur++;
1737
1738       /* In #include-style directives, terminators are not escapable.  */
1739       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1740         cur++;
1741       else if (c == terminator)
1742         break;
1743       else if (c == '\n')
1744         {
1745           cur--;
1746           /* Unmatched quotes always yield undefined behavior, but
1747              greedy lexing means that what appears to be an unterminated
1748              header name may actually be a legitimate sequence of tokens.  */
1749           if (terminator == '>')
1750             {
1751               token->type = CPP_LESS;
1752               return;
1753             }
1754           type = CPP_OTHER;
1755           break;
1756         }
1757       else if (c == '\0')
1758         saw_NUL = true;
1759     }
1760
1761   if (saw_NUL && !pfile->state.skipping)
1762     cpp_error (pfile, CPP_DL_WARNING,
1763                "null character(s) preserved in literal");
1764
1765   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1766     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1767                (int) terminator);
1768
1769   if (CPP_OPTION (pfile, user_literals))
1770     {
1771       /* If a string format macro, say from inttypes.h, is placed touching
1772          a string literal it could be parsed as a C++11 user-defined string
1773          literal thus breaking the program.
1774          Try to identify macros with is_macro. A warning is issued. */
1775       if (is_macro (pfile, cur))
1776         {
1777           /* Raise a warning, but do not consume subsequent tokens.  */
1778           if (CPP_OPTION (pfile, warn_literal_suffix))
1779             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1780                                    token->src_loc, 0,
1781                                    "invalid suffix on literal; C++11 requires "
1782                                    "a space between literal and string macro");
1783         }
1784       /* Grab user defined literal suffix.  */
1785       else if (ISIDST (*cur))
1786         {
1787           type = cpp_userdef_char_add_type (type);
1788           type = cpp_userdef_string_add_type (type);
1789           ++cur;
1790
1791           while (ISIDNUM (*cur))
1792             ++cur;
1793         }
1794     }
1795
1796   pfile->buffer->cur = cur;
1797   create_literal (pfile, token, base, cur - base, type);
1798 }
1799
1800 /* Return the comment table. The client may not make any assumption
1801    about the ordering of the table.  */
1802 cpp_comment_table *
1803 cpp_get_comments (cpp_reader *pfile)
1804 {
1805   return &pfile->comments;
1806 }
1807
1808 /* Append a comment to the end of the comment table. */
1809 static void
1810 store_comment (cpp_reader *pfile, cpp_token *token)
1811 {
1812   int len;
1813
1814   if (pfile->comments.allocated == 0)
1815     {
1816       pfile->comments.allocated = 256;
1817       pfile->comments.entries = (cpp_comment *) xmalloc
1818         (pfile->comments.allocated * sizeof (cpp_comment));
1819     }
1820
1821   if (pfile->comments.count == pfile->comments.allocated)
1822     {
1823       pfile->comments.allocated *= 2;
1824       pfile->comments.entries = (cpp_comment *) xrealloc
1825         (pfile->comments.entries,
1826          pfile->comments.allocated * sizeof (cpp_comment));
1827     }
1828
1829   len = token->val.str.len;
1830
1831   /* Copy comment. Note, token may not be NULL terminated. */
1832   pfile->comments.entries[pfile->comments.count].comment =
1833     (char *) xmalloc (sizeof (char) * (len + 1));
1834   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1835           token->val.str.text, len);
1836   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1837
1838   /* Set source location. */
1839   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1840
1841   /* Increment the count of entries in the comment table. */
1842   pfile->comments.count++;
1843 }
1844
1845 /* The stored comment includes the comment start and any terminator.  */
1846 static void
1847 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1848               cppchar_t type)
1849 {
1850   unsigned char *buffer;
1851   unsigned int len, clen, i;
1852
1853   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1854
1855   /* C++ comments probably (not definitely) have moved past a new
1856      line, which we don't want to save in the comment.  */
1857   if (is_vspace (pfile->buffer->cur[-1]))
1858     len--;
1859
1860   /* If we are currently in a directive or in argument parsing, then
1861      we need to store all C++ comments as C comments internally, and
1862      so we need to allocate a little extra space in that case.
1863
1864      Note that the only time we encounter a directive here is
1865      when we are saving comments in a "#define".  */
1866   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1867           && type == '/') ? len + 2 : len;
1868
1869   buffer = _cpp_unaligned_alloc (pfile, clen);
1870
1871   token->type = CPP_COMMENT;
1872   token->val.str.len = clen;
1873   token->val.str.text = buffer;
1874
1875   buffer[0] = '/';
1876   memcpy (buffer + 1, from, len - 1);
1877
1878   /* Finish conversion to a C comment, if necessary.  */
1879   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1880     {
1881       buffer[1] = '*';
1882       buffer[clen - 2] = '*';
1883       buffer[clen - 1] = '/';
1884       /* As there can be in a C++ comments illegal sequences for C comments
1885          we need to filter them out.  */
1886       for (i = 2; i < (clen - 2); i++)
1887         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1888           buffer[i] = '|';
1889     }
1890
1891   /* Finally store this comment for use by clients of libcpp. */
1892   store_comment (pfile, token);
1893 }
1894
1895 /* Allocate COUNT tokens for RUN.  */
1896 void
1897 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1898 {
1899   run->base = XNEWVEC (cpp_token, count);
1900   run->limit = run->base + count;
1901   run->next = NULL;
1902 }
1903
1904 /* Returns the next tokenrun, or creates one if there is none.  */
1905 static tokenrun *
1906 next_tokenrun (tokenrun *run)
1907 {
1908   if (run->next == NULL)
1909     {
1910       run->next = XNEW (tokenrun);
1911       run->next->prev = run;
1912       _cpp_init_tokenrun (run->next, 250);
1913     }
1914
1915   return run->next;
1916 }
1917
1918 /* Return the number of not yet processed token in a given
1919    context.  */
1920 int
1921 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1922 {
1923   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1924     return (LAST (context).token - FIRST (context).token);
1925   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1926            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1927     return (LAST (context).ptoken - FIRST (context).ptoken);
1928   else
1929       abort ();
1930 }
1931
1932 /* Returns the token present at index INDEX in a given context.  If
1933    INDEX is zero, the next token to be processed is returned.  */
1934 static const cpp_token*
1935 _cpp_token_from_context_at (cpp_context *context, int index)
1936 {
1937   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1938     return &(FIRST (context).token[index]);
1939   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1940            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1941     return FIRST (context).ptoken[index];
1942  else
1943    abort ();
1944 }
1945
1946 /* Look ahead in the input stream.  */
1947 const cpp_token *
1948 cpp_peek_token (cpp_reader *pfile, int index)
1949 {
1950   cpp_context *context = pfile->context;
1951   const cpp_token *peektok;
1952   int count;
1953
1954   /* First, scan through any pending cpp_context objects.  */
1955   while (context->prev)
1956     {
1957       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1958
1959       if (index < (int) sz)
1960         return _cpp_token_from_context_at (context, index);
1961       index -= (int) sz;
1962       context = context->prev;
1963     }
1964
1965   /* We will have to read some new tokens after all (and do so
1966      without invalidating preceding tokens).  */
1967   count = index;
1968   pfile->keep_tokens++;
1969
1970   do
1971     {
1972       peektok = _cpp_lex_token (pfile);
1973       if (peektok->type == CPP_EOF)
1974         return peektok;
1975     }
1976   while (index--);
1977
1978   _cpp_backup_tokens_direct (pfile, count + 1);
1979   pfile->keep_tokens--;
1980
1981   return peektok;
1982 }
1983
1984 /* Allocate a single token that is invalidated at the same time as the
1985    rest of the tokens on the line.  Has its line and col set to the
1986    same as the last lexed token, so that diagnostics appear in the
1987    right place.  */
1988 cpp_token *
1989 _cpp_temp_token (cpp_reader *pfile)
1990 {
1991   cpp_token *old, *result;
1992   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1993   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1994
1995   old = pfile->cur_token - 1;
1996   /* Any pre-existing lookaheads must not be clobbered.  */
1997   if (la)
1998     {
1999       if (sz <= la)
2000         {
2001           tokenrun *next = next_tokenrun (pfile->cur_run);
2002
2003           if (sz < la)
2004             memmove (next->base + 1, next->base,
2005                      (la - sz) * sizeof (cpp_token));
2006
2007           next->base[0] = pfile->cur_run->limit[-1];
2008         }
2009
2010       if (sz > 1)
2011         memmove (pfile->cur_token + 1, pfile->cur_token,
2012                  MIN (la, sz - 1) * sizeof (cpp_token));
2013     }
2014
2015   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2016     {
2017       pfile->cur_run = next_tokenrun (pfile->cur_run);
2018       pfile->cur_token = pfile->cur_run->base;
2019     }
2020
2021   result = pfile->cur_token++;
2022   result->src_loc = old->src_loc;
2023   return result;
2024 }
2025
2026 /* Lex a token into RESULT (external interface).  Takes care of issues
2027    like directive handling, token lookahead, multiple include
2028    optimization and skipping.  */
2029 const cpp_token *
2030 _cpp_lex_token (cpp_reader *pfile)
2031 {
2032   cpp_token *result;
2033
2034   for (;;)
2035     {
2036       if (pfile->cur_token == pfile->cur_run->limit)
2037         {
2038           pfile->cur_run = next_tokenrun (pfile->cur_run);
2039           pfile->cur_token = pfile->cur_run->base;
2040         }
2041       /* We assume that the current token is somewhere in the current
2042          run.  */
2043       if (pfile->cur_token < pfile->cur_run->base
2044           || pfile->cur_token >= pfile->cur_run->limit)
2045         abort ();
2046
2047       if (pfile->lookaheads)
2048         {
2049           pfile->lookaheads--;
2050           result = pfile->cur_token++;
2051         }
2052       else
2053         result = _cpp_lex_direct (pfile);
2054
2055       if (result->flags & BOL)
2056         {
2057           /* Is this a directive.  If _cpp_handle_directive returns
2058              false, it is an assembler #.  */
2059           if (result->type == CPP_HASH
2060               /* 6.10.3 p 11: Directives in a list of macro arguments
2061                  gives undefined behavior.  This implementation
2062                  handles the directive as normal.  */
2063               && pfile->state.parsing_args != 1)
2064             {
2065               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2066                 {
2067                   if (pfile->directive_result.type == CPP_PADDING)
2068                     continue;
2069                   result = &pfile->directive_result;
2070                 }
2071             }
2072           else if (pfile->state.in_deferred_pragma)
2073             result = &pfile->directive_result;
2074
2075           if (pfile->cb.line_change && !pfile->state.skipping)
2076             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2077         }
2078
2079       /* We don't skip tokens in directives.  */
2080       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2081         break;
2082
2083       /* Outside a directive, invalidate controlling macros.  At file
2084          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2085          get here and MI optimization works.  */
2086       pfile->mi_valid = false;
2087
2088       if (!pfile->state.skipping || result->type == CPP_EOF)
2089         break;
2090     }
2091
2092   return result;
2093 }
2094
2095 /* Returns true if a fresh line has been loaded.  */
2096 bool
2097 _cpp_get_fresh_line (cpp_reader *pfile)
2098 {
2099   int return_at_eof;
2100
2101   /* We can't get a new line until we leave the current directive.  */
2102   if (pfile->state.in_directive)
2103     return false;
2104
2105   for (;;)
2106     {
2107       cpp_buffer *buffer = pfile->buffer;
2108
2109       if (!buffer->need_line)
2110         return true;
2111
2112       if (buffer->next_line < buffer->rlimit)
2113         {
2114           _cpp_clean_line (pfile);
2115           return true;
2116         }
2117
2118       /* First, get out of parsing arguments state.  */
2119       if (pfile->state.parsing_args)
2120         return false;
2121
2122       /* End of buffer.  Non-empty files should end in a newline.  */
2123       if (buffer->buf != buffer->rlimit
2124           && buffer->next_line > buffer->rlimit
2125           && !buffer->from_stage3)
2126         {
2127           /* Clip to buffer size.  */
2128           buffer->next_line = buffer->rlimit;
2129         }
2130
2131       return_at_eof = buffer->return_at_eof;
2132       _cpp_pop_buffer (pfile);
2133       if (pfile->buffer == NULL || return_at_eof)
2134         return false;
2135     }
2136 }
2137
2138 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2139   do                                                    \
2140     {                                                   \
2141       result->type = ELSE_TYPE;                         \
2142       if (*buffer->cur == CHAR)                         \
2143         buffer->cur++, result->type = THEN_TYPE;        \
2144     }                                                   \
2145   while (0)
2146
2147 /* Lex a token into pfile->cur_token, which is also incremented, to
2148    get diagnostics pointing to the correct location.
2149
2150    Does not handle issues such as token lookahead, multiple-include
2151    optimization, directives, skipping etc.  This function is only
2152    suitable for use by _cpp_lex_token, and in special cases like
2153    lex_expansion_token which doesn't care for any of these issues.
2154
2155    When meeting a newline, returns CPP_EOF if parsing a directive,
2156    otherwise returns to the start of the token buffer if permissible.
2157    Returns the location of the lexed token.  */
2158 cpp_token *
2159 _cpp_lex_direct (cpp_reader *pfile)
2160 {
2161   cppchar_t c;
2162   cpp_buffer *buffer;
2163   const unsigned char *comment_start;
2164   cpp_token *result = pfile->cur_token++;
2165
2166  fresh_line:
2167   result->flags = 0;
2168   buffer = pfile->buffer;
2169   if (buffer->need_line)
2170     {
2171       if (pfile->state.in_deferred_pragma)
2172         {
2173           result->type = CPP_PRAGMA_EOL;
2174           pfile->state.in_deferred_pragma = false;
2175           if (!pfile->state.pragma_allow_expansion)
2176             pfile->state.prevent_expansion--;
2177           return result;
2178         }
2179       if (!_cpp_get_fresh_line (pfile))
2180         {
2181           result->type = CPP_EOF;
2182           if (!pfile->state.in_directive)
2183             {
2184               /* Tell the compiler the line number of the EOF token.  */
2185               result->src_loc = pfile->line_table->highest_line;
2186               result->flags = BOL;
2187             }
2188           return result;
2189         }
2190       if (!pfile->keep_tokens)
2191         {
2192           pfile->cur_run = &pfile->base_run;
2193           result = pfile->base_run.base;
2194           pfile->cur_token = result + 1;
2195         }
2196       result->flags = BOL;
2197       if (pfile->state.parsing_args == 2)
2198         result->flags |= PREV_WHITE;
2199     }
2200   buffer = pfile->buffer;
2201  update_tokens_line:
2202   result->src_loc = pfile->line_table->highest_line;
2203
2204  skipped_white:
2205   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2206       && !pfile->overlaid_buffer)
2207     {
2208       _cpp_process_line_notes (pfile, false);
2209       result->src_loc = pfile->line_table->highest_line;
2210     }
2211   c = *buffer->cur++;
2212
2213   if (pfile->forced_token_location_p)
2214     result->src_loc = *pfile->forced_token_location_p;
2215   else
2216     result->src_loc = linemap_position_for_column (pfile->line_table,
2217                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2218
2219   switch (c)
2220     {
2221     case ' ': case '\t': case '\f': case '\v': case '\0':
2222       result->flags |= PREV_WHITE;
2223       skip_whitespace (pfile, c);
2224       goto skipped_white;
2225
2226     case '\n':
2227       if (buffer->cur < buffer->rlimit)
2228         CPP_INCREMENT_LINE (pfile, 0);
2229       buffer->need_line = true;
2230       goto fresh_line;
2231
2232     case '0': case '1': case '2': case '3': case '4':
2233     case '5': case '6': case '7': case '8': case '9':
2234       {
2235         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2236         result->type = CPP_NUMBER;
2237         lex_number (pfile, &result->val.str, &nst);
2238         warn_about_normalization (pfile, result, &nst);
2239         break;
2240       }
2241
2242     case 'L':
2243     case 'u':
2244     case 'U':
2245     case 'R':
2246       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2247          wide strings or raw strings.  */
2248       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2249           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2250         {
2251           if ((*buffer->cur == '\'' && c != 'R')
2252               || *buffer->cur == '"'
2253               || (*buffer->cur == 'R'
2254                   && c != 'R'
2255                   && buffer->cur[1] == '"'
2256                   && CPP_OPTION (pfile, rliterals))
2257               || (*buffer->cur == '8'
2258                   && c == 'u'
2259                   && (buffer->cur[1] == '"'
2260                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2261                           && CPP_OPTION (pfile, rliterals)))))
2262             {
2263               lex_string (pfile, result, buffer->cur - 1);
2264               break;
2265             }
2266         }
2267       /* Fall through.  */
2268
2269     case '_':
2270     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2271     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2272     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2273     case 's': case 't':           case 'v': case 'w': case 'x':
2274     case 'y': case 'z':
2275     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2276     case 'G': case 'H': case 'I': case 'J': case 'K':
2277     case 'M': case 'N': case 'O': case 'P': case 'Q':
2278     case 'S': case 'T':           case 'V': case 'W': case 'X':
2279     case 'Y': case 'Z':
2280       result->type = CPP_NAME;
2281       {
2282         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2283         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2284                                                 &nst);
2285         warn_about_normalization (pfile, result, &nst);
2286       }
2287
2288       /* Convert named operators to their proper types.  */
2289       if (result->val.node.node->flags & NODE_OPERATOR)
2290         {
2291           result->flags |= NAMED_OP;
2292           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2293         }
2294       break;
2295
2296     case '\'':
2297     case '"':
2298       lex_string (pfile, result, buffer->cur - 1);
2299       break;
2300
2301     case '/':
2302       /* A potential block or line comment.  */
2303       comment_start = buffer->cur;
2304       c = *buffer->cur;
2305
2306       if (c == '*')
2307         {
2308           if (_cpp_skip_block_comment (pfile))
2309             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2310         }
2311       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2312                             || cpp_in_system_header (pfile)))
2313         {
2314           /* Warn about comments only if pedantically GNUC89, and not
2315              in system headers.  */
2316           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2317               && ! buffer->warned_cplusplus_comments)
2318             {
2319               cpp_error (pfile, CPP_DL_PEDWARN,
2320                          "C++ style comments are not allowed in ISO C90");
2321               cpp_error (pfile, CPP_DL_PEDWARN,
2322                          "(this will be reported only once per input file)");
2323               buffer->warned_cplusplus_comments = 1;
2324             }
2325
2326           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2327             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2328         }
2329       else if (c == '=')
2330         {
2331           buffer->cur++;
2332           result->type = CPP_DIV_EQ;
2333           break;
2334         }
2335       else
2336         {
2337           result->type = CPP_DIV;
2338           break;
2339         }
2340
2341       if (!pfile->state.save_comments)
2342         {
2343           result->flags |= PREV_WHITE;
2344           goto update_tokens_line;
2345         }
2346
2347       /* Save the comment as a token in its own right.  */
2348       save_comment (pfile, result, comment_start, c);
2349       break;
2350
2351     case '<':
2352       if (pfile->state.angled_headers)
2353         {
2354           lex_string (pfile, result, buffer->cur - 1);
2355           if (result->type != CPP_LESS)
2356             break;
2357         }
2358
2359       result->type = CPP_LESS;
2360       if (*buffer->cur == '=')
2361         buffer->cur++, result->type = CPP_LESS_EQ;
2362       else if (*buffer->cur == '<')
2363         {
2364           buffer->cur++;
2365           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2366         }
2367       else if (CPP_OPTION (pfile, digraphs))
2368         {
2369           if (*buffer->cur == ':')
2370             {
2371               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2372                  three characters are <:: and the subsequent character
2373                  is neither : nor >, the < is treated as a preprocessor
2374                  token by itself".  */
2375               if (CPP_OPTION (pfile, cplusplus)
2376                   && CPP_OPTION (pfile, lang) != CLK_CXX98
2377                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
2378                   && buffer->cur[1] == ':'
2379                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2380                 break;
2381
2382               buffer->cur++;
2383               result->flags |= DIGRAPH;
2384               result->type = CPP_OPEN_SQUARE;
2385             }
2386           else if (*buffer->cur == '%')
2387             {
2388               buffer->cur++;
2389               result->flags |= DIGRAPH;
2390               result->type = CPP_OPEN_BRACE;
2391             }
2392         }
2393       break;
2394
2395     case '>':
2396       result->type = CPP_GREATER;
2397       if (*buffer->cur == '=')
2398         buffer->cur++, result->type = CPP_GREATER_EQ;
2399       else if (*buffer->cur == '>')
2400         {
2401           buffer->cur++;
2402           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2403         }
2404       break;
2405
2406     case '%':
2407       result->type = CPP_MOD;
2408       if (*buffer->cur == '=')
2409         buffer->cur++, result->type = CPP_MOD_EQ;
2410       else if (CPP_OPTION (pfile, digraphs))
2411         {
2412           if (*buffer->cur == ':')
2413             {
2414               buffer->cur++;
2415               result->flags |= DIGRAPH;
2416               result->type = CPP_HASH;
2417               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2418                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2419             }
2420           else if (*buffer->cur == '>')
2421             {
2422               buffer->cur++;
2423               result->flags |= DIGRAPH;
2424               result->type = CPP_CLOSE_BRACE;
2425             }
2426         }
2427       break;
2428
2429     case '.':
2430       result->type = CPP_DOT;
2431       if (ISDIGIT (*buffer->cur))
2432         {
2433           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2434           result->type = CPP_NUMBER;
2435           lex_number (pfile, &result->val.str, &nst);
2436           warn_about_normalization (pfile, result, &nst);
2437         }
2438       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2439         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2440       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2441         buffer->cur++, result->type = CPP_DOT_STAR;
2442       break;
2443
2444     case '+':
2445       result->type = CPP_PLUS;
2446       if (*buffer->cur == '+')
2447         buffer->cur++, result->type = CPP_PLUS_PLUS;
2448       else if (*buffer->cur == '=')
2449         buffer->cur++, result->type = CPP_PLUS_EQ;
2450       break;
2451
2452     case '-':
2453       result->type = CPP_MINUS;
2454       if (*buffer->cur == '>')
2455         {
2456           buffer->cur++;
2457           result->type = CPP_DEREF;
2458           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2459             buffer->cur++, result->type = CPP_DEREF_STAR;
2460         }
2461       else if (*buffer->cur == '-')
2462         buffer->cur++, result->type = CPP_MINUS_MINUS;
2463       else if (*buffer->cur == '=')
2464         buffer->cur++, result->type = CPP_MINUS_EQ;
2465       break;
2466
2467     case '&':
2468       result->type = CPP_AND;
2469       if (*buffer->cur == '&')
2470         buffer->cur++, result->type = CPP_AND_AND;
2471       else if (*buffer->cur == '=')
2472         buffer->cur++, result->type = CPP_AND_EQ;
2473       break;
2474
2475     case '|':
2476       result->type = CPP_OR;
2477       if (*buffer->cur == '|')
2478         buffer->cur++, result->type = CPP_OR_OR;
2479       else if (*buffer->cur == '=')
2480         buffer->cur++, result->type = CPP_OR_EQ;
2481       break;
2482
2483     case ':':
2484       result->type = CPP_COLON;
2485       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2486         buffer->cur++, result->type = CPP_SCOPE;
2487       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2488         {
2489           buffer->cur++;
2490           result->flags |= DIGRAPH;
2491           result->type = CPP_CLOSE_SQUARE;
2492         }
2493       break;
2494
2495     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2496     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2497     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2498     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2499     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2500
2501     case '?': result->type = CPP_QUERY; break;
2502     case '~': result->type = CPP_COMPL; break;
2503     case ',': result->type = CPP_COMMA; break;
2504     case '(': result->type = CPP_OPEN_PAREN; break;
2505     case ')': result->type = CPP_CLOSE_PAREN; break;
2506     case '[': result->type = CPP_OPEN_SQUARE; break;
2507     case ']': result->type = CPP_CLOSE_SQUARE; break;
2508     case '{': result->type = CPP_OPEN_BRACE; break;
2509     case '}': result->type = CPP_CLOSE_BRACE; break;
2510     case ';': result->type = CPP_SEMICOLON; break;
2511
2512       /* @ is a punctuator in Objective-C.  */
2513     case '@': result->type = CPP_ATSIGN; break;
2514
2515     case '$':
2516     case '\\':
2517       {
2518         const uchar *base = --buffer->cur;
2519         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2520
2521         if (forms_identifier_p (pfile, true, &nst))
2522           {
2523             result->type = CPP_NAME;
2524             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2525             warn_about_normalization (pfile, result, &nst);
2526             break;
2527           }
2528         buffer->cur++;
2529       }
2530
2531     default:
2532       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2533       break;
2534     }
2535
2536   return result;
2537 }
2538
2539 /* An upper bound on the number of bytes needed to spell TOKEN.
2540    Does not include preceding whitespace.  */
2541 unsigned int
2542 cpp_token_len (const cpp_token *token)
2543 {
2544   unsigned int len;
2545
2546   switch (TOKEN_SPELL (token))
2547     {
2548     default:            len = 6;                                break;
2549     case SPELL_LITERAL: len = token->val.str.len;               break;
2550     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2551     }
2552
2553   return len;
2554 }
2555
2556 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2557    Return the number of bytes read out of NAME.  (There are always
2558    10 bytes written to BUFFER.)  */
2559
2560 static size_t
2561 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2562 {
2563   int j;
2564   int ucn_len = 0;
2565   int ucn_len_c;
2566   unsigned t;
2567   unsigned long utf32;
2568
2569   /* Compute the length of the UTF-8 sequence.  */
2570   for (t = *name; t & 0x80; t <<= 1)
2571     ucn_len++;
2572
2573   utf32 = *name & (0x7F >> ucn_len);
2574   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2575     {
2576       utf32 = (utf32 << 6) | (*++name & 0x3F);
2577
2578       /* Ill-formed UTF-8.  */
2579       if ((*name & ~0x3F) != 0x80)
2580         abort ();
2581     }
2582
2583   *buffer++ = '\\';
2584   *buffer++ = 'U';
2585   for (j = 7; j >= 0; j--)
2586     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2587   return ucn_len;
2588 }
2589
2590 /* Given a token TYPE corresponding to a digraph, return a pointer to
2591    the spelling of the digraph.  */
2592 static const unsigned char *
2593 cpp_digraph2name (enum cpp_ttype type)
2594 {
2595   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2596 }
2597
2598 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2599    already contain the enough space to hold the token's spelling.
2600    Returns a pointer to the character after the last character written.
2601    FORSTRING is true if this is to be the spelling after translation
2602    phase 1 (this is different for UCNs).
2603    FIXME: Would be nice if we didn't need the PFILE argument.  */
2604 unsigned char *
2605 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2606                  unsigned char *buffer, bool forstring)
2607 {
2608   switch (TOKEN_SPELL (token))
2609     {
2610     case SPELL_OPERATOR:
2611       {
2612         const unsigned char *spelling;
2613         unsigned char c;
2614
2615         if (token->flags & DIGRAPH)
2616           spelling = cpp_digraph2name (token->type);
2617         else if (token->flags & NAMED_OP)
2618           goto spell_ident;
2619         else
2620           spelling = TOKEN_NAME (token);
2621
2622         while ((c = *spelling++) != '\0')
2623           *buffer++ = c;
2624       }
2625       break;
2626
2627     spell_ident:
2628     case SPELL_IDENT:
2629       if (forstring)
2630         {
2631           memcpy (buffer, NODE_NAME (token->val.node.node),
2632                   NODE_LEN (token->val.node.node));
2633           buffer += NODE_LEN (token->val.node.node);
2634         }
2635       else
2636         {
2637           size_t i;
2638           const unsigned char * name = NODE_NAME (token->val.node.node);
2639
2640           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2641             if (name[i] & ~0x7F)
2642               {
2643                 i += utf8_to_ucn (buffer, name + i) - 1;
2644                 buffer += 10;
2645               }
2646             else
2647               *buffer++ = NODE_NAME (token->val.node.node)[i];
2648         }
2649       break;
2650
2651     case SPELL_LITERAL:
2652       memcpy (buffer, token->val.str.text, token->val.str.len);
2653       buffer += token->val.str.len;
2654       break;
2655
2656     case SPELL_NONE:
2657       cpp_error (pfile, CPP_DL_ICE,
2658                  "unspellable token %s", TOKEN_NAME (token));
2659       break;
2660     }
2661
2662   return buffer;
2663 }
2664
2665 /* Returns TOKEN spelt as a null-terminated string.  The string is
2666    freed when the reader is destroyed.  Useful for diagnostics.  */
2667 unsigned char *
2668 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2669 {
2670   unsigned int len = cpp_token_len (token) + 1;
2671   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2672
2673   end = cpp_spell_token (pfile, token, start, false);
2674   end[0] = '\0';
2675
2676   return start;
2677 }
2678
2679 /* Returns a pointer to a string which spells the token defined by
2680    TYPE and FLAGS.  Used by C front ends, which really should move to
2681    using cpp_token_as_text.  */
2682 const char *
2683 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2684 {
2685   if (flags & DIGRAPH)
2686     return (const char *) cpp_digraph2name (type);
2687   else if (flags & NAMED_OP)
2688     return cpp_named_operator2name (type);
2689
2690   return (const char *) token_spellings[type].name;
2691 }
2692
2693 /* Writes the spelling of token to FP, without any preceding space.
2694    Separated from cpp_spell_token for efficiency - to avoid stdio
2695    double-buffering.  */
2696 void
2697 cpp_output_token (const cpp_token *token, FILE *fp)
2698 {
2699   switch (TOKEN_SPELL (token))
2700     {
2701     case SPELL_OPERATOR:
2702       {
2703         const unsigned char *spelling;
2704         int c;
2705
2706         if (token->flags & DIGRAPH)
2707           spelling = cpp_digraph2name (token->type);
2708         else if (token->flags & NAMED_OP)
2709           goto spell_ident;
2710         else
2711           spelling = TOKEN_NAME (token);
2712
2713         c = *spelling;
2714         do
2715           putc (c, fp);
2716         while ((c = *++spelling) != '\0');
2717       }
2718       break;
2719
2720     spell_ident:
2721     case SPELL_IDENT:
2722       {
2723         size_t i;
2724         const unsigned char * name = NODE_NAME (token->val.node.node);
2725
2726         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2727           if (name[i] & ~0x7F)
2728             {
2729               unsigned char buffer[10];
2730               i += utf8_to_ucn (buffer, name + i) - 1;
2731               fwrite (buffer, 1, 10, fp);
2732             }
2733           else
2734             fputc (NODE_NAME (token->val.node.node)[i], fp);
2735       }
2736       break;
2737
2738     case SPELL_LITERAL:
2739       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2740       break;
2741
2742     case SPELL_NONE:
2743       /* An error, most probably.  */
2744       break;
2745     }
2746 }
2747
2748 /* Compare two tokens.  */
2749 int
2750 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2751 {
2752   if (a->type == b->type && a->flags == b->flags)
2753     switch (TOKEN_SPELL (a))
2754       {
2755       default:                  /* Keep compiler happy.  */
2756       case SPELL_OPERATOR:
2757         /* token_no is used to track where multiple consecutive ##
2758            tokens were originally located.  */
2759         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2760       case SPELL_NONE:
2761         return (a->type != CPP_MACRO_ARG
2762                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2763       case SPELL_IDENT:
2764         return a->val.node.node == b->val.node.node;
2765       case SPELL_LITERAL:
2766         return (a->val.str.len == b->val.str.len
2767                 && !memcmp (a->val.str.text, b->val.str.text,
2768                             a->val.str.len));
2769       }
2770
2771   return 0;
2772 }
2773
2774 /* Returns nonzero if a space should be inserted to avoid an
2775    accidental token paste for output.  For simplicity, it is
2776    conservative, and occasionally advises a space where one is not
2777    needed, e.g. "." and ".2".  */
2778 int
2779 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2780                  const cpp_token *token2)
2781 {
2782   enum cpp_ttype a = token1->type, b = token2->type;
2783   cppchar_t c;
2784
2785   if (token1->flags & NAMED_OP)
2786     a = CPP_NAME;
2787   if (token2->flags & NAMED_OP)
2788     b = CPP_NAME;
2789
2790   c = EOF;
2791   if (token2->flags & DIGRAPH)
2792     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2793   else if (token_spellings[b].category == SPELL_OPERATOR)
2794     c = token_spellings[b].name[0];
2795
2796   /* Quickly get everything that can paste with an '='.  */
2797   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2798     return 1;
2799
2800   switch (a)
2801     {
2802     case CPP_GREATER:   return c == '>';
2803     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2804     case CPP_PLUS:      return c == '+';
2805     case CPP_MINUS:     return c == '-' || c == '>';
2806     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2807     case CPP_MOD:       return c == ':' || c == '>';
2808     case CPP_AND:       return c == '&';
2809     case CPP_OR:        return c == '|';
2810     case CPP_COLON:     return c == ':' || c == '>';
2811     case CPP_DEREF:     return c == '*';
2812     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2813     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2814     case CPP_NAME:      return ((b == CPP_NUMBER
2815                                  && name_p (pfile, &token2->val.str))
2816                                 || b == CPP_NAME
2817                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2818     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2819                                 || c == '.' || c == '+' || c == '-');
2820                                       /* UCNs */
2821     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2822                                  && b == CPP_NAME)
2823                                 || (CPP_OPTION (pfile, objc)
2824                                     && token1->val.str.text[0] == '@'
2825                                     && (b == CPP_NAME || b == CPP_STRING)));
2826     case CPP_STRING:
2827     case CPP_WSTRING:
2828     case CPP_UTF8STRING:
2829     case CPP_STRING16:
2830     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
2831                                 && (b == CPP_NAME
2832                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
2833                                         && ISIDST (token2->val.str.text[0]))));
2834
2835     default:            break;
2836     }
2837
2838   return 0;
2839 }
2840
2841 /* Output all the remaining tokens on the current line, and a newline
2842    character, to FP.  Leading whitespace is removed.  If there are
2843    macros, special token padding is not performed.  */
2844 void
2845 cpp_output_line (cpp_reader *pfile, FILE *fp)
2846 {
2847   const cpp_token *token;
2848
2849   token = cpp_get_token (pfile);
2850   while (token->type != CPP_EOF)
2851     {
2852       cpp_output_token (token, fp);
2853       token = cpp_get_token (pfile);
2854       if (token->flags & PREV_WHITE)
2855         putc (' ', fp);
2856     }
2857
2858   putc ('\n', fp);
2859 }
2860
2861 /* Return a string representation of all the remaining tokens on the
2862    current line.  The result is allocated using xmalloc and must be
2863    freed by the caller.  */
2864 unsigned char *
2865 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2866 {
2867   const cpp_token *token;
2868   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2869   unsigned int alloced = 120 + out;
2870   unsigned char *result = (unsigned char *) xmalloc (alloced);
2871
2872   /* If DIR_NAME is empty, there are no initial contents.  */
2873   if (dir_name)
2874     {
2875       sprintf ((char *) result, "#%s ", dir_name);
2876       out += 2;
2877     }
2878
2879   token = cpp_get_token (pfile);
2880   while (token->type != CPP_EOF)
2881     {
2882       unsigned char *last;
2883       /* Include room for a possible space and the terminating nul.  */
2884       unsigned int len = cpp_token_len (token) + 2;
2885
2886       if (out + len > alloced)
2887         {
2888           alloced *= 2;
2889           if (out + len > alloced)
2890             alloced = out + len;
2891           result = (unsigned char *) xrealloc (result, alloced);
2892         }
2893
2894       last = cpp_spell_token (pfile, token, &result[out], 0);
2895       out = last - result;
2896
2897       token = cpp_get_token (pfile);
2898       if (token->flags & PREV_WHITE)
2899         result[out++] = ' ';
2900     }
2901
2902   result[out] = '\0';
2903   return result;
2904 }
2905
2906 /* Memory buffers.  Changing these three constants can have a dramatic
2907    effect on performance.  The values here are reasonable defaults,
2908    but might be tuned.  If you adjust them, be sure to test across a
2909    range of uses of cpplib, including heavy nested function-like macro
2910    expansion.  Also check the change in peak memory usage (NJAMD is a
2911    good tool for this).  */
2912 #define MIN_BUFF_SIZE 8000
2913 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2914 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2915         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2916
2917 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2918   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2919 #endif
2920
2921 /* Create a new allocation buffer.  Place the control block at the end
2922    of the buffer, so that buffer overflows will cause immediate chaos.  */
2923 static _cpp_buff *
2924 new_buff (size_t len)
2925 {
2926   _cpp_buff *result;
2927   unsigned char *base;
2928
2929   if (len < MIN_BUFF_SIZE)
2930     len = MIN_BUFF_SIZE;
2931   len = CPP_ALIGN (len);
2932
2933 #ifdef ENABLE_VALGRIND_CHECKING
2934   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
2935      struct first.  */
2936   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
2937   base = XNEWVEC (unsigned char, len + slen);
2938   result = (_cpp_buff *) base;
2939   base += slen;
2940 #else
2941   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2942   result = (_cpp_buff *) (base + len);
2943 #endif
2944   result->base = base;
2945   result->cur = base;
2946   result->limit = base + len;
2947   result->next = NULL;
2948   return result;
2949 }
2950
2951 /* Place a chain of unwanted allocation buffers on the free list.  */
2952 void
2953 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2954 {
2955   _cpp_buff *end = buff;
2956
2957   while (end->next)
2958     end = end->next;
2959   end->next = pfile->free_buffs;
2960   pfile->free_buffs = buff;
2961 }
2962
2963 /* Return a free buffer of size at least MIN_SIZE.  */
2964 _cpp_buff *
2965 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2966 {
2967   _cpp_buff *result, **p;
2968
2969   for (p = &pfile->free_buffs;; p = &(*p)->next)
2970     {
2971       size_t size;
2972
2973       if (*p == NULL)
2974         return new_buff (min_size);
2975       result = *p;
2976       size = result->limit - result->base;
2977       /* Return a buffer that's big enough, but don't waste one that's
2978          way too big.  */
2979       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2980         break;
2981     }
2982
2983   *p = result->next;
2984   result->next = NULL;
2985   result->cur = result->base;
2986   return result;
2987 }
2988
2989 /* Creates a new buffer with enough space to hold the uncommitted
2990    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2991    the excess bytes to the new buffer.  Chains the new buffer after
2992    BUFF, and returns the new buffer.  */
2993 _cpp_buff *
2994 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2995 {
2996   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2997   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2998
2999   buff->next = new_buff;
3000   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3001   return new_buff;
3002 }
3003
3004 /* Creates a new buffer with enough space to hold the uncommitted
3005    remaining bytes of the buffer pointed to by BUFF, and at least
3006    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3007    Chains the new buffer before the buffer pointed to by BUFF, and
3008    updates the pointer to point to the new buffer.  */
3009 void
3010 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3011 {
3012   _cpp_buff *new_buff, *old_buff = *pbuff;
3013   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3014
3015   new_buff = _cpp_get_buff (pfile, size);
3016   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3017   new_buff->next = old_buff;
3018   *pbuff = new_buff;
3019 }
3020
3021 /* Free a chain of buffers starting at BUFF.  */
3022 void
3023 _cpp_free_buff (_cpp_buff *buff)
3024 {
3025   _cpp_buff *next;
3026
3027   for (; buff; buff = next)
3028     {
3029       next = buff->next;
3030 #ifdef ENABLE_VALGRIND_CHECKING
3031       free (buff);
3032 #else
3033       free (buff->base);
3034 #endif
3035     }
3036 }
3037
3038 /* Allocate permanent, unaligned storage of length LEN.  */
3039 unsigned char *
3040 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3041 {
3042   _cpp_buff *buff = pfile->u_buff;
3043   unsigned char *result = buff->cur;
3044
3045   if (len > (size_t) (buff->limit - result))
3046     {
3047       buff = _cpp_get_buff (pfile, len);
3048       buff->next = pfile->u_buff;
3049       pfile->u_buff = buff;
3050       result = buff->cur;
3051     }
3052
3053   buff->cur = result + len;
3054   return result;
3055 }
3056
3057 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3058    That buffer is used for growing allocations when saving macro
3059    replacement lists in a #define, and when parsing an answer to an
3060    assertion in #assert, #unassert or #if (and therefore possibly
3061    whilst expanding macros).  It therefore must not be used by any
3062    code that they might call: specifically the lexer and the guts of
3063    the macro expander.
3064
3065    All existing other uses clearly fit this restriction: storing
3066    registered pragmas during initialization.  */
3067 unsigned char *
3068 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3069 {
3070   _cpp_buff *buff = pfile->a_buff;
3071   unsigned char *result = buff->cur;
3072
3073   if (len > (size_t) (buff->limit - result))
3074     {
3075       buff = _cpp_get_buff (pfile, len);
3076       buff->next = pfile->a_buff;
3077       pfile->a_buff = buff;
3078       result = buff->cur;
3079     }
3080
3081   buff->cur = result + len;
3082   return result;
3083 }
3084
3085 /* Say which field of TOK is in use.  */
3086
3087 enum cpp_token_fld_kind
3088 cpp_token_val_index (const cpp_token *tok)
3089 {
3090   switch (TOKEN_SPELL (tok))
3091     {
3092     case SPELL_IDENT:
3093       return CPP_TOKEN_FLD_NODE;
3094     case SPELL_LITERAL:
3095       return CPP_TOKEN_FLD_STR;
3096     case SPELL_OPERATOR:
3097       if (tok->type == CPP_PASTE)
3098         return CPP_TOKEN_FLD_TOKEN_NO;
3099       else
3100         return CPP_TOKEN_FLD_NONE;
3101     case SPELL_NONE:
3102       if (tok->type == CPP_MACRO_ARG)
3103         return CPP_TOKEN_FLD_ARG_NO;
3104       else if (tok->type == CPP_PADDING)
3105         return CPP_TOKEN_FLD_SOURCE;
3106       else if (tok->type == CPP_PRAGMA)
3107         return CPP_TOKEN_FLD_PRAGMA;
3108       /* else fall through */
3109     default:
3110       return CPP_TOKEN_FLD_NONE;
3111     }
3112 }
3113
3114 /* All tokens lexed in R after calling this function will be forced to have
3115    their source_location the same as the location referenced by P, until
3116    cpp_stop_forcing_token_locations is called for R.  */
3117
3118 void
3119 cpp_force_token_locations (cpp_reader *r, source_location *p)
3120 {
3121   r->forced_token_location_p = p;
3122 }
3123
3124 /* Go back to assigning locations naturally for lexed tokens.  */
3125
3126 void
3127 cpp_stop_forcing_token_locations (cpp_reader *r)
3128 {
3129   r->forced_token_location_p = NULL;
3130 }