libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2013 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problems can be properly
 267    autoconfed:
 268
 269    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 270    Before Solaris 9 Update 6, SSE insns cannot be executed.
 271    The Solaris 10+ assembler tags objects with the instruction set
 272    extensions used, so SSE4.2 executables cannot run on machines that
 273    don't support that extension.  */
 274
 275 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 276
 277 /* Replicated character data to be shared between implementations.
 278    Recall that outside of a context with vector support we can't
 279    define compatible vector types, therefore these are all defined
 280    in terms of raw characters.  */
 281 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 282   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 283     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 284   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 285     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 286   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 287     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 288   { '?', '?', '?', '?', '?', '?', '?', '?',
 289     '?', '?', '?', '?', '?', '?', '?', '?' },
 290 };
 291
 292 /* A version of the fast scanner using MMX vectorized byte compare insns.
 293
 294    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 295    which was packaged into SSE1; it is also present in the AMD MMX
 296    extension.  Mark the function as using "sse" so that we emit a real
 297    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 298
 299 static const uchar *
 300 #ifndef __SSE__
 301 __attribute__((__target__("sse")))
 302 #endif
 303 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 304 {
 305   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 306   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 307
 308   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 309   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 310   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 311   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 312
 313   unsigned int misalign, found, mask;
 314   const v8qi *p;
 315   v8qi data, t, c;
 316
 317   /* Align the source pointer.  While MMX doesn't generate unaligned data
 318      faults, this allows us to safely scan to the end of the buffer without
 319      reading beyond the end of the last page.  */
 320   misalign = (uintptr_t)s & 7;
 321   p = (const v8qi *)((uintptr_t)s & -8);
 322   data = *p;
 323
 324   /* Create a mask for the bytes that are valid within the first
 325      16-byte block.  The Idea here is that the AND with the mask
 326      within the loop is "free", since we need some AND or TEST
 327      insn in order to set the flags for the branch anyway.  */
 328   mask = -1u << misalign;
 329
 330   /* Main loop processing 8 bytes at a time.  */
 331   goto start;
 332   do
 333     {
 334       data = *++p;
 335       mask = -1;
 336
 337     start:
 338       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 339       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 344       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 345       found = __builtin_ia32_pmovmskb (t);
 346       found &= mask;
 347     }
 348   while (!found);
 349
 350   __builtin_ia32_emms ();
 351
 352   /* FOUND contains 1 in bits for which we matched a relevant
 353      character.  Conversion to the byte index is trivial.  */
 354   found = __builtin_ctz(found);
 355   return (const uchar *)p + found;
 356 }
 357
 358 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 359
 360 static const uchar *
 361 #ifndef __SSE2__
 362 __attribute__((__target__("sse2")))
 363 #endif
 364 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 365 {
 366   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 367
 368   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 369   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 370   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 371   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 372
 373   unsigned int misalign, found, mask;
 374   const v16qi *p;
 375   v16qi data, t;
 376
 377   /* Align the source pointer.  */
 378   misalign = (uintptr_t)s & 15;
 379   p = (const v16qi *)((uintptr_t)s & -16);
 380   data = *p;
 381
 382   /* Create a mask for the bytes that are valid within the first
 383      16-byte block.  The Idea here is that the AND with the mask
 384      within the loop is "free", since we need some AND or TEST
 385      insn in order to set the flags for the branch anyway.  */
 386   mask = -1u << misalign;
 387
 388   /* Main loop processing 16 bytes at a time.  */
 389   goto start;
 390   do
 391     {
 392       data = *++p;
 393       mask = -1;
 394
 395     start:
 396       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 397       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 398       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 400       found = __builtin_ia32_pmovmskb128 (t);
 401       found &= mask;
 402     }
 403   while (!found);
 404
 405   /* FOUND contains 1 in bits for which we matched a relevant
 406      character.  Conversion to the byte index is trivial.  */
 407   found = __builtin_ctz(found);
 408   return (const uchar *)p + found;
 409 }
 410
 411 #ifdef HAVE_SSE4
 412 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 413
 414 static const uchar *
 415 #ifndef __SSE4_2__
 416 __attribute__((__target__("sse4.2")))
 417 #endif
 418 search_line_sse42 (const uchar *s, const uchar *end)
 419 {
 420   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 421   static const v16qi search = { '\n', '\r', '?', '\\' };
 422
 423   uintptr_t si = (uintptr_t)s;
 424   uintptr_t index;
 425
 426   /* Check for unaligned input.  */
 427   if (si & 15)
 428     {
 429       v16qi sv;
 430
 431       if (__builtin_expect (end - s < 16, 0)
 432           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 433         {
 434           /* There are less than 16 bytes left in the buffer, and less
 435              than 16 bytes left on the page.  Reading 16 bytes at this
 436              point might generate a spurious page fault.  Defer to the
 437              SSE2 implementation, which already handles alignment.  */
 438           return search_line_sse2 (s, end);
 439         }
 440
 441       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 442          memory need not be aligned.  */
 443       sv = __builtin_ia32_loaddqu ((const char *) s);
 444       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 445
 446       if (__builtin_expect (index < 16, 0))
 447         goto found;
 448
 449       /* Advance the pointer to an aligned address.  We will re-scan a
 450          few bytes, but we no longer need care for reading past the
 451          end of a page, since we're guaranteed a match.  */
 452       s = (const uchar *)((si + 16) & -16);
 453     }
 454
 455   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 456      in inline assembly, we can make proper use of the flags set.  */
 457   __asm (      "sub $16, %1\n"
 458         "       .balign 16\n"
 459         "0:     add $16, %1\n"
 460         "       %vpcmpestri $0, (%1), %2\n"
 461         "       jnc 0b"
 462         : "=&c"(index), "+r"(s)
 463         : "x"(search), "a"(4), "d"(16));
 464
 465  found:
 466   return s + index;
 467 }
 468
 469 #else
 470 /* Work around out-dated assemblers without sse4 support.  */
 471 #define search_line_sse42 search_line_sse2
 472 #endif
 473
 474 /* Check the CPU capabilities.  */
 475
 476 #include "../gcc/config/i386/cpuid.h"
 477
 478 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 479 static search_line_fast_type search_line_fast;
 480
 481 #define HAVE_init_vectorized_lexer 1
 482 static inline void
 483 init_vectorized_lexer (void)
 484 {
 485   unsigned dummy, ecx = 0, edx = 0;
 486   search_line_fast_type impl = search_line_acc_char;
 487   int minimum = 0;
 488
 489 #if defined(__SSE4_2__)
 490   minimum = 3;
 491 #elif defined(__SSE2__)
 492   minimum = 2;
 493 #elif defined(__SSE__)
 494   minimum = 1;
 495 #endif
 496
 497   if (minimum == 3)
 498     impl = search_line_sse42;
 499   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 500     {
 501       if (minimum == 3 || (ecx & bit_SSE4_2))
 502         impl = search_line_sse42;
 503       else if (minimum == 2 || (edx & bit_SSE2))
 504         impl = search_line_sse2;
 505       else if (minimum == 1 || (edx & bit_SSE))
 506         impl = search_line_mmx;
 507     }
 508   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 509     {
 510       if (minimum == 1
 511           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 512         impl = search_line_mmx;
 513     }
 514
 515   search_line_fast = impl;
 516 }
 517
 518 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 519
 520 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 521 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 522    so we can't compile this function without -maltivec on the command line
 523    (or implied by some other switch).  */
 524
 525 static const uchar *
 526 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 527 {
 528   typedef __attribute__((altivec(vector))) unsigned char vc;
 529
 530   const vc repl_nl = {
 531     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 532     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 533   };
 534   const vc repl_cr = {
 535     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 536     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 537   };
 538   const vc repl_bs = {
 539     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 540     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 541   };
 542   const vc repl_qm = {
 543     '?', '?', '?', '?', '?', '?', '?', '?',
 544     '?', '?', '?', '?', '?', '?', '?', '?',
 545   };
 546   const vc ones = {
 547     -1, -1, -1, -1, -1, -1, -1, -1,
 548     -1, -1, -1, -1, -1, -1, -1, -1,
 549   };
 550   const vc zero = { 0 };
 551
 552   vc data, mask, t;
 553
 554   /* Altivec loads automatically mask addresses with -16.  This lets us
 555      issue the first load as early as possible.  */
 556   data = __builtin_vec_ld(0, (const vc *)s);
 557
 558   /* Discard bytes before the beginning of the buffer.  Do this by
 559      beginning with all ones and shifting in zeros according to the
 560      mis-alignment.  The LVSR instruction pulls the exact shift we
 561      want from the address.  */
 562   mask = __builtin_vec_lvsr(0, s);
 563   mask = __builtin_vec_perm(zero, ones, mask);
 564   data &= mask;
 565
 566   /* While altivec loads mask addresses, we still need to align S so
 567      that the offset we compute at the end is correct.  */
 568   s = (const uchar *)((uintptr_t)s & -16);
 569
 570   /* Main loop processing 16 bytes at a time.  */
 571   goto start;
 572   do
 573     {
 574       vc m_nl, m_cr, m_bs, m_qm;
 575
 576       s += 16;
 577       data = __builtin_vec_ld(0, (const vc *)s);
 578
 579     start:
 580       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 581       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 582       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 583       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 584       t = (m_nl | m_cr) | (m_bs | m_qm);
 585
 586       /* T now contains 0xff in bytes for which we matched one of the relevant
 587          characters.  We want to exit the loop if any byte in T is non-zero.
 588          Below is the expansion of vec_any_ne(t, zero).  */
 589     }
 590   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 591
 592   {
 593 #define N  (sizeof(vc) / sizeof(long))
 594
 595     union {
 596       vc v;
 597       /* Statically assert that N is 2 or 4.  */
 598       unsigned long l[(N == 2 || N == 4) ? N : -1];
 599     } u;
 600     unsigned long l, i = 0;
 601
 602     u.v = t;
 603
 604     /* Find the first word of T that is non-zero.  */
 605     switch (N)
 606       {
 607       case 4:
 608         l = u.l[i++];
 609         if (l != 0)
 610           break;
 611         s += sizeof(unsigned long);
 612         l = u.l[i++];
 613         if (l != 0)
 614           break;
 615         s += sizeof(unsigned long);
 616       case 2:
 617         l = u.l[i++];
 618         if (l != 0)
 619           break;
 620         s += sizeof(unsigned long);
 621         l = u.l[i];
 622       }
 623
 624     /* L now contains 0xff in bytes for which we matched one of the
 625        relevant characters.  We can find the byte index by finding
 626        its bit index and dividing by 8.  */
 627     l = __builtin_clzl(l) >> 3;
 628     return s + l;
 629
 630 #undef N
 631   }
 632 }
 633
 634 #elif defined (__ARM_NEON__)
 635 #include "arm_neon.h"
 636
 637 static const uchar *
 638 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 639 {
 640   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 641   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 642   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 643   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 644   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 645
 646   unsigned int misalign, found, mask;
 647   const uint8_t *p;
 648   uint8x16_t data;
 649
 650   /* Align the source pointer.  */
 651   misalign = (uintptr_t)s & 15;
 652   p = (const uint8_t *)((uintptr_t)s & -16);
 653   data = vld1q_u8 (p);
 654
 655   /* Create a mask for the bytes that are valid within the first
 656      16-byte block.  The Idea here is that the AND with the mask
 657      within the loop is "free", since we need some AND or TEST
 658      insn in order to set the flags for the branch anyway.  */
 659   mask = (-1u << misalign) & 0xffff;
 660
 661   /* Main loop, processing 16 bytes at a time.  */
 662   goto start;
 663
 664   do
 665     {
 666       uint8x8_t l;
 667       uint16x4_t m;
 668       uint32x2_t n;
 669       uint8x16_t t, u, v, w;
 670
 671       p += 16;
 672       data = vld1q_u8 (p);
 673       mask = 0xffff;
 674
 675     start:
 676       t = vceqq_u8 (data, repl_nl);
 677       u = vceqq_u8 (data, repl_cr);
 678       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 679       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 680       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 681       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 682       m = vpaddl_u8 (l);
 683       n = vpaddl_u16 (m);
 684
 685       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 686               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 687       found &= mask;
 688     }
 689   while (!found);
 690
 691   /* FOUND contains 1 in bits for which we matched a relevant
 692      character.  Conversion to the byte index is trivial.  */
 693   found = __builtin_ctz (found);
 694   return (const uchar *)p + found;
 695 }
 696
 697 #else
 698
 699 /* We only have one accellerated alternative.  Use a direct call so that
 700    we encourage inlining.  */
 701
 702 #define search_line_fast  search_line_acc_char
 703
 704 #endif
 705
 706 /* Initialize the lexer if needed.  */
 707
 708 void
 709 _cpp_init_lexer (void)
 710 {
 711 #ifdef HAVE_init_vectorized_lexer
 712   init_vectorized_lexer ();
 713 #endif
 714 }
 715
 716 /* Returns with a logical line that contains no escaped newlines or
 717    trigraphs.  This is a time-critical inner loop.  */
 718 void
 719 _cpp_clean_line (cpp_reader *pfile)
 720 {
 721   cpp_buffer *buffer;
 722   const uchar *s;
 723   uchar c, *d, *p;
 724
 725   buffer = pfile->buffer;
 726   buffer->cur_note = buffer->notes_used = 0;
 727   buffer->cur = buffer->line_base = buffer->next_line;
 728   buffer->need_line = false;
 729   s = buffer->next_line;
 730
 731   if (!buffer->from_stage3)
 732     {
 733       const uchar *pbackslash = NULL;
 734
 735       /* Fast path.  This is the common case of an un-escaped line with
 736          no trigraphs.  The primary win here is by not writing any
 737          data back to memory until we have to.  */
 738       while (1)
 739         {
 740           /* Perform an optimized search for \n, \r, \\, ?.  */
 741           s = search_line_fast (s, buffer->rlimit);
 742
 743           c = *s;
 744           if (c == '\\')
 745             {
 746               /* Record the location of the backslash and continue.  */
 747               pbackslash = s++;
 748             }
 749           else if (__builtin_expect (c == '?', 0))
 750             {
 751               if (__builtin_expect (s[1] == '?', false)
 752                    && _cpp_trigraph_map[s[2]])
 753                 {
 754                   /* Have a trigraph.  We may or may not have to convert
 755                      it.  Add a line note regardless, for -Wtrigraphs.  */
 756                   add_line_note (buffer, s, s[2]);
 757                   if (CPP_OPTION (pfile, trigraphs))
 758                     {
 759                       /* We do, and that means we have to switch to the
 760                          slow path.  */
 761                       d = (uchar *) s;
 762                       *d = _cpp_trigraph_map[s[2]];
 763                       s += 2;
 764                       goto slow_path;
 765                     }
 766                 }
 767               /* Not a trigraph.  Continue on fast-path.  */
 768               s++;
 769             }
 770           else
 771             break;
 772         }
 773
 774       /* This must be \r or \n.  We're either done, or we'll be forced
 775          to write back to the buffer and continue on the slow path.  */
 776       d = (uchar *) s;
 777
 778       if (__builtin_expect (s == buffer->rlimit, false))
 779         goto done;
 780
 781       /* DOS line ending? */
 782       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 783         {
 784           s++;
 785           if (s == buffer->rlimit)
 786             goto done;
 787         }
 788
 789       if (__builtin_expect (pbackslash == NULL, true))
 790         goto done;
 791
 792       /* Check for escaped newline.  */
 793       p = d;
 794       while (is_nvspace (p[-1]))
 795         p--;
 796       if (p - 1 != pbackslash)
 797         goto done;
 798
 799       /* Have an escaped newline; process it and proceed to
 800          the slow path.  */
 801       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 802       d = p - 2;
 803       buffer->next_line = p - 1;
 804
 805     slow_path:
 806       while (1)
 807         {
 808           c = *++s;
 809           *++d = c;
 810
 811           if (c == '\n' || c == '\r')
 812             {
 813               /* Handle DOS line endings.  */
 814               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 815                 s++;
 816               if (s == buffer->rlimit)
 817                 break;
 818
 819               /* Escaped?  */
 820               p = d;
 821               while (p != buffer->next_line && is_nvspace (p[-1]))
 822                 p--;
 823               if (p == buffer->next_line || p[-1] != '\\')
 824                 break;
 825
 826               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 827               d = p - 2;
 828               buffer->next_line = p - 1;
 829             }
 830           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 831             {
 832               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 833               add_line_note (buffer, d, s[2]);
 834               if (CPP_OPTION (pfile, trigraphs))
 835                 {
 836                   *d = _cpp_trigraph_map[s[2]];
 837                   s += 2;
 838                 }
 839             }
 840         }
 841     }
 842   else
 843     {
 844       while (*s != '\n' && *s != '\r')
 845         s++;
 846       d = (uchar *) s;
 847
 848       /* Handle DOS line endings.  */
 849       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 850         s++;
 851     }
 852
 853  done:
 854   *d = '\n';
 855   /* A sentinel note that should never be processed.  */
 856   add_line_note (buffer, d + 1, '\n');
 857   buffer->next_line = s + 1;
 858 }
 859
 860 /* Return true if the trigraph indicated by NOTE should be warned
 861    about in a comment.  */
 862 static bool
 863 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 864 {
 865   const uchar *p;
 866
 867   /* Within comments we don't warn about trigraphs, unless the
 868      trigraph forms an escaped newline, as that may change
 869      behavior.  */
 870   if (note->type != '/')
 871     return false;
 872
 873   /* If -trigraphs, then this was an escaped newline iff the next note
 874      is coincident.  */
 875   if (CPP_OPTION (pfile, trigraphs))
 876     return note[1].pos == note->pos;
 877
 878   /* Otherwise, see if this forms an escaped newline.  */
 879   p = note->pos + 3;
 880   while (is_nvspace (*p))
 881     p++;
 882
 883   /* There might have been escaped newlines between the trigraph and the
 884      newline we found.  Hence the position test.  */
 885   return (*p == '\n' && p < note[1].pos);
 886 }
 887
 888 /* Process the notes created by add_line_note as far as the current
 889    location.  */
 890 void
 891 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 892 {
 893   cpp_buffer *buffer = pfile->buffer;
 894
 895   for (;;)
 896     {
 897       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 898       unsigned int col;
 899
 900       if (note->pos > buffer->cur)
 901         break;
 902
 903       buffer->cur_note++;
 904       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 905
 906       if (note->type == '\\' || note->type == ' ')
 907         {
 908           if (note->type == ' ' && !in_comment)
 909             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 910                                  "backslash and newline separated by space");
 911
 912           if (buffer->next_line > buffer->rlimit)
 913             {
 914               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 915                                    "backslash-newline at end of file");
 916               /* Prevent "no newline at end of file" warning.  */
 917               buffer->next_line = buffer->rlimit;
 918             }
 919
 920           buffer->line_base = note->pos;
 921           CPP_INCREMENT_LINE (pfile, 0);
 922         }
 923       else if (_cpp_trigraph_map[note->type])
 924         {
 925           if (CPP_OPTION (pfile, warn_trigraphs)
 926               && (!in_comment || warn_in_comment (pfile, note)))
 927             {
 928               if (CPP_OPTION (pfile, trigraphs))
 929                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 930                                        pfile->line_table->highest_line, col,
 931                                        "trigraph ??%c converted to %c",
 932                                        note->type,
 933                                        (int) _cpp_trigraph_map[note->type]);
 934               else
 935                 {
 936                   cpp_warning_with_line
 937                     (pfile, CPP_W_TRIGRAPHS,
 938                      pfile->line_table->highest_line, col,
 939                      "trigraph ??%c ignored, use -trigraphs to enable",
 940                      note->type);
 941                 }
 942             }
 943         }
 944       else if (note->type == 0)
 945         /* Already processed in lex_raw_string.  */;
 946       else
 947         abort ();
 948     }
 949 }
 950
 951 /* Skip a C-style block comment.  We find the end of the comment by
 952    seeing if an asterisk is before every '/' we encounter.  Returns
 953    nonzero if comment terminated by EOF, zero otherwise.
 954
 955    Buffer->cur points to the initial asterisk of the comment.  */
 956 bool
 957 _cpp_skip_block_comment (cpp_reader *pfile)
 958 {
 959   cpp_buffer *buffer = pfile->buffer;
 960   const uchar *cur = buffer->cur;
 961   uchar c;
 962
 963   cur++;
 964   if (*cur == '/')
 965     cur++;
 966
 967   for (;;)
 968     {
 969       /* People like decorating comments with '*', so check for '/'
 970          instead for efficiency.  */
 971       c = *cur++;
 972
 973       if (c == '/')
 974         {
 975           if (cur[-2] == '*')
 976             break;
 977
 978           /* Warn about potential nested comments, but not if the '/'
 979              comes immediately before the true comment delimiter.
 980              Don't bother to get it right across escaped newlines.  */
 981           if (CPP_OPTION (pfile, warn_comments)
 982               && cur[0] == '*' && cur[1] != '/')
 983             {
 984               buffer->cur = cur;
 985               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 986                                      pfile->line_table->highest_line,
 987                                      CPP_BUF_COL (buffer),
 988                                      "\"/*\" within comment");
 989             }
 990         }
 991       else if (c == '\n')
 992         {
 993           unsigned int cols;
 994           buffer->cur = cur - 1;
 995           _cpp_process_line_notes (pfile, true);
 996           if (buffer->next_line >= buffer->rlimit)
 997             return true;
 998           _cpp_clean_line (pfile);
 999
1000           cols = buffer->next_line - buffer->line_base;
1001           CPP_INCREMENT_LINE (pfile, cols);
1002
1003           cur = buffer->cur;
1004         }
1005     }
1006
1007   buffer->cur = cur;
1008   _cpp_process_line_notes (pfile, true);
1009   return false;
1010 }
1011
1012 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1013    terminating newline.  Handles escaped newlines.  Returns nonzero
1014    if a multiline comment.  */
1015 static int
1016 skip_line_comment (cpp_reader *pfile)
1017 {
1018   cpp_buffer *buffer = pfile->buffer;
1019   source_location orig_line = pfile->line_table->highest_line;
1020
1021   while (*buffer->cur != '\n')
1022     buffer->cur++;
1023
1024   _cpp_process_line_notes (pfile, true);
1025   return orig_line != pfile->line_table->highest_line;
1026 }
1027
1028 /* Skips whitespace, saving the next non-whitespace character.  */
1029 static void
1030 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1031 {
1032   cpp_buffer *buffer = pfile->buffer;
1033   bool saw_NUL = false;
1034
1035   do
1036     {
1037       /* Horizontal space always OK.  */
1038       if (c == ' ' || c == '\t')
1039         ;
1040       /* Just \f \v or \0 left.  */
1041       else if (c == '\0')
1042         saw_NUL = true;
1043       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1044         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1045                              CPP_BUF_COL (buffer),
1046                              "%s in preprocessing directive",
1047                              c == '\f' ? "form feed" : "vertical tab");
1048
1049       c = *buffer->cur++;
1050     }
1051   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1052   while (is_nvspace (c));
1053
1054   if (saw_NUL)
1055     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1056
1057   buffer->cur--;
1058 }
1059
1060 /* See if the characters of a number token are valid in a name (no
1061    '.', '+' or '-').  */
1062 static int
1063 name_p (cpp_reader *pfile, const cpp_string *string)
1064 {
1065   unsigned int i;
1066
1067   for (i = 0; i < string->len; i++)
1068     if (!is_idchar (string->text[i]))
1069       return 0;
1070
1071   return 1;
1072 }
1073
1074 /* After parsing an identifier or other sequence, produce a warning about
1075    sequences not in NFC/NFKC.  */
1076 static void
1077 warn_about_normalization (cpp_reader *pfile,
1078                           const cpp_token *token,
1079                           const struct normalize_state *s)
1080 {
1081   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1082       && !pfile->state.skipping)
1083     {
1084       /* Make sure that the token is printed using UCNs, even
1085          if we'd otherwise happily print UTF-8.  */
1086       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1087       size_t sz;
1088
1089       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1090       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1091         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1092                                "`%.*s' is not in NFKC", (int) sz, buf);
1093       else
1094         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1095                                "`%.*s' is not in NFC", (int) sz, buf);
1096       free (buf);
1097     }
1098 }
1099
1100 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1101    an identifier.  FIRST is TRUE if this starts an identifier.  */
1102 static bool
1103 forms_identifier_p (cpp_reader *pfile, int first,
1104                     struct normalize_state *state)
1105 {
1106   cpp_buffer *buffer = pfile->buffer;
1107
1108   if (*buffer->cur == '$')
1109     {
1110       if (!CPP_OPTION (pfile, dollars_in_ident))
1111         return false;
1112
1113       buffer->cur++;
1114       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1115         {
1116           CPP_OPTION (pfile, warn_dollars) = 0;
1117           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1118         }
1119
1120       return true;
1121     }
1122
1123   /* Is this a syntactically valid UCN?  */
1124   if (CPP_OPTION (pfile, extended_identifiers)
1125       && *buffer->cur == '\\'
1126       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1127     {
1128       buffer->cur += 2;
1129       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1130                           state))
1131         return true;
1132       buffer->cur -= 2;
1133     }
1134
1135   return false;
1136 }
1137
1138 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1139 static cpp_hashnode *
1140 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1141 {
1142   cpp_hashnode *result;
1143   const uchar *cur;
1144   unsigned int len;
1145   unsigned int hash = HT_HASHSTEP (0, *base);
1146
1147   cur = base + 1;
1148   while (ISIDNUM (*cur))
1149     {
1150       hash = HT_HASHSTEP (hash, *cur);
1151       cur++;
1152     }
1153   len = cur - base;
1154   hash = HT_HASHFINISH (hash, len);
1155   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1156                                               base, len, hash, HT_ALLOC));
1157
1158   /* Rarely, identifiers require diagnostics when lexed.  */
1159   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1160                         && !pfile->state.skipping, 0))
1161     {
1162       /* It is allowed to poison the same identifier twice.  */
1163       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1164         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1165                    NODE_NAME (result));
1166
1167       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1168          replacement list of a variadic macro.  */
1169       if (result == pfile->spec_nodes.n__VA_ARGS__
1170           && !pfile->state.va_args_ok)
1171         cpp_error (pfile, CPP_DL_PEDWARN,
1172                    "__VA_ARGS__ can only appear in the expansion"
1173                    " of a C99 variadic macro");
1174
1175       /* For -Wc++-compat, warn about use of C++ named operators.  */
1176       if (result->flags & NODE_WARN_OPERATOR)
1177         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1178                      "identifier \"%s\" is a special operator name in C++",
1179                      NODE_NAME (result));
1180     }
1181
1182   return result;
1183 }
1184
1185 /* Get the cpp_hashnode of an identifier specified by NAME in
1186    the current cpp_reader object.  If none is found, NULL is returned.  */
1187 cpp_hashnode *
1188 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1189 {
1190   cpp_hashnode *result;
1191   result = lex_identifier_intern (pfile, (uchar *) name);
1192   return result;
1193 }
1194
1195 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1196 static cpp_hashnode *
1197 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1198                 struct normalize_state *nst)
1199 {
1200   cpp_hashnode *result;
1201   const uchar *cur;
1202   unsigned int len;
1203   unsigned int hash = HT_HASHSTEP (0, *base);
1204
1205   cur = pfile->buffer->cur;
1206   if (! starts_ucn)
1207     {
1208       while (ISIDNUM (*cur))
1209         {
1210           hash = HT_HASHSTEP (hash, *cur);
1211           cur++;
1212         }
1213       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1214     }
1215   pfile->buffer->cur = cur;
1216   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1217     {
1218       /* Slower version for identifiers containing UCNs (or $).  */
1219       do {
1220         while (ISIDNUM (*pfile->buffer->cur))
1221           {
1222             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1223             pfile->buffer->cur++;
1224           }
1225       } while (forms_identifier_p (pfile, false, nst));
1226       result = _cpp_interpret_identifier (pfile, base,
1227                                           pfile->buffer->cur - base);
1228     }
1229   else
1230     {
1231       len = cur - base;
1232       hash = HT_HASHFINISH (hash, len);
1233
1234       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1235                                                   base, len, hash, HT_ALLOC));
1236     }
1237
1238   /* Rarely, identifiers require diagnostics when lexed.  */
1239   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1240                         && !pfile->state.skipping, 0))
1241     {
1242       /* It is allowed to poison the same identifier twice.  */
1243       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1244         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1245                    NODE_NAME (result));
1246
1247       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1248          replacement list of a variadic macro.  */
1249       if (result == pfile->spec_nodes.n__VA_ARGS__
1250           && !pfile->state.va_args_ok)
1251         cpp_error (pfile, CPP_DL_PEDWARN,
1252                    "__VA_ARGS__ can only appear in the expansion"
1253                    " of a C99 variadic macro");
1254
1255       /* For -Wc++-compat, warn about use of C++ named operators.  */
1256       if (result->flags & NODE_WARN_OPERATOR)
1257         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1258                      "identifier \"%s\" is a special operator name in C++",
1259                      NODE_NAME (result));
1260     }
1261
1262   return result;
1263 }
1264
1265 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1266 static void
1267 lex_number (cpp_reader *pfile, cpp_string *number,
1268             struct normalize_state *nst)
1269 {
1270   const uchar *cur;
1271   const uchar *base;
1272   uchar *dest;
1273
1274   base = pfile->buffer->cur - 1;
1275   do
1276     {
1277       cur = pfile->buffer->cur;
1278
1279       /* N.B. ISIDNUM does not include $.  */
1280       while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1281              || VALID_SIGN (*cur, cur[-1]))
1282         {
1283           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1284           cur++;
1285         }
1286
1287       pfile->buffer->cur = cur;
1288     }
1289   while (forms_identifier_p (pfile, false, nst));
1290
1291   number->len = cur - base;
1292   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1293   memcpy (dest, base, number->len);
1294   dest[number->len] = '\0';
1295   number->text = dest;
1296 }
1297
1298 /* Create a token of type TYPE with a literal spelling.  */
1299 static void
1300 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1301                 unsigned int len, enum cpp_ttype type)
1302 {
1303   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1304
1305   memcpy (dest, base, len);
1306   dest[len] = '\0';
1307   token->type = type;
1308   token->val.str.len = len;
1309   token->val.str.text = dest;
1310 }
1311
1312 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1313    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1314
1315 static void
1316 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1317                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1318 {
1319   _cpp_buff *first_buff = *first_buff_p;
1320   _cpp_buff *last_buff = *last_buff_p;
1321
1322   if (first_buff == NULL)
1323     first_buff = last_buff = _cpp_get_buff (pfile, len);
1324   else if (len > BUFF_ROOM (last_buff))
1325     {
1326       size_t room = BUFF_ROOM (last_buff);
1327       memcpy (BUFF_FRONT (last_buff), base, room);
1328       BUFF_FRONT (last_buff) += room;
1329       base += room;
1330       len -= room;
1331       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1332     }
1333
1334   memcpy (BUFF_FRONT (last_buff), base, len);
1335   BUFF_FRONT (last_buff) += len;
1336
1337   *first_buff_p = first_buff;
1338   *last_buff_p = last_buff;
1339 }
1340
1341
1342 /* Returns true if a macro has been defined.
1343    This might not work if compile with -save-temps,
1344    or preprocess separately from compilation.  */
1345
1346 static bool
1347 is_macro(cpp_reader *pfile, const uchar *base)
1348 {
1349   const uchar *cur = base;
1350   if (! ISIDST (*cur))
1351     return false;
1352   unsigned int hash = HT_HASHSTEP (0, *cur);
1353   ++cur;
1354   while (ISIDNUM (*cur))
1355     {
1356       hash = HT_HASHSTEP (hash, *cur);
1357       ++cur;
1358     }
1359   hash = HT_HASHFINISH (hash, cur - base);
1360
1361   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1362                                         base, cur - base, hash, HT_NO_INSERT));
1363
1364   return !result ? false : (result->type == NT_MACRO);
1365 }
1366
1367
1368 /* Lexes a raw string.  The stored string contains the spelling, including
1369    double quotes, delimiter string, '(' and ')', any leading
1370    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1371    literal, or CPP_OTHER if it was not properly terminated.
1372
1373    The spelling is NUL-terminated, but it is not guaranteed that this
1374    is the first NUL since embedded NULs are preserved.  */
1375
1376 static void
1377 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1378                 const uchar *cur)
1379 {
1380   uchar raw_prefix[17];
1381   uchar temp_buffer[18];
1382   const uchar *orig_base;
1383   unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
1384   enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
1385   raw_str_phase phase = RAW_STR_PREFIX;
1386   enum cpp_ttype type;
1387   size_t total_len = 0;
1388   /* Index into temp_buffer during phases other than RAW_STR,
1389      during RAW_STR phase 17 to tell BUF_APPEND that nothing should
1390      be appended to temp_buffer.  */
1391   size_t temp_buffer_len = 0;
1392   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1393   size_t raw_prefix_start;
1394   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1395
1396   type = (*base == 'L' ? CPP_WSTRING :
1397           *base == 'U' ? CPP_STRING32 :
1398           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1399           : CPP_STRING);
1400
1401 #define BUF_APPEND(STR,LEN)                                     \
1402       do {                                                      \
1403         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1404                         &first_buff, &last_buff);               \
1405         total_len += (LEN);                                     \
1406         if (__builtin_expect (temp_buffer_len < 17, 0)          \
1407             && (const uchar *)(STR) != base                     \
1408             && (LEN) <= 2)                                      \
1409           {                                                     \
1410             memcpy (temp_buffer + temp_buffer_len,              \
1411                     (const uchar *)(STR), (LEN));               \
1412             temp_buffer_len += (LEN);                           \
1413           }                                                     \
1414       } while (0);
1415
1416   orig_base = base;
1417   ++cur;
1418   raw_prefix_start = cur - base;
1419   for (;;)
1420     {
1421       cppchar_t c;
1422
1423       /* If we previously performed any trigraph or line splicing
1424          transformations, undo them in between the opening and closing
1425          double quote.  */
1426       while (note->pos < cur)
1427         ++note;
1428       for (; note->pos == cur; ++note)
1429         {
1430           switch (note->type)
1431             {
1432             case '\\':
1433             case ' ':
1434               /* Restore backslash followed by newline.  */
1435               BUF_APPEND (base, cur - base);
1436               base = cur;
1437               BUF_APPEND ("\\", 1);
1438             after_backslash:
1439               if (note->type == ' ')
1440                 {
1441                   /* GNU backslash whitespace newline extension.  FIXME
1442                      could be any sequence of non-vertical space.  When we
1443                      can properly restore any such sequence, we should mark
1444                      this note as handled so _cpp_process_line_notes
1445                      doesn't warn.  */
1446                   BUF_APPEND (" ", 1);
1447                 }
1448
1449               BUF_APPEND ("\n", 1);
1450               break;
1451
1452             case 0:
1453               /* Already handled.  */
1454               break;
1455
1456             default:
1457               if (_cpp_trigraph_map[note->type])
1458                 {
1459                   /* Don't warn about this trigraph in
1460                      _cpp_process_line_notes, since trigraphs show up as
1461                      trigraphs in raw strings.  */
1462                   uchar type = note->type;
1463                   note->type = 0;
1464
1465                   if (!CPP_OPTION (pfile, trigraphs))
1466                     /* If we didn't convert the trigraph in the first
1467                        place, don't do anything now either.  */
1468                     break;
1469
1470                   BUF_APPEND (base, cur - base);
1471                   base = cur;
1472                   BUF_APPEND ("??", 2);
1473
1474                   /* ??/ followed by newline gets two line notes, one for
1475                      the trigraph and one for the backslash/newline.  */
1476                   if (type == '/' && note[1].pos == cur)
1477                     {
1478                       if (note[1].type != '\\'
1479                           && note[1].type != ' ')
1480                         abort ();
1481                       BUF_APPEND ("/", 1);
1482                       ++note;
1483                       goto after_backslash;
1484                     }
1485                   else
1486                     {
1487                       /* Skip the replacement character.  */
1488                       base = ++cur;
1489                       BUF_APPEND (&type, 1);
1490                       c = type;
1491                       goto check_c;
1492                     }
1493                 }
1494               else
1495                 abort ();
1496               break;
1497             }
1498         }
1499       c = *cur++;
1500       if (__builtin_expect (temp_buffer_len < 17, 0))
1501         temp_buffer[temp_buffer_len++] = c;
1502
1503      check_c:
1504       if (phase == RAW_STR_PREFIX)
1505         {
1506           while (raw_prefix_len < temp_buffer_len)
1507             {
1508               raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len];
1509               switch (raw_prefix[raw_prefix_len])
1510                 {
1511                 case ' ': case '(': case ')': case '\\': case '\t':
1512                 case '\v': case '\f': case '\n': default:
1513                   break;
1514                 /* Basic source charset except the above chars.  */
1515                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1516                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1517                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1518                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1519                 case 'y': case 'z':
1520                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1521                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1522                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1523                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1524                 case 'Y': case 'Z':
1525                 case '0': case '1': case '2': case '3': case '4': case '5':
1526                 case '6': case '7': case '8': case '9':
1527                 case '_': case '{': case '}': case '#': case '[': case ']':
1528                 case '<': case '>': case '%': case ':': case ';': case '.':
1529                 case '?': case '*': case '+': case '-': case '/': case '^':
1530                 case '&': case '|': case '~': case '!': case '=': case ',':
1531                 case '"': case '\'':
1532                   if (raw_prefix_len < 16)
1533                     {
1534                       raw_prefix_len++;
1535                       continue;
1536                     }
1537                   break;
1538                 }
1539
1540               if (raw_prefix[raw_prefix_len] != '(')
1541                 {
1542                   int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1;
1543                   if (raw_prefix_len == 16)
1544                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1545                                          col, "raw string delimiter longer "
1546                                               "than 16 characters");
1547                   else if (raw_prefix[raw_prefix_len] == '\n')
1548                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1549                                          col, "invalid new-line in raw "
1550                                               "string delimiter");
1551                   else
1552                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1553                                          col, "invalid character '%c' in "
1554                                               "raw string delimiter",
1555                                          (int) raw_prefix[raw_prefix_len]);
1556                   pfile->buffer->cur = orig_base + raw_prefix_start - 1;
1557                   create_literal (pfile, token, orig_base,
1558                                   raw_prefix_start - 1, CPP_OTHER);
1559                   if (first_buff)
1560                     _cpp_release_buff (pfile, first_buff);
1561                   return;
1562                 }
1563               raw_prefix[raw_prefix_len] = '"';
1564               phase = RAW_STR;
1565               /* Nothing should be appended to temp_buffer during
1566                  RAW_STR phase.  */
1567               temp_buffer_len = 17;
1568               break;
1569             }
1570           continue;
1571         }
1572       else if (phase == RAW_STR_SUFFIX)
1573         {
1574           while (raw_suffix_len <= raw_prefix_len
1575                  && raw_suffix_len < temp_buffer_len
1576                  && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len])
1577             raw_suffix_len++;
1578           if (raw_suffix_len > raw_prefix_len)
1579             break;
1580           if (raw_suffix_len == temp_buffer_len)
1581             continue;
1582           phase = RAW_STR;
1583           /* Nothing should be appended to temp_buffer during
1584              RAW_STR phase.  */
1585           temp_buffer_len = 17;
1586         }
1587       if (c == ')')
1588         {
1589           phase = RAW_STR_SUFFIX;
1590           raw_suffix_len = 0;
1591           temp_buffer_len = 0;
1592         }
1593       else if (c == '\n')
1594         {
1595           if (pfile->state.in_directive
1596               || (pfile->state.parsing_args
1597                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
1598             {
1599               cur--;
1600               type = CPP_OTHER;
1601               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1602                                    "unterminated raw string");
1603               break;
1604             }
1605
1606           BUF_APPEND (base, cur - base);
1607
1608           if (pfile->buffer->cur < pfile->buffer->rlimit)
1609             CPP_INCREMENT_LINE (pfile, 0);
1610           pfile->buffer->need_line = true;
1611
1612           pfile->buffer->cur = cur-1;
1613           _cpp_process_line_notes (pfile, false);
1614           if (!_cpp_get_fresh_line (pfile))
1615             {
1616               source_location src_loc = token->src_loc;
1617               token->type = CPP_EOF;
1618               /* Tell the compiler the line number of the EOF token.  */
1619               token->src_loc = pfile->line_table->highest_line;
1620               token->flags = BOL;
1621               if (first_buff != NULL)
1622                 _cpp_release_buff (pfile, first_buff);
1623               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1624                                    "unterminated raw string");
1625               return;
1626             }
1627
1628           cur = base = pfile->buffer->cur;
1629           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1630         }
1631     }
1632
1633   if (CPP_OPTION (pfile, user_literals))
1634     {
1635       /* If a string format macro, say from inttypes.h, is placed touching
1636          a string literal it could be parsed as a C++11 user-defined string
1637          literal thus breaking the program.
1638          Try to identify macros with is_macro. A warning is issued. */
1639       if (is_macro (pfile, cur))
1640         {
1641           /* Raise a warning, but do not consume subsequent tokens.  */
1642           if (CPP_OPTION (pfile, warn_literal_suffix))
1643             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1644                                    token->src_loc, 0,
1645                                    "invalid suffix on literal; C++11 requires "
1646                                    "a space between literal and string macro");
1647         }
1648       /* Grab user defined literal suffix.  */
1649       else if (ISIDST (*cur))
1650         {
1651           type = cpp_userdef_string_add_type (type);
1652           ++cur;
1653
1654           while (ISIDNUM (*cur))
1655             ++cur;
1656         }
1657     }
1658
1659   pfile->buffer->cur = cur;
1660   if (first_buff == NULL)
1661     create_literal (pfile, token, base, cur - base, type);
1662   else
1663     {
1664       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1665
1666       token->type = type;
1667       token->val.str.len = total_len + (cur - base);
1668       token->val.str.text = dest;
1669       last_buff = first_buff;
1670       while (last_buff != NULL)
1671         {
1672           memcpy (dest, last_buff->base,
1673                   BUFF_FRONT (last_buff) - last_buff->base);
1674           dest += BUFF_FRONT (last_buff) - last_buff->base;
1675           last_buff = last_buff->next;
1676         }
1677       _cpp_release_buff (pfile, first_buff);
1678       memcpy (dest, base, cur - base);
1679       dest[cur - base] = '\0';
1680     }
1681 }
1682
1683 /* Lexes a string, character constant, or angle-bracketed header file
1684    name.  The stored string contains the spelling, including opening
1685    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1686    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1687    if it was not properly terminated, or CPP_LESS for an unterminated
1688    header name which must be relexed as normal tokens.
1689
1690    The spelling is NUL-terminated, but it is not guaranteed that this
1691    is the first NUL since embedded NULs are preserved.  */
1692 static void
1693 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1694 {
1695   bool saw_NUL = false;
1696   const uchar *cur;
1697   cppchar_t terminator;
1698   enum cpp_ttype type;
1699
1700   cur = base;
1701   terminator = *cur++;
1702   if (terminator == 'L' || terminator == 'U')
1703     terminator = *cur++;
1704   else if (terminator == 'u')
1705     {
1706       terminator = *cur++;
1707       if (terminator == '8')
1708         terminator = *cur++;
1709     }
1710   if (terminator == 'R')
1711     {
1712       lex_raw_string (pfile, token, base, cur);
1713       return;
1714     }
1715   if (terminator == '"')
1716     type = (*base == 'L' ? CPP_WSTRING :
1717             *base == 'U' ? CPP_STRING32 :
1718             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1719                          : CPP_STRING);
1720   else if (terminator == '\'')
1721     type = (*base == 'L' ? CPP_WCHAR :
1722             *base == 'U' ? CPP_CHAR32 :
1723             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1724   else
1725     terminator = '>', type = CPP_HEADER_NAME;
1726
1727   for (;;)
1728     {
1729       cppchar_t c = *cur++;
1730
1731       /* In #include-style directives, terminators are not escapable.  */
1732       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1733         cur++;
1734       else if (c == terminator)
1735         break;
1736       else if (c == '\n')
1737         {
1738           cur--;
1739           /* Unmatched quotes always yield undefined behavior, but
1740              greedy lexing means that what appears to be an unterminated
1741              header name may actually be a legitimate sequence of tokens.  */
1742           if (terminator == '>')
1743             {
1744               token->type = CPP_LESS;
1745               return;
1746             }
1747           type = CPP_OTHER;
1748           break;
1749         }
1750       else if (c == '\0')
1751         saw_NUL = true;
1752     }
1753
1754   if (saw_NUL && !pfile->state.skipping)
1755     cpp_error (pfile, CPP_DL_WARNING,
1756                "null character(s) preserved in literal");
1757
1758   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1759     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1760                (int) terminator);
1761
1762   if (CPP_OPTION (pfile, user_literals))
1763     {
1764       /* If a string format macro, say from inttypes.h, is placed touching
1765          a string literal it could be parsed as a C++11 user-defined string
1766          literal thus breaking the program.
1767          Try to identify macros with is_macro. A warning is issued. */
1768       if (is_macro (pfile, cur))
1769         {
1770           /* Raise a warning, but do not consume subsequent tokens.  */
1771           if (CPP_OPTION (pfile, warn_literal_suffix))
1772             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1773                                    token->src_loc, 0,
1774                                    "invalid suffix on literal; C++11 requires "
1775                                    "a space between literal and string macro");
1776         }
1777       /* Grab user defined literal suffix.  */
1778       else if (ISIDST (*cur))
1779         {
1780           type = cpp_userdef_char_add_type (type);
1781           type = cpp_userdef_string_add_type (type);
1782           ++cur;
1783
1784           while (ISIDNUM (*cur))
1785             ++cur;
1786         }
1787     }
1788
1789   pfile->buffer->cur = cur;
1790   create_literal (pfile, token, base, cur - base, type);
1791 }
1792
1793 /* Return the comment table. The client may not make any assumption
1794    about the ordering of the table.  */
1795 cpp_comment_table *
1796 cpp_get_comments (cpp_reader *pfile)
1797 {
1798   return &pfile->comments;
1799 }
1800
1801 /* Append a comment to the end of the comment table. */
1802 static void
1803 store_comment (cpp_reader *pfile, cpp_token *token)
1804 {
1805   int len;
1806
1807   if (pfile->comments.allocated == 0)
1808     {
1809       pfile->comments.allocated = 256;
1810       pfile->comments.entries = (cpp_comment *) xmalloc
1811         (pfile->comments.allocated * sizeof (cpp_comment));
1812     }
1813
1814   if (pfile->comments.count == pfile->comments.allocated)
1815     {
1816       pfile->comments.allocated *= 2;
1817       pfile->comments.entries = (cpp_comment *) xrealloc
1818         (pfile->comments.entries,
1819          pfile->comments.allocated * sizeof (cpp_comment));
1820     }
1821
1822   len = token->val.str.len;
1823
1824   /* Copy comment. Note, token may not be NULL terminated. */
1825   pfile->comments.entries[pfile->comments.count].comment =
1826     (char *) xmalloc (sizeof (char) * (len + 1));
1827   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1828           token->val.str.text, len);
1829   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1830
1831   /* Set source location. */
1832   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1833
1834   /* Increment the count of entries in the comment table. */
1835   pfile->comments.count++;
1836 }
1837
1838 /* The stored comment includes the comment start and any terminator.  */
1839 static void
1840 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1841               cppchar_t type)
1842 {
1843   unsigned char *buffer;
1844   unsigned int len, clen, i;
1845
1846   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1847
1848   /* C++ comments probably (not definitely) have moved past a new
1849      line, which we don't want to save in the comment.  */
1850   if (is_vspace (pfile->buffer->cur[-1]))
1851     len--;
1852
1853   /* If we are currently in a directive or in argument parsing, then
1854      we need to store all C++ comments as C comments internally, and
1855      so we need to allocate a little extra space in that case.
1856
1857      Note that the only time we encounter a directive here is
1858      when we are saving comments in a "#define".  */
1859   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1860           && type == '/') ? len + 2 : len;
1861
1862   buffer = _cpp_unaligned_alloc (pfile, clen);
1863
1864   token->type = CPP_COMMENT;
1865   token->val.str.len = clen;
1866   token->val.str.text = buffer;
1867
1868   buffer[0] = '/';
1869   memcpy (buffer + 1, from, len - 1);
1870
1871   /* Finish conversion to a C comment, if necessary.  */
1872   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1873     {
1874       buffer[1] = '*';
1875       buffer[clen - 2] = '*';
1876       buffer[clen - 1] = '/';
1877       /* As there can be in a C++ comments illegal sequences for C comments
1878          we need to filter them out.  */
1879       for (i = 2; i < (clen - 2); i++)
1880         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1881           buffer[i] = '|';
1882     }
1883
1884   /* Finally store this comment for use by clients of libcpp. */
1885   store_comment (pfile, token);
1886 }
1887
1888 /* Allocate COUNT tokens for RUN.  */
1889 void
1890 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1891 {
1892   run->base = XNEWVEC (cpp_token, count);
1893   run->limit = run->base + count;
1894   run->next = NULL;
1895 }
1896
1897 /* Returns the next tokenrun, or creates one if there is none.  */
1898 static tokenrun *
1899 next_tokenrun (tokenrun *run)
1900 {
1901   if (run->next == NULL)
1902     {
1903       run->next = XNEW (tokenrun);
1904       run->next->prev = run;
1905       _cpp_init_tokenrun (run->next, 250);
1906     }
1907
1908   return run->next;
1909 }
1910
1911 /* Return the number of not yet processed token in a given
1912    context.  */
1913 int
1914 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1915 {
1916   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1917     return (LAST (context).token - FIRST (context).token);
1918   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1919            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1920     return (LAST (context).ptoken - FIRST (context).ptoken);
1921   else
1922       abort ();
1923 }
1924
1925 /* Returns the token present at index INDEX in a given context.  If
1926    INDEX is zero, the next token to be processed is returned.  */
1927 static const cpp_token*
1928 _cpp_token_from_context_at (cpp_context *context, int index)
1929 {
1930   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1931     return &(FIRST (context).token[index]);
1932   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1933            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1934     return FIRST (context).ptoken[index];
1935  else
1936    abort ();
1937 }
1938
1939 /* Look ahead in the input stream.  */
1940 const cpp_token *
1941 cpp_peek_token (cpp_reader *pfile, int index)
1942 {
1943   cpp_context *context = pfile->context;
1944   const cpp_token *peektok;
1945   int count;
1946
1947   /* First, scan through any pending cpp_context objects.  */
1948   while (context->prev)
1949     {
1950       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1951
1952       if (index < (int) sz)
1953         return _cpp_token_from_context_at (context, index);
1954       index -= (int) sz;
1955       context = context->prev;
1956     }
1957
1958   /* We will have to read some new tokens after all (and do so
1959      without invalidating preceding tokens).  */
1960   count = index;
1961   pfile->keep_tokens++;
1962
1963   do
1964     {
1965       peektok = _cpp_lex_token (pfile);
1966       if (peektok->type == CPP_EOF)
1967         return peektok;
1968     }
1969   while (index--);
1970
1971   _cpp_backup_tokens_direct (pfile, count + 1);
1972   pfile->keep_tokens--;
1973
1974   return peektok;
1975 }
1976
1977 /* Allocate a single token that is invalidated at the same time as the
1978    rest of the tokens on the line.  Has its line and col set to the
1979    same as the last lexed token, so that diagnostics appear in the
1980    right place.  */
1981 cpp_token *
1982 _cpp_temp_token (cpp_reader *pfile)
1983 {
1984   cpp_token *old, *result;
1985   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1986   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1987
1988   old = pfile->cur_token - 1;
1989   /* Any pre-existing lookaheads must not be clobbered.  */
1990   if (la)
1991     {
1992       if (sz <= la)
1993         {
1994           tokenrun *next = next_tokenrun (pfile->cur_run);
1995
1996           if (sz < la)
1997             memmove (next->base + 1, next->base,
1998                      (la - sz) * sizeof (cpp_token));
1999
2000           next->base[0] = pfile->cur_run->limit[-1];
2001         }
2002
2003       if (sz > 1)
2004         memmove (pfile->cur_token + 1, pfile->cur_token,
2005                  MIN (la, sz - 1) * sizeof (cpp_token));
2006     }
2007
2008   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2009     {
2010       pfile->cur_run = next_tokenrun (pfile->cur_run);
2011       pfile->cur_token = pfile->cur_run->base;
2012     }
2013
2014   result = pfile->cur_token++;
2015   result->src_loc = old->src_loc;
2016   return result;
2017 }
2018
2019 /* Lex a token into RESULT (external interface).  Takes care of issues
2020    like directive handling, token lookahead, multiple include
2021    optimization and skipping.  */
2022 const cpp_token *
2023 _cpp_lex_token (cpp_reader *pfile)
2024 {
2025   cpp_token *result;
2026
2027   for (;;)
2028     {
2029       if (pfile->cur_token == pfile->cur_run->limit)
2030         {
2031           pfile->cur_run = next_tokenrun (pfile->cur_run);
2032           pfile->cur_token = pfile->cur_run->base;
2033         }
2034       /* We assume that the current token is somewhere in the current
2035          run.  */
2036       if (pfile->cur_token < pfile->cur_run->base
2037           || pfile->cur_token >= pfile->cur_run->limit)
2038         abort ();
2039
2040       if (pfile->lookaheads)
2041         {
2042           pfile->lookaheads--;
2043           result = pfile->cur_token++;
2044         }
2045       else
2046         result = _cpp_lex_direct (pfile);
2047
2048       if (result->flags & BOL)
2049         {
2050           /* Is this a directive.  If _cpp_handle_directive returns
2051              false, it is an assembler #.  */
2052           if (result->type == CPP_HASH
2053               /* 6.10.3 p 11: Directives in a list of macro arguments
2054                  gives undefined behavior.  This implementation
2055                  handles the directive as normal.  */
2056               && pfile->state.parsing_args != 1)
2057             {
2058               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2059                 {
2060                   if (pfile->directive_result.type == CPP_PADDING)
2061                     continue;
2062                   result = &pfile->directive_result;
2063                 }
2064             }
2065           else if (pfile->state.in_deferred_pragma)
2066             result = &pfile->directive_result;
2067
2068           if (pfile->cb.line_change && !pfile->state.skipping)
2069             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2070         }
2071
2072       /* We don't skip tokens in directives.  */
2073       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2074         break;
2075
2076       /* Outside a directive, invalidate controlling macros.  At file
2077          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2078          get here and MI optimization works.  */
2079       pfile->mi_valid = false;
2080
2081       if (!pfile->state.skipping || result->type == CPP_EOF)
2082         break;
2083     }
2084
2085   return result;
2086 }
2087
2088 /* Returns true if a fresh line has been loaded.  */
2089 bool
2090 _cpp_get_fresh_line (cpp_reader *pfile)
2091 {
2092   int return_at_eof;
2093
2094   /* We can't get a new line until we leave the current directive.  */
2095   if (pfile->state.in_directive)
2096     return false;
2097
2098   for (;;)
2099     {
2100       cpp_buffer *buffer = pfile->buffer;
2101
2102       if (!buffer->need_line)
2103         return true;
2104
2105       if (buffer->next_line < buffer->rlimit)
2106         {
2107           _cpp_clean_line (pfile);
2108           return true;
2109         }
2110
2111       /* First, get out of parsing arguments state.  */
2112       if (pfile->state.parsing_args)
2113         return false;
2114
2115       /* End of buffer.  Non-empty files should end in a newline.  */
2116       if (buffer->buf != buffer->rlimit
2117           && buffer->next_line > buffer->rlimit
2118           && !buffer->from_stage3)
2119         {
2120           /* Clip to buffer size.  */
2121           buffer->next_line = buffer->rlimit;
2122         }
2123
2124       return_at_eof = buffer->return_at_eof;
2125       _cpp_pop_buffer (pfile);
2126       if (pfile->buffer == NULL || return_at_eof)
2127         return false;
2128     }
2129 }
2130
2131 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2132   do                                                    \
2133     {                                                   \
2134       result->type = ELSE_TYPE;                         \
2135       if (*buffer->cur == CHAR)                         \
2136         buffer->cur++, result->type = THEN_TYPE;        \
2137     }                                                   \
2138   while (0)
2139
2140 /* Lex a token into pfile->cur_token, which is also incremented, to
2141    get diagnostics pointing to the correct location.
2142
2143    Does not handle issues such as token lookahead, multiple-include
2144    optimization, directives, skipping etc.  This function is only
2145    suitable for use by _cpp_lex_token, and in special cases like
2146    lex_expansion_token which doesn't care for any of these issues.
2147
2148    When meeting a newline, returns CPP_EOF if parsing a directive,
2149    otherwise returns to the start of the token buffer if permissible.
2150    Returns the location of the lexed token.  */
2151 cpp_token *
2152 _cpp_lex_direct (cpp_reader *pfile)
2153 {
2154   cppchar_t c;
2155   cpp_buffer *buffer;
2156   const unsigned char *comment_start;
2157   cpp_token *result = pfile->cur_token++;
2158
2159  fresh_line:
2160   result->flags = 0;
2161   buffer = pfile->buffer;
2162   if (buffer->need_line)
2163     {
2164       if (pfile->state.in_deferred_pragma)
2165         {
2166           result->type = CPP_PRAGMA_EOL;
2167           pfile->state.in_deferred_pragma = false;
2168           if (!pfile->state.pragma_allow_expansion)
2169             pfile->state.prevent_expansion--;
2170           return result;
2171         }
2172       if (!_cpp_get_fresh_line (pfile))
2173         {
2174           result->type = CPP_EOF;
2175           if (!pfile->state.in_directive)
2176             {
2177               /* Tell the compiler the line number of the EOF token.  */
2178               result->src_loc = pfile->line_table->highest_line;
2179               result->flags = BOL;
2180             }
2181           return result;
2182         }
2183       if (!pfile->keep_tokens)
2184         {
2185           pfile->cur_run = &pfile->base_run;
2186           result = pfile->base_run.base;
2187           pfile->cur_token = result + 1;
2188         }
2189       result->flags = BOL;
2190       if (pfile->state.parsing_args == 2)
2191         result->flags |= PREV_WHITE;
2192     }
2193   buffer = pfile->buffer;
2194  update_tokens_line:
2195   result->src_loc = pfile->line_table->highest_line;
2196
2197  skipped_white:
2198   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2199       && !pfile->overlaid_buffer)
2200     {
2201       _cpp_process_line_notes (pfile, false);
2202       result->src_loc = pfile->line_table->highest_line;
2203     }
2204   c = *buffer->cur++;
2205
2206   if (pfile->forced_token_location_p)
2207     result->src_loc = *pfile->forced_token_location_p;
2208   else
2209     result->src_loc = linemap_position_for_column (pfile->line_table,
2210                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2211
2212   switch (c)
2213     {
2214     case ' ': case '\t': case '\f': case '\v': case '\0':
2215       result->flags |= PREV_WHITE;
2216       skip_whitespace (pfile, c);
2217       goto skipped_white;
2218
2219     case '\n':
2220       if (buffer->cur < buffer->rlimit)
2221         CPP_INCREMENT_LINE (pfile, 0);
2222       buffer->need_line = true;
2223       goto fresh_line;
2224
2225     case '0': case '1': case '2': case '3': case '4':
2226     case '5': case '6': case '7': case '8': case '9':
2227       {
2228         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2229         result->type = CPP_NUMBER;
2230         lex_number (pfile, &result->val.str, &nst);
2231         warn_about_normalization (pfile, result, &nst);
2232         break;
2233       }
2234
2235     case 'L':
2236     case 'u':
2237     case 'U':
2238     case 'R':
2239       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2240          wide strings or raw strings.  */
2241       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2242           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2243         {
2244           if ((*buffer->cur == '\'' && c != 'R')
2245               || *buffer->cur == '"'
2246               || (*buffer->cur == 'R'
2247                   && c != 'R'
2248                   && buffer->cur[1] == '"'
2249                   && CPP_OPTION (pfile, rliterals))
2250               || (*buffer->cur == '8'
2251                   && c == 'u'
2252                   && (buffer->cur[1] == '"'
2253                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2254                           && CPP_OPTION (pfile, rliterals)))))
2255             {
2256               lex_string (pfile, result, buffer->cur - 1);
2257               break;
2258             }
2259         }
2260       /* Fall through.  */
2261
2262     case '_':
2263     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2264     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2265     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2266     case 's': case 't':           case 'v': case 'w': case 'x':
2267     case 'y': case 'z':
2268     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2269     case 'G': case 'H': case 'I': case 'J': case 'K':
2270     case 'M': case 'N': case 'O': case 'P': case 'Q':
2271     case 'S': case 'T':           case 'V': case 'W': case 'X':
2272     case 'Y': case 'Z':
2273       result->type = CPP_NAME;
2274       {
2275         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2276         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2277                                                 &nst);
2278         warn_about_normalization (pfile, result, &nst);
2279       }
2280
2281       /* Convert named operators to their proper types.  */
2282       if (result->val.node.node->flags & NODE_OPERATOR)
2283         {
2284           result->flags |= NAMED_OP;
2285           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2286         }
2287       break;
2288
2289     case '\'':
2290     case '"':
2291       lex_string (pfile, result, buffer->cur - 1);
2292       break;
2293
2294     case '/':
2295       /* A potential block or line comment.  */
2296       comment_start = buffer->cur;
2297       c = *buffer->cur;
2298
2299       if (c == '*')
2300         {
2301           if (_cpp_skip_block_comment (pfile))
2302             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2303         }
2304       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2305                             || cpp_in_system_header (pfile)))
2306         {
2307           /* Warn about comments only if pedantically GNUC89, and not
2308              in system headers.  */
2309           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2310               && ! buffer->warned_cplusplus_comments)
2311             {
2312               cpp_error (pfile, CPP_DL_PEDWARN,
2313                          "C++ style comments are not allowed in ISO C90");
2314               cpp_error (pfile, CPP_DL_PEDWARN,
2315                          "(this will be reported only once per input file)");
2316               buffer->warned_cplusplus_comments = 1;
2317             }
2318
2319           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2320             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2321         }
2322       else if (c == '=')
2323         {
2324           buffer->cur++;
2325           result->type = CPP_DIV_EQ;
2326           break;
2327         }
2328       else
2329         {
2330           result->type = CPP_DIV;
2331           break;
2332         }
2333
2334       if (!pfile->state.save_comments)
2335         {
2336           result->flags |= PREV_WHITE;
2337           goto update_tokens_line;
2338         }
2339
2340       /* Save the comment as a token in its own right.  */
2341       save_comment (pfile, result, comment_start, c);
2342       break;
2343
2344     case '<':
2345       if (pfile->state.angled_headers)
2346         {
2347           lex_string (pfile, result, buffer->cur - 1);
2348           if (result->type != CPP_LESS)
2349             break;
2350         }
2351
2352       result->type = CPP_LESS;
2353       if (*buffer->cur == '=')
2354         buffer->cur++, result->type = CPP_LESS_EQ;
2355       else if (*buffer->cur == '<')
2356         {
2357           buffer->cur++;
2358           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2359         }
2360       else if (CPP_OPTION (pfile, digraphs))
2361         {
2362           if (*buffer->cur == ':')
2363             {
2364               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2365                  three characters are <:: and the subsequent character
2366                  is neither : nor >, the < is treated as a preprocessor
2367                  token by itself".  */
2368               if (CPP_OPTION (pfile, cplusplus)
2369                   && CPP_OPTION (pfile, lang) != CLK_CXX98
2370                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
2371                   && buffer->cur[1] == ':'
2372                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2373                 break;
2374
2375               buffer->cur++;
2376               result->flags |= DIGRAPH;
2377               result->type = CPP_OPEN_SQUARE;
2378             }
2379           else if (*buffer->cur == '%')
2380             {
2381               buffer->cur++;
2382               result->flags |= DIGRAPH;
2383               result->type = CPP_OPEN_BRACE;
2384             }
2385         }
2386       break;
2387
2388     case '>':
2389       result->type = CPP_GREATER;
2390       if (*buffer->cur == '=')
2391         buffer->cur++, result->type = CPP_GREATER_EQ;
2392       else if (*buffer->cur == '>')
2393         {
2394           buffer->cur++;
2395           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2396         }
2397       break;
2398
2399     case '%':
2400       result->type = CPP_MOD;
2401       if (*buffer->cur == '=')
2402         buffer->cur++, result->type = CPP_MOD_EQ;
2403       else if (CPP_OPTION (pfile, digraphs))
2404         {
2405           if (*buffer->cur == ':')
2406             {
2407               buffer->cur++;
2408               result->flags |= DIGRAPH;
2409               result->type = CPP_HASH;
2410               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2411                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2412             }
2413           else if (*buffer->cur == '>')
2414             {
2415               buffer->cur++;
2416               result->flags |= DIGRAPH;
2417               result->type = CPP_CLOSE_BRACE;
2418             }
2419         }
2420       break;
2421
2422     case '.':
2423       result->type = CPP_DOT;
2424       if (ISDIGIT (*buffer->cur))
2425         {
2426           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2427           result->type = CPP_NUMBER;
2428           lex_number (pfile, &result->val.str, &nst);
2429           warn_about_normalization (pfile, result, &nst);
2430         }
2431       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2432         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2433       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2434         buffer->cur++, result->type = CPP_DOT_STAR;
2435       break;
2436
2437     case '+':
2438       result->type = CPP_PLUS;
2439       if (*buffer->cur == '+')
2440         buffer->cur++, result->type = CPP_PLUS_PLUS;
2441       else if (*buffer->cur == '=')
2442         buffer->cur++, result->type = CPP_PLUS_EQ;
2443       break;
2444
2445     case '-':
2446       result->type = CPP_MINUS;
2447       if (*buffer->cur == '>')
2448         {
2449           buffer->cur++;
2450           result->type = CPP_DEREF;
2451           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2452             buffer->cur++, result->type = CPP_DEREF_STAR;
2453         }
2454       else if (*buffer->cur == '-')
2455         buffer->cur++, result->type = CPP_MINUS_MINUS;
2456       else if (*buffer->cur == '=')
2457         buffer->cur++, result->type = CPP_MINUS_EQ;
2458       break;
2459
2460     case '&':
2461       result->type = CPP_AND;
2462       if (*buffer->cur == '&')
2463         buffer->cur++, result->type = CPP_AND_AND;
2464       else if (*buffer->cur == '=')
2465         buffer->cur++, result->type = CPP_AND_EQ;
2466       break;
2467
2468     case '|':
2469       result->type = CPP_OR;
2470       if (*buffer->cur == '|')
2471         buffer->cur++, result->type = CPP_OR_OR;
2472       else if (*buffer->cur == '=')
2473         buffer->cur++, result->type = CPP_OR_EQ;
2474       break;
2475
2476     case ':':
2477       result->type = CPP_COLON;
2478       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2479         buffer->cur++, result->type = CPP_SCOPE;
2480       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2481         {
2482           buffer->cur++;
2483           result->flags |= DIGRAPH;
2484           result->type = CPP_CLOSE_SQUARE;
2485         }
2486       break;
2487
2488     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2489     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2490     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2491     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2492     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2493
2494     case '?': result->type = CPP_QUERY; break;
2495     case '~': result->type = CPP_COMPL; break;
2496     case ',': result->type = CPP_COMMA; break;
2497     case '(': result->type = CPP_OPEN_PAREN; break;
2498     case ')': result->type = CPP_CLOSE_PAREN; break;
2499     case '[': result->type = CPP_OPEN_SQUARE; break;
2500     case ']': result->type = CPP_CLOSE_SQUARE; break;
2501     case '{': result->type = CPP_OPEN_BRACE; break;
2502     case '}': result->type = CPP_CLOSE_BRACE; break;
2503     case ';': result->type = CPP_SEMICOLON; break;
2504
2505       /* @ is a punctuator in Objective-C.  */
2506     case '@': result->type = CPP_ATSIGN; break;
2507
2508     case '$':
2509     case '\\':
2510       {
2511         const uchar *base = --buffer->cur;
2512         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2513
2514         if (forms_identifier_p (pfile, true, &nst))
2515           {
2516             result->type = CPP_NAME;
2517             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2518             warn_about_normalization (pfile, result, &nst);
2519             break;
2520           }
2521         buffer->cur++;
2522       }
2523
2524     default:
2525       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2526       break;
2527     }
2528
2529   return result;
2530 }
2531
2532 /* An upper bound on the number of bytes needed to spell TOKEN.
2533    Does not include preceding whitespace.  */
2534 unsigned int
2535 cpp_token_len (const cpp_token *token)
2536 {
2537   unsigned int len;
2538
2539   switch (TOKEN_SPELL (token))
2540     {
2541     default:            len = 6;                                break;
2542     case SPELL_LITERAL: len = token->val.str.len;               break;
2543     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2544     }
2545
2546   return len;
2547 }
2548
2549 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2550    Return the number of bytes read out of NAME.  (There are always
2551    10 bytes written to BUFFER.)  */
2552
2553 static size_t
2554 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2555 {
2556   int j;
2557   int ucn_len = 0;
2558   int ucn_len_c;
2559   unsigned t;
2560   unsigned long utf32;
2561
2562   /* Compute the length of the UTF-8 sequence.  */
2563   for (t = *name; t & 0x80; t <<= 1)
2564     ucn_len++;
2565
2566   utf32 = *name & (0x7F >> ucn_len);
2567   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2568     {
2569       utf32 = (utf32 << 6) | (*++name & 0x3F);
2570
2571       /* Ill-formed UTF-8.  */
2572       if ((*name & ~0x3F) != 0x80)
2573         abort ();
2574     }
2575
2576   *buffer++ = '\\';
2577   *buffer++ = 'U';
2578   for (j = 7; j >= 0; j--)
2579     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2580   return ucn_len;
2581 }
2582
2583 /* Given a token TYPE corresponding to a digraph, return a pointer to
2584    the spelling of the digraph.  */
2585 static const unsigned char *
2586 cpp_digraph2name (enum cpp_ttype type)
2587 {
2588   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2589 }
2590
2591 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2592    already contain the enough space to hold the token's spelling.
2593    Returns a pointer to the character after the last character written.
2594    FORSTRING is true if this is to be the spelling after translation
2595    phase 1 (this is different for UCNs).
2596    FIXME: Would be nice if we didn't need the PFILE argument.  */
2597 unsigned char *
2598 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2599                  unsigned char *buffer, bool forstring)
2600 {
2601   switch (TOKEN_SPELL (token))
2602     {
2603     case SPELL_OPERATOR:
2604       {
2605         const unsigned char *spelling;
2606         unsigned char c;
2607
2608         if (token->flags & DIGRAPH)
2609           spelling = cpp_digraph2name (token->type);
2610         else if (token->flags & NAMED_OP)
2611           goto spell_ident;
2612         else
2613           spelling = TOKEN_NAME (token);
2614
2615         while ((c = *spelling++) != '\0')
2616           *buffer++ = c;
2617       }
2618       break;
2619
2620     spell_ident:
2621     case SPELL_IDENT:
2622       if (forstring)
2623         {
2624           memcpy (buffer, NODE_NAME (token->val.node.node),
2625                   NODE_LEN (token->val.node.node));
2626           buffer += NODE_LEN (token->val.node.node);
2627         }
2628       else
2629         {
2630           size_t i;
2631           const unsigned char * name = NODE_NAME (token->val.node.node);
2632
2633           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2634             if (name[i] & ~0x7F)
2635               {
2636                 i += utf8_to_ucn (buffer, name + i) - 1;
2637                 buffer += 10;
2638               }
2639             else
2640               *buffer++ = NODE_NAME (token->val.node.node)[i];
2641         }
2642       break;
2643
2644     case SPELL_LITERAL:
2645       memcpy (buffer, token->val.str.text, token->val.str.len);
2646       buffer += token->val.str.len;
2647       break;
2648
2649     case SPELL_NONE:
2650       cpp_error (pfile, CPP_DL_ICE,
2651                  "unspellable token %s", TOKEN_NAME (token));
2652       break;
2653     }
2654
2655   return buffer;
2656 }
2657
2658 /* Returns TOKEN spelt as a null-terminated string.  The string is
2659    freed when the reader is destroyed.  Useful for diagnostics.  */
2660 unsigned char *
2661 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2662 {
2663   unsigned int len = cpp_token_len (token) + 1;
2664   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2665
2666   end = cpp_spell_token (pfile, token, start, false);
2667   end[0] = '\0';
2668
2669   return start;
2670 }
2671
2672 /* Returns a pointer to a string which spells the token defined by
2673    TYPE and FLAGS.  Used by C front ends, which really should move to
2674    using cpp_token_as_text.  */
2675 const char *
2676 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2677 {
2678   if (flags & DIGRAPH)
2679     return (const char *) cpp_digraph2name (type);
2680   else if (flags & NAMED_OP)
2681     return cpp_named_operator2name (type);
2682
2683   return (const char *) token_spellings[type].name;
2684 }
2685
2686 /* Writes the spelling of token to FP, without any preceding space.
2687    Separated from cpp_spell_token for efficiency - to avoid stdio
2688    double-buffering.  */
2689 void
2690 cpp_output_token (const cpp_token *token, FILE *fp)
2691 {
2692   switch (TOKEN_SPELL (token))
2693     {
2694     case SPELL_OPERATOR:
2695       {
2696         const unsigned char *spelling;
2697         int c;
2698
2699         if (token->flags & DIGRAPH)
2700           spelling = cpp_digraph2name (token->type);
2701         else if (token->flags & NAMED_OP)
2702           goto spell_ident;
2703         else
2704           spelling = TOKEN_NAME (token);
2705
2706         c = *spelling;
2707         do
2708           putc (c, fp);
2709         while ((c = *++spelling) != '\0');
2710       }
2711       break;
2712
2713     spell_ident:
2714     case SPELL_IDENT:
2715       {
2716         size_t i;
2717         const unsigned char * name = NODE_NAME (token->val.node.node);
2718
2719         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2720           if (name[i] & ~0x7F)
2721             {
2722               unsigned char buffer[10];
2723               i += utf8_to_ucn (buffer, name + i) - 1;
2724               fwrite (buffer, 1, 10, fp);
2725             }
2726           else
2727             fputc (NODE_NAME (token->val.node.node)[i], fp);
2728       }
2729       break;
2730
2731     case SPELL_LITERAL:
2732       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2733       break;
2734
2735     case SPELL_NONE:
2736       /* An error, most probably.  */
2737       break;
2738     }
2739 }
2740
2741 /* Compare two tokens.  */
2742 int
2743 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2744 {
2745   if (a->type == b->type && a->flags == b->flags)
2746     switch (TOKEN_SPELL (a))
2747       {
2748       default:                  /* Keep compiler happy.  */
2749       case SPELL_OPERATOR:
2750         /* token_no is used to track where multiple consecutive ##
2751            tokens were originally located.  */
2752         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2753       case SPELL_NONE:
2754         return (a->type != CPP_MACRO_ARG
2755                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2756       case SPELL_IDENT:
2757         return a->val.node.node == b->val.node.node;
2758       case SPELL_LITERAL:
2759         return (a->val.str.len == b->val.str.len
2760                 && !memcmp (a->val.str.text, b->val.str.text,
2761                             a->val.str.len));
2762       }
2763
2764   return 0;
2765 }
2766
2767 /* Returns nonzero if a space should be inserted to avoid an
2768    accidental token paste for output.  For simplicity, it is
2769    conservative, and occasionally advises a space where one is not
2770    needed, e.g. "." and ".2".  */
2771 int
2772 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2773                  const cpp_token *token2)
2774 {
2775   enum cpp_ttype a = token1->type, b = token2->type;
2776   cppchar_t c;
2777
2778   if (token1->flags & NAMED_OP)
2779     a = CPP_NAME;
2780   if (token2->flags & NAMED_OP)
2781     b = CPP_NAME;
2782
2783   c = EOF;
2784   if (token2->flags & DIGRAPH)
2785     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2786   else if (token_spellings[b].category == SPELL_OPERATOR)
2787     c = token_spellings[b].name[0];
2788
2789   /* Quickly get everything that can paste with an '='.  */
2790   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2791     return 1;
2792
2793   switch (a)
2794     {
2795     case CPP_GREATER:   return c == '>';
2796     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2797     case CPP_PLUS:      return c == '+';
2798     case CPP_MINUS:     return c == '-' || c == '>';
2799     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2800     case CPP_MOD:       return c == ':' || c == '>';
2801     case CPP_AND:       return c == '&';
2802     case CPP_OR:        return c == '|';
2803     case CPP_COLON:     return c == ':' || c == '>';
2804     case CPP_DEREF:     return c == '*';
2805     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2806     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2807     case CPP_NAME:      return ((b == CPP_NUMBER
2808                                  && name_p (pfile, &token2->val.str))
2809                                 || b == CPP_NAME
2810                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2811     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2812                                 || c == '.' || c == '+' || c == '-');
2813                                       /* UCNs */
2814     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2815                                  && b == CPP_NAME)
2816                                 || (CPP_OPTION (pfile, objc)
2817                                     && token1->val.str.text[0] == '@'
2818                                     && (b == CPP_NAME || b == CPP_STRING)));
2819     case CPP_STRING:
2820     case CPP_WSTRING:
2821     case CPP_UTF8STRING:
2822     case CPP_STRING16:
2823     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
2824                                 && (b == CPP_NAME
2825                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
2826                                         && ISIDST (token2->val.str.text[0]))));
2827
2828     default:            break;
2829     }
2830
2831   return 0;
2832 }
2833
2834 /* Output all the remaining tokens on the current line, and a newline
2835    character, to FP.  Leading whitespace is removed.  If there are
2836    macros, special token padding is not performed.  */
2837 void
2838 cpp_output_line (cpp_reader *pfile, FILE *fp)
2839 {
2840   const cpp_token *token;
2841
2842   token = cpp_get_token (pfile);
2843   while (token->type != CPP_EOF)
2844     {
2845       cpp_output_token (token, fp);
2846       token = cpp_get_token (pfile);
2847       if (token->flags & PREV_WHITE)
2848         putc (' ', fp);
2849     }
2850
2851   putc ('\n', fp);
2852 }
2853
2854 /* Return a string representation of all the remaining tokens on the
2855    current line.  The result is allocated using xmalloc and must be
2856    freed by the caller.  */
2857 unsigned char *
2858 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2859 {
2860   const cpp_token *token;
2861   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2862   unsigned int alloced = 120 + out;
2863   unsigned char *result = (unsigned char *) xmalloc (alloced);
2864
2865   /* If DIR_NAME is empty, there are no initial contents.  */
2866   if (dir_name)
2867     {
2868       sprintf ((char *) result, "#%s ", dir_name);
2869       out += 2;
2870     }
2871
2872   token = cpp_get_token (pfile);
2873   while (token->type != CPP_EOF)
2874     {
2875       unsigned char *last;
2876       /* Include room for a possible space and the terminating nul.  */
2877       unsigned int len = cpp_token_len (token) + 2;
2878
2879       if (out + len > alloced)
2880         {
2881           alloced *= 2;
2882           if (out + len > alloced)
2883             alloced = out + len;
2884           result = (unsigned char *) xrealloc (result, alloced);
2885         }
2886
2887       last = cpp_spell_token (pfile, token, &result[out], 0);
2888       out = last - result;
2889
2890       token = cpp_get_token (pfile);
2891       if (token->flags & PREV_WHITE)
2892         result[out++] = ' ';
2893     }
2894
2895   result[out] = '\0';
2896   return result;
2897 }
2898
2899 /* Memory buffers.  Changing these three constants can have a dramatic
2900    effect on performance.  The values here are reasonable defaults,
2901    but might be tuned.  If you adjust them, be sure to test across a
2902    range of uses of cpplib, including heavy nested function-like macro
2903    expansion.  Also check the change in peak memory usage (NJAMD is a
2904    good tool for this).  */
2905 #define MIN_BUFF_SIZE 8000
2906 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2907 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2908         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2909
2910 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2911   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2912 #endif
2913
2914 /* Create a new allocation buffer.  Place the control block at the end
2915    of the buffer, so that buffer overflows will cause immediate chaos.  */
2916 static _cpp_buff *
2917 new_buff (size_t len)
2918 {
2919   _cpp_buff *result;
2920   unsigned char *base;
2921
2922   if (len < MIN_BUFF_SIZE)
2923     len = MIN_BUFF_SIZE;
2924   len = CPP_ALIGN (len);
2925
2926 #ifdef ENABLE_VALGRIND_CHECKING
2927   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
2928      struct first.  */
2929   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
2930   base = XNEWVEC (unsigned char, len + slen);
2931   result = (_cpp_buff *) base;
2932   base += slen;
2933 #else
2934   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2935   result = (_cpp_buff *) (base + len);
2936 #endif
2937   result->base = base;
2938   result->cur = base;
2939   result->limit = base + len;
2940   result->next = NULL;
2941   return result;
2942 }
2943
2944 /* Place a chain of unwanted allocation buffers on the free list.  */
2945 void
2946 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2947 {
2948   _cpp_buff *end = buff;
2949
2950   while (end->next)
2951     end = end->next;
2952   end->next = pfile->free_buffs;
2953   pfile->free_buffs = buff;
2954 }
2955
2956 /* Return a free buffer of size at least MIN_SIZE.  */
2957 _cpp_buff *
2958 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2959 {
2960   _cpp_buff *result, **p;
2961
2962   for (p = &pfile->free_buffs;; p = &(*p)->next)
2963     {
2964       size_t size;
2965
2966       if (*p == NULL)
2967         return new_buff (min_size);
2968       result = *p;
2969       size = result->limit - result->base;
2970       /* Return a buffer that's big enough, but don't waste one that's
2971          way too big.  */
2972       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2973         break;
2974     }
2975
2976   *p = result->next;
2977   result->next = NULL;
2978   result->cur = result->base;
2979   return result;
2980 }
2981
2982 /* Creates a new buffer with enough space to hold the uncommitted
2983    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2984    the excess bytes to the new buffer.  Chains the new buffer after
2985    BUFF, and returns the new buffer.  */
2986 _cpp_buff *
2987 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2988 {
2989   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2990   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2991
2992   buff->next = new_buff;
2993   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2994   return new_buff;
2995 }
2996
2997 /* Creates a new buffer with enough space to hold the uncommitted
2998    remaining bytes of the buffer pointed to by BUFF, and at least
2999    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3000    Chains the new buffer before the buffer pointed to by BUFF, and
3001    updates the pointer to point to the new buffer.  */
3002 void
3003 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3004 {
3005   _cpp_buff *new_buff, *old_buff = *pbuff;
3006   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3007
3008   new_buff = _cpp_get_buff (pfile, size);
3009   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3010   new_buff->next = old_buff;
3011   *pbuff = new_buff;
3012 }
3013
3014 /* Free a chain of buffers starting at BUFF.  */
3015 void
3016 _cpp_free_buff (_cpp_buff *buff)
3017 {
3018   _cpp_buff *next;
3019
3020   for (; buff; buff = next)
3021     {
3022       next = buff->next;
3023 #ifdef ENABLE_VALGRIND_CHECKING
3024       free (buff);
3025 #else
3026       free (buff->base);
3027 #endif
3028     }
3029 }
3030
3031 /* Allocate permanent, unaligned storage of length LEN.  */
3032 unsigned char *
3033 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3034 {
3035   _cpp_buff *buff = pfile->u_buff;
3036   unsigned char *result = buff->cur;
3037
3038   if (len > (size_t) (buff->limit - result))
3039     {
3040       buff = _cpp_get_buff (pfile, len);
3041       buff->next = pfile->u_buff;
3042       pfile->u_buff = buff;
3043       result = buff->cur;
3044     }
3045
3046   buff->cur = result + len;
3047   return result;
3048 }
3049
3050 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3051    That buffer is used for growing allocations when saving macro
3052    replacement lists in a #define, and when parsing an answer to an
3053    assertion in #assert, #unassert or #if (and therefore possibly
3054    whilst expanding macros).  It therefore must not be used by any
3055    code that they might call: specifically the lexer and the guts of
3056    the macro expander.
3057
3058    All existing other uses clearly fit this restriction: storing
3059    registered pragmas during initialization.  */
3060 unsigned char *
3061 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3062 {
3063   _cpp_buff *buff = pfile->a_buff;
3064   unsigned char *result = buff->cur;
3065
3066   if (len > (size_t) (buff->limit - result))
3067     {
3068       buff = _cpp_get_buff (pfile, len);
3069       buff->next = pfile->a_buff;
3070       pfile->a_buff = buff;
3071       result = buff->cur;
3072     }
3073
3074   buff->cur = result + len;
3075   return result;
3076 }
3077
3078 /* Say which field of TOK is in use.  */
3079
3080 enum cpp_token_fld_kind
3081 cpp_token_val_index (const cpp_token *tok)
3082 {
3083   switch (TOKEN_SPELL (tok))
3084     {
3085     case SPELL_IDENT:
3086       return CPP_TOKEN_FLD_NODE;
3087     case SPELL_LITERAL:
3088       return CPP_TOKEN_FLD_STR;
3089     case SPELL_OPERATOR:
3090       if (tok->type == CPP_PASTE)
3091         return CPP_TOKEN_FLD_TOKEN_NO;
3092       else
3093         return CPP_TOKEN_FLD_NONE;
3094     case SPELL_NONE:
3095       if (tok->type == CPP_MACRO_ARG)
3096         return CPP_TOKEN_FLD_ARG_NO;
3097       else if (tok->type == CPP_PADDING)
3098         return CPP_TOKEN_FLD_SOURCE;
3099       else if (tok->type == CPP_PRAGMA)
3100         return CPP_TOKEN_FLD_PRAGMA;
3101       /* else fall through */
3102     default:
3103       return CPP_TOKEN_FLD_NONE;
3104     }
3105 }
3106
3107 /* All tokens lexed in R after calling this function will be forced to have
3108    their source_location the same as the location referenced by P, until
3109    cpp_stop_forcing_token_locations is called for R.  */
3110
3111 void
3112 cpp_force_token_locations (cpp_reader *r, source_location *p)
3113 {
3114   r->forced_token_location_p = p;
3115 }
3116
3117 /* Go back to assigning locations naturally for lexed tokens.  */
3118
3119 void
3120 cpp_stop_forcing_token_locations (cpp_reader *r)
3121 {
3122   r->forced_token_location_p = NULL;
3123 }