libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2013 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problems can be properly
 267    autoconfed:
 268
 269    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 270    Before Solaris 9 Update 6, SSE insns cannot be executed.
 271    The Solaris 10+ assembler tags objects with the instruction set
 272    extensions used, so SSE4.2 executables cannot run on machines that
 273    don't support that extension.  */
 274
 275 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 276
 277 /* Replicated character data to be shared between implementations.
 278    Recall that outside of a context with vector support we can't
 279    define compatible vector types, therefore these are all defined
 280    in terms of raw characters.  */
 281 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 282   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 283     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 284   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 285     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 286   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 287     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 288   { '?', '?', '?', '?', '?', '?', '?', '?',
 289     '?', '?', '?', '?', '?', '?', '?', '?' },
 290 };
 291
 292 /* A version of the fast scanner using MMX vectorized byte compare insns.
 293
 294    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 295    which was packaged into SSE1; it is also present in the AMD MMX
 296    extension.  Mark the function as using "sse" so that we emit a real
 297    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 298
 299 static const uchar *
 300 #ifndef __SSE__
 301 __attribute__((__target__("sse")))
 302 #endif
 303 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 304 {
 305   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 306   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 307
 308   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 309   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 310   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 311   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 312
 313   unsigned int misalign, found, mask;
 314   const v8qi *p;
 315   v8qi data, t, c;
 316
 317   /* Align the source pointer.  While MMX doesn't generate unaligned data
 318      faults, this allows us to safely scan to the end of the buffer without
 319      reading beyond the end of the last page.  */
 320   misalign = (uintptr_t)s & 7;
 321   p = (const v8qi *)((uintptr_t)s & -8);
 322   data = *p;
 323
 324   /* Create a mask for the bytes that are valid within the first
 325      16-byte block.  The Idea here is that the AND with the mask
 326      within the loop is "free", since we need some AND or TEST
 327      insn in order to set the flags for the branch anyway.  */
 328   mask = -1u << misalign;
 329
 330   /* Main loop processing 8 bytes at a time.  */
 331   goto start;
 332   do
 333     {
 334       data = *++p;
 335       mask = -1;
 336
 337     start:
 338       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 339       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 344       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 345       found = __builtin_ia32_pmovmskb (t);
 346       found &= mask;
 347     }
 348   while (!found);
 349
 350   __builtin_ia32_emms ();
 351
 352   /* FOUND contains 1 in bits for which we matched a relevant
 353      character.  Conversion to the byte index is trivial.  */
 354   found = __builtin_ctz(found);
 355   return (const uchar *)p + found;
 356 }
 357
 358 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 359
 360 static const uchar *
 361 #ifndef __SSE2__
 362 __attribute__((__target__("sse2")))
 363 #endif
 364 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 365 {
 366   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 367
 368   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 369   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 370   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 371   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 372
 373   unsigned int misalign, found, mask;
 374   const v16qi *p;
 375   v16qi data, t;
 376
 377   /* Align the source pointer.  */
 378   misalign = (uintptr_t)s & 15;
 379   p = (const v16qi *)((uintptr_t)s & -16);
 380   data = *p;
 381
 382   /* Create a mask for the bytes that are valid within the first
 383      16-byte block.  The Idea here is that the AND with the mask
 384      within the loop is "free", since we need some AND or TEST
 385      insn in order to set the flags for the branch anyway.  */
 386   mask = -1u << misalign;
 387
 388   /* Main loop processing 16 bytes at a time.  */
 389   goto start;
 390   do
 391     {
 392       data = *++p;
 393       mask = -1;
 394
 395     start:
 396       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 397       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 398       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 400       found = __builtin_ia32_pmovmskb128 (t);
 401       found &= mask;
 402     }
 403   while (!found);
 404
 405   /* FOUND contains 1 in bits for which we matched a relevant
 406      character.  Conversion to the byte index is trivial.  */
 407   found = __builtin_ctz(found);
 408   return (const uchar *)p + found;
 409 }
 410
 411 #ifdef HAVE_SSE4
 412 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 413
 414 static const uchar *
 415 #ifndef __SSE4_2__
 416 __attribute__((__target__("sse4.2")))
 417 #endif
 418 search_line_sse42 (const uchar *s, const uchar *end)
 419 {
 420   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 421   static const v16qi search = { '\n', '\r', '?', '\\' };
 422
 423   uintptr_t si = (uintptr_t)s;
 424   uintptr_t index;
 425
 426   /* Check for unaligned input.  */
 427   if (si & 15)
 428     {
 429       v16qi sv;
 430
 431       if (__builtin_expect (end - s < 16, 0)
 432           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 433         {
 434           /* There are less than 16 bytes left in the buffer, and less
 435              than 16 bytes left on the page.  Reading 16 bytes at this
 436              point might generate a spurious page fault.  Defer to the
 437              SSE2 implementation, which already handles alignment.  */
 438           return search_line_sse2 (s, end);
 439         }
 440
 441       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 442          memory need not be aligned.  */
 443       sv = __builtin_ia32_loaddqu ((const char *) s);
 444       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 445
 446       if (__builtin_expect (index < 16, 0))
 447         goto found;
 448
 449       /* Advance the pointer to an aligned address.  We will re-scan a
 450          few bytes, but we no longer need care for reading past the
 451          end of a page, since we're guaranteed a match.  */
 452       s = (const uchar *)((si + 16) & -16);
 453     }
 454
 455   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 456      in inline assembly, we can make proper use of the flags set.  */
 457   __asm (      "sub $16, %1\n"
 458         "       .balign 16\n"
 459         "0:     add $16, %1\n"
 460         "       %vpcmpestri $0, (%1), %2\n"
 461         "       jnc 0b"
 462         : "=&c"(index), "+r"(s)
 463         : "x"(search), "a"(4), "d"(16));
 464
 465  found:
 466   return s + index;
 467 }
 468
 469 #else
 470 /* Work around out-dated assemblers without sse4 support.  */
 471 #define search_line_sse42 search_line_sse2
 472 #endif
 473
 474 /* Check the CPU capabilities.  */
 475
 476 #include "../gcc/config/i386/cpuid.h"
 477
 478 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 479 static search_line_fast_type search_line_fast;
 480
 481 #define HAVE_init_vectorized_lexer 1
 482 static inline void
 483 init_vectorized_lexer (void)
 484 {
 485   unsigned dummy, ecx = 0, edx = 0;
 486   search_line_fast_type impl = search_line_acc_char;
 487   int minimum = 0;
 488
 489 #if defined(__SSE4_2__)
 490   minimum = 3;
 491 #elif defined(__SSE2__)
 492   minimum = 2;
 493 #elif defined(__SSE__)
 494   minimum = 1;
 495 #endif
 496
 497   if (minimum == 3)
 498     impl = search_line_sse42;
 499   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 500     {
 501       if (minimum == 3 || (ecx & bit_SSE4_2))
 502         impl = search_line_sse42;
 503       else if (minimum == 2 || (edx & bit_SSE2))
 504         impl = search_line_sse2;
 505       else if (minimum == 1 || (edx & bit_SSE))
 506         impl = search_line_mmx;
 507     }
 508   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 509     {
 510       if (minimum == 1
 511           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 512         impl = search_line_mmx;
 513     }
 514
 515   search_line_fast = impl;
 516 }
 517
 518 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 519
 520 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 521 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 522    so we can't compile this function without -maltivec on the command line
 523    (or implied by some other switch).  */
 524
 525 static const uchar *
 526 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 527 {
 528   typedef __attribute__((altivec(vector))) unsigned char vc;
 529
 530   const vc repl_nl = {
 531     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 532     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 533   };
 534   const vc repl_cr = {
 535     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 536     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 537   };
 538   const vc repl_bs = {
 539     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 540     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 541   };
 542   const vc repl_qm = {
 543     '?', '?', '?', '?', '?', '?', '?', '?',
 544     '?', '?', '?', '?', '?', '?', '?', '?',
 545   };
 546   const vc ones = {
 547     -1, -1, -1, -1, -1, -1, -1, -1,
 548     -1, -1, -1, -1, -1, -1, -1, -1,
 549   };
 550   const vc zero = { 0 };
 551
 552   vc data, mask, t;
 553
 554   /* Altivec loads automatically mask addresses with -16.  This lets us
 555      issue the first load as early as possible.  */
 556   data = __builtin_vec_ld(0, (const vc *)s);
 557
 558   /* Discard bytes before the beginning of the buffer.  Do this by
 559      beginning with all ones and shifting in zeros according to the
 560      mis-alignment.  The LVSR instruction pulls the exact shift we
 561      want from the address.  */
 562   mask = __builtin_vec_lvsr(0, s);
 563   mask = __builtin_vec_perm(zero, ones, mask);
 564   data &= mask;
 565
 566   /* While altivec loads mask addresses, we still need to align S so
 567      that the offset we compute at the end is correct.  */
 568   s = (const uchar *)((uintptr_t)s & -16);
 569
 570   /* Main loop processing 16 bytes at a time.  */
 571   goto start;
 572   do
 573     {
 574       vc m_nl, m_cr, m_bs, m_qm;
 575
 576       s += 16;
 577       data = __builtin_vec_ld(0, (const vc *)s);
 578
 579     start:
 580       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 581       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 582       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 583       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 584       t = (m_nl | m_cr) | (m_bs | m_qm);
 585
 586       /* T now contains 0xff in bytes for which we matched one of the relevant
 587          characters.  We want to exit the loop if any byte in T is non-zero.
 588          Below is the expansion of vec_any_ne(t, zero).  */
 589     }
 590   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 591
 592   {
 593 #define N  (sizeof(vc) / sizeof(long))
 594
 595     union {
 596       vc v;
 597       /* Statically assert that N is 2 or 4.  */
 598       unsigned long l[(N == 2 || N == 4) ? N : -1];
 599     } u;
 600     unsigned long l, i = 0;
 601
 602     u.v = t;
 603
 604     /* Find the first word of T that is non-zero.  */
 605     switch (N)
 606       {
 607       case 4:
 608         l = u.l[i++];
 609         if (l != 0)
 610           break;
 611         s += sizeof(unsigned long);
 612         l = u.l[i++];
 613         if (l != 0)
 614           break;
 615         s += sizeof(unsigned long);
 616       case 2:
 617         l = u.l[i++];
 618         if (l != 0)
 619           break;
 620         s += sizeof(unsigned long);
 621         l = u.l[i];
 622       }
 623
 624     /* L now contains 0xff in bytes for which we matched one of the
 625        relevant characters.  We can find the byte index by finding
 626        its bit index and dividing by 8.  */
 627     l = __builtin_clzl(l) >> 3;
 628     return s + l;
 629
 630 #undef N
 631   }
 632 }
 633
 634 #elif defined (__ARM_NEON__)
 635 #include "arm_neon.h"
 636
 637 static const uchar *
 638 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 639 {
 640   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 641   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 642   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 643   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 644   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 645
 646   unsigned int misalign, found, mask;
 647   const uint8_t *p;
 648   uint8x16_t data;
 649
 650   /* Align the source pointer.  */
 651   misalign = (uintptr_t)s & 15;
 652   p = (const uint8_t *)((uintptr_t)s & -16);
 653   data = vld1q_u8 (p);
 654
 655   /* Create a mask for the bytes that are valid within the first
 656      16-byte block.  The Idea here is that the AND with the mask
 657      within the loop is "free", since we need some AND or TEST
 658      insn in order to set the flags for the branch anyway.  */
 659   mask = (-1u << misalign) & 0xffff;
 660
 661   /* Main loop, processing 16 bytes at a time.  */
 662   goto start;
 663
 664   do
 665     {
 666       uint8x8_t l;
 667       uint16x4_t m;
 668       uint32x2_t n;
 669       uint8x16_t t, u, v, w;
 670
 671       p += 16;
 672       data = vld1q_u8 (p);
 673       mask = 0xffff;
 674
 675     start:
 676       t = vceqq_u8 (data, repl_nl);
 677       u = vceqq_u8 (data, repl_cr);
 678       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 679       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 680       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 681       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 682       m = vpaddl_u8 (l);
 683       n = vpaddl_u16 (m);
 684
 685       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 686               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 687       found &= mask;
 688     }
 689   while (!found);
 690
 691   /* FOUND contains 1 in bits for which we matched a relevant
 692      character.  Conversion to the byte index is trivial.  */
 693   found = __builtin_ctz (found);
 694   return (const uchar *)p + found;
 695 }
 696
 697 #else
 698
 699 /* We only have one accellerated alternative.  Use a direct call so that
 700    we encourage inlining.  */
 701
 702 #define search_line_fast  search_line_acc_char
 703
 704 #endif
 705
 706 /* Initialize the lexer if needed.  */
 707
 708 void
 709 _cpp_init_lexer (void)
 710 {
 711 #ifdef HAVE_init_vectorized_lexer
 712   init_vectorized_lexer ();
 713 #endif
 714 }
 715
 716 /* Returns with a logical line that contains no escaped newlines or
 717    trigraphs.  This is a time-critical inner loop.  */
 718 void
 719 _cpp_clean_line (cpp_reader *pfile)
 720 {
 721   cpp_buffer *buffer;
 722   const uchar *s;
 723   uchar c, *d, *p;
 724
 725   buffer = pfile->buffer;
 726   buffer->cur_note = buffer->notes_used = 0;
 727   buffer->cur = buffer->line_base = buffer->next_line;
 728   buffer->need_line = false;
 729   s = buffer->next_line;
 730
 731   if (!buffer->from_stage3)
 732     {
 733       const uchar *pbackslash = NULL;
 734
 735       /* Fast path.  This is the common case of an un-escaped line with
 736          no trigraphs.  The primary win here is by not writing any
 737          data back to memory until we have to.  */
 738       while (1)
 739         {
 740           /* Perform an optimized search for \n, \r, \\, ?.  */
 741           s = search_line_fast (s, buffer->rlimit);
 742
 743           c = *s;
 744           if (c == '\\')
 745             {
 746               /* Record the location of the backslash and continue.  */
 747               pbackslash = s++;
 748             }
 749           else if (__builtin_expect (c == '?', 0))
 750             {
 751               if (__builtin_expect (s[1] == '?', false)
 752                    && _cpp_trigraph_map[s[2]])
 753                 {
 754                   /* Have a trigraph.  We may or may not have to convert
 755                      it.  Add a line note regardless, for -Wtrigraphs.  */
 756                   add_line_note (buffer, s, s[2]);
 757                   if (CPP_OPTION (pfile, trigraphs))
 758                     {
 759                       /* We do, and that means we have to switch to the
 760                          slow path.  */
 761                       d = (uchar *) s;
 762                       *d = _cpp_trigraph_map[s[2]];
 763                       s += 2;
 764                       goto slow_path;
 765                     }
 766                 }
 767               /* Not a trigraph.  Continue on fast-path.  */
 768               s++;
 769             }
 770           else
 771             break;
 772         }
 773
 774       /* This must be \r or \n.  We're either done, or we'll be forced
 775          to write back to the buffer and continue on the slow path.  */
 776       d = (uchar *) s;
 777
 778       if (__builtin_expect (s == buffer->rlimit, false))
 779         goto done;
 780
 781       /* DOS line ending? */
 782       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 783         {
 784           s++;
 785           if (s == buffer->rlimit)
 786             goto done;
 787         }
 788
 789       if (__builtin_expect (pbackslash == NULL, true))
 790         goto done;
 791
 792       /* Check for escaped newline.  */
 793       p = d;
 794       while (is_nvspace (p[-1]))
 795         p--;
 796       if (p - 1 != pbackslash)
 797         goto done;
 798
 799       /* Have an escaped newline; process it and proceed to
 800          the slow path.  */
 801       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 802       d = p - 2;
 803       buffer->next_line = p - 1;
 804
 805     slow_path:
 806       while (1)
 807         {
 808           c = *++s;
 809           *++d = c;
 810
 811           if (c == '\n' || c == '\r')
 812             {
 813               /* Handle DOS line endings.  */
 814               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 815                 s++;
 816               if (s == buffer->rlimit)
 817                 break;
 818
 819               /* Escaped?  */
 820               p = d;
 821               while (p != buffer->next_line && is_nvspace (p[-1]))
 822                 p--;
 823               if (p == buffer->next_line || p[-1] != '\\')
 824                 break;
 825
 826               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 827               d = p - 2;
 828               buffer->next_line = p - 1;
 829             }
 830           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 831             {
 832               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 833               add_line_note (buffer, d, s[2]);
 834               if (CPP_OPTION (pfile, trigraphs))
 835                 {
 836                   *d = _cpp_trigraph_map[s[2]];
 837                   s += 2;
 838                 }
 839             }
 840         }
 841     }
 842   else
 843     {
 844       while (*s != '\n' && *s != '\r')
 845         s++;
 846       d = (uchar *) s;
 847
 848       /* Handle DOS line endings.  */
 849       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 850         s++;
 851     }
 852
 853  done:
 854   *d = '\n';
 855   /* A sentinel note that should never be processed.  */
 856   add_line_note (buffer, d + 1, '\n');
 857   buffer->next_line = s + 1;
 858 }
 859
 860 /* Return true if the trigraph indicated by NOTE should be warned
 861    about in a comment.  */
 862 static bool
 863 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 864 {
 865   const uchar *p;
 866
 867   /* Within comments we don't warn about trigraphs, unless the
 868      trigraph forms an escaped newline, as that may change
 869      behavior.  */
 870   if (note->type != '/')
 871     return false;
 872
 873   /* If -trigraphs, then this was an escaped newline iff the next note
 874      is coincident.  */
 875   if (CPP_OPTION (pfile, trigraphs))
 876     return note[1].pos == note->pos;
 877
 878   /* Otherwise, see if this forms an escaped newline.  */
 879   p = note->pos + 3;
 880   while (is_nvspace (*p))
 881     p++;
 882
 883   /* There might have been escaped newlines between the trigraph and the
 884      newline we found.  Hence the position test.  */
 885   return (*p == '\n' && p < note[1].pos);
 886 }
 887
 888 /* Process the notes created by add_line_note as far as the current
 889    location.  */
 890 void
 891 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 892 {
 893   cpp_buffer *buffer = pfile->buffer;
 894
 895   for (;;)
 896     {
 897       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 898       unsigned int col;
 899
 900       if (note->pos > buffer->cur)
 901         break;
 902
 903       buffer->cur_note++;
 904       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 905
 906       if (note->type == '\\' || note->type == ' ')
 907         {
 908           if (note->type == ' ' && !in_comment)
 909             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 910                                  "backslash and newline separated by space");
 911
 912           if (buffer->next_line > buffer->rlimit)
 913             {
 914               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 915                                    "backslash-newline at end of file");
 916               /* Prevent "no newline at end of file" warning.  */
 917               buffer->next_line = buffer->rlimit;
 918             }
 919
 920           buffer->line_base = note->pos;
 921           CPP_INCREMENT_LINE (pfile, 0);
 922         }
 923       else if (_cpp_trigraph_map[note->type])
 924         {
 925           if (CPP_OPTION (pfile, warn_trigraphs)
 926               && (!in_comment || warn_in_comment (pfile, note)))
 927             {
 928               if (CPP_OPTION (pfile, trigraphs))
 929                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 930                                        pfile->line_table->highest_line, col,
 931                                        "trigraph ??%c converted to %c",
 932                                        note->type,
 933                                        (int) _cpp_trigraph_map[note->type]);
 934               else
 935                 {
 936                   cpp_warning_with_line
 937                     (pfile, CPP_W_TRIGRAPHS,
 938                      pfile->line_table->highest_line, col,
 939                      "trigraph ??%c ignored, use -trigraphs to enable",
 940                      note->type);
 941                 }
 942             }
 943         }
 944       else if (note->type == 0)
 945         /* Already processed in lex_raw_string.  */;
 946       else
 947         abort ();
 948     }
 949 }
 950
 951 /* Skip a C-style block comment.  We find the end of the comment by
 952    seeing if an asterisk is before every '/' we encounter.  Returns
 953    nonzero if comment terminated by EOF, zero otherwise.
 954
 955    Buffer->cur points to the initial asterisk of the comment.  */
 956 bool
 957 _cpp_skip_block_comment (cpp_reader *pfile)
 958 {
 959   cpp_buffer *buffer = pfile->buffer;
 960   const uchar *cur = buffer->cur;
 961   uchar c;
 962
 963   cur++;
 964   if (*cur == '/')
 965     cur++;
 966
 967   for (;;)
 968     {
 969       /* People like decorating comments with '*', so check for '/'
 970          instead for efficiency.  */
 971       c = *cur++;
 972
 973       if (c == '/')
 974         {
 975           if (cur[-2] == '*')
 976             break;
 977
 978           /* Warn about potential nested comments, but not if the '/'
 979              comes immediately before the true comment delimiter.
 980              Don't bother to get it right across escaped newlines.  */
 981           if (CPP_OPTION (pfile, warn_comments)
 982               && cur[0] == '*' && cur[1] != '/')
 983             {
 984               buffer->cur = cur;
 985               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 986                                      pfile->line_table->highest_line,
 987                                      CPP_BUF_COL (buffer),
 988                                      "\"/*\" within comment");
 989             }
 990         }
 991       else if (c == '\n')
 992         {
 993           unsigned int cols;
 994           buffer->cur = cur - 1;
 995           _cpp_process_line_notes (pfile, true);
 996           if (buffer->next_line >= buffer->rlimit)
 997             return true;
 998           _cpp_clean_line (pfile);
 999
1000           cols = buffer->next_line - buffer->line_base;
1001           CPP_INCREMENT_LINE (pfile, cols);
1002
1003           cur = buffer->cur;
1004         }
1005     }
1006
1007   buffer->cur = cur;
1008   _cpp_process_line_notes (pfile, true);
1009   return false;
1010 }
1011
1012 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1013    terminating newline.  Handles escaped newlines.  Returns nonzero
1014    if a multiline comment.  */
1015 static int
1016 skip_line_comment (cpp_reader *pfile)
1017 {
1018   cpp_buffer *buffer = pfile->buffer;
1019   source_location orig_line = pfile->line_table->highest_line;
1020
1021   while (*buffer->cur != '\n')
1022     buffer->cur++;
1023
1024   _cpp_process_line_notes (pfile, true);
1025   return orig_line != pfile->line_table->highest_line;
1026 }
1027
1028 /* Skips whitespace, saving the next non-whitespace character.  */
1029 static void
1030 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1031 {
1032   cpp_buffer *buffer = pfile->buffer;
1033   bool saw_NUL = false;
1034
1035   do
1036     {
1037       /* Horizontal space always OK.  */
1038       if (c == ' ' || c == '\t')
1039         ;
1040       /* Just \f \v or \0 left.  */
1041       else if (c == '\0')
1042         saw_NUL = true;
1043       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1044         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1045                              CPP_BUF_COL (buffer),
1046                              "%s in preprocessing directive",
1047                              c == '\f' ? "form feed" : "vertical tab");
1048
1049       c = *buffer->cur++;
1050     }
1051   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1052   while (is_nvspace (c));
1053
1054   if (saw_NUL)
1055     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1056
1057   buffer->cur--;
1058 }
1059
1060 /* See if the characters of a number token are valid in a name (no
1061    '.', '+' or '-').  */
1062 static int
1063 name_p (cpp_reader *pfile, const cpp_string *string)
1064 {
1065   unsigned int i;
1066
1067   for (i = 0; i < string->len; i++)
1068     if (!is_idchar (string->text[i]))
1069       return 0;
1070
1071   return 1;
1072 }
1073
1074 /* After parsing an identifier or other sequence, produce a warning about
1075    sequences not in NFC/NFKC.  */
1076 static void
1077 warn_about_normalization (cpp_reader *pfile,
1078                           const cpp_token *token,
1079                           const struct normalize_state *s)
1080 {
1081   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1082       && !pfile->state.skipping)
1083     {
1084       /* Make sure that the token is printed using UCNs, even
1085          if we'd otherwise happily print UTF-8.  */
1086       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1087       size_t sz;
1088
1089       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1090       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1091         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1092                                "`%.*s' is not in NFKC", (int) sz, buf);
1093       else
1094         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1095                                "`%.*s' is not in NFC", (int) sz, buf);
1096       free (buf);
1097     }
1098 }
1099
1100 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1101    an identifier.  FIRST is TRUE if this starts an identifier.  */
1102 static bool
1103 forms_identifier_p (cpp_reader *pfile, int first,
1104                     struct normalize_state *state)
1105 {
1106   cpp_buffer *buffer = pfile->buffer;
1107
1108   if (*buffer->cur == '$')
1109     {
1110       if (!CPP_OPTION (pfile, dollars_in_ident))
1111         return false;
1112
1113       buffer->cur++;
1114       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1115         {
1116           CPP_OPTION (pfile, warn_dollars) = 0;
1117           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1118         }
1119
1120       return true;
1121     }
1122
1123   /* Is this a syntactically valid UCN?  */
1124   if (CPP_OPTION (pfile, extended_identifiers)
1125       && *buffer->cur == '\\'
1126       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1127     {
1128       buffer->cur += 2;
1129       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1130                           state))
1131         return true;
1132       buffer->cur -= 2;
1133     }
1134
1135   return false;
1136 }
1137
1138 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1139 static cpp_hashnode *
1140 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1141 {
1142   cpp_hashnode *result;
1143   const uchar *cur;
1144   unsigned int len;
1145   unsigned int hash = HT_HASHSTEP (0, *base);
1146
1147   cur = base + 1;
1148   while (ISIDNUM (*cur))
1149     {
1150       hash = HT_HASHSTEP (hash, *cur);
1151       cur++;
1152     }
1153   len = cur - base;
1154   hash = HT_HASHFINISH (hash, len);
1155   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1156                                               base, len, hash, HT_ALLOC));
1157
1158   /* Rarely, identifiers require diagnostics when lexed.  */
1159   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1160                         && !pfile->state.skipping, 0))
1161     {
1162       /* It is allowed to poison the same identifier twice.  */
1163       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1164         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1165                    NODE_NAME (result));
1166
1167       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1168          replacement list of a variadic macro.  */
1169       if (result == pfile->spec_nodes.n__VA_ARGS__
1170           && !pfile->state.va_args_ok)
1171         cpp_error (pfile, CPP_DL_PEDWARN,
1172                    "__VA_ARGS__ can only appear in the expansion"
1173                    " of a C99 variadic macro");
1174
1175       /* For -Wc++-compat, warn about use of C++ named operators.  */
1176       if (result->flags & NODE_WARN_OPERATOR)
1177         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1178                      "identifier \"%s\" is a special operator name in C++",
1179                      NODE_NAME (result));
1180     }
1181
1182   return result;
1183 }
1184
1185 /* Get the cpp_hashnode of an identifier specified by NAME in
1186    the current cpp_reader object.  If none is found, NULL is returned.  */
1187 cpp_hashnode *
1188 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1189 {
1190   cpp_hashnode *result;
1191   result = lex_identifier_intern (pfile, (uchar *) name);
1192   return result;
1193 }
1194
1195 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1196 static cpp_hashnode *
1197 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1198                 struct normalize_state *nst)
1199 {
1200   cpp_hashnode *result;
1201   const uchar *cur;
1202   unsigned int len;
1203   unsigned int hash = HT_HASHSTEP (0, *base);
1204
1205   cur = pfile->buffer->cur;
1206   if (! starts_ucn)
1207     while (ISIDNUM (*cur))
1208       {
1209         hash = HT_HASHSTEP (hash, *cur);
1210         cur++;
1211       }
1212   pfile->buffer->cur = cur;
1213   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1214     {
1215       /* Slower version for identifiers containing UCNs (or $).  */
1216       do {
1217         while (ISIDNUM (*pfile->buffer->cur))
1218           {
1219             pfile->buffer->cur++;
1220             NORMALIZE_STATE_UPDATE_IDNUM (nst);
1221           }
1222       } while (forms_identifier_p (pfile, false, nst));
1223       result = _cpp_interpret_identifier (pfile, base,
1224                                           pfile->buffer->cur - base);
1225     }
1226   else
1227     {
1228       len = cur - base;
1229       hash = HT_HASHFINISH (hash, len);
1230
1231       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1232                                                   base, len, hash, HT_ALLOC));
1233     }
1234
1235   /* Rarely, identifiers require diagnostics when lexed.  */
1236   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1237                         && !pfile->state.skipping, 0))
1238     {
1239       /* It is allowed to poison the same identifier twice.  */
1240       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1241         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1242                    NODE_NAME (result));
1243
1244       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1245          replacement list of a variadic macro.  */
1246       if (result == pfile->spec_nodes.n__VA_ARGS__
1247           && !pfile->state.va_args_ok)
1248         cpp_error (pfile, CPP_DL_PEDWARN,
1249                    "__VA_ARGS__ can only appear in the expansion"
1250                    " of a C99 variadic macro");
1251
1252       /* For -Wc++-compat, warn about use of C++ named operators.  */
1253       if (result->flags & NODE_WARN_OPERATOR)
1254         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1255                      "identifier \"%s\" is a special operator name in C++",
1256                      NODE_NAME (result));
1257     }
1258
1259   return result;
1260 }
1261
1262 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1263 static void
1264 lex_number (cpp_reader *pfile, cpp_string *number,
1265             struct normalize_state *nst)
1266 {
1267   const uchar *cur;
1268   const uchar *base;
1269   uchar *dest;
1270
1271   base = pfile->buffer->cur - 1;
1272   do
1273     {
1274       cur = pfile->buffer->cur;
1275
1276       /* N.B. ISIDNUM does not include $.  */
1277       while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1278              || VALID_SIGN (*cur, cur[-1]))
1279         {
1280           cur++;
1281           NORMALIZE_STATE_UPDATE_IDNUM (nst);
1282         }
1283
1284       pfile->buffer->cur = cur;
1285     }
1286   while (forms_identifier_p (pfile, false, nst));
1287
1288   number->len = cur - base;
1289   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1290   memcpy (dest, base, number->len);
1291   dest[number->len] = '\0';
1292   number->text = dest;
1293 }
1294
1295 /* Create a token of type TYPE with a literal spelling.  */
1296 static void
1297 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1298                 unsigned int len, enum cpp_ttype type)
1299 {
1300   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1301
1302   memcpy (dest, base, len);
1303   dest[len] = '\0';
1304   token->type = type;
1305   token->val.str.len = len;
1306   token->val.str.text = dest;
1307 }
1308
1309 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1310    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1311
1312 static void
1313 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1314                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1315 {
1316   _cpp_buff *first_buff = *first_buff_p;
1317   _cpp_buff *last_buff = *last_buff_p;
1318
1319   if (first_buff == NULL)
1320     first_buff = last_buff = _cpp_get_buff (pfile, len);
1321   else if (len > BUFF_ROOM (last_buff))
1322     {
1323       size_t room = BUFF_ROOM (last_buff);
1324       memcpy (BUFF_FRONT (last_buff), base, room);
1325       BUFF_FRONT (last_buff) += room;
1326       base += room;
1327       len -= room;
1328       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1329     }
1330
1331   memcpy (BUFF_FRONT (last_buff), base, len);
1332   BUFF_FRONT (last_buff) += len;
1333
1334   *first_buff_p = first_buff;
1335   *last_buff_p = last_buff;
1336 }
1337
1338
1339 /* Returns true if a macro has been defined.
1340    This might not work if compile with -save-temps,
1341    or preprocess separately from compilation.  */
1342
1343 static bool
1344 is_macro(cpp_reader *pfile, const uchar *base)
1345 {
1346   const uchar *cur = base;
1347   if (! ISIDST (*cur))
1348     return false;
1349   unsigned int hash = HT_HASHSTEP (0, *cur);
1350   ++cur;
1351   while (ISIDNUM (*cur))
1352     {
1353       hash = HT_HASHSTEP (hash, *cur);
1354       ++cur;
1355     }
1356   hash = HT_HASHFINISH (hash, cur - base);
1357
1358   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1359                                         base, cur - base, hash, HT_NO_INSERT));
1360
1361   return !result ? false : (result->type == NT_MACRO);
1362 }
1363
1364
1365 /* Lexes a raw string.  The stored string contains the spelling, including
1366    double quotes, delimiter string, '(' and ')', any leading
1367    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1368    literal, or CPP_OTHER if it was not properly terminated.
1369
1370    The spelling is NUL-terminated, but it is not guaranteed that this
1371    is the first NUL since embedded NULs are preserved.  */
1372
1373 static void
1374 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1375                 const uchar *cur)
1376 {
1377   uchar raw_prefix[17];
1378   uchar temp_buffer[18];
1379   const uchar *orig_base;
1380   unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
1381   enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
1382   raw_str_phase phase = RAW_STR_PREFIX;
1383   enum cpp_ttype type;
1384   size_t total_len = 0;
1385   /* Index into temp_buffer during phases other than RAW_STR,
1386      during RAW_STR phase 17 to tell BUF_APPEND that nothing should
1387      be appended to temp_buffer.  */
1388   size_t temp_buffer_len = 0;
1389   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1390   size_t raw_prefix_start;
1391   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1392
1393   type = (*base == 'L' ? CPP_WSTRING :
1394           *base == 'U' ? CPP_STRING32 :
1395           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1396           : CPP_STRING);
1397
1398 #define BUF_APPEND(STR,LEN)                                     \
1399       do {                                                      \
1400         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1401                         &first_buff, &last_buff);               \
1402         total_len += (LEN);                                     \
1403         if (__builtin_expect (temp_buffer_len < 17, 0)          \
1404             && (const uchar *)(STR) != base                     \
1405             && (LEN) <= 2)                                      \
1406           {                                                     \
1407             memcpy (temp_buffer + temp_buffer_len,              \
1408                     (const uchar *)(STR), (LEN));               \
1409             temp_buffer_len += (LEN);                           \
1410           }                                                     \
1411       } while (0);
1412
1413   orig_base = base;
1414   ++cur;
1415   raw_prefix_start = cur - base;
1416   for (;;)
1417     {
1418       cppchar_t c;
1419
1420       /* If we previously performed any trigraph or line splicing
1421          transformations, undo them in between the opening and closing
1422          double quote.  */
1423       while (note->pos < cur)
1424         ++note;
1425       for (; note->pos == cur; ++note)
1426         {
1427           switch (note->type)
1428             {
1429             case '\\':
1430             case ' ':
1431               /* Restore backslash followed by newline.  */
1432               BUF_APPEND (base, cur - base);
1433               base = cur;
1434               BUF_APPEND ("\\", 1);
1435             after_backslash:
1436               if (note->type == ' ')
1437                 {
1438                   /* GNU backslash whitespace newline extension.  FIXME
1439                      could be any sequence of non-vertical space.  When we
1440                      can properly restore any such sequence, we should mark
1441                      this note as handled so _cpp_process_line_notes
1442                      doesn't warn.  */
1443                   BUF_APPEND (" ", 1);
1444                 }
1445
1446               BUF_APPEND ("\n", 1);
1447               break;
1448
1449             case 0:
1450               /* Already handled.  */
1451               break;
1452
1453             default:
1454               if (_cpp_trigraph_map[note->type])
1455                 {
1456                   /* Don't warn about this trigraph in
1457                      _cpp_process_line_notes, since trigraphs show up as
1458                      trigraphs in raw strings.  */
1459                   uchar type = note->type;
1460                   note->type = 0;
1461
1462                   if (!CPP_OPTION (pfile, trigraphs))
1463                     /* If we didn't convert the trigraph in the first
1464                        place, don't do anything now either.  */
1465                     break;
1466
1467                   BUF_APPEND (base, cur - base);
1468                   base = cur;
1469                   BUF_APPEND ("??", 2);
1470
1471                   /* ??/ followed by newline gets two line notes, one for
1472                      the trigraph and one for the backslash/newline.  */
1473                   if (type == '/' && note[1].pos == cur)
1474                     {
1475                       if (note[1].type != '\\'
1476                           && note[1].type != ' ')
1477                         abort ();
1478                       BUF_APPEND ("/", 1);
1479                       ++note;
1480                       goto after_backslash;
1481                     }
1482                   else
1483                     {
1484                       /* Skip the replacement character.  */
1485                       base = ++cur;
1486                       BUF_APPEND (&type, 1);
1487                       c = type;
1488                       goto check_c;
1489                     }
1490                 }
1491               else
1492                 abort ();
1493               break;
1494             }
1495         }
1496       c = *cur++;
1497       if (__builtin_expect (temp_buffer_len < 17, 0))
1498         temp_buffer[temp_buffer_len++] = c;
1499
1500      check_c:
1501       if (phase == RAW_STR_PREFIX)
1502         {
1503           while (raw_prefix_len < temp_buffer_len)
1504             {
1505               raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len];
1506               switch (raw_prefix[raw_prefix_len])
1507                 {
1508                 case ' ': case '(': case ')': case '\\': case '\t':
1509                 case '\v': case '\f': case '\n': default:
1510                   break;
1511                 /* Basic source charset except the above chars.  */
1512                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1513                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1514                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1515                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1516                 case 'y': case 'z':
1517                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1518                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1519                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1520                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1521                 case 'Y': case 'Z':
1522                 case '0': case '1': case '2': case '3': case '4': case '5':
1523                 case '6': case '7': case '8': case '9':
1524                 case '_': case '{': case '}': case '#': case '[': case ']':
1525                 case '<': case '>': case '%': case ':': case ';': case '.':
1526                 case '?': case '*': case '+': case '-': case '/': case '^':
1527                 case '&': case '|': case '~': case '!': case '=': case ',':
1528                 case '"': case '\'':
1529                   if (raw_prefix_len < 16)
1530                     {
1531                       raw_prefix_len++;
1532                       continue;
1533                     }
1534                   break;
1535                 }
1536
1537               if (raw_prefix[raw_prefix_len] != '(')
1538                 {
1539                   int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1;
1540                   if (raw_prefix_len == 16)
1541                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1542                                          col, "raw string delimiter longer "
1543                                               "than 16 characters");
1544                   else if (raw_prefix[raw_prefix_len] == '\n')
1545                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1546                                          col, "invalid new-line in raw "
1547                                               "string delimiter");
1548                   else
1549                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1550                                          col, "invalid character '%c' in "
1551                                               "raw string delimiter",
1552                                          (int) raw_prefix[raw_prefix_len]);
1553                   pfile->buffer->cur = orig_base + raw_prefix_start - 1;
1554                   create_literal (pfile, token, orig_base,
1555                                   raw_prefix_start - 1, CPP_OTHER);
1556                   if (first_buff)
1557                     _cpp_release_buff (pfile, first_buff);
1558                   return;
1559                 }
1560               raw_prefix[raw_prefix_len] = '"';
1561               phase = RAW_STR;
1562               /* Nothing should be appended to temp_buffer during
1563                  RAW_STR phase.  */
1564               temp_buffer_len = 17;
1565               break;
1566             }
1567           continue;
1568         }
1569       else if (phase == RAW_STR_SUFFIX)
1570         {
1571           while (raw_suffix_len <= raw_prefix_len
1572                  && raw_suffix_len < temp_buffer_len
1573                  && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len])
1574             raw_suffix_len++;
1575           if (raw_suffix_len > raw_prefix_len)
1576             break;
1577           if (raw_suffix_len == temp_buffer_len)
1578             continue;
1579           phase = RAW_STR;
1580           /* Nothing should be appended to temp_buffer during
1581              RAW_STR phase.  */
1582           temp_buffer_len = 17;
1583         }
1584       if (c == ')')
1585         {
1586           phase = RAW_STR_SUFFIX;
1587           raw_suffix_len = 0;
1588           temp_buffer_len = 0;
1589         }
1590       else if (c == '\n')
1591         {
1592           if (pfile->state.in_directive
1593               || (pfile->state.parsing_args
1594                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
1595             {
1596               cur--;
1597               type = CPP_OTHER;
1598               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1599                                    "unterminated raw string");
1600               break;
1601             }
1602
1603           BUF_APPEND (base, cur - base);
1604
1605           if (pfile->buffer->cur < pfile->buffer->rlimit)
1606             CPP_INCREMENT_LINE (pfile, 0);
1607           pfile->buffer->need_line = true;
1608
1609           pfile->buffer->cur = cur-1;
1610           _cpp_process_line_notes (pfile, false);
1611           if (!_cpp_get_fresh_line (pfile))
1612             {
1613               source_location src_loc = token->src_loc;
1614               token->type = CPP_EOF;
1615               /* Tell the compiler the line number of the EOF token.  */
1616               token->src_loc = pfile->line_table->highest_line;
1617               token->flags = BOL;
1618               if (first_buff != NULL)
1619                 _cpp_release_buff (pfile, first_buff);
1620               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1621                                    "unterminated raw string");
1622               return;
1623             }
1624
1625           cur = base = pfile->buffer->cur;
1626           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1627         }
1628     }
1629
1630   if (CPP_OPTION (pfile, user_literals))
1631     {
1632       /* If a string format macro, say from inttypes.h, is placed touching
1633          a string literal it could be parsed as a C++11 user-defined string
1634          literal thus breaking the program.
1635          Try to identify macros with is_macro. A warning is issued. */
1636       if (is_macro (pfile, cur))
1637         {
1638           /* Raise a warning, but do not consume subsequent tokens.  */
1639           if (CPP_OPTION (pfile, warn_literal_suffix))
1640             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1641                                    token->src_loc, 0,
1642                                    "invalid suffix on literal; C++11 requires "
1643                                    "a space between literal and string macro");
1644         }
1645       /* Grab user defined literal suffix.  */
1646       else if (ISIDST (*cur))
1647         {
1648           type = cpp_userdef_string_add_type (type);
1649           ++cur;
1650
1651           while (ISIDNUM (*cur))
1652             ++cur;
1653         }
1654     }
1655
1656   pfile->buffer->cur = cur;
1657   if (first_buff == NULL)
1658     create_literal (pfile, token, base, cur - base, type);
1659   else
1660     {
1661       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1662
1663       token->type = type;
1664       token->val.str.len = total_len + (cur - base);
1665       token->val.str.text = dest;
1666       last_buff = first_buff;
1667       while (last_buff != NULL)
1668         {
1669           memcpy (dest, last_buff->base,
1670                   BUFF_FRONT (last_buff) - last_buff->base);
1671           dest += BUFF_FRONT (last_buff) - last_buff->base;
1672           last_buff = last_buff->next;
1673         }
1674       _cpp_release_buff (pfile, first_buff);
1675       memcpy (dest, base, cur - base);
1676       dest[cur - base] = '\0';
1677     }
1678 }
1679
1680 /* Lexes a string, character constant, or angle-bracketed header file
1681    name.  The stored string contains the spelling, including opening
1682    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1683    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1684    if it was not properly terminated, or CPP_LESS for an unterminated
1685    header name which must be relexed as normal tokens.
1686
1687    The spelling is NUL-terminated, but it is not guaranteed that this
1688    is the first NUL since embedded NULs are preserved.  */
1689 static void
1690 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1691 {
1692   bool saw_NUL = false;
1693   const uchar *cur;
1694   cppchar_t terminator;
1695   enum cpp_ttype type;
1696
1697   cur = base;
1698   terminator = *cur++;
1699   if (terminator == 'L' || terminator == 'U')
1700     terminator = *cur++;
1701   else if (terminator == 'u')
1702     {
1703       terminator = *cur++;
1704       if (terminator == '8')
1705         terminator = *cur++;
1706     }
1707   if (terminator == 'R')
1708     {
1709       lex_raw_string (pfile, token, base, cur);
1710       return;
1711     }
1712   if (terminator == '"')
1713     type = (*base == 'L' ? CPP_WSTRING :
1714             *base == 'U' ? CPP_STRING32 :
1715             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1716                          : CPP_STRING);
1717   else if (terminator == '\'')
1718     type = (*base == 'L' ? CPP_WCHAR :
1719             *base == 'U' ? CPP_CHAR32 :
1720             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1721   else
1722     terminator = '>', type = CPP_HEADER_NAME;
1723
1724   for (;;)
1725     {
1726       cppchar_t c = *cur++;
1727
1728       /* In #include-style directives, terminators are not escapable.  */
1729       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1730         cur++;
1731       else if (c == terminator)
1732         break;
1733       else if (c == '\n')
1734         {
1735           cur--;
1736           /* Unmatched quotes always yield undefined behavior, but
1737              greedy lexing means that what appears to be an unterminated
1738              header name may actually be a legitimate sequence of tokens.  */
1739           if (terminator == '>')
1740             {
1741               token->type = CPP_LESS;
1742               return;
1743             }
1744           type = CPP_OTHER;
1745           break;
1746         }
1747       else if (c == '\0')
1748         saw_NUL = true;
1749     }
1750
1751   if (saw_NUL && !pfile->state.skipping)
1752     cpp_error (pfile, CPP_DL_WARNING,
1753                "null character(s) preserved in literal");
1754
1755   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1756     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1757                (int) terminator);
1758
1759   if (CPP_OPTION (pfile, user_literals))
1760     {
1761       /* If a string format macro, say from inttypes.h, is placed touching
1762          a string literal it could be parsed as a C++11 user-defined string
1763          literal thus breaking the program.
1764          Try to identify macros with is_macro. A warning is issued. */
1765       if (is_macro (pfile, cur))
1766         {
1767           /* Raise a warning, but do not consume subsequent tokens.  */
1768           if (CPP_OPTION (pfile, warn_literal_suffix))
1769             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1770                                    token->src_loc, 0,
1771                                    "invalid suffix on literal; C++11 requires "
1772                                    "a space between literal and string macro");
1773         }
1774       /* Grab user defined literal suffix.  */
1775       else if (ISIDST (*cur))
1776         {
1777           type = cpp_userdef_char_add_type (type);
1778           type = cpp_userdef_string_add_type (type);
1779           ++cur;
1780
1781           while (ISIDNUM (*cur))
1782             ++cur;
1783         }
1784     }
1785
1786   pfile->buffer->cur = cur;
1787   create_literal (pfile, token, base, cur - base, type);
1788 }
1789
1790 /* Return the comment table. The client may not make any assumption
1791    about the ordering of the table.  */
1792 cpp_comment_table *
1793 cpp_get_comments (cpp_reader *pfile)
1794 {
1795   return &pfile->comments;
1796 }
1797
1798 /* Append a comment to the end of the comment table. */
1799 static void
1800 store_comment (cpp_reader *pfile, cpp_token *token)
1801 {
1802   int len;
1803
1804   if (pfile->comments.allocated == 0)
1805     {
1806       pfile->comments.allocated = 256;
1807       pfile->comments.entries = (cpp_comment *) xmalloc
1808         (pfile->comments.allocated * sizeof (cpp_comment));
1809     }
1810
1811   if (pfile->comments.count == pfile->comments.allocated)
1812     {
1813       pfile->comments.allocated *= 2;
1814       pfile->comments.entries = (cpp_comment *) xrealloc
1815         (pfile->comments.entries,
1816          pfile->comments.allocated * sizeof (cpp_comment));
1817     }
1818
1819   len = token->val.str.len;
1820
1821   /* Copy comment. Note, token may not be NULL terminated. */
1822   pfile->comments.entries[pfile->comments.count].comment =
1823     (char *) xmalloc (sizeof (char) * (len + 1));
1824   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1825           token->val.str.text, len);
1826   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1827
1828   /* Set source location. */
1829   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1830
1831   /* Increment the count of entries in the comment table. */
1832   pfile->comments.count++;
1833 }
1834
1835 /* The stored comment includes the comment start and any terminator.  */
1836 static void
1837 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1838               cppchar_t type)
1839 {
1840   unsigned char *buffer;
1841   unsigned int len, clen, i;
1842
1843   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1844
1845   /* C++ comments probably (not definitely) have moved past a new
1846      line, which we don't want to save in the comment.  */
1847   if (is_vspace (pfile->buffer->cur[-1]))
1848     len--;
1849
1850   /* If we are currently in a directive or in argument parsing, then
1851      we need to store all C++ comments as C comments internally, and
1852      so we need to allocate a little extra space in that case.
1853
1854      Note that the only time we encounter a directive here is
1855      when we are saving comments in a "#define".  */
1856   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1857           && type == '/') ? len + 2 : len;
1858
1859   buffer = _cpp_unaligned_alloc (pfile, clen);
1860
1861   token->type = CPP_COMMENT;
1862   token->val.str.len = clen;
1863   token->val.str.text = buffer;
1864
1865   buffer[0] = '/';
1866   memcpy (buffer + 1, from, len - 1);
1867
1868   /* Finish conversion to a C comment, if necessary.  */
1869   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1870     {
1871       buffer[1] = '*';
1872       buffer[clen - 2] = '*';
1873       buffer[clen - 1] = '/';
1874       /* As there can be in a C++ comments illegal sequences for C comments
1875          we need to filter them out.  */
1876       for (i = 2; i < (clen - 2); i++)
1877         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1878           buffer[i] = '|';
1879     }
1880
1881   /* Finally store this comment for use by clients of libcpp. */
1882   store_comment (pfile, token);
1883 }
1884
1885 /* Allocate COUNT tokens for RUN.  */
1886 void
1887 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1888 {
1889   run->base = XNEWVEC (cpp_token, count);
1890   run->limit = run->base + count;
1891   run->next = NULL;
1892 }
1893
1894 /* Returns the next tokenrun, or creates one if there is none.  */
1895 static tokenrun *
1896 next_tokenrun (tokenrun *run)
1897 {
1898   if (run->next == NULL)
1899     {
1900       run->next = XNEW (tokenrun);
1901       run->next->prev = run;
1902       _cpp_init_tokenrun (run->next, 250);
1903     }
1904
1905   return run->next;
1906 }
1907
1908 /* Return the number of not yet processed token in a given
1909    context.  */
1910 int
1911 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1912 {
1913   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1914     return (LAST (context).token - FIRST (context).token);
1915   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1916            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1917     return (LAST (context).ptoken - FIRST (context).ptoken);
1918   else
1919       abort ();
1920 }
1921
1922 /* Returns the token present at index INDEX in a given context.  If
1923    INDEX is zero, the next token to be processed is returned.  */
1924 static const cpp_token*
1925 _cpp_token_from_context_at (cpp_context *context, int index)
1926 {
1927   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1928     return &(FIRST (context).token[index]);
1929   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1930            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1931     return FIRST (context).ptoken[index];
1932  else
1933    abort ();
1934 }
1935
1936 /* Look ahead in the input stream.  */
1937 const cpp_token *
1938 cpp_peek_token (cpp_reader *pfile, int index)
1939 {
1940   cpp_context *context = pfile->context;
1941   const cpp_token *peektok;
1942   int count;
1943
1944   /* First, scan through any pending cpp_context objects.  */
1945   while (context->prev)
1946     {
1947       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1948
1949       if (index < (int) sz)
1950         return _cpp_token_from_context_at (context, index);
1951       index -= (int) sz;
1952       context = context->prev;
1953     }
1954
1955   /* We will have to read some new tokens after all (and do so
1956      without invalidating preceding tokens).  */
1957   count = index;
1958   pfile->keep_tokens++;
1959
1960   do
1961     {
1962       peektok = _cpp_lex_token (pfile);
1963       if (peektok->type == CPP_EOF)
1964         return peektok;
1965     }
1966   while (index--);
1967
1968   _cpp_backup_tokens_direct (pfile, count + 1);
1969   pfile->keep_tokens--;
1970
1971   return peektok;
1972 }
1973
1974 /* Allocate a single token that is invalidated at the same time as the
1975    rest of the tokens on the line.  Has its line and col set to the
1976    same as the last lexed token, so that diagnostics appear in the
1977    right place.  */
1978 cpp_token *
1979 _cpp_temp_token (cpp_reader *pfile)
1980 {
1981   cpp_token *old, *result;
1982   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1983   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1984
1985   old = pfile->cur_token - 1;
1986   /* Any pre-existing lookaheads must not be clobbered.  */
1987   if (la)
1988     {
1989       if (sz <= la)
1990         {
1991           tokenrun *next = next_tokenrun (pfile->cur_run);
1992
1993           if (sz < la)
1994             memmove (next->base + 1, next->base,
1995                      (la - sz) * sizeof (cpp_token));
1996
1997           next->base[0] = pfile->cur_run->limit[-1];
1998         }
1999
2000       if (sz > 1)
2001         memmove (pfile->cur_token + 1, pfile->cur_token,
2002                  MIN (la, sz - 1) * sizeof (cpp_token));
2003     }
2004
2005   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2006     {
2007       pfile->cur_run = next_tokenrun (pfile->cur_run);
2008       pfile->cur_token = pfile->cur_run->base;
2009     }
2010
2011   result = pfile->cur_token++;
2012   result->src_loc = old->src_loc;
2013   return result;
2014 }
2015
2016 /* Lex a token into RESULT (external interface).  Takes care of issues
2017    like directive handling, token lookahead, multiple include
2018    optimization and skipping.  */
2019 const cpp_token *
2020 _cpp_lex_token (cpp_reader *pfile)
2021 {
2022   cpp_token *result;
2023
2024   for (;;)
2025     {
2026       if (pfile->cur_token == pfile->cur_run->limit)
2027         {
2028           pfile->cur_run = next_tokenrun (pfile->cur_run);
2029           pfile->cur_token = pfile->cur_run->base;
2030         }
2031       /* We assume that the current token is somewhere in the current
2032          run.  */
2033       if (pfile->cur_token < pfile->cur_run->base
2034           || pfile->cur_token >= pfile->cur_run->limit)
2035         abort ();
2036
2037       if (pfile->lookaheads)
2038         {
2039           pfile->lookaheads--;
2040           result = pfile->cur_token++;
2041         }
2042       else
2043         result = _cpp_lex_direct (pfile);
2044
2045       if (result->flags & BOL)
2046         {
2047           /* Is this a directive.  If _cpp_handle_directive returns
2048              false, it is an assembler #.  */
2049           if (result->type == CPP_HASH
2050               /* 6.10.3 p 11: Directives in a list of macro arguments
2051                  gives undefined behavior.  This implementation
2052                  handles the directive as normal.  */
2053               && pfile->state.parsing_args != 1)
2054             {
2055               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2056                 {
2057                   if (pfile->directive_result.type == CPP_PADDING)
2058                     continue;
2059                   result = &pfile->directive_result;
2060                 }
2061             }
2062           else if (pfile->state.in_deferred_pragma)
2063             result = &pfile->directive_result;
2064
2065           if (pfile->cb.line_change && !pfile->state.skipping)
2066             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2067         }
2068
2069       /* We don't skip tokens in directives.  */
2070       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2071         break;
2072
2073       /* Outside a directive, invalidate controlling macros.  At file
2074          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2075          get here and MI optimization works.  */
2076       pfile->mi_valid = false;
2077
2078       if (!pfile->state.skipping || result->type == CPP_EOF)
2079         break;
2080     }
2081
2082   return result;
2083 }
2084
2085 /* Returns true if a fresh line has been loaded.  */
2086 bool
2087 _cpp_get_fresh_line (cpp_reader *pfile)
2088 {
2089   int return_at_eof;
2090
2091   /* We can't get a new line until we leave the current directive.  */
2092   if (pfile->state.in_directive)
2093     return false;
2094
2095   for (;;)
2096     {
2097       cpp_buffer *buffer = pfile->buffer;
2098
2099       if (!buffer->need_line)
2100         return true;
2101
2102       if (buffer->next_line < buffer->rlimit)
2103         {
2104           _cpp_clean_line (pfile);
2105           return true;
2106         }
2107
2108       /* First, get out of parsing arguments state.  */
2109       if (pfile->state.parsing_args)
2110         return false;
2111
2112       /* End of buffer.  Non-empty files should end in a newline.  */
2113       if (buffer->buf != buffer->rlimit
2114           && buffer->next_line > buffer->rlimit
2115           && !buffer->from_stage3)
2116         {
2117           /* Clip to buffer size.  */
2118           buffer->next_line = buffer->rlimit;
2119         }
2120
2121       return_at_eof = buffer->return_at_eof;
2122       _cpp_pop_buffer (pfile);
2123       if (pfile->buffer == NULL || return_at_eof)
2124         return false;
2125     }
2126 }
2127
2128 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2129   do                                                    \
2130     {                                                   \
2131       result->type = ELSE_TYPE;                         \
2132       if (*buffer->cur == CHAR)                         \
2133         buffer->cur++, result->type = THEN_TYPE;        \
2134     }                                                   \
2135   while (0)
2136
2137 /* Lex a token into pfile->cur_token, which is also incremented, to
2138    get diagnostics pointing to the correct location.
2139
2140    Does not handle issues such as token lookahead, multiple-include
2141    optimization, directives, skipping etc.  This function is only
2142    suitable for use by _cpp_lex_token, and in special cases like
2143    lex_expansion_token which doesn't care for any of these issues.
2144
2145    When meeting a newline, returns CPP_EOF if parsing a directive,
2146    otherwise returns to the start of the token buffer if permissible.
2147    Returns the location of the lexed token.  */
2148 cpp_token *
2149 _cpp_lex_direct (cpp_reader *pfile)
2150 {
2151   cppchar_t c;
2152   cpp_buffer *buffer;
2153   const unsigned char *comment_start;
2154   cpp_token *result = pfile->cur_token++;
2155
2156  fresh_line:
2157   result->flags = 0;
2158   buffer = pfile->buffer;
2159   if (buffer->need_line)
2160     {
2161       if (pfile->state.in_deferred_pragma)
2162         {
2163           result->type = CPP_PRAGMA_EOL;
2164           pfile->state.in_deferred_pragma = false;
2165           if (!pfile->state.pragma_allow_expansion)
2166             pfile->state.prevent_expansion--;
2167           return result;
2168         }
2169       if (!_cpp_get_fresh_line (pfile))
2170         {
2171           result->type = CPP_EOF;
2172           if (!pfile->state.in_directive)
2173             {
2174               /* Tell the compiler the line number of the EOF token.  */
2175               result->src_loc = pfile->line_table->highest_line;
2176               result->flags = BOL;
2177             }
2178           return result;
2179         }
2180       if (!pfile->keep_tokens)
2181         {
2182           pfile->cur_run = &pfile->base_run;
2183           result = pfile->base_run.base;
2184           pfile->cur_token = result + 1;
2185         }
2186       result->flags = BOL;
2187       if (pfile->state.parsing_args == 2)
2188         result->flags |= PREV_WHITE;
2189     }
2190   buffer = pfile->buffer;
2191  update_tokens_line:
2192   result->src_loc = pfile->line_table->highest_line;
2193
2194  skipped_white:
2195   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2196       && !pfile->overlaid_buffer)
2197     {
2198       _cpp_process_line_notes (pfile, false);
2199       result->src_loc = pfile->line_table->highest_line;
2200     }
2201   c = *buffer->cur++;
2202
2203   if (pfile->forced_token_location_p)
2204     result->src_loc = *pfile->forced_token_location_p;
2205   else
2206     result->src_loc = linemap_position_for_column (pfile->line_table,
2207                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2208
2209   switch (c)
2210     {
2211     case ' ': case '\t': case '\f': case '\v': case '\0':
2212       result->flags |= PREV_WHITE;
2213       skip_whitespace (pfile, c);
2214       goto skipped_white;
2215
2216     case '\n':
2217       if (buffer->cur < buffer->rlimit)
2218         CPP_INCREMENT_LINE (pfile, 0);
2219       buffer->need_line = true;
2220       goto fresh_line;
2221
2222     case '0': case '1': case '2': case '3': case '4':
2223     case '5': case '6': case '7': case '8': case '9':
2224       {
2225         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2226         result->type = CPP_NUMBER;
2227         lex_number (pfile, &result->val.str, &nst);
2228         warn_about_normalization (pfile, result, &nst);
2229         break;
2230       }
2231
2232     case 'L':
2233     case 'u':
2234     case 'U':
2235     case 'R':
2236       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2237          wide strings or raw strings.  */
2238       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2239           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2240         {
2241           if ((*buffer->cur == '\'' && c != 'R')
2242               || *buffer->cur == '"'
2243               || (*buffer->cur == 'R'
2244                   && c != 'R'
2245                   && buffer->cur[1] == '"'
2246                   && CPP_OPTION (pfile, rliterals))
2247               || (*buffer->cur == '8'
2248                   && c == 'u'
2249                   && (buffer->cur[1] == '"'
2250                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2251                           && CPP_OPTION (pfile, rliterals)))))
2252             {
2253               lex_string (pfile, result, buffer->cur - 1);
2254               break;
2255             }
2256         }
2257       /* Fall through.  */
2258
2259     case '_':
2260     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2261     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2262     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2263     case 's': case 't':           case 'v': case 'w': case 'x':
2264     case 'y': case 'z':
2265     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2266     case 'G': case 'H': case 'I': case 'J': case 'K':
2267     case 'M': case 'N': case 'O': case 'P': case 'Q':
2268     case 'S': case 'T':           case 'V': case 'W': case 'X':
2269     case 'Y': case 'Z':
2270       result->type = CPP_NAME;
2271       {
2272         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2273         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2274                                                 &nst);
2275         warn_about_normalization (pfile, result, &nst);
2276       }
2277
2278       /* Convert named operators to their proper types.  */
2279       if (result->val.node.node->flags & NODE_OPERATOR)
2280         {
2281           result->flags |= NAMED_OP;
2282           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2283         }
2284       break;
2285
2286     case '\'':
2287     case '"':
2288       lex_string (pfile, result, buffer->cur - 1);
2289       break;
2290
2291     case '/':
2292       /* A potential block or line comment.  */
2293       comment_start = buffer->cur;
2294       c = *buffer->cur;
2295
2296       if (c == '*')
2297         {
2298           if (_cpp_skip_block_comment (pfile))
2299             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2300         }
2301       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2302                             || cpp_in_system_header (pfile)))
2303         {
2304           /* Warn about comments only if pedantically GNUC89, and not
2305              in system headers.  */
2306           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2307               && ! buffer->warned_cplusplus_comments)
2308             {
2309               cpp_error (pfile, CPP_DL_PEDWARN,
2310                          "C++ style comments are not allowed in ISO C90");
2311               cpp_error (pfile, CPP_DL_PEDWARN,
2312                          "(this will be reported only once per input file)");
2313               buffer->warned_cplusplus_comments = 1;
2314             }
2315
2316           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2317             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2318         }
2319       else if (c == '=')
2320         {
2321           buffer->cur++;
2322           result->type = CPP_DIV_EQ;
2323           break;
2324         }
2325       else
2326         {
2327           result->type = CPP_DIV;
2328           break;
2329         }
2330
2331       if (!pfile->state.save_comments)
2332         {
2333           result->flags |= PREV_WHITE;
2334           goto update_tokens_line;
2335         }
2336
2337       /* Save the comment as a token in its own right.  */
2338       save_comment (pfile, result, comment_start, c);
2339       break;
2340
2341     case '<':
2342       if (pfile->state.angled_headers)
2343         {
2344           lex_string (pfile, result, buffer->cur - 1);
2345           if (result->type != CPP_LESS)
2346             break;
2347         }
2348
2349       result->type = CPP_LESS;
2350       if (*buffer->cur == '=')
2351         buffer->cur++, result->type = CPP_LESS_EQ;
2352       else if (*buffer->cur == '<')
2353         {
2354           buffer->cur++;
2355           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2356         }
2357       else if (CPP_OPTION (pfile, digraphs))
2358         {
2359           if (*buffer->cur == ':')
2360             {
2361               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2362                  three characters are <:: and the subsequent character
2363                  is neither : nor >, the < is treated as a preprocessor
2364                  token by itself".  */
2365               if (CPP_OPTION (pfile, cplusplus)
2366                   && CPP_OPTION (pfile, lang) != CLK_CXX98
2367                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
2368                   && buffer->cur[1] == ':'
2369                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2370                 break;
2371
2372               buffer->cur++;
2373               result->flags |= DIGRAPH;
2374               result->type = CPP_OPEN_SQUARE;
2375             }
2376           else if (*buffer->cur == '%')
2377             {
2378               buffer->cur++;
2379               result->flags |= DIGRAPH;
2380               result->type = CPP_OPEN_BRACE;
2381             }
2382         }
2383       break;
2384
2385     case '>':
2386       result->type = CPP_GREATER;
2387       if (*buffer->cur == '=')
2388         buffer->cur++, result->type = CPP_GREATER_EQ;
2389       else if (*buffer->cur == '>')
2390         {
2391           buffer->cur++;
2392           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2393         }
2394       break;
2395
2396     case '%':
2397       result->type = CPP_MOD;
2398       if (*buffer->cur == '=')
2399         buffer->cur++, result->type = CPP_MOD_EQ;
2400       else if (CPP_OPTION (pfile, digraphs))
2401         {
2402           if (*buffer->cur == ':')
2403             {
2404               buffer->cur++;
2405               result->flags |= DIGRAPH;
2406               result->type = CPP_HASH;
2407               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2408                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2409             }
2410           else if (*buffer->cur == '>')
2411             {
2412               buffer->cur++;
2413               result->flags |= DIGRAPH;
2414               result->type = CPP_CLOSE_BRACE;
2415             }
2416         }
2417       break;
2418
2419     case '.':
2420       result->type = CPP_DOT;
2421       if (ISDIGIT (*buffer->cur))
2422         {
2423           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2424           result->type = CPP_NUMBER;
2425           lex_number (pfile, &result->val.str, &nst);
2426           warn_about_normalization (pfile, result, &nst);
2427         }
2428       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2429         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2430       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2431         buffer->cur++, result->type = CPP_DOT_STAR;
2432       break;
2433
2434     case '+':
2435       result->type = CPP_PLUS;
2436       if (*buffer->cur == '+')
2437         buffer->cur++, result->type = CPP_PLUS_PLUS;
2438       else if (*buffer->cur == '=')
2439         buffer->cur++, result->type = CPP_PLUS_EQ;
2440       break;
2441
2442     case '-':
2443       result->type = CPP_MINUS;
2444       if (*buffer->cur == '>')
2445         {
2446           buffer->cur++;
2447           result->type = CPP_DEREF;
2448           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2449             buffer->cur++, result->type = CPP_DEREF_STAR;
2450         }
2451       else if (*buffer->cur == '-')
2452         buffer->cur++, result->type = CPP_MINUS_MINUS;
2453       else if (*buffer->cur == '=')
2454         buffer->cur++, result->type = CPP_MINUS_EQ;
2455       break;
2456
2457     case '&':
2458       result->type = CPP_AND;
2459       if (*buffer->cur == '&')
2460         buffer->cur++, result->type = CPP_AND_AND;
2461       else if (*buffer->cur == '=')
2462         buffer->cur++, result->type = CPP_AND_EQ;
2463       break;
2464
2465     case '|':
2466       result->type = CPP_OR;
2467       if (*buffer->cur == '|')
2468         buffer->cur++, result->type = CPP_OR_OR;
2469       else if (*buffer->cur == '=')
2470         buffer->cur++, result->type = CPP_OR_EQ;
2471       break;
2472
2473     case ':':
2474       result->type = CPP_COLON;
2475       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2476         buffer->cur++, result->type = CPP_SCOPE;
2477       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2478         {
2479           buffer->cur++;
2480           result->flags |= DIGRAPH;
2481           result->type = CPP_CLOSE_SQUARE;
2482         }
2483       break;
2484
2485     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2486     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2487     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2488     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2489     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2490
2491     case '?': result->type = CPP_QUERY; break;
2492     case '~': result->type = CPP_COMPL; break;
2493     case ',': result->type = CPP_COMMA; break;
2494     case '(': result->type = CPP_OPEN_PAREN; break;
2495     case ')': result->type = CPP_CLOSE_PAREN; break;
2496     case '[': result->type = CPP_OPEN_SQUARE; break;
2497     case ']': result->type = CPP_CLOSE_SQUARE; break;
2498     case '{': result->type = CPP_OPEN_BRACE; break;
2499     case '}': result->type = CPP_CLOSE_BRACE; break;
2500     case ';': result->type = CPP_SEMICOLON; break;
2501
2502       /* @ is a punctuator in Objective-C.  */
2503     case '@': result->type = CPP_ATSIGN; break;
2504
2505     case '$':
2506     case '\\':
2507       {
2508         const uchar *base = --buffer->cur;
2509         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2510
2511         if (forms_identifier_p (pfile, true, &nst))
2512           {
2513             result->type = CPP_NAME;
2514             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2515             warn_about_normalization (pfile, result, &nst);
2516             break;
2517           }
2518         buffer->cur++;
2519       }
2520
2521     default:
2522       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2523       break;
2524     }
2525
2526   return result;
2527 }
2528
2529 /* An upper bound on the number of bytes needed to spell TOKEN.
2530    Does not include preceding whitespace.  */
2531 unsigned int
2532 cpp_token_len (const cpp_token *token)
2533 {
2534   unsigned int len;
2535
2536   switch (TOKEN_SPELL (token))
2537     {
2538     default:            len = 6;                                break;
2539     case SPELL_LITERAL: len = token->val.str.len;               break;
2540     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2541     }
2542
2543   return len;
2544 }
2545
2546 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2547    Return the number of bytes read out of NAME.  (There are always
2548    10 bytes written to BUFFER.)  */
2549
2550 static size_t
2551 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2552 {
2553   int j;
2554   int ucn_len = 0;
2555   int ucn_len_c;
2556   unsigned t;
2557   unsigned long utf32;
2558
2559   /* Compute the length of the UTF-8 sequence.  */
2560   for (t = *name; t & 0x80; t <<= 1)
2561     ucn_len++;
2562
2563   utf32 = *name & (0x7F >> ucn_len);
2564   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2565     {
2566       utf32 = (utf32 << 6) | (*++name & 0x3F);
2567
2568       /* Ill-formed UTF-8.  */
2569       if ((*name & ~0x3F) != 0x80)
2570         abort ();
2571     }
2572
2573   *buffer++ = '\\';
2574   *buffer++ = 'U';
2575   for (j = 7; j >= 0; j--)
2576     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2577   return ucn_len;
2578 }
2579
2580 /* Given a token TYPE corresponding to a digraph, return a pointer to
2581    the spelling of the digraph.  */
2582 static const unsigned char *
2583 cpp_digraph2name (enum cpp_ttype type)
2584 {
2585   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2586 }
2587
2588 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2589    already contain the enough space to hold the token's spelling.
2590    Returns a pointer to the character after the last character written.
2591    FORSTRING is true if this is to be the spelling after translation
2592    phase 1 (this is different for UCNs).
2593    FIXME: Would be nice if we didn't need the PFILE argument.  */
2594 unsigned char *
2595 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2596                  unsigned char *buffer, bool forstring)
2597 {
2598   switch (TOKEN_SPELL (token))
2599     {
2600     case SPELL_OPERATOR:
2601       {
2602         const unsigned char *spelling;
2603         unsigned char c;
2604
2605         if (token->flags & DIGRAPH)
2606           spelling = cpp_digraph2name (token->type);
2607         else if (token->flags & NAMED_OP)
2608           goto spell_ident;
2609         else
2610           spelling = TOKEN_NAME (token);
2611
2612         while ((c = *spelling++) != '\0')
2613           *buffer++ = c;
2614       }
2615       break;
2616
2617     spell_ident:
2618     case SPELL_IDENT:
2619       if (forstring)
2620         {
2621           memcpy (buffer, NODE_NAME (token->val.node.node),
2622                   NODE_LEN (token->val.node.node));
2623           buffer += NODE_LEN (token->val.node.node);
2624         }
2625       else
2626         {
2627           size_t i;
2628           const unsigned char * name = NODE_NAME (token->val.node.node);
2629
2630           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2631             if (name[i] & ~0x7F)
2632               {
2633                 i += utf8_to_ucn (buffer, name + i) - 1;
2634                 buffer += 10;
2635               }
2636             else
2637               *buffer++ = NODE_NAME (token->val.node.node)[i];
2638         }
2639       break;
2640
2641     case SPELL_LITERAL:
2642       memcpy (buffer, token->val.str.text, token->val.str.len);
2643       buffer += token->val.str.len;
2644       break;
2645
2646     case SPELL_NONE:
2647       cpp_error (pfile, CPP_DL_ICE,
2648                  "unspellable token %s", TOKEN_NAME (token));
2649       break;
2650     }
2651
2652   return buffer;
2653 }
2654
2655 /* Returns TOKEN spelt as a null-terminated string.  The string is
2656    freed when the reader is destroyed.  Useful for diagnostics.  */
2657 unsigned char *
2658 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2659 {
2660   unsigned int len = cpp_token_len (token) + 1;
2661   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2662
2663   end = cpp_spell_token (pfile, token, start, false);
2664   end[0] = '\0';
2665
2666   return start;
2667 }
2668
2669 /* Returns a pointer to a string which spells the token defined by
2670    TYPE and FLAGS.  Used by C front ends, which really should move to
2671    using cpp_token_as_text.  */
2672 const char *
2673 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2674 {
2675   if (flags & DIGRAPH)
2676     return (const char *) cpp_digraph2name (type);
2677   else if (flags & NAMED_OP)
2678     return cpp_named_operator2name (type);
2679
2680   return (const char *) token_spellings[type].name;
2681 }
2682
2683 /* Writes the spelling of token to FP, without any preceding space.
2684    Separated from cpp_spell_token for efficiency - to avoid stdio
2685    double-buffering.  */
2686 void
2687 cpp_output_token (const cpp_token *token, FILE *fp)
2688 {
2689   switch (TOKEN_SPELL (token))
2690     {
2691     case SPELL_OPERATOR:
2692       {
2693         const unsigned char *spelling;
2694         int c;
2695
2696         if (token->flags & DIGRAPH)
2697           spelling = cpp_digraph2name (token->type);
2698         else if (token->flags & NAMED_OP)
2699           goto spell_ident;
2700         else
2701           spelling = TOKEN_NAME (token);
2702
2703         c = *spelling;
2704         do
2705           putc (c, fp);
2706         while ((c = *++spelling) != '\0');
2707       }
2708       break;
2709
2710     spell_ident:
2711     case SPELL_IDENT:
2712       {
2713         size_t i;
2714         const unsigned char * name = NODE_NAME (token->val.node.node);
2715
2716         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2717           if (name[i] & ~0x7F)
2718             {
2719               unsigned char buffer[10];
2720               i += utf8_to_ucn (buffer, name + i) - 1;
2721               fwrite (buffer, 1, 10, fp);
2722             }
2723           else
2724             fputc (NODE_NAME (token->val.node.node)[i], fp);
2725       }
2726       break;
2727
2728     case SPELL_LITERAL:
2729       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2730       break;
2731
2732     case SPELL_NONE:
2733       /* An error, most probably.  */
2734       break;
2735     }
2736 }
2737
2738 /* Compare two tokens.  */
2739 int
2740 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2741 {
2742   if (a->type == b->type && a->flags == b->flags)
2743     switch (TOKEN_SPELL (a))
2744       {
2745       default:                  /* Keep compiler happy.  */
2746       case SPELL_OPERATOR:
2747         /* token_no is used to track where multiple consecutive ##
2748            tokens were originally located.  */
2749         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2750       case SPELL_NONE:
2751         return (a->type != CPP_MACRO_ARG
2752                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2753       case SPELL_IDENT:
2754         return a->val.node.node == b->val.node.node;
2755       case SPELL_LITERAL:
2756         return (a->val.str.len == b->val.str.len
2757                 && !memcmp (a->val.str.text, b->val.str.text,
2758                             a->val.str.len));
2759       }
2760
2761   return 0;
2762 }
2763
2764 /* Returns nonzero if a space should be inserted to avoid an
2765    accidental token paste for output.  For simplicity, it is
2766    conservative, and occasionally advises a space where one is not
2767    needed, e.g. "." and ".2".  */
2768 int
2769 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2770                  const cpp_token *token2)
2771 {
2772   enum cpp_ttype a = token1->type, b = token2->type;
2773   cppchar_t c;
2774
2775   if (token1->flags & NAMED_OP)
2776     a = CPP_NAME;
2777   if (token2->flags & NAMED_OP)
2778     b = CPP_NAME;
2779
2780   c = EOF;
2781   if (token2->flags & DIGRAPH)
2782     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2783   else if (token_spellings[b].category == SPELL_OPERATOR)
2784     c = token_spellings[b].name[0];
2785
2786   /* Quickly get everything that can paste with an '='.  */
2787   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2788     return 1;
2789
2790   switch (a)
2791     {
2792     case CPP_GREATER:   return c == '>';
2793     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2794     case CPP_PLUS:      return c == '+';
2795     case CPP_MINUS:     return c == '-' || c == '>';
2796     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2797     case CPP_MOD:       return c == ':' || c == '>';
2798     case CPP_AND:       return c == '&';
2799     case CPP_OR:        return c == '|';
2800     case CPP_COLON:     return c == ':' || c == '>';
2801     case CPP_DEREF:     return c == '*';
2802     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2803     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2804     case CPP_NAME:      return ((b == CPP_NUMBER
2805                                  && name_p (pfile, &token2->val.str))
2806                                 || b == CPP_NAME
2807                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2808     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2809                                 || c == '.' || c == '+' || c == '-');
2810                                       /* UCNs */
2811     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2812                                  && b == CPP_NAME)
2813                                 || (CPP_OPTION (pfile, objc)
2814                                     && token1->val.str.text[0] == '@'
2815                                     && (b == CPP_NAME || b == CPP_STRING)));
2816     case CPP_STRING:
2817     case CPP_WSTRING:
2818     case CPP_UTF8STRING:
2819     case CPP_STRING16:
2820     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
2821                                 && (b == CPP_NAME
2822                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
2823                                         && ISIDST (token2->val.str.text[0]))));
2824
2825     default:            break;
2826     }
2827
2828   return 0;
2829 }
2830
2831 /* Output all the remaining tokens on the current line, and a newline
2832    character, to FP.  Leading whitespace is removed.  If there are
2833    macros, special token padding is not performed.  */
2834 void
2835 cpp_output_line (cpp_reader *pfile, FILE *fp)
2836 {
2837   const cpp_token *token;
2838
2839   token = cpp_get_token (pfile);
2840   while (token->type != CPP_EOF)
2841     {
2842       cpp_output_token (token, fp);
2843       token = cpp_get_token (pfile);
2844       if (token->flags & PREV_WHITE)
2845         putc (' ', fp);
2846     }
2847
2848   putc ('\n', fp);
2849 }
2850
2851 /* Return a string representation of all the remaining tokens on the
2852    current line.  The result is allocated using xmalloc and must be
2853    freed by the caller.  */
2854 unsigned char *
2855 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2856 {
2857   const cpp_token *token;
2858   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2859   unsigned int alloced = 120 + out;
2860   unsigned char *result = (unsigned char *) xmalloc (alloced);
2861
2862   /* If DIR_NAME is empty, there are no initial contents.  */
2863   if (dir_name)
2864     {
2865       sprintf ((char *) result, "#%s ", dir_name);
2866       out += 2;
2867     }
2868
2869   token = cpp_get_token (pfile);
2870   while (token->type != CPP_EOF)
2871     {
2872       unsigned char *last;
2873       /* Include room for a possible space and the terminating nul.  */
2874       unsigned int len = cpp_token_len (token) + 2;
2875
2876       if (out + len > alloced)
2877         {
2878           alloced *= 2;
2879           if (out + len > alloced)
2880             alloced = out + len;
2881           result = (unsigned char *) xrealloc (result, alloced);
2882         }
2883
2884       last = cpp_spell_token (pfile, token, &result[out], 0);
2885       out = last - result;
2886
2887       token = cpp_get_token (pfile);
2888       if (token->flags & PREV_WHITE)
2889         result[out++] = ' ';
2890     }
2891
2892   result[out] = '\0';
2893   return result;
2894 }
2895
2896 /* Memory buffers.  Changing these three constants can have a dramatic
2897    effect on performance.  The values here are reasonable defaults,
2898    but might be tuned.  If you adjust them, be sure to test across a
2899    range of uses of cpplib, including heavy nested function-like macro
2900    expansion.  Also check the change in peak memory usage (NJAMD is a
2901    good tool for this).  */
2902 #define MIN_BUFF_SIZE 8000
2903 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2904 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2905         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2906
2907 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2908   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2909 #endif
2910
2911 /* Create a new allocation buffer.  Place the control block at the end
2912    of the buffer, so that buffer overflows will cause immediate chaos.  */
2913 static _cpp_buff *
2914 new_buff (size_t len)
2915 {
2916   _cpp_buff *result;
2917   unsigned char *base;
2918
2919   if (len < MIN_BUFF_SIZE)
2920     len = MIN_BUFF_SIZE;
2921   len = CPP_ALIGN (len);
2922
2923 #ifdef ENABLE_VALGRIND_CHECKING
2924   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
2925      struct first.  */
2926   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
2927   base = XNEWVEC (unsigned char, len + slen);
2928   result = (_cpp_buff *) base;
2929   base += slen;
2930 #else
2931   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2932   result = (_cpp_buff *) (base + len);
2933 #endif
2934   result->base = base;
2935   result->cur = base;
2936   result->limit = base + len;
2937   result->next = NULL;
2938   return result;
2939 }
2940
2941 /* Place a chain of unwanted allocation buffers on the free list.  */
2942 void
2943 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2944 {
2945   _cpp_buff *end = buff;
2946
2947   while (end->next)
2948     end = end->next;
2949   end->next = pfile->free_buffs;
2950   pfile->free_buffs = buff;
2951 }
2952
2953 /* Return a free buffer of size at least MIN_SIZE.  */
2954 _cpp_buff *
2955 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2956 {
2957   _cpp_buff *result, **p;
2958
2959   for (p = &pfile->free_buffs;; p = &(*p)->next)
2960     {
2961       size_t size;
2962
2963       if (*p == NULL)
2964         return new_buff (min_size);
2965       result = *p;
2966       size = result->limit - result->base;
2967       /* Return a buffer that's big enough, but don't waste one that's
2968          way too big.  */
2969       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2970         break;
2971     }
2972
2973   *p = result->next;
2974   result->next = NULL;
2975   result->cur = result->base;
2976   return result;
2977 }
2978
2979 /* Creates a new buffer with enough space to hold the uncommitted
2980    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2981    the excess bytes to the new buffer.  Chains the new buffer after
2982    BUFF, and returns the new buffer.  */
2983 _cpp_buff *
2984 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2985 {
2986   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2987   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2988
2989   buff->next = new_buff;
2990   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2991   return new_buff;
2992 }
2993
2994 /* Creates a new buffer with enough space to hold the uncommitted
2995    remaining bytes of the buffer pointed to by BUFF, and at least
2996    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
2997    Chains the new buffer before the buffer pointed to by BUFF, and
2998    updates the pointer to point to the new buffer.  */
2999 void
3000 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3001 {
3002   _cpp_buff *new_buff, *old_buff = *pbuff;
3003   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3004
3005   new_buff = _cpp_get_buff (pfile, size);
3006   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3007   new_buff->next = old_buff;
3008   *pbuff = new_buff;
3009 }
3010
3011 /* Free a chain of buffers starting at BUFF.  */
3012 void
3013 _cpp_free_buff (_cpp_buff *buff)
3014 {
3015   _cpp_buff *next;
3016
3017   for (; buff; buff = next)
3018     {
3019       next = buff->next;
3020 #ifdef ENABLE_VALGRIND_CHECKING
3021       free (buff);
3022 #else
3023       free (buff->base);
3024 #endif
3025     }
3026 }
3027
3028 /* Allocate permanent, unaligned storage of length LEN.  */
3029 unsigned char *
3030 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3031 {
3032   _cpp_buff *buff = pfile->u_buff;
3033   unsigned char *result = buff->cur;
3034
3035   if (len > (size_t) (buff->limit - result))
3036     {
3037       buff = _cpp_get_buff (pfile, len);
3038       buff->next = pfile->u_buff;
3039       pfile->u_buff = buff;
3040       result = buff->cur;
3041     }
3042
3043   buff->cur = result + len;
3044   return result;
3045 }
3046
3047 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3048    That buffer is used for growing allocations when saving macro
3049    replacement lists in a #define, and when parsing an answer to an
3050    assertion in #assert, #unassert or #if (and therefore possibly
3051    whilst expanding macros).  It therefore must not be used by any
3052    code that they might call: specifically the lexer and the guts of
3053    the macro expander.
3054
3055    All existing other uses clearly fit this restriction: storing
3056    registered pragmas during initialization.  */
3057 unsigned char *
3058 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3059 {
3060   _cpp_buff *buff = pfile->a_buff;
3061   unsigned char *result = buff->cur;
3062
3063   if (len > (size_t) (buff->limit - result))
3064     {
3065       buff = _cpp_get_buff (pfile, len);
3066       buff->next = pfile->a_buff;
3067       pfile->a_buff = buff;
3068       result = buff->cur;
3069     }
3070
3071   buff->cur = result + len;
3072   return result;
3073 }
3074
3075 /* Say which field of TOK is in use.  */
3076
3077 enum cpp_token_fld_kind
3078 cpp_token_val_index (const cpp_token *tok)
3079 {
3080   switch (TOKEN_SPELL (tok))
3081     {
3082     case SPELL_IDENT:
3083       return CPP_TOKEN_FLD_NODE;
3084     case SPELL_LITERAL:
3085       return CPP_TOKEN_FLD_STR;
3086     case SPELL_OPERATOR:
3087       if (tok->type == CPP_PASTE)
3088         return CPP_TOKEN_FLD_TOKEN_NO;
3089       else
3090         return CPP_TOKEN_FLD_NONE;
3091     case SPELL_NONE:
3092       if (tok->type == CPP_MACRO_ARG)
3093         return CPP_TOKEN_FLD_ARG_NO;
3094       else if (tok->type == CPP_PADDING)
3095         return CPP_TOKEN_FLD_SOURCE;
3096       else if (tok->type == CPP_PRAGMA)
3097         return CPP_TOKEN_FLD_PRAGMA;
3098       /* else fall through */
3099     default:
3100       return CPP_TOKEN_FLD_NONE;
3101     }
3102 }
3103
3104 /* All tokens lexed in R after calling this function will be forced to have
3105    their source_location the same as the location referenced by P, until
3106    cpp_stop_forcing_token_locations is called for R.  */
3107
3108 void
3109 cpp_force_token_locations (cpp_reader *r, source_location *p)
3110 {
3111   r->forced_token_location_p = p;
3112 }
3113
3114 /* Go back to assigning locations naturally for lexed tokens.  */
3115
3116 void
3117 cpp_stop_forcing_token_locations (cpp_reader *r)
3118 {
3119   r->forced_token_location_p = NULL;
3120 }