libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2013 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problems can be properly
 267    autoconfed:
 268
 269    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 270    Before Solaris 9 Update 6, SSE insns cannot be executed.
 271    The Solaris 10+ assembler tags objects with the instruction set
 272    extensions used, so SSE4.2 executables cannot run on machines that
 273    don't support that extension.  */
 274
 275 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 276
 277 /* Replicated character data to be shared between implementations.
 278    Recall that outside of a context with vector support we can't
 279    define compatible vector types, therefore these are all defined
 280    in terms of raw characters.  */
 281 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 282   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 283     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 284   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 285     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 286   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 287     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 288   { '?', '?', '?', '?', '?', '?', '?', '?',
 289     '?', '?', '?', '?', '?', '?', '?', '?' },
 290 };
 291
 292 /* A version of the fast scanner using MMX vectorized byte compare insns.
 293
 294    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 295    which was packaged into SSE1; it is also present in the AMD MMX
 296    extension.  Mark the function as using "sse" so that we emit a real
 297    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 298
 299 static const uchar *
 300 #ifndef __SSE__
 301 __attribute__((__target__("sse")))
 302 #endif
 303 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 304 {
 305   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 306   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 307
 308   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 309   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 310   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 311   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 312
 313   unsigned int misalign, found, mask;
 314   const v8qi *p;
 315   v8qi data, t, c;
 316
 317   /* Align the source pointer.  While MMX doesn't generate unaligned data
 318      faults, this allows us to safely scan to the end of the buffer without
 319      reading beyond the end of the last page.  */
 320   misalign = (uintptr_t)s & 7;
 321   p = (const v8qi *)((uintptr_t)s & -8);
 322   data = *p;
 323
 324   /* Create a mask for the bytes that are valid within the first
 325      16-byte block.  The Idea here is that the AND with the mask
 326      within the loop is "free", since we need some AND or TEST
 327      insn in order to set the flags for the branch anyway.  */
 328   mask = -1u << misalign;
 329
 330   /* Main loop processing 8 bytes at a time.  */
 331   goto start;
 332   do
 333     {
 334       data = *++p;
 335       mask = -1;
 336
 337     start:
 338       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 339       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 344       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 345       found = __builtin_ia32_pmovmskb (t);
 346       found &= mask;
 347     }
 348   while (!found);
 349
 350   __builtin_ia32_emms ();
 351
 352   /* FOUND contains 1 in bits for which we matched a relevant
 353      character.  Conversion to the byte index is trivial.  */
 354   found = __builtin_ctz(found);
 355   return (const uchar *)p + found;
 356 }
 357
 358 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 359
 360 static const uchar *
 361 #ifndef __SSE2__
 362 __attribute__((__target__("sse2")))
 363 #endif
 364 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 365 {
 366   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 367
 368   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 369   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 370   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 371   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 372
 373   unsigned int misalign, found, mask;
 374   const v16qi *p;
 375   v16qi data, t;
 376
 377   /* Align the source pointer.  */
 378   misalign = (uintptr_t)s & 15;
 379   p = (const v16qi *)((uintptr_t)s & -16);
 380   data = *p;
 381
 382   /* Create a mask for the bytes that are valid within the first
 383      16-byte block.  The Idea here is that the AND with the mask
 384      within the loop is "free", since we need some AND or TEST
 385      insn in order to set the flags for the branch anyway.  */
 386   mask = -1u << misalign;
 387
 388   /* Main loop processing 16 bytes at a time.  */
 389   goto start;
 390   do
 391     {
 392       data = *++p;
 393       mask = -1;
 394
 395     start:
 396       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 397       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 398       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 400       found = __builtin_ia32_pmovmskb128 (t);
 401       found &= mask;
 402     }
 403   while (!found);
 404
 405   /* FOUND contains 1 in bits for which we matched a relevant
 406      character.  Conversion to the byte index is trivial.  */
 407   found = __builtin_ctz(found);
 408   return (const uchar *)p + found;
 409 }
 410
 411 #ifdef HAVE_SSE4
 412 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 413
 414 static const uchar *
 415 #ifndef __SSE4_2__
 416 __attribute__((__target__("sse4.2")))
 417 #endif
 418 search_line_sse42 (const uchar *s, const uchar *end)
 419 {
 420   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 421   static const v16qi search = { '\n', '\r', '?', '\\' };
 422
 423   uintptr_t si = (uintptr_t)s;
 424   uintptr_t index;
 425
 426   /* Check for unaligned input.  */
 427   if (si & 15)
 428     {
 429       v16qi sv;
 430
 431       if (__builtin_expect (end - s < 16, 0)
 432           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 433         {
 434           /* There are less than 16 bytes left in the buffer, and less
 435              than 16 bytes left on the page.  Reading 16 bytes at this
 436              point might generate a spurious page fault.  Defer to the
 437              SSE2 implementation, which already handles alignment.  */
 438           return search_line_sse2 (s, end);
 439         }
 440
 441       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 442          memory need not be aligned.  */
 443       sv = __builtin_ia32_loaddqu ((const char *) s);
 444       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 445
 446       if (__builtin_expect (index < 16, 0))
 447         goto found;
 448
 449       /* Advance the pointer to an aligned address.  We will re-scan a
 450          few bytes, but we no longer need care for reading past the
 451          end of a page, since we're guaranteed a match.  */
 452       s = (const uchar *)((si + 16) & -16);
 453     }
 454
 455   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 456      in inline assembly, we can make proper use of the flags set.  */
 457   __asm (      "sub $16, %1\n"
 458         "       .balign 16\n"
 459         "0:     add $16, %1\n"
 460         "       %vpcmpestri $0, (%1), %2\n"
 461         "       jnc 0b"
 462         : "=&c"(index), "+r"(s)
 463         : "x"(search), "a"(4), "d"(16));
 464
 465  found:
 466   return s + index;
 467 }
 468
 469 #else
 470 /* Work around out-dated assemblers without sse4 support.  */
 471 #define search_line_sse42 search_line_sse2
 472 #endif
 473
 474 /* Check the CPU capabilities.  */
 475
 476 #include "../gcc/config/i386/cpuid.h"
 477
 478 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 479 static search_line_fast_type search_line_fast;
 480
 481 #define HAVE_init_vectorized_lexer 1
 482 static inline void
 483 init_vectorized_lexer (void)
 484 {
 485   unsigned dummy, ecx = 0, edx = 0;
 486   search_line_fast_type impl = search_line_acc_char;
 487   int minimum = 0;
 488
 489 #if defined(__SSE4_2__)
 490   minimum = 3;
 491 #elif defined(__SSE2__)
 492   minimum = 2;
 493 #elif defined(__SSE__)
 494   minimum = 1;
 495 #endif
 496
 497   if (minimum == 3)
 498     impl = search_line_sse42;
 499   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 500     {
 501       if (minimum == 3 || (ecx & bit_SSE4_2))
 502         impl = search_line_sse42;
 503       else if (minimum == 2 || (edx & bit_SSE2))
 504         impl = search_line_sse2;
 505       else if (minimum == 1 || (edx & bit_SSE))
 506         impl = search_line_mmx;
 507     }
 508   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 509     {
 510       if (minimum == 1
 511           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 512         impl = search_line_mmx;
 513     }
 514
 515   search_line_fast = impl;
 516 }
 517
 518 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 519
 520 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 521 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 522    so we can't compile this function without -maltivec on the command line
 523    (or implied by some other switch).  */
 524
 525 static const uchar *
 526 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 527 {
 528   typedef __attribute__((altivec(vector))) unsigned char vc;
 529
 530   const vc repl_nl = {
 531     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 532     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 533   };
 534   const vc repl_cr = {
 535     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 536     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 537   };
 538   const vc repl_bs = {
 539     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 540     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 541   };
 542   const vc repl_qm = {
 543     '?', '?', '?', '?', '?', '?', '?', '?',
 544     '?', '?', '?', '?', '?', '?', '?', '?',
 545   };
 546   const vc ones = {
 547     -1, -1, -1, -1, -1, -1, -1, -1,
 548     -1, -1, -1, -1, -1, -1, -1, -1,
 549   };
 550   const vc zero = { 0 };
 551
 552   vc data, mask, t;
 553
 554   /* Altivec loads automatically mask addresses with -16.  This lets us
 555      issue the first load as early as possible.  */
 556   data = __builtin_vec_ld(0, (const vc *)s);
 557
 558   /* Discard bytes before the beginning of the buffer.  Do this by
 559      beginning with all ones and shifting in zeros according to the
 560      mis-alignment.  The LVSR instruction pulls the exact shift we
 561      want from the address.  */
 562   mask = __builtin_vec_lvsr(0, s);
 563   mask = __builtin_vec_perm(zero, ones, mask);
 564   data &= mask;
 565
 566   /* While altivec loads mask addresses, we still need to align S so
 567      that the offset we compute at the end is correct.  */
 568   s = (const uchar *)((uintptr_t)s & -16);
 569
 570   /* Main loop processing 16 bytes at a time.  */
 571   goto start;
 572   do
 573     {
 574       vc m_nl, m_cr, m_bs, m_qm;
 575
 576       s += 16;
 577       data = __builtin_vec_ld(0, (const vc *)s);
 578
 579     start:
 580       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 581       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 582       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 583       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 584       t = (m_nl | m_cr) | (m_bs | m_qm);
 585
 586       /* T now contains 0xff in bytes for which we matched one of the relevant
 587          characters.  We want to exit the loop if any byte in T is non-zero.
 588          Below is the expansion of vec_any_ne(t, zero).  */
 589     }
 590   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 591
 592   {
 593 #define N  (sizeof(vc) / sizeof(long))
 594
 595     union {
 596       vc v;
 597       /* Statically assert that N is 2 or 4.  */
 598       unsigned long l[(N == 2 || N == 4) ? N : -1];
 599     } u;
 600     unsigned long l, i = 0;
 601
 602     u.v = t;
 603
 604     /* Find the first word of T that is non-zero.  */
 605     switch (N)
 606       {
 607       case 4:
 608         l = u.l[i++];
 609         if (l != 0)
 610           break;
 611         s += sizeof(unsigned long);
 612         l = u.l[i++];
 613         if (l != 0)
 614           break;
 615         s += sizeof(unsigned long);
 616       case 2:
 617         l = u.l[i++];
 618         if (l != 0)
 619           break;
 620         s += sizeof(unsigned long);
 621         l = u.l[i];
 622       }
 623
 624     /* L now contains 0xff in bytes for which we matched one of the
 625        relevant characters.  We can find the byte index by finding
 626        its bit index and dividing by 8.  */
 627     l = __builtin_clzl(l) >> 3;
 628     return s + l;
 629
 630 #undef N
 631   }
 632 }
 633
 634 #elif defined (__ARM_NEON__)
 635 #include "arm_neon.h"
 636
 637 static const uchar *
 638 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 639 {
 640   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 641   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 642   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 643   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 644   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 645
 646   unsigned int misalign, found, mask;
 647   const uint8_t *p;
 648   uint8x16_t data;
 649
 650   /* Align the source pointer.  */
 651   misalign = (uintptr_t)s & 15;
 652   p = (const uint8_t *)((uintptr_t)s & -16);
 653   data = vld1q_u8 (p);
 654
 655   /* Create a mask for the bytes that are valid within the first
 656      16-byte block.  The Idea here is that the AND with the mask
 657      within the loop is "free", since we need some AND or TEST
 658      insn in order to set the flags for the branch anyway.  */
 659   mask = (-1u << misalign) & 0xffff;
 660
 661   /* Main loop, processing 16 bytes at a time.  */
 662   goto start;
 663
 664   do
 665     {
 666       uint8x8_t l;
 667       uint16x4_t m;
 668       uint32x2_t n;
 669       uint8x16_t t, u, v, w;
 670
 671       p += 16;
 672       data = vld1q_u8 (p);
 673       mask = 0xffff;
 674
 675     start:
 676       t = vceqq_u8 (data, repl_nl);
 677       u = vceqq_u8 (data, repl_cr);
 678       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 679       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 680       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 681       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 682       m = vpaddl_u8 (l);
 683       n = vpaddl_u16 (m);
 684
 685       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 686               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 687       found &= mask;
 688     }
 689   while (!found);
 690
 691   /* FOUND contains 1 in bits for which we matched a relevant
 692      character.  Conversion to the byte index is trivial.  */
 693   found = __builtin_ctz (found);
 694   return (const uchar *)p + found;
 695 }
 696
 697 #else
 698
 699 /* We only have one accellerated alternative.  Use a direct call so that
 700    we encourage inlining.  */
 701
 702 #define search_line_fast  search_line_acc_char
 703
 704 #endif
 705
 706 /* Initialize the lexer if needed.  */
 707
 708 void
 709 _cpp_init_lexer (void)
 710 {
 711 #ifdef HAVE_init_vectorized_lexer
 712   init_vectorized_lexer ();
 713 #endif
 714 }
 715
 716 /* Returns with a logical line that contains no escaped newlines or
 717    trigraphs.  This is a time-critical inner loop.  */
 718 void
 719 _cpp_clean_line (cpp_reader *pfile)
 720 {
 721   cpp_buffer *buffer;
 722   const uchar *s;
 723   uchar c, *d, *p;
 724
 725   buffer = pfile->buffer;
 726   buffer->cur_note = buffer->notes_used = 0;
 727   buffer->cur = buffer->line_base = buffer->next_line;
 728   buffer->need_line = false;
 729   s = buffer->next_line;
 730
 731   if (!buffer->from_stage3)
 732     {
 733       const uchar *pbackslash = NULL;
 734
 735       /* Fast path.  This is the common case of an un-escaped line with
 736          no trigraphs.  The primary win here is by not writing any
 737          data back to memory until we have to.  */
 738       while (1)
 739         {
 740           /* Perform an optimized search for \n, \r, \\, ?.  */
 741           s = search_line_fast (s, buffer->rlimit);
 742
 743           c = *s;
 744           if (c == '\\')
 745             {
 746               /* Record the location of the backslash and continue.  */
 747               pbackslash = s++;
 748             }
 749           else if (__builtin_expect (c == '?', 0))
 750             {
 751               if (__builtin_expect (s[1] == '?', false)
 752                    && _cpp_trigraph_map[s[2]])
 753                 {
 754                   /* Have a trigraph.  We may or may not have to convert
 755                      it.  Add a line note regardless, for -Wtrigraphs.  */
 756                   add_line_note (buffer, s, s[2]);
 757                   if (CPP_OPTION (pfile, trigraphs))
 758                     {
 759                       /* We do, and that means we have to switch to the
 760                          slow path.  */
 761                       d = (uchar *) s;
 762                       *d = _cpp_trigraph_map[s[2]];
 763                       s += 2;
 764                       goto slow_path;
 765                     }
 766                 }
 767               /* Not a trigraph.  Continue on fast-path.  */
 768               s++;
 769             }
 770           else
 771             break;
 772         }
 773
 774       /* This must be \r or \n.  We're either done, or we'll be forced
 775          to write back to the buffer and continue on the slow path.  */
 776       d = (uchar *) s;
 777
 778       if (__builtin_expect (s == buffer->rlimit, false))
 779         goto done;
 780
 781       /* DOS line ending? */
 782       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 783         {
 784           s++;
 785           if (s == buffer->rlimit)
 786             goto done;
 787         }
 788
 789       if (__builtin_expect (pbackslash == NULL, true))
 790         goto done;
 791
 792       /* Check for escaped newline.  */
 793       p = d;
 794       while (is_nvspace (p[-1]))
 795         p--;
 796       if (p - 1 != pbackslash)
 797         goto done;
 798
 799       /* Have an escaped newline; process it and proceed to
 800          the slow path.  */
 801       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 802       d = p - 2;
 803       buffer->next_line = p - 1;
 804
 805     slow_path:
 806       while (1)
 807         {
 808           c = *++s;
 809           *++d = c;
 810
 811           if (c == '\n' || c == '\r')
 812             {
 813               /* Handle DOS line endings.  */
 814               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 815                 s++;
 816               if (s == buffer->rlimit)
 817                 break;
 818
 819               /* Escaped?  */
 820               p = d;
 821               while (p != buffer->next_line && is_nvspace (p[-1]))
 822                 p--;
 823               if (p == buffer->next_line || p[-1] != '\\')
 824                 break;
 825
 826               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 827               d = p - 2;
 828               buffer->next_line = p - 1;
 829             }
 830           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 831             {
 832               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 833               add_line_note (buffer, d, s[2]);
 834               if (CPP_OPTION (pfile, trigraphs))
 835                 {
 836                   *d = _cpp_trigraph_map[s[2]];
 837                   s += 2;
 838                 }
 839             }
 840         }
 841     }
 842   else
 843     {
 844       while (*s != '\n' && *s != '\r')
 845         s++;
 846       d = (uchar *) s;
 847
 848       /* Handle DOS line endings.  */
 849       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 850         s++;
 851     }
 852
 853  done:
 854   *d = '\n';
 855   /* A sentinel note that should never be processed.  */
 856   add_line_note (buffer, d + 1, '\n');
 857   buffer->next_line = s + 1;
 858 }
 859
 860 /* Return true if the trigraph indicated by NOTE should be warned
 861    about in a comment.  */
 862 static bool
 863 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 864 {
 865   const uchar *p;
 866
 867   /* Within comments we don't warn about trigraphs, unless the
 868      trigraph forms an escaped newline, as that may change
 869      behavior.  */
 870   if (note->type != '/')
 871     return false;
 872
 873   /* If -trigraphs, then this was an escaped newline iff the next note
 874      is coincident.  */
 875   if (CPP_OPTION (pfile, trigraphs))
 876     return note[1].pos == note->pos;
 877
 878   /* Otherwise, see if this forms an escaped newline.  */
 879   p = note->pos + 3;
 880   while (is_nvspace (*p))
 881     p++;
 882
 883   /* There might have been escaped newlines between the trigraph and the
 884      newline we found.  Hence the position test.  */
 885   return (*p == '\n' && p < note[1].pos);
 886 }
 887
 888 /* Process the notes created by add_line_note as far as the current
 889    location.  */
 890 void
 891 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 892 {
 893   cpp_buffer *buffer = pfile->buffer;
 894
 895   for (;;)
 896     {
 897       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 898       unsigned int col;
 899
 900       if (note->pos > buffer->cur)
 901         break;
 902
 903       buffer->cur_note++;
 904       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 905
 906       if (note->type == '\\' || note->type == ' ')
 907         {
 908           if (note->type == ' ' && !in_comment)
 909             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 910                                  "backslash and newline separated by space");
 911
 912           if (buffer->next_line > buffer->rlimit)
 913             {
 914               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 915                                    "backslash-newline at end of file");
 916               /* Prevent "no newline at end of file" warning.  */
 917               buffer->next_line = buffer->rlimit;
 918             }
 919
 920           buffer->line_base = note->pos;
 921           CPP_INCREMENT_LINE (pfile, 0);
 922         }
 923       else if (_cpp_trigraph_map[note->type])
 924         {
 925           if (CPP_OPTION (pfile, warn_trigraphs)
 926               && (!in_comment || warn_in_comment (pfile, note)))
 927             {
 928               if (CPP_OPTION (pfile, trigraphs))
 929                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 930                                        pfile->line_table->highest_line, col,
 931                                        "trigraph ??%c converted to %c",
 932                                        note->type,
 933                                        (int) _cpp_trigraph_map[note->type]);
 934               else
 935                 {
 936                   cpp_warning_with_line
 937                     (pfile, CPP_W_TRIGRAPHS,
 938                      pfile->line_table->highest_line, col,
 939                      "trigraph ??%c ignored, use -trigraphs to enable",
 940                      note->type);
 941                 }
 942             }
 943         }
 944       else if (note->type == 0)
 945         /* Already processed in lex_raw_string.  */;
 946       else
 947         abort ();
 948     }
 949 }
 950
 951 /* Skip a C-style block comment.  We find the end of the comment by
 952    seeing if an asterisk is before every '/' we encounter.  Returns
 953    nonzero if comment terminated by EOF, zero otherwise.
 954
 955    Buffer->cur points to the initial asterisk of the comment.  */
 956 bool
 957 _cpp_skip_block_comment (cpp_reader *pfile)
 958 {
 959   cpp_buffer *buffer = pfile->buffer;
 960   const uchar *cur = buffer->cur;
 961   uchar c;
 962
 963   cur++;
 964   if (*cur == '/')
 965     cur++;
 966
 967   for (;;)
 968     {
 969       /* People like decorating comments with '*', so check for '/'
 970          instead for efficiency.  */
 971       c = *cur++;
 972
 973       if (c == '/')
 974         {
 975           if (cur[-2] == '*')
 976             break;
 977
 978           /* Warn about potential nested comments, but not if the '/'
 979              comes immediately before the true comment delimiter.
 980              Don't bother to get it right across escaped newlines.  */
 981           if (CPP_OPTION (pfile, warn_comments)
 982               && cur[0] == '*' && cur[1] != '/')
 983             {
 984               buffer->cur = cur;
 985               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 986                                      pfile->line_table->highest_line,
 987                                      CPP_BUF_COL (buffer),
 988                                      "\"/*\" within comment");
 989             }
 990         }
 991       else if (c == '\n')
 992         {
 993           unsigned int cols;
 994           buffer->cur = cur - 1;
 995           _cpp_process_line_notes (pfile, true);
 996           if (buffer->next_line >= buffer->rlimit)
 997             return true;
 998           _cpp_clean_line (pfile);
 999
1000           cols = buffer->next_line - buffer->line_base;
1001           CPP_INCREMENT_LINE (pfile, cols);
1002
1003           cur = buffer->cur;
1004         }
1005     }
1006
1007   buffer->cur = cur;
1008   _cpp_process_line_notes (pfile, true);
1009   return false;
1010 }
1011
1012 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1013    terminating newline.  Handles escaped newlines.  Returns nonzero
1014    if a multiline comment.  */
1015 static int
1016 skip_line_comment (cpp_reader *pfile)
1017 {
1018   cpp_buffer *buffer = pfile->buffer;
1019   source_location orig_line = pfile->line_table->highest_line;
1020
1021   while (*buffer->cur != '\n')
1022     buffer->cur++;
1023
1024   _cpp_process_line_notes (pfile, true);
1025   return orig_line != pfile->line_table->highest_line;
1026 }
1027
1028 /* Skips whitespace, saving the next non-whitespace character.  */
1029 static void
1030 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1031 {
1032   cpp_buffer *buffer = pfile->buffer;
1033   bool saw_NUL = false;
1034
1035   do
1036     {
1037       /* Horizontal space always OK.  */
1038       if (c == ' ' || c == '\t')
1039         ;
1040       /* Just \f \v or \0 left.  */
1041       else if (c == '\0')
1042         saw_NUL = true;
1043       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1044         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1045                              CPP_BUF_COL (buffer),
1046                              "%s in preprocessing directive",
1047                              c == '\f' ? "form feed" : "vertical tab");
1048
1049       c = *buffer->cur++;
1050     }
1051   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1052   while (is_nvspace (c));
1053
1054   if (saw_NUL)
1055     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1056
1057   buffer->cur--;
1058 }
1059
1060 /* See if the characters of a number token are valid in a name (no
1061    '.', '+' or '-').  */
1062 static int
1063 name_p (cpp_reader *pfile, const cpp_string *string)
1064 {
1065   unsigned int i;
1066
1067   for (i = 0; i < string->len; i++)
1068     if (!is_idchar (string->text[i]))
1069       return 0;
1070
1071   return 1;
1072 }
1073
1074 /* After parsing an identifier or other sequence, produce a warning about
1075    sequences not in NFC/NFKC.  */
1076 static void
1077 warn_about_normalization (cpp_reader *pfile,
1078                           const cpp_token *token,
1079                           const struct normalize_state *s)
1080 {
1081   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1082       && !pfile->state.skipping)
1083     {
1084       /* Make sure that the token is printed using UCNs, even
1085          if we'd otherwise happily print UTF-8.  */
1086       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1087       size_t sz;
1088
1089       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1090       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1091         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1092                                "`%.*s' is not in NFKC", (int) sz, buf);
1093       else
1094         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1095                                "`%.*s' is not in NFC", (int) sz, buf);
1096       free (buf);
1097     }
1098 }
1099
1100 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1101    an identifier.  FIRST is TRUE if this starts an identifier.  */
1102 static bool
1103 forms_identifier_p (cpp_reader *pfile, int first,
1104                     struct normalize_state *state)
1105 {
1106   cpp_buffer *buffer = pfile->buffer;
1107
1108   if (*buffer->cur == '$')
1109     {
1110       if (!CPP_OPTION (pfile, dollars_in_ident))
1111         return false;
1112
1113       buffer->cur++;
1114       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1115         {
1116           CPP_OPTION (pfile, warn_dollars) = 0;
1117           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1118         }
1119
1120       return true;
1121     }
1122
1123   /* Is this a syntactically valid UCN?  */
1124   if (CPP_OPTION (pfile, extended_identifiers)
1125       && *buffer->cur == '\\'
1126       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1127     {
1128       buffer->cur += 2;
1129       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1130                           state))
1131         return true;
1132       buffer->cur -= 2;
1133     }
1134
1135   return false;
1136 }
1137
1138 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1139 static cpp_hashnode *
1140 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1141 {
1142   cpp_hashnode *result;
1143   const uchar *cur;
1144   unsigned int len;
1145   unsigned int hash = HT_HASHSTEP (0, *base);
1146
1147   cur = base + 1;
1148   while (ISIDNUM (*cur))
1149     {
1150       hash = HT_HASHSTEP (hash, *cur);
1151       cur++;
1152     }
1153   len = cur - base;
1154   hash = HT_HASHFINISH (hash, len);
1155   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1156                                               base, len, hash, HT_ALLOC));
1157
1158   /* Rarely, identifiers require diagnostics when lexed.  */
1159   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1160                         && !pfile->state.skipping, 0))
1161     {
1162       /* It is allowed to poison the same identifier twice.  */
1163       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1164         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1165                    NODE_NAME (result));
1166
1167       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1168          replacement list of a variadic macro.  */
1169       if (result == pfile->spec_nodes.n__VA_ARGS__
1170           && !pfile->state.va_args_ok)
1171         cpp_error (pfile, CPP_DL_PEDWARN,
1172                    "__VA_ARGS__ can only appear in the expansion"
1173                    " of a C99 variadic macro");
1174
1175       /* For -Wc++-compat, warn about use of C++ named operators.  */
1176       if (result->flags & NODE_WARN_OPERATOR)
1177         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1178                      "identifier \"%s\" is a special operator name in C++",
1179                      NODE_NAME (result));
1180     }
1181
1182   return result;
1183 }
1184
1185 /* Get the cpp_hashnode of an identifier specified by NAME in
1186    the current cpp_reader object.  If none is found, NULL is returned.  */
1187 cpp_hashnode *
1188 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1189 {
1190   cpp_hashnode *result;
1191   result = lex_identifier_intern (pfile, (uchar *) name);
1192   return result;
1193 }
1194
1195 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1196 static cpp_hashnode *
1197 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1198                 struct normalize_state *nst)
1199 {
1200   cpp_hashnode *result;
1201   const uchar *cur;
1202   unsigned int len;
1203   unsigned int hash = HT_HASHSTEP (0, *base);
1204
1205   cur = pfile->buffer->cur;
1206   if (! starts_ucn)
1207     while (ISIDNUM (*cur))
1208       {
1209         hash = HT_HASHSTEP (hash, *cur);
1210         cur++;
1211       }
1212   pfile->buffer->cur = cur;
1213   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1214     {
1215       /* Slower version for identifiers containing UCNs (or $).  */
1216       do {
1217         while (ISIDNUM (*pfile->buffer->cur))
1218           {
1219             pfile->buffer->cur++;
1220             NORMALIZE_STATE_UPDATE_IDNUM (nst);
1221           }
1222       } while (forms_identifier_p (pfile, false, nst));
1223       result = _cpp_interpret_identifier (pfile, base,
1224                                           pfile->buffer->cur - base);
1225     }
1226   else
1227     {
1228       len = cur - base;
1229       hash = HT_HASHFINISH (hash, len);
1230
1231       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1232                                                   base, len, hash, HT_ALLOC));
1233     }
1234
1235   /* Rarely, identifiers require diagnostics when lexed.  */
1236   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1237                         && !pfile->state.skipping, 0))
1238     {
1239       /* It is allowed to poison the same identifier twice.  */
1240       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1241         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1242                    NODE_NAME (result));
1243
1244       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1245          replacement list of a variadic macro.  */
1246       if (result == pfile->spec_nodes.n__VA_ARGS__
1247           && !pfile->state.va_args_ok)
1248         cpp_error (pfile, CPP_DL_PEDWARN,
1249                    "__VA_ARGS__ can only appear in the expansion"
1250                    " of a C99 variadic macro");
1251
1252       /* For -Wc++-compat, warn about use of C++ named operators.  */
1253       if (result->flags & NODE_WARN_OPERATOR)
1254         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1255                      "identifier \"%s\" is a special operator name in C++",
1256                      NODE_NAME (result));
1257     }
1258
1259   return result;
1260 }
1261
1262 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1263 static void
1264 lex_number (cpp_reader *pfile, cpp_string *number,
1265             struct normalize_state *nst)
1266 {
1267   const uchar *cur;
1268   const uchar *base;
1269   uchar *dest;
1270
1271   base = pfile->buffer->cur - 1;
1272   do
1273     {
1274       cur = pfile->buffer->cur;
1275
1276       /* N.B. ISIDNUM does not include $.  */
1277       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
1278         {
1279           cur++;
1280           NORMALIZE_STATE_UPDATE_IDNUM (nst);
1281         }
1282
1283       pfile->buffer->cur = cur;
1284     }
1285   while (forms_identifier_p (pfile, false, nst));
1286
1287   number->len = cur - base;
1288   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1289   memcpy (dest, base, number->len);
1290   dest[number->len] = '\0';
1291   number->text = dest;
1292 }
1293
1294 /* Create a token of type TYPE with a literal spelling.  */
1295 static void
1296 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1297                 unsigned int len, enum cpp_ttype type)
1298 {
1299   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1300
1301   memcpy (dest, base, len);
1302   dest[len] = '\0';
1303   token->type = type;
1304   token->val.str.len = len;
1305   token->val.str.text = dest;
1306 }
1307
1308 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1309    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1310
1311 static void
1312 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1313                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1314 {
1315   _cpp_buff *first_buff = *first_buff_p;
1316   _cpp_buff *last_buff = *last_buff_p;
1317
1318   if (first_buff == NULL)
1319     first_buff = last_buff = _cpp_get_buff (pfile, len);
1320   else if (len > BUFF_ROOM (last_buff))
1321     {
1322       size_t room = BUFF_ROOM (last_buff);
1323       memcpy (BUFF_FRONT (last_buff), base, room);
1324       BUFF_FRONT (last_buff) += room;
1325       base += room;
1326       len -= room;
1327       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1328     }
1329
1330   memcpy (BUFF_FRONT (last_buff), base, len);
1331   BUFF_FRONT (last_buff) += len;
1332
1333   *first_buff_p = first_buff;
1334   *last_buff_p = last_buff;
1335 }
1336
1337 /* Lexes a raw string.  The stored string contains the spelling, including
1338    double quotes, delimiter string, '(' and ')', any leading
1339    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1340    literal, or CPP_OTHER if it was not properly terminated.
1341
1342    The spelling is NUL-terminated, but it is not guaranteed that this
1343    is the first NUL since embedded NULs are preserved.  */
1344
1345 static void
1346 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1347                 const uchar *cur)
1348 {
1349   const uchar *raw_prefix;
1350   unsigned int raw_prefix_len = 0;
1351   enum cpp_ttype type;
1352   size_t total_len = 0;
1353   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1354   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1355
1356   type = (*base == 'L' ? CPP_WSTRING :
1357           *base == 'U' ? CPP_STRING32 :
1358           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1359           : CPP_STRING);
1360
1361   raw_prefix = cur + 1;
1362   while (raw_prefix_len < 16)
1363     {
1364       switch (raw_prefix[raw_prefix_len])
1365         {
1366         case ' ': case '(': case ')': case '\\': case '\t':
1367         case '\v': case '\f': case '\n': default:
1368           break;
1369         /* Basic source charset except the above chars.  */
1370         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1371         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1372         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1373         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1374         case 'y': case 'z':
1375         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1376         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1377         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1378         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1379         case 'Y': case 'Z':
1380         case '0': case '1': case '2': case '3': case '4': case '5':
1381         case '6': case '7': case '8': case '9':
1382         case '_': case '{': case '}': case '#': case '[': case ']':
1383         case '<': case '>': case '%': case ':': case ';': case '.':
1384         case '?': case '*': case '+': case '-': case '/': case '^':
1385         case '&': case '|': case '~': case '!': case '=': case ',':
1386         case '"': case '\'':
1387           raw_prefix_len++;
1388           continue;
1389         }
1390       break;
1391     }
1392
1393   if (raw_prefix[raw_prefix_len] != '(')
1394     {
1395       int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
1396                 + 1;
1397       if (raw_prefix_len == 16)
1398         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1399                              "raw string delimiter longer than 16 characters");
1400       else
1401         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1402                              "invalid character '%c' in raw string delimiter",
1403                              (int) raw_prefix[raw_prefix_len]);
1404       pfile->buffer->cur = raw_prefix - 1;
1405       create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
1406       return;
1407     }
1408
1409   cur = raw_prefix + raw_prefix_len + 1;
1410   for (;;)
1411     {
1412 #define BUF_APPEND(STR,LEN)                                     \
1413       do {                                                      \
1414         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1415                         &first_buff, &last_buff);               \
1416         total_len += (LEN);                                     \
1417       } while (0);
1418
1419       cppchar_t c;
1420
1421       /* If we previously performed any trigraph or line splicing
1422          transformations, undo them within the body of the raw string.  */
1423       while (note->pos < cur)
1424         ++note;
1425       for (; note->pos == cur; ++note)
1426         {
1427           switch (note->type)
1428             {
1429             case '\\':
1430             case ' ':
1431               /* Restore backslash followed by newline.  */
1432               BUF_APPEND (base, cur - base);
1433               base = cur;
1434               BUF_APPEND ("\\", 1);
1435             after_backslash:
1436               if (note->type == ' ')
1437                 {
1438                   /* GNU backslash whitespace newline extension.  FIXME
1439                      could be any sequence of non-vertical space.  When we
1440                      can properly restore any such sequence, we should mark
1441                      this note as handled so _cpp_process_line_notes
1442                      doesn't warn.  */
1443                   BUF_APPEND (" ", 1);
1444                 }
1445
1446               BUF_APPEND ("\n", 1);
1447               break;
1448
1449             case 0:
1450               /* Already handled.  */
1451               break;
1452
1453             default:
1454               if (_cpp_trigraph_map[note->type])
1455                 {
1456                   /* Don't warn about this trigraph in
1457                      _cpp_process_line_notes, since trigraphs show up as
1458                      trigraphs in raw strings.  */
1459                   uchar type = note->type;
1460                   note->type = 0;
1461
1462                   if (!CPP_OPTION (pfile, trigraphs))
1463                     /* If we didn't convert the trigraph in the first
1464                        place, don't do anything now either.  */
1465                     break;
1466
1467                   BUF_APPEND (base, cur - base);
1468                   base = cur;
1469                   BUF_APPEND ("??", 2);
1470
1471                   /* ??/ followed by newline gets two line notes, one for
1472                      the trigraph and one for the backslash/newline.  */
1473                   if (type == '/' && note[1].pos == cur)
1474                     {
1475                       if (note[1].type != '\\'
1476                           && note[1].type != ' ')
1477                         abort ();
1478                       BUF_APPEND ("/", 1);
1479                       ++note;
1480                       goto after_backslash;
1481                     }
1482                   /* The ) from ??) could be part of the suffix.  */
1483                   else if (type == ')'
1484                            && strncmp ((const char *) cur+1,
1485                                        (const char *) raw_prefix,
1486                                        raw_prefix_len) == 0
1487                            && cur[raw_prefix_len+1] == '"')
1488                     {
1489                       BUF_APPEND (")", 1);
1490                       base++;
1491                       cur += raw_prefix_len + 2;
1492                       goto break_outer_loop;
1493                     }
1494                   else
1495                     {
1496                       /* Skip the replacement character.  */
1497                       base = ++cur;
1498                       BUF_APPEND (&type, 1);
1499                     }
1500                 }
1501               else
1502                 abort ();
1503               break;
1504             }
1505         }
1506       c = *cur++;
1507
1508       if (c == ')'
1509           && strncmp ((const char *) cur, (const char *) raw_prefix,
1510                       raw_prefix_len) == 0
1511           && cur[raw_prefix_len] == '"')
1512         {
1513           cur += raw_prefix_len + 1;
1514           break;
1515         }
1516       else if (c == '\n')
1517         {
1518           if (pfile->state.in_directive
1519               || pfile->state.parsing_args
1520               || pfile->state.in_deferred_pragma)
1521             {
1522               cur--;
1523               type = CPP_OTHER;
1524               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1525                                    "unterminated raw string");
1526               break;
1527             }
1528
1529           BUF_APPEND (base, cur - base);
1530
1531           if (pfile->buffer->cur < pfile->buffer->rlimit)
1532             CPP_INCREMENT_LINE (pfile, 0);
1533           pfile->buffer->need_line = true;
1534
1535           pfile->buffer->cur = cur-1;
1536           _cpp_process_line_notes (pfile, false);
1537           if (!_cpp_get_fresh_line (pfile))
1538             {
1539               source_location src_loc = token->src_loc;
1540               token->type = CPP_EOF;
1541               /* Tell the compiler the line number of the EOF token.  */
1542               token->src_loc = pfile->line_table->highest_line;
1543               token->flags = BOL;
1544               if (first_buff != NULL)
1545                 _cpp_release_buff (pfile, first_buff);
1546               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1547                                    "unterminated raw string");
1548               return;
1549             }
1550
1551           cur = base = pfile->buffer->cur;
1552           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1553         }
1554     }
1555  break_outer_loop:
1556
1557   if (CPP_OPTION (pfile, user_literals))
1558     {
1559       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1560          underscore is ill-formed.  Since this breaks programs using macros
1561          from inttypes.h, we generate a warning and treat the ud-suffix as a
1562          separate preprocessing token.  This approach is under discussion by
1563          the standards committee, and has been adopted as a conforming
1564          extension by other front ends such as clang.
1565          A special exception is made for the suffix 's' which will be
1566          standardized as a user-defined literal suffix for strings.  */
1567       if (ISALPHA (*cur) && *cur != 's')
1568         {
1569           /* Raise a warning, but do not consume subsequent tokens.  */
1570           if (CPP_OPTION (pfile, warn_literal_suffix))
1571             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1572                                    token->src_loc, 0,
1573                                    "invalid suffix on literal; C++11 requires "
1574                                    "a space between literal and identifier");
1575         }
1576       /* Grab user defined literal suffix.  */
1577       else if (ISIDST (*cur))
1578         {
1579           type = cpp_userdef_string_add_type (type);
1580           ++cur;
1581
1582           while (ISIDNUM (*cur))
1583             ++cur;
1584         }
1585     }
1586
1587   pfile->buffer->cur = cur;
1588   if (first_buff == NULL)
1589     create_literal (pfile, token, base, cur - base, type);
1590   else
1591     {
1592       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1593
1594       token->type = type;
1595       token->val.str.len = total_len + (cur - base);
1596       token->val.str.text = dest;
1597       last_buff = first_buff;
1598       while (last_buff != NULL)
1599         {
1600           memcpy (dest, last_buff->base,
1601                   BUFF_FRONT (last_buff) - last_buff->base);
1602           dest += BUFF_FRONT (last_buff) - last_buff->base;
1603           last_buff = last_buff->next;
1604         }
1605       _cpp_release_buff (pfile, first_buff);
1606       memcpy (dest, base, cur - base);
1607       dest[cur - base] = '\0';
1608     }
1609 }
1610
1611 /* Lexes a string, character constant, or angle-bracketed header file
1612    name.  The stored string contains the spelling, including opening
1613    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1614    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1615    if it was not properly terminated, or CPP_LESS for an unterminated
1616    header name which must be relexed as normal tokens.
1617
1618    The spelling is NUL-terminated, but it is not guaranteed that this
1619    is the first NUL since embedded NULs are preserved.  */
1620 static void
1621 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1622 {
1623   bool saw_NUL = false;
1624   const uchar *cur;
1625   cppchar_t terminator;
1626   enum cpp_ttype type;
1627
1628   cur = base;
1629   terminator = *cur++;
1630   if (terminator == 'L' || terminator == 'U')
1631     terminator = *cur++;
1632   else if (terminator == 'u')
1633     {
1634       terminator = *cur++;
1635       if (terminator == '8')
1636         terminator = *cur++;
1637     }
1638   if (terminator == 'R')
1639     {
1640       lex_raw_string (pfile, token, base, cur);
1641       return;
1642     }
1643   if (terminator == '"')
1644     type = (*base == 'L' ? CPP_WSTRING :
1645             *base == 'U' ? CPP_STRING32 :
1646             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1647                          : CPP_STRING);
1648   else if (terminator == '\'')
1649     type = (*base == 'L' ? CPP_WCHAR :
1650             *base == 'U' ? CPP_CHAR32 :
1651             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1652   else
1653     terminator = '>', type = CPP_HEADER_NAME;
1654
1655   for (;;)
1656     {
1657       cppchar_t c = *cur++;
1658
1659       /* In #include-style directives, terminators are not escapable.  */
1660       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1661         cur++;
1662       else if (c == terminator)
1663         break;
1664       else if (c == '\n')
1665         {
1666           cur--;
1667           /* Unmatched quotes always yield undefined behavior, but
1668              greedy lexing means that what appears to be an unterminated
1669              header name may actually be a legitimate sequence of tokens.  */
1670           if (terminator == '>')
1671             {
1672               token->type = CPP_LESS;
1673               return;
1674             }
1675           type = CPP_OTHER;
1676           break;
1677         }
1678       else if (c == '\0')
1679         saw_NUL = true;
1680     }
1681
1682   if (saw_NUL && !pfile->state.skipping)
1683     cpp_error (pfile, CPP_DL_WARNING,
1684                "null character(s) preserved in literal");
1685
1686   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1687     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1688                (int) terminator);
1689
1690   if (CPP_OPTION (pfile, user_literals))
1691     {
1692       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1693          underscore is ill-formed.  Since this breaks programs using macros
1694          from inttypes.h, we generate a warning and treat the ud-suffix as a
1695          separate preprocessing token.  This approach is under discussion by
1696          the standards committee, and has been adopted as a conforming
1697          extension by other front ends such as clang.
1698          A special exception is made for the suffix 's' which will be
1699          standardized as a user-defined literal suffix for strings.  */
1700       if (ISALPHA (*cur) && *cur != 's')
1701         {
1702           /* Raise a warning, but do not consume subsequent tokens.  */
1703           if (CPP_OPTION (pfile, warn_literal_suffix))
1704             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1705                                    token->src_loc, 0,
1706                                    "invalid suffix on literal; C++11 requires "
1707                                    "a space between literal and identifier");
1708         }
1709       /* Grab user defined literal suffix.  */
1710       else if (ISIDST (*cur))
1711         {
1712           type = cpp_userdef_char_add_type (type);
1713           type = cpp_userdef_string_add_type (type);
1714           ++cur;
1715
1716           while (ISIDNUM (*cur))
1717             ++cur;
1718         }
1719     }
1720
1721   pfile->buffer->cur = cur;
1722   create_literal (pfile, token, base, cur - base, type);
1723 }
1724
1725 /* Return the comment table. The client may not make any assumption
1726    about the ordering of the table.  */
1727 cpp_comment_table *
1728 cpp_get_comments (cpp_reader *pfile)
1729 {
1730   return &pfile->comments;
1731 }
1732
1733 /* Append a comment to the end of the comment table. */
1734 static void
1735 store_comment (cpp_reader *pfile, cpp_token *token)
1736 {
1737   int len;
1738
1739   if (pfile->comments.allocated == 0)
1740     {
1741       pfile->comments.allocated = 256;
1742       pfile->comments.entries = (cpp_comment *) xmalloc
1743         (pfile->comments.allocated * sizeof (cpp_comment));
1744     }
1745
1746   if (pfile->comments.count == pfile->comments.allocated)
1747     {
1748       pfile->comments.allocated *= 2;
1749       pfile->comments.entries = (cpp_comment *) xrealloc
1750         (pfile->comments.entries,
1751          pfile->comments.allocated * sizeof (cpp_comment));
1752     }
1753
1754   len = token->val.str.len;
1755
1756   /* Copy comment. Note, token may not be NULL terminated. */
1757   pfile->comments.entries[pfile->comments.count].comment =
1758     (char *) xmalloc (sizeof (char) * (len + 1));
1759   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1760           token->val.str.text, len);
1761   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1762
1763   /* Set source location. */
1764   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1765
1766   /* Increment the count of entries in the comment table. */
1767   pfile->comments.count++;
1768 }
1769
1770 /* The stored comment includes the comment start and any terminator.  */
1771 static void
1772 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1773               cppchar_t type)
1774 {
1775   unsigned char *buffer;
1776   unsigned int len, clen, i;
1777
1778   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1779
1780   /* C++ comments probably (not definitely) have moved past a new
1781      line, which we don't want to save in the comment.  */
1782   if (is_vspace (pfile->buffer->cur[-1]))
1783     len--;
1784
1785   /* If we are currently in a directive or in argument parsing, then
1786      we need to store all C++ comments as C comments internally, and
1787      so we need to allocate a little extra space in that case.
1788
1789      Note that the only time we encounter a directive here is
1790      when we are saving comments in a "#define".  */
1791   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1792           && type == '/') ? len + 2 : len;
1793
1794   buffer = _cpp_unaligned_alloc (pfile, clen);
1795
1796   token->type = CPP_COMMENT;
1797   token->val.str.len = clen;
1798   token->val.str.text = buffer;
1799
1800   buffer[0] = '/';
1801   memcpy (buffer + 1, from, len - 1);
1802
1803   /* Finish conversion to a C comment, if necessary.  */
1804   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1805     {
1806       buffer[1] = '*';
1807       buffer[clen - 2] = '*';
1808       buffer[clen - 1] = '/';
1809       /* As there can be in a C++ comments illegal sequences for C comments
1810          we need to filter them out.  */
1811       for (i = 2; i < (clen - 2); i++)
1812         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1813           buffer[i] = '|';
1814     }
1815
1816   /* Finally store this comment for use by clients of libcpp. */
1817   store_comment (pfile, token);
1818 }
1819
1820 /* Allocate COUNT tokens for RUN.  */
1821 void
1822 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1823 {
1824   run->base = XNEWVEC (cpp_token, count);
1825   run->limit = run->base + count;
1826   run->next = NULL;
1827 }
1828
1829 /* Returns the next tokenrun, or creates one if there is none.  */
1830 static tokenrun *
1831 next_tokenrun (tokenrun *run)
1832 {
1833   if (run->next == NULL)
1834     {
1835       run->next = XNEW (tokenrun);
1836       run->next->prev = run;
1837       _cpp_init_tokenrun (run->next, 250);
1838     }
1839
1840   return run->next;
1841 }
1842
1843 /* Return the number of not yet processed token in a given
1844    context.  */
1845 int
1846 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1847 {
1848   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1849     return (LAST (context).token - FIRST (context).token);
1850   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1851            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1852     return (LAST (context).ptoken - FIRST (context).ptoken);
1853   else
1854       abort ();
1855 }
1856
1857 /* Returns the token present at index INDEX in a given context.  If
1858    INDEX is zero, the next token to be processed is returned.  */
1859 static const cpp_token*
1860 _cpp_token_from_context_at (cpp_context *context, int index)
1861 {
1862   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1863     return &(FIRST (context).token[index]);
1864   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1865            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1866     return FIRST (context).ptoken[index];
1867  else
1868    abort ();
1869 }
1870
1871 /* Look ahead in the input stream.  */
1872 const cpp_token *
1873 cpp_peek_token (cpp_reader *pfile, int index)
1874 {
1875   cpp_context *context = pfile->context;
1876   const cpp_token *peektok;
1877   int count;
1878
1879   /* First, scan through any pending cpp_context objects.  */
1880   while (context->prev)
1881     {
1882       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1883
1884       if (index < (int) sz)
1885         return _cpp_token_from_context_at (context, index);
1886       index -= (int) sz;
1887       context = context->prev;
1888     }
1889
1890   /* We will have to read some new tokens after all (and do so
1891      without invalidating preceding tokens).  */
1892   count = index;
1893   pfile->keep_tokens++;
1894
1895   do
1896     {
1897       peektok = _cpp_lex_token (pfile);
1898       if (peektok->type == CPP_EOF)
1899         return peektok;
1900     }
1901   while (index--);
1902
1903   _cpp_backup_tokens_direct (pfile, count + 1);
1904   pfile->keep_tokens--;
1905
1906   return peektok;
1907 }
1908
1909 /* Allocate a single token that is invalidated at the same time as the
1910    rest of the tokens on the line.  Has its line and col set to the
1911    same as the last lexed token, so that diagnostics appear in the
1912    right place.  */
1913 cpp_token *
1914 _cpp_temp_token (cpp_reader *pfile)
1915 {
1916   cpp_token *old, *result;
1917   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1918   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1919
1920   old = pfile->cur_token - 1;
1921   /* Any pre-existing lookaheads must not be clobbered.  */
1922   if (la)
1923     {
1924       if (sz <= la)
1925         {
1926           tokenrun *next = next_tokenrun (pfile->cur_run);
1927
1928           if (sz < la)
1929             memmove (next->base + 1, next->base,
1930                      (la - sz) * sizeof (cpp_token));
1931
1932           next->base[0] = pfile->cur_run->limit[-1];
1933         }
1934
1935       if (sz > 1)
1936         memmove (pfile->cur_token + 1, pfile->cur_token,
1937                  MIN (la, sz - 1) * sizeof (cpp_token));
1938     }
1939
1940   if (!sz && pfile->cur_token == pfile->cur_run->limit)
1941     {
1942       pfile->cur_run = next_tokenrun (pfile->cur_run);
1943       pfile->cur_token = pfile->cur_run->base;
1944     }
1945
1946   result = pfile->cur_token++;
1947   result->src_loc = old->src_loc;
1948   return result;
1949 }
1950
1951 /* Lex a token into RESULT (external interface).  Takes care of issues
1952    like directive handling, token lookahead, multiple include
1953    optimization and skipping.  */
1954 const cpp_token *
1955 _cpp_lex_token (cpp_reader *pfile)
1956 {
1957   cpp_token *result;
1958
1959   for (;;)
1960     {
1961       if (pfile->cur_token == pfile->cur_run->limit)
1962         {
1963           pfile->cur_run = next_tokenrun (pfile->cur_run);
1964           pfile->cur_token = pfile->cur_run->base;
1965         }
1966       /* We assume that the current token is somewhere in the current
1967          run.  */
1968       if (pfile->cur_token < pfile->cur_run->base
1969           || pfile->cur_token >= pfile->cur_run->limit)
1970         abort ();
1971
1972       if (pfile->lookaheads)
1973         {
1974           pfile->lookaheads--;
1975           result = pfile->cur_token++;
1976         }
1977       else
1978         result = _cpp_lex_direct (pfile);
1979
1980       if (result->flags & BOL)
1981         {
1982           /* Is this a directive.  If _cpp_handle_directive returns
1983              false, it is an assembler #.  */
1984           if (result->type == CPP_HASH
1985               /* 6.10.3 p 11: Directives in a list of macro arguments
1986                  gives undefined behavior.  This implementation
1987                  handles the directive as normal.  */
1988               && pfile->state.parsing_args != 1)
1989             {
1990               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1991                 {
1992                   if (pfile->directive_result.type == CPP_PADDING)
1993                     continue;
1994                   result = &pfile->directive_result;
1995                 }
1996             }
1997           else if (pfile->state.in_deferred_pragma)
1998             result = &pfile->directive_result;
1999
2000           if (pfile->cb.line_change && !pfile->state.skipping)
2001             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2002         }
2003
2004       /* We don't skip tokens in directives.  */
2005       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2006         break;
2007
2008       /* Outside a directive, invalidate controlling macros.  At file
2009          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2010          get here and MI optimization works.  */
2011       pfile->mi_valid = false;
2012
2013       if (!pfile->state.skipping || result->type == CPP_EOF)
2014         break;
2015     }
2016
2017   return result;
2018 }
2019
2020 /* Returns true if a fresh line has been loaded.  */
2021 bool
2022 _cpp_get_fresh_line (cpp_reader *pfile)
2023 {
2024   int return_at_eof;
2025
2026   /* We can't get a new line until we leave the current directive.  */
2027   if (pfile->state.in_directive)
2028     return false;
2029
2030   for (;;)
2031     {
2032       cpp_buffer *buffer = pfile->buffer;
2033
2034       if (!buffer->need_line)
2035         return true;
2036
2037       if (buffer->next_line < buffer->rlimit)
2038         {
2039           _cpp_clean_line (pfile);
2040           return true;
2041         }
2042
2043       /* First, get out of parsing arguments state.  */
2044       if (pfile->state.parsing_args)
2045         return false;
2046
2047       /* End of buffer.  Non-empty files should end in a newline.  */
2048       if (buffer->buf != buffer->rlimit
2049           && buffer->next_line > buffer->rlimit
2050           && !buffer->from_stage3)
2051         {
2052           /* Clip to buffer size.  */
2053           buffer->next_line = buffer->rlimit;
2054         }
2055
2056       return_at_eof = buffer->return_at_eof;
2057       _cpp_pop_buffer (pfile);
2058       if (pfile->buffer == NULL || return_at_eof)
2059         return false;
2060     }
2061 }
2062
2063 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2064   do                                                    \
2065     {                                                   \
2066       result->type = ELSE_TYPE;                         \
2067       if (*buffer->cur == CHAR)                         \
2068         buffer->cur++, result->type = THEN_TYPE;        \
2069     }                                                   \
2070   while (0)
2071
2072 /* Lex a token into pfile->cur_token, which is also incremented, to
2073    get diagnostics pointing to the correct location.
2074
2075    Does not handle issues such as token lookahead, multiple-include
2076    optimization, directives, skipping etc.  This function is only
2077    suitable for use by _cpp_lex_token, and in special cases like
2078    lex_expansion_token which doesn't care for any of these issues.
2079
2080    When meeting a newline, returns CPP_EOF if parsing a directive,
2081    otherwise returns to the start of the token buffer if permissible.
2082    Returns the location of the lexed token.  */
2083 cpp_token *
2084 _cpp_lex_direct (cpp_reader *pfile)
2085 {
2086   cppchar_t c;
2087   cpp_buffer *buffer;
2088   const unsigned char *comment_start;
2089   cpp_token *result = pfile->cur_token++;
2090
2091  fresh_line:
2092   result->flags = 0;
2093   buffer = pfile->buffer;
2094   if (buffer->need_line)
2095     {
2096       if (pfile->state.in_deferred_pragma)
2097         {
2098           result->type = CPP_PRAGMA_EOL;
2099           pfile->state.in_deferred_pragma = false;
2100           if (!pfile->state.pragma_allow_expansion)
2101             pfile->state.prevent_expansion--;
2102           return result;
2103         }
2104       if (!_cpp_get_fresh_line (pfile))
2105         {
2106           result->type = CPP_EOF;
2107           if (!pfile->state.in_directive)
2108             {
2109               /* Tell the compiler the line number of the EOF token.  */
2110               result->src_loc = pfile->line_table->highest_line;
2111               result->flags = BOL;
2112             }
2113           return result;
2114         }
2115       if (!pfile->keep_tokens)
2116         {
2117           pfile->cur_run = &pfile->base_run;
2118           result = pfile->base_run.base;
2119           pfile->cur_token = result + 1;
2120         }
2121       result->flags = BOL;
2122       if (pfile->state.parsing_args == 2)
2123         result->flags |= PREV_WHITE;
2124     }
2125   buffer = pfile->buffer;
2126  update_tokens_line:
2127   result->src_loc = pfile->line_table->highest_line;
2128
2129  skipped_white:
2130   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2131       && !pfile->overlaid_buffer)
2132     {
2133       _cpp_process_line_notes (pfile, false);
2134       result->src_loc = pfile->line_table->highest_line;
2135     }
2136   c = *buffer->cur++;
2137
2138   if (pfile->forced_token_location_p)
2139     result->src_loc = *pfile->forced_token_location_p;
2140   else
2141     result->src_loc = linemap_position_for_column (pfile->line_table,
2142                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2143
2144   switch (c)
2145     {
2146     case ' ': case '\t': case '\f': case '\v': case '\0':
2147       result->flags |= PREV_WHITE;
2148       skip_whitespace (pfile, c);
2149       goto skipped_white;
2150
2151     case '\n':
2152       if (buffer->cur < buffer->rlimit)
2153         CPP_INCREMENT_LINE (pfile, 0);
2154       buffer->need_line = true;
2155       goto fresh_line;
2156
2157     case '0': case '1': case '2': case '3': case '4':
2158     case '5': case '6': case '7': case '8': case '9':
2159       {
2160         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2161         result->type = CPP_NUMBER;
2162         lex_number (pfile, &result->val.str, &nst);
2163         warn_about_normalization (pfile, result, &nst);
2164         break;
2165       }
2166
2167     case 'L':
2168     case 'u':
2169     case 'U':
2170     case 'R':
2171       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2172          wide strings or raw strings.  */
2173       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2174           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2175         {
2176           if ((*buffer->cur == '\'' && c != 'R')
2177               || *buffer->cur == '"'
2178               || (*buffer->cur == 'R'
2179                   && c != 'R'
2180                   && buffer->cur[1] == '"'
2181                   && CPP_OPTION (pfile, rliterals))
2182               || (*buffer->cur == '8'
2183                   && c == 'u'
2184                   && (buffer->cur[1] == '"'
2185                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2186                           && CPP_OPTION (pfile, rliterals)))))
2187             {
2188               lex_string (pfile, result, buffer->cur - 1);
2189               break;
2190             }
2191         }
2192       /* Fall through.  */
2193
2194     case '_':
2195     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2196     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2197     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2198     case 's': case 't':           case 'v': case 'w': case 'x':
2199     case 'y': case 'z':
2200     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2201     case 'G': case 'H': case 'I': case 'J': case 'K':
2202     case 'M': case 'N': case 'O': case 'P': case 'Q':
2203     case 'S': case 'T':           case 'V': case 'W': case 'X':
2204     case 'Y': case 'Z':
2205       result->type = CPP_NAME;
2206       {
2207         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2208         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2209                                                 &nst);
2210         warn_about_normalization (pfile, result, &nst);
2211       }
2212
2213       /* Convert named operators to their proper types.  */
2214       if (result->val.node.node->flags & NODE_OPERATOR)
2215         {
2216           result->flags |= NAMED_OP;
2217           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2218         }
2219       break;
2220
2221     case '\'':
2222     case '"':
2223       lex_string (pfile, result, buffer->cur - 1);
2224       break;
2225
2226     case '/':
2227       /* A potential block or line comment.  */
2228       comment_start = buffer->cur;
2229       c = *buffer->cur;
2230
2231       if (c == '*')
2232         {
2233           if (_cpp_skip_block_comment (pfile))
2234             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2235         }
2236       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2237                             || cpp_in_system_header (pfile)))
2238         {
2239           /* Warn about comments only if pedantically GNUC89, and not
2240              in system headers.  */
2241           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2242               && ! buffer->warned_cplusplus_comments)
2243             {
2244               cpp_error (pfile, CPP_DL_PEDWARN,
2245                          "C++ style comments are not allowed in ISO C90");
2246               cpp_error (pfile, CPP_DL_PEDWARN,
2247                          "(this will be reported only once per input file)");
2248               buffer->warned_cplusplus_comments = 1;
2249             }
2250
2251           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2252             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2253         }
2254       else if (c == '=')
2255         {
2256           buffer->cur++;
2257           result->type = CPP_DIV_EQ;
2258           break;
2259         }
2260       else
2261         {
2262           result->type = CPP_DIV;
2263           break;
2264         }
2265
2266       if (!pfile->state.save_comments)
2267         {
2268           result->flags |= PREV_WHITE;
2269           goto update_tokens_line;
2270         }
2271
2272       /* Save the comment as a token in its own right.  */
2273       save_comment (pfile, result, comment_start, c);
2274       break;
2275
2276     case '<':
2277       if (pfile->state.angled_headers)
2278         {
2279           lex_string (pfile, result, buffer->cur - 1);
2280           if (result->type != CPP_LESS)
2281             break;
2282         }
2283
2284       result->type = CPP_LESS;
2285       if (*buffer->cur == '=')
2286         buffer->cur++, result->type = CPP_LESS_EQ;
2287       else if (*buffer->cur == '<')
2288         {
2289           buffer->cur++;
2290           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2291         }
2292       else if (CPP_OPTION (pfile, digraphs))
2293         {
2294           if (*buffer->cur == ':')
2295             {
2296               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2297                  three characters are <:: and the subsequent character
2298                  is neither : nor >, the < is treated as a preprocessor
2299                  token by itself".  */
2300               if (CPP_OPTION (pfile, cplusplus)
2301                   && (CPP_OPTION (pfile, lang) == CLK_CXX11
2302                       || CPP_OPTION (pfile, lang) == CLK_GNUCXX11)
2303                   && buffer->cur[1] == ':'
2304                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2305                 break;
2306
2307               buffer->cur++;
2308               result->flags |= DIGRAPH;
2309               result->type = CPP_OPEN_SQUARE;
2310             }
2311           else if (*buffer->cur == '%')
2312             {
2313               buffer->cur++;
2314               result->flags |= DIGRAPH;
2315               result->type = CPP_OPEN_BRACE;
2316             }
2317         }
2318       break;
2319
2320     case '>':
2321       result->type = CPP_GREATER;
2322       if (*buffer->cur == '=')
2323         buffer->cur++, result->type = CPP_GREATER_EQ;
2324       else if (*buffer->cur == '>')
2325         {
2326           buffer->cur++;
2327           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2328         }
2329       break;
2330
2331     case '%':
2332       result->type = CPP_MOD;
2333       if (*buffer->cur == '=')
2334         buffer->cur++, result->type = CPP_MOD_EQ;
2335       else if (CPP_OPTION (pfile, digraphs))
2336         {
2337           if (*buffer->cur == ':')
2338             {
2339               buffer->cur++;
2340               result->flags |= DIGRAPH;
2341               result->type = CPP_HASH;
2342               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2343                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2344             }
2345           else if (*buffer->cur == '>')
2346             {
2347               buffer->cur++;
2348               result->flags |= DIGRAPH;
2349               result->type = CPP_CLOSE_BRACE;
2350             }
2351         }
2352       break;
2353
2354     case '.':
2355       result->type = CPP_DOT;
2356       if (ISDIGIT (*buffer->cur))
2357         {
2358           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2359           result->type = CPP_NUMBER;
2360           lex_number (pfile, &result->val.str, &nst);
2361           warn_about_normalization (pfile, result, &nst);
2362         }
2363       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2364         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2365       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2366         buffer->cur++, result->type = CPP_DOT_STAR;
2367       break;
2368
2369     case '+':
2370       result->type = CPP_PLUS;
2371       if (*buffer->cur == '+')
2372         buffer->cur++, result->type = CPP_PLUS_PLUS;
2373       else if (*buffer->cur == '=')
2374         buffer->cur++, result->type = CPP_PLUS_EQ;
2375       break;
2376
2377     case '-':
2378       result->type = CPP_MINUS;
2379       if (*buffer->cur == '>')
2380         {
2381           buffer->cur++;
2382           result->type = CPP_DEREF;
2383           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2384             buffer->cur++, result->type = CPP_DEREF_STAR;
2385         }
2386       else if (*buffer->cur == '-')
2387         buffer->cur++, result->type = CPP_MINUS_MINUS;
2388       else if (*buffer->cur == '=')
2389         buffer->cur++, result->type = CPP_MINUS_EQ;
2390       break;
2391
2392     case '&':
2393       result->type = CPP_AND;
2394       if (*buffer->cur == '&')
2395         buffer->cur++, result->type = CPP_AND_AND;
2396       else if (*buffer->cur == '=')
2397         buffer->cur++, result->type = CPP_AND_EQ;
2398       break;
2399
2400     case '|':
2401       result->type = CPP_OR;
2402       if (*buffer->cur == '|')
2403         buffer->cur++, result->type = CPP_OR_OR;
2404       else if (*buffer->cur == '=')
2405         buffer->cur++, result->type = CPP_OR_EQ;
2406       break;
2407
2408     case ':':
2409       result->type = CPP_COLON;
2410       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2411         buffer->cur++, result->type = CPP_SCOPE;
2412       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2413         {
2414           buffer->cur++;
2415           result->flags |= DIGRAPH;
2416           result->type = CPP_CLOSE_SQUARE;
2417         }
2418       break;
2419
2420     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2421     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2422     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2423     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2424     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2425
2426     case '?': result->type = CPP_QUERY; break;
2427     case '~': result->type = CPP_COMPL; break;
2428     case ',': result->type = CPP_COMMA; break;
2429     case '(': result->type = CPP_OPEN_PAREN; break;
2430     case ')': result->type = CPP_CLOSE_PAREN; break;
2431     case '[': result->type = CPP_OPEN_SQUARE; break;
2432     case ']': result->type = CPP_CLOSE_SQUARE; break;
2433     case '{': result->type = CPP_OPEN_BRACE; break;
2434     case '}': result->type = CPP_CLOSE_BRACE; break;
2435     case ';': result->type = CPP_SEMICOLON; break;
2436
2437       /* @ is a punctuator in Objective-C.  */
2438     case '@': result->type = CPP_ATSIGN; break;
2439
2440     case '$':
2441     case '\\':
2442       {
2443         const uchar *base = --buffer->cur;
2444         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2445
2446         if (forms_identifier_p (pfile, true, &nst))
2447           {
2448             result->type = CPP_NAME;
2449             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2450             warn_about_normalization (pfile, result, &nst);
2451             break;
2452           }
2453         buffer->cur++;
2454       }
2455
2456     default:
2457       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2458       break;
2459     }
2460
2461   return result;
2462 }
2463
2464 /* An upper bound on the number of bytes needed to spell TOKEN.
2465    Does not include preceding whitespace.  */
2466 unsigned int
2467 cpp_token_len (const cpp_token *token)
2468 {
2469   unsigned int len;
2470
2471   switch (TOKEN_SPELL (token))
2472     {
2473     default:            len = 6;                                break;
2474     case SPELL_LITERAL: len = token->val.str.len;               break;
2475     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2476     }
2477
2478   return len;
2479 }
2480
2481 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2482    Return the number of bytes read out of NAME.  (There are always
2483    10 bytes written to BUFFER.)  */
2484
2485 static size_t
2486 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2487 {
2488   int j;
2489   int ucn_len = 0;
2490   int ucn_len_c;
2491   unsigned t;
2492   unsigned long utf32;
2493
2494   /* Compute the length of the UTF-8 sequence.  */
2495   for (t = *name; t & 0x80; t <<= 1)
2496     ucn_len++;
2497
2498   utf32 = *name & (0x7F >> ucn_len);
2499   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2500     {
2501       utf32 = (utf32 << 6) | (*++name & 0x3F);
2502
2503       /* Ill-formed UTF-8.  */
2504       if ((*name & ~0x3F) != 0x80)
2505         abort ();
2506     }
2507
2508   *buffer++ = '\\';
2509   *buffer++ = 'U';
2510   for (j = 7; j >= 0; j--)
2511     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2512   return ucn_len;
2513 }
2514
2515 /* Given a token TYPE corresponding to a digraph, return a pointer to
2516    the spelling of the digraph.  */
2517 static const unsigned char *
2518 cpp_digraph2name (enum cpp_ttype type)
2519 {
2520   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2521 }
2522
2523 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2524    already contain the enough space to hold the token's spelling.
2525    Returns a pointer to the character after the last character written.
2526    FORSTRING is true if this is to be the spelling after translation
2527    phase 1 (this is different for UCNs).
2528    FIXME: Would be nice if we didn't need the PFILE argument.  */
2529 unsigned char *
2530 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2531                  unsigned char *buffer, bool forstring)
2532 {
2533   switch (TOKEN_SPELL (token))
2534     {
2535     case SPELL_OPERATOR:
2536       {
2537         const unsigned char *spelling;
2538         unsigned char c;
2539
2540         if (token->flags & DIGRAPH)
2541           spelling = cpp_digraph2name (token->type);
2542         else if (token->flags & NAMED_OP)
2543           goto spell_ident;
2544         else
2545           spelling = TOKEN_NAME (token);
2546
2547         while ((c = *spelling++) != '\0')
2548           *buffer++ = c;
2549       }
2550       break;
2551
2552     spell_ident:
2553     case SPELL_IDENT:
2554       if (forstring)
2555         {
2556           memcpy (buffer, NODE_NAME (token->val.node.node),
2557                   NODE_LEN (token->val.node.node));
2558           buffer += NODE_LEN (token->val.node.node);
2559         }
2560       else
2561         {
2562           size_t i;
2563           const unsigned char * name = NODE_NAME (token->val.node.node);
2564
2565           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2566             if (name[i] & ~0x7F)
2567               {
2568                 i += utf8_to_ucn (buffer, name + i) - 1;
2569                 buffer += 10;
2570               }
2571             else
2572               *buffer++ = NODE_NAME (token->val.node.node)[i];
2573         }
2574       break;
2575
2576     case SPELL_LITERAL:
2577       memcpy (buffer, token->val.str.text, token->val.str.len);
2578       buffer += token->val.str.len;
2579       break;
2580
2581     case SPELL_NONE:
2582       cpp_error (pfile, CPP_DL_ICE,
2583                  "unspellable token %s", TOKEN_NAME (token));
2584       break;
2585     }
2586
2587   return buffer;
2588 }
2589
2590 /* Returns TOKEN spelt as a null-terminated string.  The string is
2591    freed when the reader is destroyed.  Useful for diagnostics.  */
2592 unsigned char *
2593 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2594 {
2595   unsigned int len = cpp_token_len (token) + 1;
2596   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2597
2598   end = cpp_spell_token (pfile, token, start, false);
2599   end[0] = '\0';
2600
2601   return start;
2602 }
2603
2604 /* Returns a pointer to a string which spells the token defined by
2605    TYPE and FLAGS.  Used by C front ends, which really should move to
2606    using cpp_token_as_text.  */
2607 const char *
2608 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2609 {
2610   if (flags & DIGRAPH)
2611     return (const char *) cpp_digraph2name (type);
2612   else if (flags & NAMED_OP)
2613     return cpp_named_operator2name (type);
2614
2615   return (const char *) token_spellings[type].name;
2616 }
2617
2618 /* Writes the spelling of token to FP, without any preceding space.
2619    Separated from cpp_spell_token for efficiency - to avoid stdio
2620    double-buffering.  */
2621 void
2622 cpp_output_token (const cpp_token *token, FILE *fp)
2623 {
2624   switch (TOKEN_SPELL (token))
2625     {
2626     case SPELL_OPERATOR:
2627       {
2628         const unsigned char *spelling;
2629         int c;
2630
2631         if (token->flags & DIGRAPH)
2632           spelling = cpp_digraph2name (token->type);
2633         else if (token->flags & NAMED_OP)
2634           goto spell_ident;
2635         else
2636           spelling = TOKEN_NAME (token);
2637
2638         c = *spelling;
2639         do
2640           putc (c, fp);
2641         while ((c = *++spelling) != '\0');
2642       }
2643       break;
2644
2645     spell_ident:
2646     case SPELL_IDENT:
2647       {
2648         size_t i;
2649         const unsigned char * name = NODE_NAME (token->val.node.node);
2650
2651         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2652           if (name[i] & ~0x7F)
2653             {
2654               unsigned char buffer[10];
2655               i += utf8_to_ucn (buffer, name + i) - 1;
2656               fwrite (buffer, 1, 10, fp);
2657             }
2658           else
2659             fputc (NODE_NAME (token->val.node.node)[i], fp);
2660       }
2661       break;
2662
2663     case SPELL_LITERAL:
2664       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2665       break;
2666
2667     case SPELL_NONE:
2668       /* An error, most probably.  */
2669       break;
2670     }
2671 }
2672
2673 /* Compare two tokens.  */
2674 int
2675 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2676 {
2677   if (a->type == b->type && a->flags == b->flags)
2678     switch (TOKEN_SPELL (a))
2679       {
2680       default:                  /* Keep compiler happy.  */
2681       case SPELL_OPERATOR:
2682         /* token_no is used to track where multiple consecutive ##
2683            tokens were originally located.  */
2684         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2685       case SPELL_NONE:
2686         return (a->type != CPP_MACRO_ARG
2687                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2688       case SPELL_IDENT:
2689         return a->val.node.node == b->val.node.node;
2690       case SPELL_LITERAL:
2691         return (a->val.str.len == b->val.str.len
2692                 && !memcmp (a->val.str.text, b->val.str.text,
2693                             a->val.str.len));
2694       }
2695
2696   return 0;
2697 }
2698
2699 /* Returns nonzero if a space should be inserted to avoid an
2700    accidental token paste for output.  For simplicity, it is
2701    conservative, and occasionally advises a space where one is not
2702    needed, e.g. "." and ".2".  */
2703 int
2704 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2705                  const cpp_token *token2)
2706 {
2707   enum cpp_ttype a = token1->type, b = token2->type;
2708   cppchar_t c;
2709
2710   if (token1->flags & NAMED_OP)
2711     a = CPP_NAME;
2712   if (token2->flags & NAMED_OP)
2713     b = CPP_NAME;
2714
2715   c = EOF;
2716   if (token2->flags & DIGRAPH)
2717     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2718   else if (token_spellings[b].category == SPELL_OPERATOR)
2719     c = token_spellings[b].name[0];
2720
2721   /* Quickly get everything that can paste with an '='.  */
2722   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2723     return 1;
2724
2725   switch (a)
2726     {
2727     case CPP_GREATER:   return c == '>';
2728     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2729     case CPP_PLUS:      return c == '+';
2730     case CPP_MINUS:     return c == '-' || c == '>';
2731     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2732     case CPP_MOD:       return c == ':' || c == '>';
2733     case CPP_AND:       return c == '&';
2734     case CPP_OR:        return c == '|';
2735     case CPP_COLON:     return c == ':' || c == '>';
2736     case CPP_DEREF:     return c == '*';
2737     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2738     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2739     case CPP_NAME:      return ((b == CPP_NUMBER
2740                                  && name_p (pfile, &token2->val.str))
2741                                 || b == CPP_NAME
2742                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2743     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2744                                 || c == '.' || c == '+' || c == '-');
2745                                       /* UCNs */
2746     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2747                                  && b == CPP_NAME)
2748                                 || (CPP_OPTION (pfile, objc)
2749                                     && token1->val.str.text[0] == '@'
2750                                     && (b == CPP_NAME || b == CPP_STRING)));
2751     default:            break;
2752     }
2753
2754   return 0;
2755 }
2756
2757 /* Output all the remaining tokens on the current line, and a newline
2758    character, to FP.  Leading whitespace is removed.  If there are
2759    macros, special token padding is not performed.  */
2760 void
2761 cpp_output_line (cpp_reader *pfile, FILE *fp)
2762 {
2763   const cpp_token *token;
2764
2765   token = cpp_get_token (pfile);
2766   while (token->type != CPP_EOF)
2767     {
2768       cpp_output_token (token, fp);
2769       token = cpp_get_token (pfile);
2770       if (token->flags & PREV_WHITE)
2771         putc (' ', fp);
2772     }
2773
2774   putc ('\n', fp);
2775 }
2776
2777 /* Return a string representation of all the remaining tokens on the
2778    current line.  The result is allocated using xmalloc and must be
2779    freed by the caller.  */
2780 unsigned char *
2781 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2782 {
2783   const cpp_token *token;
2784   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2785   unsigned int alloced = 120 + out;
2786   unsigned char *result = (unsigned char *) xmalloc (alloced);
2787
2788   /* If DIR_NAME is empty, there are no initial contents.  */
2789   if (dir_name)
2790     {
2791       sprintf ((char *) result, "#%s ", dir_name);
2792       out += 2;
2793     }
2794
2795   token = cpp_get_token (pfile);
2796   while (token->type != CPP_EOF)
2797     {
2798       unsigned char *last;
2799       /* Include room for a possible space and the terminating nul.  */
2800       unsigned int len = cpp_token_len (token) + 2;
2801
2802       if (out + len > alloced)
2803         {
2804           alloced *= 2;
2805           if (out + len > alloced)
2806             alloced = out + len;
2807           result = (unsigned char *) xrealloc (result, alloced);
2808         }
2809
2810       last = cpp_spell_token (pfile, token, &result[out], 0);
2811       out = last - result;
2812
2813       token = cpp_get_token (pfile);
2814       if (token->flags & PREV_WHITE)
2815         result[out++] = ' ';
2816     }
2817
2818   result[out] = '\0';
2819   return result;
2820 }
2821
2822 /* Memory buffers.  Changing these three constants can have a dramatic
2823    effect on performance.  The values here are reasonable defaults,
2824    but might be tuned.  If you adjust them, be sure to test across a
2825    range of uses of cpplib, including heavy nested function-like macro
2826    expansion.  Also check the change in peak memory usage (NJAMD is a
2827    good tool for this).  */
2828 #define MIN_BUFF_SIZE 8000
2829 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2830 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2831         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2832
2833 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2834   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2835 #endif
2836
2837 /* Create a new allocation buffer.  Place the control block at the end
2838    of the buffer, so that buffer overflows will cause immediate chaos.  */
2839 static _cpp_buff *
2840 new_buff (size_t len)
2841 {
2842   _cpp_buff *result;
2843   unsigned char *base;
2844
2845   if (len < MIN_BUFF_SIZE)
2846     len = MIN_BUFF_SIZE;
2847   len = CPP_ALIGN (len);
2848
2849   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2850   result = (_cpp_buff *) (base + len);
2851   result->base = base;
2852   result->cur = base;
2853   result->limit = base + len;
2854   result->next = NULL;
2855   return result;
2856 }
2857
2858 /* Place a chain of unwanted allocation buffers on the free list.  */
2859 void
2860 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2861 {
2862   _cpp_buff *end = buff;
2863
2864   while (end->next)
2865     end = end->next;
2866   end->next = pfile->free_buffs;
2867   pfile->free_buffs = buff;
2868 }
2869
2870 /* Return a free buffer of size at least MIN_SIZE.  */
2871 _cpp_buff *
2872 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2873 {
2874   _cpp_buff *result, **p;
2875
2876   for (p = &pfile->free_buffs;; p = &(*p)->next)
2877     {
2878       size_t size;
2879
2880       if (*p == NULL)
2881         return new_buff (min_size);
2882       result = *p;
2883       size = result->limit - result->base;
2884       /* Return a buffer that's big enough, but don't waste one that's
2885          way too big.  */
2886       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2887         break;
2888     }
2889
2890   *p = result->next;
2891   result->next = NULL;
2892   result->cur = result->base;
2893   return result;
2894 }
2895
2896 /* Creates a new buffer with enough space to hold the uncommitted
2897    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2898    the excess bytes to the new buffer.  Chains the new buffer after
2899    BUFF, and returns the new buffer.  */
2900 _cpp_buff *
2901 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2902 {
2903   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2904   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2905
2906   buff->next = new_buff;
2907   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2908   return new_buff;
2909 }
2910
2911 /* Creates a new buffer with enough space to hold the uncommitted
2912    remaining bytes of the buffer pointed to by BUFF, and at least
2913    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
2914    Chains the new buffer before the buffer pointed to by BUFF, and
2915    updates the pointer to point to the new buffer.  */
2916 void
2917 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
2918 {
2919   _cpp_buff *new_buff, *old_buff = *pbuff;
2920   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2921
2922   new_buff = _cpp_get_buff (pfile, size);
2923   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2924   new_buff->next = old_buff;
2925   *pbuff = new_buff;
2926 }
2927
2928 /* Free a chain of buffers starting at BUFF.  */
2929 void
2930 _cpp_free_buff (_cpp_buff *buff)
2931 {
2932   _cpp_buff *next;
2933
2934   for (; buff; buff = next)
2935     {
2936       next = buff->next;
2937       free (buff->base);
2938     }
2939 }
2940
2941 /* Allocate permanent, unaligned storage of length LEN.  */
2942 unsigned char *
2943 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2944 {
2945   _cpp_buff *buff = pfile->u_buff;
2946   unsigned char *result = buff->cur;
2947
2948   if (len > (size_t) (buff->limit - result))
2949     {
2950       buff = _cpp_get_buff (pfile, len);
2951       buff->next = pfile->u_buff;
2952       pfile->u_buff = buff;
2953       result = buff->cur;
2954     }
2955
2956   buff->cur = result + len;
2957   return result;
2958 }
2959
2960 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2961    That buffer is used for growing allocations when saving macro
2962    replacement lists in a #define, and when parsing an answer to an
2963    assertion in #assert, #unassert or #if (and therefore possibly
2964    whilst expanding macros).  It therefore must not be used by any
2965    code that they might call: specifically the lexer and the guts of
2966    the macro expander.
2967
2968    All existing other uses clearly fit this restriction: storing
2969    registered pragmas during initialization.  */
2970 unsigned char *
2971 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2972 {
2973   _cpp_buff *buff = pfile->a_buff;
2974   unsigned char *result = buff->cur;
2975
2976   if (len > (size_t) (buff->limit - result))
2977     {
2978       buff = _cpp_get_buff (pfile, len);
2979       buff->next = pfile->a_buff;
2980       pfile->a_buff = buff;
2981       result = buff->cur;
2982     }
2983
2984   buff->cur = result + len;
2985   return result;
2986 }
2987
2988 /* Say which field of TOK is in use.  */
2989
2990 enum cpp_token_fld_kind
2991 cpp_token_val_index (cpp_token *tok)
2992 {
2993   switch (TOKEN_SPELL (tok))
2994     {
2995     case SPELL_IDENT:
2996       return CPP_TOKEN_FLD_NODE;
2997     case SPELL_LITERAL:
2998       return CPP_TOKEN_FLD_STR;
2999     case SPELL_OPERATOR:
3000       if (tok->type == CPP_PASTE)
3001         return CPP_TOKEN_FLD_TOKEN_NO;
3002       else
3003         return CPP_TOKEN_FLD_NONE;
3004     case SPELL_NONE:
3005       if (tok->type == CPP_MACRO_ARG)
3006         return CPP_TOKEN_FLD_ARG_NO;
3007       else if (tok->type == CPP_PADDING)
3008         return CPP_TOKEN_FLD_SOURCE;
3009       else if (tok->type == CPP_PRAGMA)
3010         return CPP_TOKEN_FLD_PRAGMA;
3011       /* else fall through */
3012     default:
3013       return CPP_TOKEN_FLD_NONE;
3014     }
3015 }
3016
3017 /* All tokens lexed in R after calling this function will be forced to have
3018    their source_location the same as the location referenced by P, until
3019    cpp_stop_forcing_token_locations is called for R.  */
3020
3021 void
3022 cpp_force_token_locations (cpp_reader *r, source_location *p)
3023 {
3024   r->forced_token_location_p = p;
3025 }
3026
3027 /* Go back to assigning locations naturally for lexed tokens.  */
3028
3029 void
3030 cpp_stop_forcing_token_locations (cpp_reader *r)
3031 {
3032   r->forced_token_location_p = NULL;
3033 }