libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2010,
   3    2011, 2012 Free Software Foundation, Inc.
   4    Contributed by Per Bothner, 1994-95.
   5    Based on CCCP program by Paul Rubin, June 1986
   6    Adapted to ANSI C, Richard Stallman, Jan 1987
   7    Broken out to separate file, Zack Weinberg, Mar 2000
   8
   9 This program is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 3, or (at your option) any
  12 later version.
  13
  14 This program is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with this program; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "cpplib.h"
  26 #include "internal.h"
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  46 #define TK(e, s) { SPELL_ ## s,    UC #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void store_comment (cpp_reader *, cpp_token *);
  60 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  61                             unsigned int, enum cpp_ttype);
  62 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  63 static int name_p (cpp_reader *, const cpp_string *);
  64 static tokenrun *next_tokenrun (tokenrun *);
  65
  66 static _cpp_buff *new_buff (size_t);
  67
  68
  69 /* Utility routine:
  70
  71    Compares, the token TOKEN to the NUL-terminated string STRING.
  72    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  73 int
  74 cpp_ideq (const cpp_token *token, const char *string)
  75 {
  76   if (token->type != CPP_NAME)
  77     return 0;
  78
  79   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  80 }
  81
  82 /* Record a note TYPE at byte POS into the current cleaned logical
  83    line.  */
  84 static void
  85 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  86 {
  87   if (buffer->notes_used == buffer->notes_cap)
  88     {
  89       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  90       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  91                                   buffer->notes_cap);
  92     }
  93
  94   buffer->notes[buffer->notes_used].pos = pos;
  95   buffer->notes[buffer->notes_used].type = type;
  96   buffer->notes_used++;
  97 }
  98
  99 \f
 100 /* Fast path to find line special characters using optimized character
 101    scanning algorithms.  Anything complicated falls back to the slow
 102    path below.  Since this loop is very hot it's worth doing these kinds
 103    of optimizations.
 104
 105    One of the paths through the ifdefs should provide
 106
 107      const uchar *search_line_fast (const uchar *s, const uchar *end);
 108
 109    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 110    the found character.
 111
 112    Note that the last character of the buffer is *always* a newline,
 113    as forced by _cpp_convert_input.  This fact can be used to avoid
 114    explicitly looking for the end of the buffer.  */
 115
 116 /* Configure gives us an ifdef test.  */
 117 #ifndef WORDS_BIGENDIAN
 118 #define WORDS_BIGENDIAN 0
 119 #endif
 120
 121 /* We'd like the largest integer that fits into a register.  There's nothing
 122    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 123    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 124    can get the "real" word size.  */
 125 #ifdef __GNUC__
 126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 127 #else
 128 typedef unsigned long word_type;
 129 #endif
 130
 131 /* The code below is only expecting sizes 4 or 8.
 132    Die at compile-time if this expectation is violated.  */
 133 typedef char check_word_type_size
 134   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 135
 136 /* Return X with the first N bytes forced to values that won't match one
 137    of the interesting characters.  Note that NUL is not interesting.  */
 138
 139 static inline word_type
 140 acc_char_mask_misalign (word_type val, unsigned int n)
 141 {
 142   word_type mask = -1;
 143   if (WORDS_BIGENDIAN)
 144     mask >>= n * 8;
 145   else
 146     mask <<= n * 8;
 147   return val & mask;
 148 }
 149
 150 /* Return X replicated to all byte positions within WORD_TYPE.  */
 151
 152 static inline word_type
 153 acc_char_replicate (uchar x)
 154 {
 155   word_type ret;
 156
 157   ret = (x << 24) | (x << 16) | (x << 8) | x;
 158   if (sizeof(word_type) == 8)
 159     ret = (ret << 16 << 16) | ret;
 160   return ret;
 161 }
 162
 163 /* Return non-zero if some byte of VAL is (probably) C.  */
 164
 165 static inline word_type
 166 acc_char_cmp (word_type val, word_type c)
 167 {
 168 #if defined(__GNUC__) && defined(__alpha__)
 169   /* We can get exact results using a compare-bytes instruction.
 170      Get (val == c) via (0 >= (val ^ c)).  */
 171   return __builtin_alpha_cmpbge (0, val ^ c);
 172 #else
 173   word_type magic = 0x7efefefeU;
 174   if (sizeof(word_type) == 8)
 175     magic = (magic << 16 << 16) | 0xfefefefeU;
 176   magic |= 1;
 177
 178   val ^= c;
 179   return ((val + magic) ^ ~val) & ~magic;
 180 #endif
 181 }
 182
 183 /* Given the result of acc_char_cmp is non-zero, return the index of
 184    the found character.  If this was a false positive, return -1.  */
 185
 186 static inline int
 187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 188                 word_type val ATTRIBUTE_UNUSED)
 189 {
 190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 191   /* The cmpbge instruction sets *bits* of the result corresponding to
 192      matches in the bytes with no false positives.  */
 193   return __builtin_ctzl (cmp);
 194 #else
 195   unsigned int i;
 196
 197   /* ??? It would be nice to force unrolling here,
 198      and have all of these constants folded.  */
 199   for (i = 0; i < sizeof(word_type); ++i)
 200     {
 201       uchar c;
 202       if (WORDS_BIGENDIAN)
 203         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 204       else
 205         c = (val >> i * 8) & 0xff;
 206
 207       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 208         return i;
 209     }
 210
 211   return -1;
 212 #endif
 213 }
 214
 215 /* A version of the fast scanner using bit fiddling techniques.
 216
 217    For 32-bit words, one would normally perform 16 comparisons and
 218    16 branches.  With this algorithm one performs 24 arithmetic
 219    operations and one branch.  Whether this is faster with a 32-bit
 220    word size is going to be somewhat system dependent.
 221
 222    For 64-bit words, we eliminate twice the number of comparisons
 223    and branches without increasing the number of arithmetic operations.
 224    It's almost certainly going to be a win with 64-bit word size.  */
 225
 226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 227   ATTRIBUTE_UNUSED;
 228
 229 static const uchar *
 230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 231 {
 232   const word_type repl_nl = acc_char_replicate ('\n');
 233   const word_type repl_cr = acc_char_replicate ('\r');
 234   const word_type repl_bs = acc_char_replicate ('\\');
 235   const word_type repl_qm = acc_char_replicate ('?');
 236
 237   unsigned int misalign;
 238   const word_type *p;
 239   word_type val, t;
 240
 241   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 242   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 243   val = *p;
 244   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 245   if (misalign)
 246     val = acc_char_mask_misalign (val, misalign);
 247
 248   /* Main loop.  */
 249   while (1)
 250     {
 251       t  = acc_char_cmp (val, repl_nl);
 252       t |= acc_char_cmp (val, repl_cr);
 253       t |= acc_char_cmp (val, repl_bs);
 254       t |= acc_char_cmp (val, repl_qm);
 255
 256       if (__builtin_expect (t != 0, 0))
 257         {
 258           int i = acc_char_index (t, val);
 259           if (i >= 0)
 260             return (const uchar *)p + i;
 261         }
 262
 263       val = *++p;
 264     }
 265 }
 266
 267 /* Disable on Solaris 2/x86 until the following problems can be properly
 268    autoconfed:
 269
 270    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 271    Before Solaris 9 Update 6, SSE insns cannot be executed.
 272    The Solaris 10+ assembler tags objects with the instruction set
 273    extensions used, so SSE4.2 executables cannot run on machines that
 274    don't support that extension.  */
 275
 276 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 277
 278 /* Replicated character data to be shared between implementations.
 279    Recall that outside of a context with vector support we can't
 280    define compatible vector types, therefore these are all defined
 281    in terms of raw characters.  */
 282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 283   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 284     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 285   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 286     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 287   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 288     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 289   { '?', '?', '?', '?', '?', '?', '?', '?',
 290     '?', '?', '?', '?', '?', '?', '?', '?' },
 291 };
 292
 293 /* A version of the fast scanner using MMX vectorized byte compare insns.
 294
 295    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 296    which was packaged into SSE1; it is also present in the AMD MMX
 297    extension.  Mark the function as using "sse" so that we emit a real
 298    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 299
 300 static const uchar *
 301 #ifndef __SSE__
 302 __attribute__((__target__("sse")))
 303 #endif
 304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 305 {
 306   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 307   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 308
 309   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 310   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 311   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 312   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 313
 314   unsigned int misalign, found, mask;
 315   const v8qi *p;
 316   v8qi data, t, c;
 317
 318   /* Align the source pointer.  While MMX doesn't generate unaligned data
 319      faults, this allows us to safely scan to the end of the buffer without
 320      reading beyond the end of the last page.  */
 321   misalign = (uintptr_t)s & 7;
 322   p = (const v8qi *)((uintptr_t)s & -8);
 323   data = *p;
 324
 325   /* Create a mask for the bytes that are valid within the first
 326      16-byte block.  The Idea here is that the AND with the mask
 327      within the loop is "free", since we need some AND or TEST
 328      insn in order to set the flags for the branch anyway.  */
 329   mask = -1u << misalign;
 330
 331   /* Main loop processing 8 bytes at a time.  */
 332   goto start;
 333   do
 334     {
 335       data = *++p;
 336       mask = -1;
 337
 338     start:
 339       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 340       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 341       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 342       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 343       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 344       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 345       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 346       found = __builtin_ia32_pmovmskb (t);
 347       found &= mask;
 348     }
 349   while (!found);
 350
 351   __builtin_ia32_emms ();
 352
 353   /* FOUND contains 1 in bits for which we matched a relevant
 354      character.  Conversion to the byte index is trivial.  */
 355   found = __builtin_ctz(found);
 356   return (const uchar *)p + found;
 357 }
 358
 359 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 360
 361 static const uchar *
 362 #ifndef __SSE2__
 363 __attribute__((__target__("sse2")))
 364 #endif
 365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 366 {
 367   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 368
 369   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 370   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 371   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 372   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 373
 374   unsigned int misalign, found, mask;
 375   const v16qi *p;
 376   v16qi data, t;
 377
 378   /* Align the source pointer.  */
 379   misalign = (uintptr_t)s & 15;
 380   p = (const v16qi *)((uintptr_t)s & -16);
 381   data = *p;
 382
 383   /* Create a mask for the bytes that are valid within the first
 384      16-byte block.  The Idea here is that the AND with the mask
 385      within the loop is "free", since we need some AND or TEST
 386      insn in order to set the flags for the branch anyway.  */
 387   mask = -1u << misalign;
 388
 389   /* Main loop processing 16 bytes at a time.  */
 390   goto start;
 391   do
 392     {
 393       data = *++p;
 394       mask = -1;
 395
 396     start:
 397       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 398       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 400       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 401       found = __builtin_ia32_pmovmskb128 (t);
 402       found &= mask;
 403     }
 404   while (!found);
 405
 406   /* FOUND contains 1 in bits for which we matched a relevant
 407      character.  Conversion to the byte index is trivial.  */
 408   found = __builtin_ctz(found);
 409   return (const uchar *)p + found;
 410 }
 411
 412 #ifdef HAVE_SSE4
 413 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 414
 415 static const uchar *
 416 #ifndef __SSE4_2__
 417 __attribute__((__target__("sse4.2")))
 418 #endif
 419 search_line_sse42 (const uchar *s, const uchar *end)
 420 {
 421   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 422   static const v16qi search = { '\n', '\r', '?', '\\' };
 423
 424   uintptr_t si = (uintptr_t)s;
 425   uintptr_t index;
 426
 427   /* Check for unaligned input.  */
 428   if (si & 15)
 429     {
 430       v16qi sv;
 431
 432       if (__builtin_expect (end - s < 16, 0)
 433           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 434         {
 435           /* There are less than 16 bytes left in the buffer, and less
 436              than 16 bytes left on the page.  Reading 16 bytes at this
 437              point might generate a spurious page fault.  Defer to the
 438              SSE2 implementation, which already handles alignment.  */
 439           return search_line_sse2 (s, end);
 440         }
 441
 442       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 443          memory need not be aligned.  */
 444       sv = __builtin_ia32_loaddqu ((const char *) s);
 445       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 446
 447       if (__builtin_expect (index < 16, 0))
 448         goto found;
 449
 450       /* Advance the pointer to an aligned address.  We will re-scan a
 451          few bytes, but we no longer need care for reading past the
 452          end of a page, since we're guaranteed a match.  */
 453       s = (const uchar *)((si + 16) & -16);
 454     }
 455
 456   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 457      in inline assembly, we can make proper use of the flags set.  */
 458   __asm (      "sub $16, %1\n"
 459         "       .balign 16\n"
 460         "0:     add $16, %1\n"
 461         "       %vpcmpestri $0, (%1), %2\n"
 462         "       jnc 0b"
 463         : "=&c"(index), "+r"(s)
 464         : "x"(search), "a"(4), "d"(16));
 465
 466  found:
 467   return s + index;
 468 }
 469
 470 #else
 471 /* Work around out-dated assemblers without sse4 support.  */
 472 #define search_line_sse42 search_line_sse2
 473 #endif
 474
 475 /* Check the CPU capabilities.  */
 476
 477 #include "../gcc/config/i386/cpuid.h"
 478
 479 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 480 static search_line_fast_type search_line_fast;
 481
 482 #define HAVE_init_vectorized_lexer 1
 483 static inline void
 484 init_vectorized_lexer (void)
 485 {
 486   unsigned dummy, ecx = 0, edx = 0;
 487   search_line_fast_type impl = search_line_acc_char;
 488   int minimum = 0;
 489
 490 #if defined(__SSE4_2__)
 491   minimum = 3;
 492 #elif defined(__SSE2__)
 493   minimum = 2;
 494 #elif defined(__SSE__)
 495   minimum = 1;
 496 #endif
 497
 498   if (minimum == 3)
 499     impl = search_line_sse42;
 500   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 501     {
 502       if (minimum == 3 || (ecx & bit_SSE4_2))
 503         impl = search_line_sse42;
 504       else if (minimum == 2 || (edx & bit_SSE2))
 505         impl = search_line_sse2;
 506       else if (minimum == 1 || (edx & bit_SSE))
 507         impl = search_line_mmx;
 508     }
 509   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 510     {
 511       if (minimum == 1
 512           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 513         impl = search_line_mmx;
 514     }
 515
 516   search_line_fast = impl;
 517 }
 518
 519 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 520
 521 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 522 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 523    so we can't compile this function without -maltivec on the command line
 524    (or implied by some other switch).  */
 525
 526 static const uchar *
 527 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 528 {
 529   typedef __attribute__((altivec(vector))) unsigned char vc;
 530
 531   const vc repl_nl = {
 532     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 533     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 534   };
 535   const vc repl_cr = {
 536     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 537     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 538   };
 539   const vc repl_bs = {
 540     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 541     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 542   };
 543   const vc repl_qm = {
 544     '?', '?', '?', '?', '?', '?', '?', '?',
 545     '?', '?', '?', '?', '?', '?', '?', '?',
 546   };
 547   const vc ones = {
 548     -1, -1, -1, -1, -1, -1, -1, -1,
 549     -1, -1, -1, -1, -1, -1, -1, -1,
 550   };
 551   const vc zero = { 0 };
 552
 553   vc data, mask, t;
 554
 555   /* Altivec loads automatically mask addresses with -16.  This lets us
 556      issue the first load as early as possible.  */
 557   data = __builtin_vec_ld(0, (const vc *)s);
 558
 559   /* Discard bytes before the beginning of the buffer.  Do this by
 560      beginning with all ones and shifting in zeros according to the
 561      mis-alignment.  The LVSR instruction pulls the exact shift we
 562      want from the address.  */
 563   mask = __builtin_vec_lvsr(0, s);
 564   mask = __builtin_vec_perm(zero, ones, mask);
 565   data &= mask;
 566
 567   /* While altivec loads mask addresses, we still need to align S so
 568      that the offset we compute at the end is correct.  */
 569   s = (const uchar *)((uintptr_t)s & -16);
 570
 571   /* Main loop processing 16 bytes at a time.  */
 572   goto start;
 573   do
 574     {
 575       vc m_nl, m_cr, m_bs, m_qm;
 576
 577       s += 16;
 578       data = __builtin_vec_ld(0, (const vc *)s);
 579
 580     start:
 581       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 582       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 583       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 584       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 585       t = (m_nl | m_cr) | (m_bs | m_qm);
 586
 587       /* T now contains 0xff in bytes for which we matched one of the relevant
 588          characters.  We want to exit the loop if any byte in T is non-zero.
 589          Below is the expansion of vec_any_ne(t, zero).  */
 590     }
 591   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 592
 593   {
 594 #define N  (sizeof(vc) / sizeof(long))
 595
 596     union {
 597       vc v;
 598       /* Statically assert that N is 2 or 4.  */
 599       unsigned long l[(N == 2 || N == 4) ? N : -1];
 600     } u;
 601     unsigned long l, i = 0;
 602
 603     u.v = t;
 604
 605     /* Find the first word of T that is non-zero.  */
 606     switch (N)
 607       {
 608       case 4:
 609         l = u.l[i++];
 610         if (l != 0)
 611           break;
 612         s += sizeof(unsigned long);
 613         l = u.l[i++];
 614         if (l != 0)
 615           break;
 616         s += sizeof(unsigned long);
 617       case 2:
 618         l = u.l[i++];
 619         if (l != 0)
 620           break;
 621         s += sizeof(unsigned long);
 622         l = u.l[i];
 623       }
 624
 625     /* L now contains 0xff in bytes for which we matched one of the
 626        relevant characters.  We can find the byte index by finding
 627        its bit index and dividing by 8.  */
 628     l = __builtin_clzl(l) >> 3;
 629     return s + l;
 630
 631 #undef N
 632   }
 633 }
 634
 635 #elif defined (__ARM_NEON__)
 636 #include "arm_neon.h"
 637
 638 static const uchar *
 639 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 640 {
 641   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 642   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 643   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 644   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 645   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 646
 647   unsigned int misalign, found, mask;
 648   const uint8_t *p;
 649   uint8x16_t data;
 650
 651   /* Align the source pointer.  */
 652   misalign = (uintptr_t)s & 15;
 653   p = (const uint8_t *)((uintptr_t)s & -16);
 654   data = vld1q_u8 (p);
 655
 656   /* Create a mask for the bytes that are valid within the first
 657      16-byte block.  The Idea here is that the AND with the mask
 658      within the loop is "free", since we need some AND or TEST
 659      insn in order to set the flags for the branch anyway.  */
 660   mask = (-1u << misalign) & 0xffff;
 661
 662   /* Main loop, processing 16 bytes at a time.  */
 663   goto start;
 664
 665   do
 666     {
 667       uint8x8_t l;
 668       uint16x4_t m;
 669       uint32x2_t n;
 670       uint8x16_t t, u, v, w;
 671
 672       p += 16;
 673       data = vld1q_u8 (p);
 674       mask = 0xffff;
 675
 676     start:
 677       t = vceqq_u8 (data, repl_nl);
 678       u = vceqq_u8 (data, repl_cr);
 679       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 680       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 681       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 682       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 683       m = vpaddl_u8 (l);
 684       n = vpaddl_u16 (m);
 685
 686       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 687               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 688       found &= mask;
 689     }
 690   while (!found);
 691
 692   /* FOUND contains 1 in bits for which we matched a relevant
 693      character.  Conversion to the byte index is trivial.  */
 694   found = __builtin_ctz (found);
 695   return (const uchar *)p + found;
 696 }
 697
 698 #else
 699
 700 /* We only have one accellerated alternative.  Use a direct call so that
 701    we encourage inlining.  */
 702
 703 #define search_line_fast  search_line_acc_char
 704
 705 #endif
 706
 707 /* Initialize the lexer if needed.  */
 708
 709 void
 710 _cpp_init_lexer (void)
 711 {
 712 #ifdef HAVE_init_vectorized_lexer
 713   init_vectorized_lexer ();
 714 #endif
 715 }
 716
 717 /* Returns with a logical line that contains no escaped newlines or
 718    trigraphs.  This is a time-critical inner loop.  */
 719 void
 720 _cpp_clean_line (cpp_reader *pfile)
 721 {
 722   cpp_buffer *buffer;
 723   const uchar *s;
 724   uchar c, *d, *p;
 725
 726   buffer = pfile->buffer;
 727   buffer->cur_note = buffer->notes_used = 0;
 728   buffer->cur = buffer->line_base = buffer->next_line;
 729   buffer->need_line = false;
 730   s = buffer->next_line;
 731
 732   if (!buffer->from_stage3)
 733     {
 734       const uchar *pbackslash = NULL;
 735
 736       /* Fast path.  This is the common case of an un-escaped line with
 737          no trigraphs.  The primary win here is by not writing any
 738          data back to memory until we have to.  */
 739       while (1)
 740         {
 741           /* Perform an optimized search for \n, \r, \\, ?.  */
 742           s = search_line_fast (s, buffer->rlimit);
 743
 744           c = *s;
 745           if (c == '\\')
 746             {
 747               /* Record the location of the backslash and continue.  */
 748               pbackslash = s++;
 749             }
 750           else if (__builtin_expect (c == '?', 0))
 751             {
 752               if (__builtin_expect (s[1] == '?', false)
 753                    && _cpp_trigraph_map[s[2]])
 754                 {
 755                   /* Have a trigraph.  We may or may not have to convert
 756                      it.  Add a line note regardless, for -Wtrigraphs.  */
 757                   add_line_note (buffer, s, s[2]);
 758                   if (CPP_OPTION (pfile, trigraphs))
 759                     {
 760                       /* We do, and that means we have to switch to the
 761                          slow path.  */
 762                       d = (uchar *) s;
 763                       *d = _cpp_trigraph_map[s[2]];
 764                       s += 2;
 765                       goto slow_path;
 766                     }
 767                 }
 768               /* Not a trigraph.  Continue on fast-path.  */
 769               s++;
 770             }
 771           else
 772             break;
 773         }
 774
 775       /* This must be \r or \n.  We're either done, or we'll be forced
 776          to write back to the buffer and continue on the slow path.  */
 777       d = (uchar *) s;
 778
 779       if (__builtin_expect (s == buffer->rlimit, false))
 780         goto done;
 781
 782       /* DOS line ending? */
 783       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 784         {
 785           s++;
 786           if (s == buffer->rlimit)
 787             goto done;
 788         }
 789
 790       if (__builtin_expect (pbackslash == NULL, true))
 791         goto done;
 792
 793       /* Check for escaped newline.  */
 794       p = d;
 795       while (is_nvspace (p[-1]))
 796         p--;
 797       if (p - 1 != pbackslash)
 798         goto done;
 799
 800       /* Have an escaped newline; process it and proceed to
 801          the slow path.  */
 802       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 803       d = p - 2;
 804       buffer->next_line = p - 1;
 805
 806     slow_path:
 807       while (1)
 808         {
 809           c = *++s;
 810           *++d = c;
 811
 812           if (c == '\n' || c == '\r')
 813             {
 814               /* Handle DOS line endings.  */
 815               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 816                 s++;
 817               if (s == buffer->rlimit)
 818                 break;
 819
 820               /* Escaped?  */
 821               p = d;
 822               while (p != buffer->next_line && is_nvspace (p[-1]))
 823                 p--;
 824               if (p == buffer->next_line || p[-1] != '\\')
 825                 break;
 826
 827               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 828               d = p - 2;
 829               buffer->next_line = p - 1;
 830             }
 831           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 832             {
 833               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 834               add_line_note (buffer, d, s[2]);
 835               if (CPP_OPTION (pfile, trigraphs))
 836                 {
 837                   *d = _cpp_trigraph_map[s[2]];
 838                   s += 2;
 839                 }
 840             }
 841         }
 842     }
 843   else
 844     {
 845       while (*s != '\n' && *s != '\r')
 846         s++;
 847       d = (uchar *) s;
 848
 849       /* Handle DOS line endings.  */
 850       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 851         s++;
 852     }
 853
 854  done:
 855   *d = '\n';
 856   /* A sentinel note that should never be processed.  */
 857   add_line_note (buffer, d + 1, '\n');
 858   buffer->next_line = s + 1;
 859 }
 860
 861 /* Return true if the trigraph indicated by NOTE should be warned
 862    about in a comment.  */
 863 static bool
 864 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 865 {
 866   const uchar *p;
 867
 868   /* Within comments we don't warn about trigraphs, unless the
 869      trigraph forms an escaped newline, as that may change
 870      behavior.  */
 871   if (note->type != '/')
 872     return false;
 873
 874   /* If -trigraphs, then this was an escaped newline iff the next note
 875      is coincident.  */
 876   if (CPP_OPTION (pfile, trigraphs))
 877     return note[1].pos == note->pos;
 878
 879   /* Otherwise, see if this forms an escaped newline.  */
 880   p = note->pos + 3;
 881   while (is_nvspace (*p))
 882     p++;
 883
 884   /* There might have been escaped newlines between the trigraph and the
 885      newline we found.  Hence the position test.  */
 886   return (*p == '\n' && p < note[1].pos);
 887 }
 888
 889 /* Process the notes created by add_line_note as far as the current
 890    location.  */
 891 void
 892 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 893 {
 894   cpp_buffer *buffer = pfile->buffer;
 895
 896   for (;;)
 897     {
 898       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 899       unsigned int col;
 900
 901       if (note->pos > buffer->cur)
 902         break;
 903
 904       buffer->cur_note++;
 905       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 906
 907       if (note->type == '\\' || note->type == ' ')
 908         {
 909           if (note->type == ' ' && !in_comment)
 910             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 911                                  "backslash and newline separated by space");
 912
 913           if (buffer->next_line > buffer->rlimit)
 914             {
 915               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 916                                    "backslash-newline at end of file");
 917               /* Prevent "no newline at end of file" warning.  */
 918               buffer->next_line = buffer->rlimit;
 919             }
 920
 921           buffer->line_base = note->pos;
 922           CPP_INCREMENT_LINE (pfile, 0);
 923         }
 924       else if (_cpp_trigraph_map[note->type])
 925         {
 926           if (CPP_OPTION (pfile, warn_trigraphs)
 927               && (!in_comment || warn_in_comment (pfile, note)))
 928             {
 929               if (CPP_OPTION (pfile, trigraphs))
 930                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 931                                        pfile->line_table->highest_line, col,
 932                                        "trigraph ??%c converted to %c",
 933                                        note->type,
 934                                        (int) _cpp_trigraph_map[note->type]);
 935               else
 936                 {
 937                   cpp_warning_with_line
 938                     (pfile, CPP_W_TRIGRAPHS,
 939                      pfile->line_table->highest_line, col,
 940                      "trigraph ??%c ignored, use -trigraphs to enable",
 941                      note->type);
 942                 }
 943             }
 944         }
 945       else if (note->type == 0)
 946         /* Already processed in lex_raw_string.  */;
 947       else
 948         abort ();
 949     }
 950 }
 951
 952 /* Skip a C-style block comment.  We find the end of the comment by
 953    seeing if an asterisk is before every '/' we encounter.  Returns
 954    nonzero if comment terminated by EOF, zero otherwise.
 955
 956    Buffer->cur points to the initial asterisk of the comment.  */
 957 bool
 958 _cpp_skip_block_comment (cpp_reader *pfile)
 959 {
 960   cpp_buffer *buffer = pfile->buffer;
 961   const uchar *cur = buffer->cur;
 962   uchar c;
 963
 964   cur++;
 965   if (*cur == '/')
 966     cur++;
 967
 968   for (;;)
 969     {
 970       /* People like decorating comments with '*', so check for '/'
 971          instead for efficiency.  */
 972       c = *cur++;
 973
 974       if (c == '/')
 975         {
 976           if (cur[-2] == '*')
 977             break;
 978
 979           /* Warn about potential nested comments, but not if the '/'
 980              comes immediately before the true comment delimiter.
 981              Don't bother to get it right across escaped newlines.  */
 982           if (CPP_OPTION (pfile, warn_comments)
 983               && cur[0] == '*' && cur[1] != '/')
 984             {
 985               buffer->cur = cur;
 986               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 987                                      pfile->line_table->highest_line,
 988                                      CPP_BUF_COL (buffer),
 989                                      "\"/*\" within comment");
 990             }
 991         }
 992       else if (c == '\n')
 993         {
 994           unsigned int cols;
 995           buffer->cur = cur - 1;
 996           _cpp_process_line_notes (pfile, true);
 997           if (buffer->next_line >= buffer->rlimit)
 998             return true;
 999           _cpp_clean_line (pfile);
1000
1001           cols = buffer->next_line - buffer->line_base;
1002           CPP_INCREMENT_LINE (pfile, cols);
1003
1004           cur = buffer->cur;
1005         }
1006     }
1007
1008   buffer->cur = cur;
1009   _cpp_process_line_notes (pfile, true);
1010   return false;
1011 }
1012
1013 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1014    terminating newline.  Handles escaped newlines.  Returns nonzero
1015    if a multiline comment.  */
1016 static int
1017 skip_line_comment (cpp_reader *pfile)
1018 {
1019   cpp_buffer *buffer = pfile->buffer;
1020   source_location orig_line = pfile->line_table->highest_line;
1021
1022   while (*buffer->cur != '\n')
1023     buffer->cur++;
1024
1025   _cpp_process_line_notes (pfile, true);
1026   return orig_line != pfile->line_table->highest_line;
1027 }
1028
1029 /* Skips whitespace, saving the next non-whitespace character.  */
1030 static void
1031 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1032 {
1033   cpp_buffer *buffer = pfile->buffer;
1034   bool saw_NUL = false;
1035
1036   do
1037     {
1038       /* Horizontal space always OK.  */
1039       if (c == ' ' || c == '\t')
1040         ;
1041       /* Just \f \v or \0 left.  */
1042       else if (c == '\0')
1043         saw_NUL = true;
1044       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1045         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1046                              CPP_BUF_COL (buffer),
1047                              "%s in preprocessing directive",
1048                              c == '\f' ? "form feed" : "vertical tab");
1049
1050       c = *buffer->cur++;
1051     }
1052   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1053   while (is_nvspace (c));
1054
1055   if (saw_NUL)
1056     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1057
1058   buffer->cur--;
1059 }
1060
1061 /* See if the characters of a number token are valid in a name (no
1062    '.', '+' or '-').  */
1063 static int
1064 name_p (cpp_reader *pfile, const cpp_string *string)
1065 {
1066   unsigned int i;
1067
1068   for (i = 0; i < string->len; i++)
1069     if (!is_idchar (string->text[i]))
1070       return 0;
1071
1072   return 1;
1073 }
1074
1075 /* After parsing an identifier or other sequence, produce a warning about
1076    sequences not in NFC/NFKC.  */
1077 static void
1078 warn_about_normalization (cpp_reader *pfile,
1079                           const cpp_token *token,
1080                           const struct normalize_state *s)
1081 {
1082   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1083       && !pfile->state.skipping)
1084     {
1085       /* Make sure that the token is printed using UCNs, even
1086          if we'd otherwise happily print UTF-8.  */
1087       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1088       size_t sz;
1089
1090       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1091       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1092         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1093                                "`%.*s' is not in NFKC", (int) sz, buf);
1094       else
1095         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1096                                "`%.*s' is not in NFC", (int) sz, buf);
1097       free (buf);
1098     }
1099 }
1100
1101 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1102    an identifier.  FIRST is TRUE if this starts an identifier.  */
1103 static bool
1104 forms_identifier_p (cpp_reader *pfile, int first,
1105                     struct normalize_state *state)
1106 {
1107   cpp_buffer *buffer = pfile->buffer;
1108
1109   if (*buffer->cur == '$')
1110     {
1111       if (!CPP_OPTION (pfile, dollars_in_ident))
1112         return false;
1113
1114       buffer->cur++;
1115       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1116         {
1117           CPP_OPTION (pfile, warn_dollars) = 0;
1118           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1119         }
1120
1121       return true;
1122     }
1123
1124   /* Is this a syntactically valid UCN?  */
1125   if (CPP_OPTION (pfile, extended_identifiers)
1126       && *buffer->cur == '\\'
1127       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1128     {
1129       buffer->cur += 2;
1130       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1131                           state))
1132         return true;
1133       buffer->cur -= 2;
1134     }
1135
1136   return false;
1137 }
1138
1139 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1140 static cpp_hashnode *
1141 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1142 {
1143   cpp_hashnode *result;
1144   const uchar *cur;
1145   unsigned int len;
1146   unsigned int hash = HT_HASHSTEP (0, *base);
1147
1148   cur = base + 1;
1149   while (ISIDNUM (*cur))
1150     {
1151       hash = HT_HASHSTEP (hash, *cur);
1152       cur++;
1153     }
1154   len = cur - base;
1155   hash = HT_HASHFINISH (hash, len);
1156   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1157                                               base, len, hash, HT_ALLOC));
1158
1159   /* Rarely, identifiers require diagnostics when lexed.  */
1160   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1161                         && !pfile->state.skipping, 0))
1162     {
1163       /* It is allowed to poison the same identifier twice.  */
1164       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1165         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1166                    NODE_NAME (result));
1167
1168       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1169          replacement list of a variadic macro.  */
1170       if (result == pfile->spec_nodes.n__VA_ARGS__
1171           && !pfile->state.va_args_ok)
1172         cpp_error (pfile, CPP_DL_PEDWARN,
1173                    "__VA_ARGS__ can only appear in the expansion"
1174                    " of a C99 variadic macro");
1175
1176       /* For -Wc++-compat, warn about use of C++ named operators.  */
1177       if (result->flags & NODE_WARN_OPERATOR)
1178         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1179                      "identifier \"%s\" is a special operator name in C++",
1180                      NODE_NAME (result));
1181     }
1182
1183   return result;
1184 }
1185
1186 /* Get the cpp_hashnode of an identifier specified by NAME in
1187    the current cpp_reader object.  If none is found, NULL is returned.  */
1188 cpp_hashnode *
1189 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1190 {
1191   cpp_hashnode *result;
1192   result = lex_identifier_intern (pfile, (uchar *) name);
1193   return result;
1194 }
1195
1196 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1197 static cpp_hashnode *
1198 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1199                 struct normalize_state *nst)
1200 {
1201   cpp_hashnode *result;
1202   const uchar *cur;
1203   unsigned int len;
1204   unsigned int hash = HT_HASHSTEP (0, *base);
1205
1206   cur = pfile->buffer->cur;
1207   if (! starts_ucn)
1208     while (ISIDNUM (*cur))
1209       {
1210         hash = HT_HASHSTEP (hash, *cur);
1211         cur++;
1212       }
1213   pfile->buffer->cur = cur;
1214   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1215     {
1216       /* Slower version for identifiers containing UCNs (or $).  */
1217       do {
1218         while (ISIDNUM (*pfile->buffer->cur))
1219           {
1220             pfile->buffer->cur++;
1221             NORMALIZE_STATE_UPDATE_IDNUM (nst);
1222           }
1223       } while (forms_identifier_p (pfile, false, nst));
1224       result = _cpp_interpret_identifier (pfile, base,
1225                                           pfile->buffer->cur - base);
1226     }
1227   else
1228     {
1229       len = cur - base;
1230       hash = HT_HASHFINISH (hash, len);
1231
1232       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1233                                                   base, len, hash, HT_ALLOC));
1234     }
1235
1236   /* Rarely, identifiers require diagnostics when lexed.  */
1237   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1238                         && !pfile->state.skipping, 0))
1239     {
1240       /* It is allowed to poison the same identifier twice.  */
1241       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1242         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1243                    NODE_NAME (result));
1244
1245       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1246          replacement list of a variadic macro.  */
1247       if (result == pfile->spec_nodes.n__VA_ARGS__
1248           && !pfile->state.va_args_ok)
1249         cpp_error (pfile, CPP_DL_PEDWARN,
1250                    "__VA_ARGS__ can only appear in the expansion"
1251                    " of a C99 variadic macro");
1252
1253       /* For -Wc++-compat, warn about use of C++ named operators.  */
1254       if (result->flags & NODE_WARN_OPERATOR)
1255         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1256                      "identifier \"%s\" is a special operator name in C++",
1257                      NODE_NAME (result));
1258     }
1259
1260   return result;
1261 }
1262
1263 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1264 static void
1265 lex_number (cpp_reader *pfile, cpp_string *number,
1266             struct normalize_state *nst)
1267 {
1268   const uchar *cur;
1269   const uchar *base;
1270   uchar *dest;
1271
1272   base = pfile->buffer->cur - 1;
1273   do
1274     {
1275       cur = pfile->buffer->cur;
1276
1277       /* N.B. ISIDNUM does not include $.  */
1278       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
1279         {
1280           cur++;
1281           NORMALIZE_STATE_UPDATE_IDNUM (nst);
1282         }
1283
1284       pfile->buffer->cur = cur;
1285     }
1286   while (forms_identifier_p (pfile, false, nst));
1287
1288   number->len = cur - base;
1289   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1290   memcpy (dest, base, number->len);
1291   dest[number->len] = '\0';
1292   number->text = dest;
1293 }
1294
1295 /* Create a token of type TYPE with a literal spelling.  */
1296 static void
1297 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1298                 unsigned int len, enum cpp_ttype type)
1299 {
1300   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1301
1302   memcpy (dest, base, len);
1303   dest[len] = '\0';
1304   token->type = type;
1305   token->val.str.len = len;
1306   token->val.str.text = dest;
1307 }
1308
1309 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1310    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1311
1312 static void
1313 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1314                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1315 {
1316   _cpp_buff *first_buff = *first_buff_p;
1317   _cpp_buff *last_buff = *last_buff_p;
1318
1319   if (first_buff == NULL)
1320     first_buff = last_buff = _cpp_get_buff (pfile, len);
1321   else if (len > BUFF_ROOM (last_buff))
1322     {
1323       size_t room = BUFF_ROOM (last_buff);
1324       memcpy (BUFF_FRONT (last_buff), base, room);
1325       BUFF_FRONT (last_buff) += room;
1326       base += room;
1327       len -= room;
1328       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1329     }
1330
1331   memcpy (BUFF_FRONT (last_buff), base, len);
1332   BUFF_FRONT (last_buff) += len;
1333
1334   *first_buff_p = first_buff;
1335   *last_buff_p = last_buff;
1336 }
1337
1338 /* Lexes a raw string.  The stored string contains the spelling, including
1339    double quotes, delimiter string, '(' and ')', any leading
1340    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1341    literal, or CPP_OTHER if it was not properly terminated.
1342
1343    The spelling is NUL-terminated, but it is not guaranteed that this
1344    is the first NUL since embedded NULs are preserved.  */
1345
1346 static void
1347 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1348                 const uchar *cur)
1349 {
1350   const uchar *raw_prefix;
1351   unsigned int raw_prefix_len = 0;
1352   enum cpp_ttype type;
1353   size_t total_len = 0;
1354   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1355   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1356
1357   type = (*base == 'L' ? CPP_WSTRING :
1358           *base == 'U' ? CPP_STRING32 :
1359           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1360           : CPP_STRING);
1361
1362   raw_prefix = cur + 1;
1363   while (raw_prefix_len < 16)
1364     {
1365       switch (raw_prefix[raw_prefix_len])
1366         {
1367         case ' ': case '(': case ')': case '\\': case '\t':
1368         case '\v': case '\f': case '\n': default:
1369           break;
1370         /* Basic source charset except the above chars.  */
1371         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1372         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1373         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1374         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1375         case 'y': case 'z':
1376         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1377         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1378         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1379         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1380         case 'Y': case 'Z':
1381         case '0': case '1': case '2': case '3': case '4': case '5':
1382         case '6': case '7': case '8': case '9':
1383         case '_': case '{': case '}': case '#': case '[': case ']':
1384         case '<': case '>': case '%': case ':': case ';': case '.':
1385         case '?': case '*': case '+': case '-': case '/': case '^':
1386         case '&': case '|': case '~': case '!': case '=': case ',':
1387         case '"': case '\'':
1388           raw_prefix_len++;
1389           continue;
1390         }
1391       break;
1392     }
1393
1394   if (raw_prefix[raw_prefix_len] != '(')
1395     {
1396       int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
1397                 + 1;
1398       if (raw_prefix_len == 16)
1399         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1400                              "raw string delimiter longer than 16 characters");
1401       else
1402         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1403                              "invalid character '%c' in raw string delimiter",
1404                              (int) raw_prefix[raw_prefix_len]);
1405       pfile->buffer->cur = raw_prefix - 1;
1406       create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
1407       return;
1408     }
1409
1410   cur = raw_prefix + raw_prefix_len + 1;
1411   for (;;)
1412     {
1413 #define BUF_APPEND(STR,LEN)                                     \
1414       do {                                                      \
1415         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1416                         &first_buff, &last_buff);               \
1417         total_len += (LEN);                                     \
1418       } while (0);
1419
1420       cppchar_t c;
1421
1422       /* If we previously performed any trigraph or line splicing
1423          transformations, undo them within the body of the raw string.  */
1424       while (note->pos < cur)
1425         ++note;
1426       for (; note->pos == cur; ++note)
1427         {
1428           switch (note->type)
1429             {
1430             case '\\':
1431             case ' ':
1432               /* Restore backslash followed by newline.  */
1433               BUF_APPEND (base, cur - base);
1434               base = cur;
1435               BUF_APPEND ("\\", 1);
1436             after_backslash:
1437               if (note->type == ' ')
1438                 {
1439                   /* GNU backslash whitespace newline extension.  FIXME
1440                      could be any sequence of non-vertical space.  When we
1441                      can properly restore any such sequence, we should mark
1442                      this note as handled so _cpp_process_line_notes
1443                      doesn't warn.  */
1444                   BUF_APPEND (" ", 1);
1445                 }
1446
1447               BUF_APPEND ("\n", 1);
1448               break;
1449
1450             case 0:
1451               /* Already handled.  */
1452               break;
1453
1454             default:
1455               if (_cpp_trigraph_map[note->type])
1456                 {
1457                   /* Don't warn about this trigraph in
1458                      _cpp_process_line_notes, since trigraphs show up as
1459                      trigraphs in raw strings.  */
1460                   uchar type = note->type;
1461                   note->type = 0;
1462
1463                   if (!CPP_OPTION (pfile, trigraphs))
1464                     /* If we didn't convert the trigraph in the first
1465                        place, don't do anything now either.  */
1466                     break;
1467
1468                   BUF_APPEND (base, cur - base);
1469                   base = cur;
1470                   BUF_APPEND ("??", 2);
1471
1472                   /* ??/ followed by newline gets two line notes, one for
1473                      the trigraph and one for the backslash/newline.  */
1474                   if (type == '/' && note[1].pos == cur)
1475                     {
1476                       if (note[1].type != '\\'
1477                           && note[1].type != ' ')
1478                         abort ();
1479                       BUF_APPEND ("/", 1);
1480                       ++note;
1481                       goto after_backslash;
1482                     }
1483                   /* The ) from ??) could be part of the suffix.  */
1484                   else if (type == ')'
1485                            && strncmp ((const char *) cur+1,
1486                                        (const char *) raw_prefix,
1487                                        raw_prefix_len) == 0
1488                            && cur[raw_prefix_len+1] == '"')
1489                     {
1490                       BUF_APPEND (")", 1);
1491                       base++;
1492                       cur += raw_prefix_len + 2;
1493                       goto break_outer_loop;
1494                     }
1495                   else
1496                     {
1497                       /* Skip the replacement character.  */
1498                       base = ++cur;
1499                       BUF_APPEND (&type, 1);
1500                     }
1501                 }
1502               else
1503                 abort ();
1504               break;
1505             }
1506         }
1507       c = *cur++;
1508
1509       if (c == ')'
1510           && strncmp ((const char *) cur, (const char *) raw_prefix,
1511                       raw_prefix_len) == 0
1512           && cur[raw_prefix_len] == '"')
1513         {
1514           cur += raw_prefix_len + 1;
1515           break;
1516         }
1517       else if (c == '\n')
1518         {
1519           if (pfile->state.in_directive
1520               || pfile->state.parsing_args
1521               || pfile->state.in_deferred_pragma)
1522             {
1523               cur--;
1524               type = CPP_OTHER;
1525               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1526                                    "unterminated raw string");
1527               break;
1528             }
1529
1530           BUF_APPEND (base, cur - base);
1531
1532           if (pfile->buffer->cur < pfile->buffer->rlimit)
1533             CPP_INCREMENT_LINE (pfile, 0);
1534           pfile->buffer->need_line = true;
1535
1536           pfile->buffer->cur = cur-1;
1537           _cpp_process_line_notes (pfile, false);
1538           if (!_cpp_get_fresh_line (pfile))
1539             {
1540               source_location src_loc = token->src_loc;
1541               token->type = CPP_EOF;
1542               /* Tell the compiler the line number of the EOF token.  */
1543               token->src_loc = pfile->line_table->highest_line;
1544               token->flags = BOL;
1545               if (first_buff != NULL)
1546                 _cpp_release_buff (pfile, first_buff);
1547               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1548                                    "unterminated raw string");
1549               return;
1550             }
1551
1552           cur = base = pfile->buffer->cur;
1553           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1554         }
1555     }
1556  break_outer_loop:
1557
1558   if (CPP_OPTION (pfile, user_literals))
1559     {
1560       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1561          underscore is ill-formed.  Since this breaks programs using macros
1562          from inttypes.h, we generate a warning and treat the ud-suffix as a
1563          separate preprocessing token.  This approach is under discussion by
1564          the standards committee, and has been adopted as a conforming
1565          extension by other front ends such as clang. */
1566       if (ISALPHA (*cur))
1567         {
1568           /* Raise a warning, but do not consume subsequent tokens.  */
1569           if (CPP_OPTION (pfile, warn_literal_suffix))
1570             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1571                                    token->src_loc, 0,
1572                                    "invalid suffix on literal; C++11 requires "
1573                                    "a space between literal and identifier");
1574         }
1575       /* Grab user defined literal suffix.  */
1576       else if (*cur == '_')
1577         {
1578           type = cpp_userdef_string_add_type (type);
1579           ++cur;
1580
1581           while (ISIDNUM (*cur))
1582             ++cur;
1583         }
1584     }
1585
1586   pfile->buffer->cur = cur;
1587   if (first_buff == NULL)
1588     create_literal (pfile, token, base, cur - base, type);
1589   else
1590     {
1591       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1592
1593       token->type = type;
1594       token->val.str.len = total_len + (cur - base);
1595       token->val.str.text = dest;
1596       last_buff = first_buff;
1597       while (last_buff != NULL)
1598         {
1599           memcpy (dest, last_buff->base,
1600                   BUFF_FRONT (last_buff) - last_buff->base);
1601           dest += BUFF_FRONT (last_buff) - last_buff->base;
1602           last_buff = last_buff->next;
1603         }
1604       _cpp_release_buff (pfile, first_buff);
1605       memcpy (dest, base, cur - base);
1606       dest[cur - base] = '\0';
1607     }
1608 }
1609
1610 /* Lexes a string, character constant, or angle-bracketed header file
1611    name.  The stored string contains the spelling, including opening
1612    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1613    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1614    if it was not properly terminated, or CPP_LESS for an unterminated
1615    header name which must be relexed as normal tokens.
1616
1617    The spelling is NUL-terminated, but it is not guaranteed that this
1618    is the first NUL since embedded NULs are preserved.  */
1619 static void
1620 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1621 {
1622   bool saw_NUL = false;
1623   const uchar *cur;
1624   cppchar_t terminator;
1625   enum cpp_ttype type;
1626
1627   cur = base;
1628   terminator = *cur++;
1629   if (terminator == 'L' || terminator == 'U')
1630     terminator = *cur++;
1631   else if (terminator == 'u')
1632     {
1633       terminator = *cur++;
1634       if (terminator == '8')
1635         terminator = *cur++;
1636     }
1637   if (terminator == 'R')
1638     {
1639       lex_raw_string (pfile, token, base, cur);
1640       return;
1641     }
1642   if (terminator == '"')
1643     type = (*base == 'L' ? CPP_WSTRING :
1644             *base == 'U' ? CPP_STRING32 :
1645             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1646                          : CPP_STRING);
1647   else if (terminator == '\'')
1648     type = (*base == 'L' ? CPP_WCHAR :
1649             *base == 'U' ? CPP_CHAR32 :
1650             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1651   else
1652     terminator = '>', type = CPP_HEADER_NAME;
1653
1654   for (;;)
1655     {
1656       cppchar_t c = *cur++;
1657
1658       /* In #include-style directives, terminators are not escapable.  */
1659       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1660         cur++;
1661       else if (c == terminator)
1662         break;
1663       else if (c == '\n')
1664         {
1665           cur--;
1666           /* Unmatched quotes always yield undefined behavior, but
1667              greedy lexing means that what appears to be an unterminated
1668              header name may actually be a legitimate sequence of tokens.  */
1669           if (terminator == '>')
1670             {
1671               token->type = CPP_LESS;
1672               return;
1673             }
1674           type = CPP_OTHER;
1675           break;
1676         }
1677       else if (c == '\0')
1678         saw_NUL = true;
1679     }
1680
1681   if (saw_NUL && !pfile->state.skipping)
1682     cpp_error (pfile, CPP_DL_WARNING,
1683                "null character(s) preserved in literal");
1684
1685   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1686     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1687                (int) terminator);
1688
1689   if (CPP_OPTION (pfile, user_literals))
1690     {
1691       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1692          underscore is ill-formed.  Since this breaks programs using macros
1693          from inttypes.h, we generate a warning and treat the ud-suffix as a
1694          separate preprocessing token.  This approach is under discussion by
1695          the standards committee, and has been adopted as a conforming
1696          extension by other front ends such as clang. */
1697       if (ISALPHA (*cur))
1698         {
1699           /* Raise a warning, but do not consume subsequent tokens.  */
1700           if (CPP_OPTION (pfile, warn_literal_suffix))
1701             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1702                                    token->src_loc, 0,
1703                                    "invalid suffix on literal; C++11 requires "
1704                                    "a space between literal and identifier");
1705         }
1706       /* Grab user defined literal suffix.  */
1707       else if (*cur == '_')
1708         {
1709           type = cpp_userdef_char_add_type (type);
1710           type = cpp_userdef_string_add_type (type);
1711           ++cur;
1712
1713           while (ISIDNUM (*cur))
1714             ++cur;
1715         }
1716     }
1717
1718   pfile->buffer->cur = cur;
1719   create_literal (pfile, token, base, cur - base, type);
1720 }
1721
1722 /* Return the comment table. The client may not make any assumption
1723    about the ordering of the table.  */
1724 cpp_comment_table *
1725 cpp_get_comments (cpp_reader *pfile)
1726 {
1727   return &pfile->comments;
1728 }
1729
1730 /* Append a comment to the end of the comment table. */
1731 static void
1732 store_comment (cpp_reader *pfile, cpp_token *token)
1733 {
1734   int len;
1735
1736   if (pfile->comments.allocated == 0)
1737     {
1738       pfile->comments.allocated = 256;
1739       pfile->comments.entries = (cpp_comment *) xmalloc
1740         (pfile->comments.allocated * sizeof (cpp_comment));
1741     }
1742
1743   if (pfile->comments.count == pfile->comments.allocated)
1744     {
1745       pfile->comments.allocated *= 2;
1746       pfile->comments.entries = (cpp_comment *) xrealloc
1747         (pfile->comments.entries,
1748          pfile->comments.allocated * sizeof (cpp_comment));
1749     }
1750
1751   len = token->val.str.len;
1752
1753   /* Copy comment. Note, token may not be NULL terminated. */
1754   pfile->comments.entries[pfile->comments.count].comment =
1755     (char *) xmalloc (sizeof (char) * (len + 1));
1756   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1757           token->val.str.text, len);
1758   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1759
1760   /* Set source location. */
1761   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1762
1763   /* Increment the count of entries in the comment table. */
1764   pfile->comments.count++;
1765 }
1766
1767 /* The stored comment includes the comment start and any terminator.  */
1768 static void
1769 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1770               cppchar_t type)
1771 {
1772   unsigned char *buffer;
1773   unsigned int len, clen, i;
1774
1775   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1776
1777   /* C++ comments probably (not definitely) have moved past a new
1778      line, which we don't want to save in the comment.  */
1779   if (is_vspace (pfile->buffer->cur[-1]))
1780     len--;
1781
1782   /* If we are currently in a directive or in argument parsing, then
1783      we need to store all C++ comments as C comments internally, and
1784      so we need to allocate a little extra space in that case.
1785
1786      Note that the only time we encounter a directive here is
1787      when we are saving comments in a "#define".  */
1788   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1789           && type == '/') ? len + 2 : len;
1790
1791   buffer = _cpp_unaligned_alloc (pfile, clen);
1792
1793   token->type = CPP_COMMENT;
1794   token->val.str.len = clen;
1795   token->val.str.text = buffer;
1796
1797   buffer[0] = '/';
1798   memcpy (buffer + 1, from, len - 1);
1799
1800   /* Finish conversion to a C comment, if necessary.  */
1801   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1802     {
1803       buffer[1] = '*';
1804       buffer[clen - 2] = '*';
1805       buffer[clen - 1] = '/';
1806       /* As there can be in a C++ comments illegal sequences for C comments
1807          we need to filter them out.  */
1808       for (i = 2; i < (clen - 2); i++)
1809         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1810           buffer[i] = '|';
1811     }
1812
1813   /* Finally store this comment for use by clients of libcpp. */
1814   store_comment (pfile, token);
1815 }
1816
1817 /* Allocate COUNT tokens for RUN.  */
1818 void
1819 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1820 {
1821   run->base = XNEWVEC (cpp_token, count);
1822   run->limit = run->base + count;
1823   run->next = NULL;
1824 }
1825
1826 /* Returns the next tokenrun, or creates one if there is none.  */
1827 static tokenrun *
1828 next_tokenrun (tokenrun *run)
1829 {
1830   if (run->next == NULL)
1831     {
1832       run->next = XNEW (tokenrun);
1833       run->next->prev = run;
1834       _cpp_init_tokenrun (run->next, 250);
1835     }
1836
1837   return run->next;
1838 }
1839
1840 /* Return the number of not yet processed token in a given
1841    context.  */
1842 int
1843 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1844 {
1845   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1846     return (LAST (context).token - FIRST (context).token);
1847   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1848            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1849     return (LAST (context).ptoken - FIRST (context).ptoken);
1850   else
1851       abort ();
1852 }
1853
1854 /* Returns the token present at index INDEX in a given context.  If
1855    INDEX is zero, the next token to be processed is returned.  */
1856 static const cpp_token*
1857 _cpp_token_from_context_at (cpp_context *context, int index)
1858 {
1859   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1860     return &(FIRST (context).token[index]);
1861   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1862            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1863     return FIRST (context).ptoken[index];
1864  else
1865    abort ();
1866 }
1867
1868 /* Look ahead in the input stream.  */
1869 const cpp_token *
1870 cpp_peek_token (cpp_reader *pfile, int index)
1871 {
1872   cpp_context *context = pfile->context;
1873   const cpp_token *peektok;
1874   int count;
1875
1876   /* First, scan through any pending cpp_context objects.  */
1877   while (context->prev)
1878     {
1879       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1880
1881       if (index < (int) sz)
1882         return _cpp_token_from_context_at (context, index);
1883       index -= (int) sz;
1884       context = context->prev;
1885     }
1886
1887   /* We will have to read some new tokens after all (and do so
1888      without invalidating preceding tokens).  */
1889   count = index;
1890   pfile->keep_tokens++;
1891
1892   do
1893     {
1894       peektok = _cpp_lex_token (pfile);
1895       if (peektok->type == CPP_EOF)
1896         return peektok;
1897     }
1898   while (index--);
1899
1900   _cpp_backup_tokens_direct (pfile, count + 1);
1901   pfile->keep_tokens--;
1902
1903   return peektok;
1904 }
1905
1906 /* Allocate a single token that is invalidated at the same time as the
1907    rest of the tokens on the line.  Has its line and col set to the
1908    same as the last lexed token, so that diagnostics appear in the
1909    right place.  */
1910 cpp_token *
1911 _cpp_temp_token (cpp_reader *pfile)
1912 {
1913   cpp_token *old, *result;
1914   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1915   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1916
1917   old = pfile->cur_token - 1;
1918   /* Any pre-existing lookaheads must not be clobbered.  */
1919   if (la)
1920     {
1921       if (sz <= la)
1922         {
1923           tokenrun *next = next_tokenrun (pfile->cur_run);
1924
1925           if (sz < la)
1926             memmove (next->base + 1, next->base,
1927                      (la - sz) * sizeof (cpp_token));
1928
1929           next->base[0] = pfile->cur_run->limit[-1];
1930         }
1931
1932       if (sz > 1)
1933         memmove (pfile->cur_token + 1, pfile->cur_token,
1934                  MIN (la, sz - 1) * sizeof (cpp_token));
1935     }
1936
1937   if (!sz && pfile->cur_token == pfile->cur_run->limit)
1938     {
1939       pfile->cur_run = next_tokenrun (pfile->cur_run);
1940       pfile->cur_token = pfile->cur_run->base;
1941     }
1942
1943   result = pfile->cur_token++;
1944   result->src_loc = old->src_loc;
1945   return result;
1946 }
1947
1948 /* Lex a token into RESULT (external interface).  Takes care of issues
1949    like directive handling, token lookahead, multiple include
1950    optimization and skipping.  */
1951 const cpp_token *
1952 _cpp_lex_token (cpp_reader *pfile)
1953 {
1954   cpp_token *result;
1955
1956   for (;;)
1957     {
1958       if (pfile->cur_token == pfile->cur_run->limit)
1959         {
1960           pfile->cur_run = next_tokenrun (pfile->cur_run);
1961           pfile->cur_token = pfile->cur_run->base;
1962         }
1963       /* We assume that the current token is somewhere in the current
1964          run.  */
1965       if (pfile->cur_token < pfile->cur_run->base
1966           || pfile->cur_token >= pfile->cur_run->limit)
1967         abort ();
1968
1969       if (pfile->lookaheads)
1970         {
1971           pfile->lookaheads--;
1972           result = pfile->cur_token++;
1973         }
1974       else
1975         result = _cpp_lex_direct (pfile);
1976
1977       if (result->flags & BOL)
1978         {
1979           /* Is this a directive.  If _cpp_handle_directive returns
1980              false, it is an assembler #.  */
1981           if (result->type == CPP_HASH
1982               /* 6.10.3 p 11: Directives in a list of macro arguments
1983                  gives undefined behavior.  This implementation
1984                  handles the directive as normal.  */
1985               && pfile->state.parsing_args != 1)
1986             {
1987               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1988                 {
1989                   if (pfile->directive_result.type == CPP_PADDING)
1990                     continue;
1991                   result = &pfile->directive_result;
1992                 }
1993             }
1994           else if (pfile->state.in_deferred_pragma)
1995             result = &pfile->directive_result;
1996
1997           if (pfile->cb.line_change && !pfile->state.skipping)
1998             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1999         }
2000
2001       /* We don't skip tokens in directives.  */
2002       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2003         break;
2004
2005       /* Outside a directive, invalidate controlling macros.  At file
2006          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2007          get here and MI optimization works.  */
2008       pfile->mi_valid = false;
2009
2010       if (!pfile->state.skipping || result->type == CPP_EOF)
2011         break;
2012     }
2013
2014   return result;
2015 }
2016
2017 /* Returns true if a fresh line has been loaded.  */
2018 bool
2019 _cpp_get_fresh_line (cpp_reader *pfile)
2020 {
2021   int return_at_eof;
2022
2023   /* We can't get a new line until we leave the current directive.  */
2024   if (pfile->state.in_directive)
2025     return false;
2026
2027   for (;;)
2028     {
2029       cpp_buffer *buffer = pfile->buffer;
2030
2031       if (!buffer->need_line)
2032         return true;
2033
2034       if (buffer->next_line < buffer->rlimit)
2035         {
2036           _cpp_clean_line (pfile);
2037           return true;
2038         }
2039
2040       /* First, get out of parsing arguments state.  */
2041       if (pfile->state.parsing_args)
2042         return false;
2043
2044       /* End of buffer.  Non-empty files should end in a newline.  */
2045       if (buffer->buf != buffer->rlimit
2046           && buffer->next_line > buffer->rlimit
2047           && !buffer->from_stage3)
2048         {
2049           /* Clip to buffer size.  */
2050           buffer->next_line = buffer->rlimit;
2051         }
2052
2053       return_at_eof = buffer->return_at_eof;
2054       _cpp_pop_buffer (pfile);
2055       if (pfile->buffer == NULL || return_at_eof)
2056         return false;
2057     }
2058 }
2059
2060 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2061   do                                                    \
2062     {                                                   \
2063       result->type = ELSE_TYPE;                         \
2064       if (*buffer->cur == CHAR)                         \
2065         buffer->cur++, result->type = THEN_TYPE;        \
2066     }                                                   \
2067   while (0)
2068
2069 /* Lex a token into pfile->cur_token, which is also incremented, to
2070    get diagnostics pointing to the correct location.
2071
2072    Does not handle issues such as token lookahead, multiple-include
2073    optimization, directives, skipping etc.  This function is only
2074    suitable for use by _cpp_lex_token, and in special cases like
2075    lex_expansion_token which doesn't care for any of these issues.
2076
2077    When meeting a newline, returns CPP_EOF if parsing a directive,
2078    otherwise returns to the start of the token buffer if permissible.
2079    Returns the location of the lexed token.  */
2080 cpp_token *
2081 _cpp_lex_direct (cpp_reader *pfile)
2082 {
2083   cppchar_t c;
2084   cpp_buffer *buffer;
2085   const unsigned char *comment_start;
2086   cpp_token *result = pfile->cur_token++;
2087
2088  fresh_line:
2089   result->flags = 0;
2090   buffer = pfile->buffer;
2091   if (buffer->need_line)
2092     {
2093       if (pfile->state.in_deferred_pragma)
2094         {
2095           result->type = CPP_PRAGMA_EOL;
2096           pfile->state.in_deferred_pragma = false;
2097           if (!pfile->state.pragma_allow_expansion)
2098             pfile->state.prevent_expansion--;
2099           return result;
2100         }
2101       if (!_cpp_get_fresh_line (pfile))
2102         {
2103           result->type = CPP_EOF;
2104           if (!pfile->state.in_directive)
2105             {
2106               /* Tell the compiler the line number of the EOF token.  */
2107               result->src_loc = pfile->line_table->highest_line;
2108               result->flags = BOL;
2109             }
2110           return result;
2111         }
2112       if (!pfile->keep_tokens)
2113         {
2114           pfile->cur_run = &pfile->base_run;
2115           result = pfile->base_run.base;
2116           pfile->cur_token = result + 1;
2117         }
2118       result->flags = BOL;
2119       if (pfile->state.parsing_args == 2)
2120         result->flags |= PREV_WHITE;
2121     }
2122   buffer = pfile->buffer;
2123  update_tokens_line:
2124   result->src_loc = pfile->line_table->highest_line;
2125
2126  skipped_white:
2127   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2128       && !pfile->overlaid_buffer)
2129     {
2130       _cpp_process_line_notes (pfile, false);
2131       result->src_loc = pfile->line_table->highest_line;
2132     }
2133   c = *buffer->cur++;
2134
2135   if (pfile->forced_token_location_p)
2136     result->src_loc = *pfile->forced_token_location_p;
2137   else
2138     result->src_loc = linemap_position_for_column (pfile->line_table,
2139                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2140
2141   switch (c)
2142     {
2143     case ' ': case '\t': case '\f': case '\v': case '\0':
2144       result->flags |= PREV_WHITE;
2145       skip_whitespace (pfile, c);
2146       goto skipped_white;
2147
2148     case '\n':
2149       if (buffer->cur < buffer->rlimit)
2150         CPP_INCREMENT_LINE (pfile, 0);
2151       buffer->need_line = true;
2152       goto fresh_line;
2153
2154     case '0': case '1': case '2': case '3': case '4':
2155     case '5': case '6': case '7': case '8': case '9':
2156       {
2157         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2158         result->type = CPP_NUMBER;
2159         lex_number (pfile, &result->val.str, &nst);
2160         warn_about_normalization (pfile, result, &nst);
2161         break;
2162       }
2163
2164     case 'L':
2165     case 'u':
2166     case 'U':
2167     case 'R':
2168       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2169          wide strings or raw strings.  */
2170       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2171           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2172         {
2173           if ((*buffer->cur == '\'' && c != 'R')
2174               || *buffer->cur == '"'
2175               || (*buffer->cur == 'R'
2176                   && c != 'R'
2177                   && buffer->cur[1] == '"'
2178                   && CPP_OPTION (pfile, rliterals))
2179               || (*buffer->cur == '8'
2180                   && c == 'u'
2181                   && (buffer->cur[1] == '"'
2182                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2183                           && CPP_OPTION (pfile, rliterals)))))
2184             {
2185               lex_string (pfile, result, buffer->cur - 1);
2186               break;
2187             }
2188         }
2189       /* Fall through.  */
2190
2191     case '_':
2192     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2193     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2194     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2195     case 's': case 't':           case 'v': case 'w': case 'x':
2196     case 'y': case 'z':
2197     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2198     case 'G': case 'H': case 'I': case 'J': case 'K':
2199     case 'M': case 'N': case 'O': case 'P': case 'Q':
2200     case 'S': case 'T':           case 'V': case 'W': case 'X':
2201     case 'Y': case 'Z':
2202       result->type = CPP_NAME;
2203       {
2204         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2205         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2206                                                 &nst);
2207         warn_about_normalization (pfile, result, &nst);
2208       }
2209
2210       /* Convert named operators to their proper types.  */
2211       if (result->val.node.node->flags & NODE_OPERATOR)
2212         {
2213           result->flags |= NAMED_OP;
2214           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2215         }
2216       break;
2217
2218     case '\'':
2219     case '"':
2220       lex_string (pfile, result, buffer->cur - 1);
2221       break;
2222
2223     case '/':
2224       /* A potential block or line comment.  */
2225       comment_start = buffer->cur;
2226       c = *buffer->cur;
2227
2228       if (c == '*')
2229         {
2230           if (_cpp_skip_block_comment (pfile))
2231             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2232         }
2233       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2234                             || cpp_in_system_header (pfile)))
2235         {
2236           /* Warn about comments only if pedantically GNUC89, and not
2237              in system headers.  */
2238           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2239               && ! buffer->warned_cplusplus_comments)
2240             {
2241               cpp_error (pfile, CPP_DL_PEDWARN,
2242                          "C++ style comments are not allowed in ISO C90");
2243               cpp_error (pfile, CPP_DL_PEDWARN,
2244                          "(this will be reported only once per input file)");
2245               buffer->warned_cplusplus_comments = 1;
2246             }
2247
2248           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2249             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2250         }
2251       else if (c == '=')
2252         {
2253           buffer->cur++;
2254           result->type = CPP_DIV_EQ;
2255           break;
2256         }
2257       else
2258         {
2259           result->type = CPP_DIV;
2260           break;
2261         }
2262
2263       if (!pfile->state.save_comments)
2264         {
2265           result->flags |= PREV_WHITE;
2266           goto update_tokens_line;
2267         }
2268
2269       /* Save the comment as a token in its own right.  */
2270       save_comment (pfile, result, comment_start, c);
2271       break;
2272
2273     case '<':
2274       if (pfile->state.angled_headers)
2275         {
2276           lex_string (pfile, result, buffer->cur - 1);
2277           if (result->type != CPP_LESS)
2278             break;
2279         }
2280
2281       result->type = CPP_LESS;
2282       if (*buffer->cur == '=')
2283         buffer->cur++, result->type = CPP_LESS_EQ;
2284       else if (*buffer->cur == '<')
2285         {
2286           buffer->cur++;
2287           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2288         }
2289       else if (CPP_OPTION (pfile, digraphs))
2290         {
2291           if (*buffer->cur == ':')
2292             {
2293               buffer->cur++;
2294               result->flags |= DIGRAPH;
2295               result->type = CPP_OPEN_SQUARE;
2296             }
2297           else if (*buffer->cur == '%')
2298             {
2299               buffer->cur++;
2300               result->flags |= DIGRAPH;
2301               result->type = CPP_OPEN_BRACE;
2302             }
2303         }
2304       break;
2305
2306     case '>':
2307       result->type = CPP_GREATER;
2308       if (*buffer->cur == '=')
2309         buffer->cur++, result->type = CPP_GREATER_EQ;
2310       else if (*buffer->cur == '>')
2311         {
2312           buffer->cur++;
2313           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2314         }
2315       break;
2316
2317     case '%':
2318       result->type = CPP_MOD;
2319       if (*buffer->cur == '=')
2320         buffer->cur++, result->type = CPP_MOD_EQ;
2321       else if (CPP_OPTION (pfile, digraphs))
2322         {
2323           if (*buffer->cur == ':')
2324             {
2325               buffer->cur++;
2326               result->flags |= DIGRAPH;
2327               result->type = CPP_HASH;
2328               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2329                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2330             }
2331           else if (*buffer->cur == '>')
2332             {
2333               buffer->cur++;
2334               result->flags |= DIGRAPH;
2335               result->type = CPP_CLOSE_BRACE;
2336             }
2337         }
2338       break;
2339
2340     case '.':
2341       result->type = CPP_DOT;
2342       if (ISDIGIT (*buffer->cur))
2343         {
2344           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2345           result->type = CPP_NUMBER;
2346           lex_number (pfile, &result->val.str, &nst);
2347           warn_about_normalization (pfile, result, &nst);
2348         }
2349       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2350         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2351       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2352         buffer->cur++, result->type = CPP_DOT_STAR;
2353       break;
2354
2355     case '+':
2356       result->type = CPP_PLUS;
2357       if (*buffer->cur == '+')
2358         buffer->cur++, result->type = CPP_PLUS_PLUS;
2359       else if (*buffer->cur == '=')
2360         buffer->cur++, result->type = CPP_PLUS_EQ;
2361       break;
2362
2363     case '-':
2364       result->type = CPP_MINUS;
2365       if (*buffer->cur == '>')
2366         {
2367           buffer->cur++;
2368           result->type = CPP_DEREF;
2369           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2370             buffer->cur++, result->type = CPP_DEREF_STAR;
2371         }
2372       else if (*buffer->cur == '-')
2373         buffer->cur++, result->type = CPP_MINUS_MINUS;
2374       else if (*buffer->cur == '=')
2375         buffer->cur++, result->type = CPP_MINUS_EQ;
2376       break;
2377
2378     case '&':
2379       result->type = CPP_AND;
2380       if (*buffer->cur == '&')
2381         buffer->cur++, result->type = CPP_AND_AND;
2382       else if (*buffer->cur == '=')
2383         buffer->cur++, result->type = CPP_AND_EQ;
2384       break;
2385
2386     case '|':
2387       result->type = CPP_OR;
2388       if (*buffer->cur == '|')
2389         buffer->cur++, result->type = CPP_OR_OR;
2390       else if (*buffer->cur == '=')
2391         buffer->cur++, result->type = CPP_OR_EQ;
2392       break;
2393
2394     case ':':
2395       result->type = CPP_COLON;
2396       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2397         buffer->cur++, result->type = CPP_SCOPE;
2398       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2399         {
2400           buffer->cur++;
2401           result->flags |= DIGRAPH;
2402           result->type = CPP_CLOSE_SQUARE;
2403         }
2404       break;
2405
2406     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2407     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2408     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2409     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2410     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2411
2412     case '?': result->type = CPP_QUERY; break;
2413     case '~': result->type = CPP_COMPL; break;
2414     case ',': result->type = CPP_COMMA; break;
2415     case '(': result->type = CPP_OPEN_PAREN; break;
2416     case ')': result->type = CPP_CLOSE_PAREN; break;
2417     case '[': result->type = CPP_OPEN_SQUARE; break;
2418     case ']': result->type = CPP_CLOSE_SQUARE; break;
2419     case '{': result->type = CPP_OPEN_BRACE; break;
2420     case '}': result->type = CPP_CLOSE_BRACE; break;
2421     case ';': result->type = CPP_SEMICOLON; break;
2422
2423       /* @ is a punctuator in Objective-C.  */
2424     case '@': result->type = CPP_ATSIGN; break;
2425
2426     case '$':
2427     case '\\':
2428       {
2429         const uchar *base = --buffer->cur;
2430         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2431
2432         if (forms_identifier_p (pfile, true, &nst))
2433           {
2434             result->type = CPP_NAME;
2435             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2436             warn_about_normalization (pfile, result, &nst);
2437             break;
2438           }
2439         buffer->cur++;
2440       }
2441
2442     default:
2443       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2444       break;
2445     }
2446
2447   return result;
2448 }
2449
2450 /* An upper bound on the number of bytes needed to spell TOKEN.
2451    Does not include preceding whitespace.  */
2452 unsigned int
2453 cpp_token_len (const cpp_token *token)
2454 {
2455   unsigned int len;
2456
2457   switch (TOKEN_SPELL (token))
2458     {
2459     default:            len = 6;                                break;
2460     case SPELL_LITERAL: len = token->val.str.len;               break;
2461     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2462     }
2463
2464   return len;
2465 }
2466
2467 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2468    Return the number of bytes read out of NAME.  (There are always
2469    10 bytes written to BUFFER.)  */
2470
2471 static size_t
2472 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2473 {
2474   int j;
2475   int ucn_len = 0;
2476   int ucn_len_c;
2477   unsigned t;
2478   unsigned long utf32;
2479
2480   /* Compute the length of the UTF-8 sequence.  */
2481   for (t = *name; t & 0x80; t <<= 1)
2482     ucn_len++;
2483
2484   utf32 = *name & (0x7F >> ucn_len);
2485   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2486     {
2487       utf32 = (utf32 << 6) | (*++name & 0x3F);
2488
2489       /* Ill-formed UTF-8.  */
2490       if ((*name & ~0x3F) != 0x80)
2491         abort ();
2492     }
2493
2494   *buffer++ = '\\';
2495   *buffer++ = 'U';
2496   for (j = 7; j >= 0; j--)
2497     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2498   return ucn_len;
2499 }
2500
2501 /* Given a token TYPE corresponding to a digraph, return a pointer to
2502    the spelling of the digraph.  */
2503 static const unsigned char *
2504 cpp_digraph2name (enum cpp_ttype type)
2505 {
2506   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2507 }
2508
2509 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2510    already contain the enough space to hold the token's spelling.
2511    Returns a pointer to the character after the last character written.
2512    FORSTRING is true if this is to be the spelling after translation
2513    phase 1 (this is different for UCNs).
2514    FIXME: Would be nice if we didn't need the PFILE argument.  */
2515 unsigned char *
2516 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2517                  unsigned char *buffer, bool forstring)
2518 {
2519   switch (TOKEN_SPELL (token))
2520     {
2521     case SPELL_OPERATOR:
2522       {
2523         const unsigned char *spelling;
2524         unsigned char c;
2525
2526         if (token->flags & DIGRAPH)
2527           spelling = cpp_digraph2name (token->type);
2528         else if (token->flags & NAMED_OP)
2529           goto spell_ident;
2530         else
2531           spelling = TOKEN_NAME (token);
2532
2533         while ((c = *spelling++) != '\0')
2534           *buffer++ = c;
2535       }
2536       break;
2537
2538     spell_ident:
2539     case SPELL_IDENT:
2540       if (forstring)
2541         {
2542           memcpy (buffer, NODE_NAME (token->val.node.node),
2543                   NODE_LEN (token->val.node.node));
2544           buffer += NODE_LEN (token->val.node.node);
2545         }
2546       else
2547         {
2548           size_t i;
2549           const unsigned char * name = NODE_NAME (token->val.node.node);
2550
2551           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2552             if (name[i] & ~0x7F)
2553               {
2554                 i += utf8_to_ucn (buffer, name + i) - 1;
2555                 buffer += 10;
2556               }
2557             else
2558               *buffer++ = NODE_NAME (token->val.node.node)[i];
2559         }
2560       break;
2561
2562     case SPELL_LITERAL:
2563       memcpy (buffer, token->val.str.text, token->val.str.len);
2564       buffer += token->val.str.len;
2565       break;
2566
2567     case SPELL_NONE:
2568       cpp_error (pfile, CPP_DL_ICE,
2569                  "unspellable token %s", TOKEN_NAME (token));
2570       break;
2571     }
2572
2573   return buffer;
2574 }
2575
2576 /* Returns TOKEN spelt as a null-terminated string.  The string is
2577    freed when the reader is destroyed.  Useful for diagnostics.  */
2578 unsigned char *
2579 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2580 {
2581   unsigned int len = cpp_token_len (token) + 1;
2582   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2583
2584   end = cpp_spell_token (pfile, token, start, false);
2585   end[0] = '\0';
2586
2587   return start;
2588 }
2589
2590 /* Returns a pointer to a string which spells the token defined by
2591    TYPE and FLAGS.  Used by C front ends, which really should move to
2592    using cpp_token_as_text.  */
2593 const char *
2594 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2595 {
2596   if (flags & DIGRAPH)
2597     return (const char *) cpp_digraph2name (type);
2598   else if (flags & NAMED_OP)
2599     return cpp_named_operator2name (type);
2600
2601   return (const char *) token_spellings[type].name;
2602 }
2603
2604 /* Writes the spelling of token to FP, without any preceding space.
2605    Separated from cpp_spell_token for efficiency - to avoid stdio
2606    double-buffering.  */
2607 void
2608 cpp_output_token (const cpp_token *token, FILE *fp)
2609 {
2610   switch (TOKEN_SPELL (token))
2611     {
2612     case SPELL_OPERATOR:
2613       {
2614         const unsigned char *spelling;
2615         int c;
2616
2617         if (token->flags & DIGRAPH)
2618           spelling = cpp_digraph2name (token->type);
2619         else if (token->flags & NAMED_OP)
2620           goto spell_ident;
2621         else
2622           spelling = TOKEN_NAME (token);
2623
2624         c = *spelling;
2625         do
2626           putc (c, fp);
2627         while ((c = *++spelling) != '\0');
2628       }
2629       break;
2630
2631     spell_ident:
2632     case SPELL_IDENT:
2633       {
2634         size_t i;
2635         const unsigned char * name = NODE_NAME (token->val.node.node);
2636
2637         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2638           if (name[i] & ~0x7F)
2639             {
2640               unsigned char buffer[10];
2641               i += utf8_to_ucn (buffer, name + i) - 1;
2642               fwrite (buffer, 1, 10, fp);
2643             }
2644           else
2645             fputc (NODE_NAME (token->val.node.node)[i], fp);
2646       }
2647       break;
2648
2649     case SPELL_LITERAL:
2650       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2651       break;
2652
2653     case SPELL_NONE:
2654       /* An error, most probably.  */
2655       break;
2656     }
2657 }
2658
2659 /* Compare two tokens.  */
2660 int
2661 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2662 {
2663   if (a->type == b->type && a->flags == b->flags)
2664     switch (TOKEN_SPELL (a))
2665       {
2666       default:                  /* Keep compiler happy.  */
2667       case SPELL_OPERATOR:
2668         /* token_no is used to track where multiple consecutive ##
2669            tokens were originally located.  */
2670         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2671       case SPELL_NONE:
2672         return (a->type != CPP_MACRO_ARG
2673                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2674       case SPELL_IDENT:
2675         return a->val.node.node == b->val.node.node;
2676       case SPELL_LITERAL:
2677         return (a->val.str.len == b->val.str.len
2678                 && !memcmp (a->val.str.text, b->val.str.text,
2679                             a->val.str.len));
2680       }
2681
2682   return 0;
2683 }
2684
2685 /* Returns nonzero if a space should be inserted to avoid an
2686    accidental token paste for output.  For simplicity, it is
2687    conservative, and occasionally advises a space where one is not
2688    needed, e.g. "." and ".2".  */
2689 int
2690 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2691                  const cpp_token *token2)
2692 {
2693   enum cpp_ttype a = token1->type, b = token2->type;
2694   cppchar_t c;
2695
2696   if (token1->flags & NAMED_OP)
2697     a = CPP_NAME;
2698   if (token2->flags & NAMED_OP)
2699     b = CPP_NAME;
2700
2701   c = EOF;
2702   if (token2->flags & DIGRAPH)
2703     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2704   else if (token_spellings[b].category == SPELL_OPERATOR)
2705     c = token_spellings[b].name[0];
2706
2707   /* Quickly get everything that can paste with an '='.  */
2708   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2709     return 1;
2710
2711   switch (a)
2712     {
2713     case CPP_GREATER:   return c == '>';
2714     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2715     case CPP_PLUS:      return c == '+';
2716     case CPP_MINUS:     return c == '-' || c == '>';
2717     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2718     case CPP_MOD:       return c == ':' || c == '>';
2719     case CPP_AND:       return c == '&';
2720     case CPP_OR:        return c == '|';
2721     case CPP_COLON:     return c == ':' || c == '>';
2722     case CPP_DEREF:     return c == '*';
2723     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2724     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2725     case CPP_NAME:      return ((b == CPP_NUMBER
2726                                  && name_p (pfile, &token2->val.str))
2727                                 || b == CPP_NAME
2728                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2729     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2730                                 || c == '.' || c == '+' || c == '-');
2731                                       /* UCNs */
2732     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2733                                  && b == CPP_NAME)
2734                                 || (CPP_OPTION (pfile, objc)
2735                                     && token1->val.str.text[0] == '@'
2736                                     && (b == CPP_NAME || b == CPP_STRING)));
2737     default:            break;
2738     }
2739
2740   return 0;
2741 }
2742
2743 /* Output all the remaining tokens on the current line, and a newline
2744    character, to FP.  Leading whitespace is removed.  If there are
2745    macros, special token padding is not performed.  */
2746 void
2747 cpp_output_line (cpp_reader *pfile, FILE *fp)
2748 {
2749   const cpp_token *token;
2750
2751   token = cpp_get_token (pfile);
2752   while (token->type != CPP_EOF)
2753     {
2754       cpp_output_token (token, fp);
2755       token = cpp_get_token (pfile);
2756       if (token->flags & PREV_WHITE)
2757         putc (' ', fp);
2758     }
2759
2760   putc ('\n', fp);
2761 }
2762
2763 /* Return a string representation of all the remaining tokens on the
2764    current line.  The result is allocated using xmalloc and must be
2765    freed by the caller.  */
2766 unsigned char *
2767 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2768 {
2769   const cpp_token *token;
2770   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2771   unsigned int alloced = 120 + out;
2772   unsigned char *result = (unsigned char *) xmalloc (alloced);
2773
2774   /* If DIR_NAME is empty, there are no initial contents.  */
2775   if (dir_name)
2776     {
2777       sprintf ((char *) result, "#%s ", dir_name);
2778       out += 2;
2779     }
2780
2781   token = cpp_get_token (pfile);
2782   while (token->type != CPP_EOF)
2783     {
2784       unsigned char *last;
2785       /* Include room for a possible space and the terminating nul.  */
2786       unsigned int len = cpp_token_len (token) + 2;
2787
2788       if (out + len > alloced)
2789         {
2790           alloced *= 2;
2791           if (out + len > alloced)
2792             alloced = out + len;
2793           result = (unsigned char *) xrealloc (result, alloced);
2794         }
2795
2796       last = cpp_spell_token (pfile, token, &result[out], 0);
2797       out = last - result;
2798
2799       token = cpp_get_token (pfile);
2800       if (token->flags & PREV_WHITE)
2801         result[out++] = ' ';
2802     }
2803
2804   result[out] = '\0';
2805   return result;
2806 }
2807
2808 /* Memory buffers.  Changing these three constants can have a dramatic
2809    effect on performance.  The values here are reasonable defaults,
2810    but might be tuned.  If you adjust them, be sure to test across a
2811    range of uses of cpplib, including heavy nested function-like macro
2812    expansion.  Also check the change in peak memory usage (NJAMD is a
2813    good tool for this).  */
2814 #define MIN_BUFF_SIZE 8000
2815 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2816 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2817         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2818
2819 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2820   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2821 #endif
2822
2823 /* Create a new allocation buffer.  Place the control block at the end
2824    of the buffer, so that buffer overflows will cause immediate chaos.  */
2825 static _cpp_buff *
2826 new_buff (size_t len)
2827 {
2828   _cpp_buff *result;
2829   unsigned char *base;
2830
2831   if (len < MIN_BUFF_SIZE)
2832     len = MIN_BUFF_SIZE;
2833   len = CPP_ALIGN (len);
2834
2835   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2836   result = (_cpp_buff *) (base + len);
2837   result->base = base;
2838   result->cur = base;
2839   result->limit = base + len;
2840   result->next = NULL;
2841   return result;
2842 }
2843
2844 /* Place a chain of unwanted allocation buffers on the free list.  */
2845 void
2846 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2847 {
2848   _cpp_buff *end = buff;
2849
2850   while (end->next)
2851     end = end->next;
2852   end->next = pfile->free_buffs;
2853   pfile->free_buffs = buff;
2854 }
2855
2856 /* Return a free buffer of size at least MIN_SIZE.  */
2857 _cpp_buff *
2858 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2859 {
2860   _cpp_buff *result, **p;
2861
2862   for (p = &pfile->free_buffs;; p = &(*p)->next)
2863     {
2864       size_t size;
2865
2866       if (*p == NULL)
2867         return new_buff (min_size);
2868       result = *p;
2869       size = result->limit - result->base;
2870       /* Return a buffer that's big enough, but don't waste one that's
2871          way too big.  */
2872       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2873         break;
2874     }
2875
2876   *p = result->next;
2877   result->next = NULL;
2878   result->cur = result->base;
2879   return result;
2880 }
2881
2882 /* Creates a new buffer with enough space to hold the uncommitted
2883    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2884    the excess bytes to the new buffer.  Chains the new buffer after
2885    BUFF, and returns the new buffer.  */
2886 _cpp_buff *
2887 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2888 {
2889   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2890   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2891
2892   buff->next = new_buff;
2893   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2894   return new_buff;
2895 }
2896
2897 /* Creates a new buffer with enough space to hold the uncommitted
2898    remaining bytes of the buffer pointed to by BUFF, and at least
2899    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
2900    Chains the new buffer before the buffer pointed to by BUFF, and
2901    updates the pointer to point to the new buffer.  */
2902 void
2903 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
2904 {
2905   _cpp_buff *new_buff, *old_buff = *pbuff;
2906   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2907
2908   new_buff = _cpp_get_buff (pfile, size);
2909   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2910   new_buff->next = old_buff;
2911   *pbuff = new_buff;
2912 }
2913
2914 /* Free a chain of buffers starting at BUFF.  */
2915 void
2916 _cpp_free_buff (_cpp_buff *buff)
2917 {
2918   _cpp_buff *next;
2919
2920   for (; buff; buff = next)
2921     {
2922       next = buff->next;
2923       free (buff->base);
2924     }
2925 }
2926
2927 /* Allocate permanent, unaligned storage of length LEN.  */
2928 unsigned char *
2929 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2930 {
2931   _cpp_buff *buff = pfile->u_buff;
2932   unsigned char *result = buff->cur;
2933
2934   if (len > (size_t) (buff->limit - result))
2935     {
2936       buff = _cpp_get_buff (pfile, len);
2937       buff->next = pfile->u_buff;
2938       pfile->u_buff = buff;
2939       result = buff->cur;
2940     }
2941
2942   buff->cur = result + len;
2943   return result;
2944 }
2945
2946 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2947    That buffer is used for growing allocations when saving macro
2948    replacement lists in a #define, and when parsing an answer to an
2949    assertion in #assert, #unassert or #if (and therefore possibly
2950    whilst expanding macros).  It therefore must not be used by any
2951    code that they might call: specifically the lexer and the guts of
2952    the macro expander.
2953
2954    All existing other uses clearly fit this restriction: storing
2955    registered pragmas during initialization.  */
2956 unsigned char *
2957 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2958 {
2959   _cpp_buff *buff = pfile->a_buff;
2960   unsigned char *result = buff->cur;
2961
2962   if (len > (size_t) (buff->limit - result))
2963     {
2964       buff = _cpp_get_buff (pfile, len);
2965       buff->next = pfile->a_buff;
2966       pfile->a_buff = buff;
2967       result = buff->cur;
2968     }
2969
2970   buff->cur = result + len;
2971   return result;
2972 }
2973
2974 /* Say which field of TOK is in use.  */
2975
2976 enum cpp_token_fld_kind
2977 cpp_token_val_index (cpp_token *tok)
2978 {
2979   switch (TOKEN_SPELL (tok))
2980     {
2981     case SPELL_IDENT:
2982       return CPP_TOKEN_FLD_NODE;
2983     case SPELL_LITERAL:
2984       return CPP_TOKEN_FLD_STR;
2985     case SPELL_OPERATOR:
2986       if (tok->type == CPP_PASTE)
2987         return CPP_TOKEN_FLD_TOKEN_NO;
2988       else
2989         return CPP_TOKEN_FLD_NONE;
2990     case SPELL_NONE:
2991       if (tok->type == CPP_MACRO_ARG)
2992         return CPP_TOKEN_FLD_ARG_NO;
2993       else if (tok->type == CPP_PADDING)
2994         return CPP_TOKEN_FLD_SOURCE;
2995       else if (tok->type == CPP_PRAGMA)
2996         return CPP_TOKEN_FLD_PRAGMA;
2997       /* else fall through */
2998     default:
2999       return CPP_TOKEN_FLD_NONE;
3000     }
3001 }
3002
3003 /* All tokens lexed in R after calling this function will be forced to have
3004    their source_location the same as the location referenced by P, until
3005    cpp_stop_forcing_token_locations is called for R.  */
3006
3007 void
3008 cpp_force_token_locations (cpp_reader *r, source_location *p)
3009 {
3010   r->forced_token_location_p = p;
3011 }
3012
3013 /* Go back to assigning locations naturally for lexed tokens.  */
3014
3015 void
3016 cpp_stop_forcing_token_locations (cpp_reader *r)
3017 {
3018   r->forced_token_location_p = NULL;
3019 }