libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2010,
   3    2011, 2012 Free Software Foundation, Inc.
   4    Contributed by Per Bothner, 1994-95.
   5    Based on CCCP program by Paul Rubin, June 1986
   6    Adapted to ANSI C, Richard Stallman, Jan 1987
   7    Broken out to separate file, Zack Weinberg, Mar 2000
   8
   9 This program is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 3, or (at your option) any
  12 later version.
  13
  14 This program is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with this program; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "cpplib.h"
  26 #include "internal.h"
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  46 #define TK(e, s) { SPELL_ ## s,    UC #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void store_comment (cpp_reader *, cpp_token *);
  60 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  61                             unsigned int, enum cpp_ttype);
  62 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  63 static int name_p (cpp_reader *, const cpp_string *);
  64 static tokenrun *next_tokenrun (tokenrun *);
  65
  66 static _cpp_buff *new_buff (size_t);
  67
  68
  69 /* Utility routine:
  70
  71    Compares, the token TOKEN to the NUL-terminated string STRING.
  72    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  73 int
  74 cpp_ideq (const cpp_token *token, const char *string)
  75 {
  76   if (token->type != CPP_NAME)
  77     return 0;
  78
  79   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  80 }
  81
  82 /* Record a note TYPE at byte POS into the current cleaned logical
  83    line.  */
  84 static void
  85 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  86 {
  87   if (buffer->notes_used == buffer->notes_cap)
  88     {
  89       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  90       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  91                                   buffer->notes_cap);
  92     }
  93
  94   buffer->notes[buffer->notes_used].pos = pos;
  95   buffer->notes[buffer->notes_used].type = type;
  96   buffer->notes_used++;
  97 }
  98
  99 \f
 100 /* Fast path to find line special characters using optimized character
 101    scanning algorithms.  Anything complicated falls back to the slow
 102    path below.  Since this loop is very hot it's worth doing these kinds
 103    of optimizations.
 104
 105    One of the paths through the ifdefs should provide
 106
 107      const uchar *search_line_fast (const uchar *s, const uchar *end);
 108
 109    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 110    the found character.
 111
 112    Note that the last character of the buffer is *always* a newline,
 113    as forced by _cpp_convert_input.  This fact can be used to avoid
 114    explicitly looking for the end of the buffer.  */
 115
 116 /* Configure gives us an ifdef test.  */
 117 #ifndef WORDS_BIGENDIAN
 118 #define WORDS_BIGENDIAN 0
 119 #endif
 120
 121 /* We'd like the largest integer that fits into a register.  There's nothing
 122    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 123    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 124    can get the "real" word size.  */
 125 #ifdef __GNUC__
 126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 127 #else
 128 typedef unsigned long word_type;
 129 #endif
 130
 131 /* The code below is only expecting sizes 4 or 8.
 132    Die at compile-time if this expectation is violated.  */
 133 typedef char check_word_type_size
 134   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 135
 136 /* Return X with the first N bytes forced to values that won't match one
 137    of the interesting characters.  Note that NUL is not interesting.  */
 138
 139 static inline word_type
 140 acc_char_mask_misalign (word_type val, unsigned int n)
 141 {
 142   word_type mask = -1;
 143   if (WORDS_BIGENDIAN)
 144     mask >>= n * 8;
 145   else
 146     mask <<= n * 8;
 147   return val & mask;
 148 }
 149
 150 /* Return X replicated to all byte positions within WORD_TYPE.  */
 151
 152 static inline word_type
 153 acc_char_replicate (uchar x)
 154 {
 155   word_type ret;
 156
 157   ret = (x << 24) | (x << 16) | (x << 8) | x;
 158   if (sizeof(word_type) == 8)
 159     ret = (ret << 16 << 16) | ret;
 160   return ret;
 161 }
 162
 163 /* Return non-zero if some byte of VAL is (probably) C.  */
 164
 165 static inline word_type
 166 acc_char_cmp (word_type val, word_type c)
 167 {
 168 #if defined(__GNUC__) && defined(__alpha__)
 169   /* We can get exact results using a compare-bytes instruction.
 170      Get (val == c) via (0 >= (val ^ c)).  */
 171   return __builtin_alpha_cmpbge (0, val ^ c);
 172 #else
 173   word_type magic = 0x7efefefeU;
 174   if (sizeof(word_type) == 8)
 175     magic = (magic << 16 << 16) | 0xfefefefeU;
 176   magic |= 1;
 177
 178   val ^= c;
 179   return ((val + magic) ^ ~val) & ~magic;
 180 #endif
 181 }
 182
 183 /* Given the result of acc_char_cmp is non-zero, return the index of
 184    the found character.  If this was a false positive, return -1.  */
 185
 186 static inline int
 187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 188                 word_type val ATTRIBUTE_UNUSED)
 189 {
 190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 191   /* The cmpbge instruction sets *bits* of the result corresponding to
 192      matches in the bytes with no false positives.  */
 193   return __builtin_ctzl (cmp);
 194 #else
 195   unsigned int i;
 196
 197   /* ??? It would be nice to force unrolling here,
 198      and have all of these constants folded.  */
 199   for (i = 0; i < sizeof(word_type); ++i)
 200     {
 201       uchar c;
 202       if (WORDS_BIGENDIAN)
 203         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 204       else
 205         c = (val >> i * 8) & 0xff;
 206
 207       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 208         return i;
 209     }
 210
 211   return -1;
 212 #endif
 213 }
 214
 215 /* A version of the fast scanner using bit fiddling techniques.
 216
 217    For 32-bit words, one would normally perform 16 comparisons and
 218    16 branches.  With this algorithm one performs 24 arithmetic
 219    operations and one branch.  Whether this is faster with a 32-bit
 220    word size is going to be somewhat system dependent.
 221
 222    For 64-bit words, we eliminate twice the number of comparisons
 223    and branches without increasing the number of arithmetic operations.
 224    It's almost certainly going to be a win with 64-bit word size.  */
 225
 226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 227   ATTRIBUTE_UNUSED;
 228
 229 static const uchar *
 230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 231 {
 232   const word_type repl_nl = acc_char_replicate ('\n');
 233   const word_type repl_cr = acc_char_replicate ('\r');
 234   const word_type repl_bs = acc_char_replicate ('\\');
 235   const word_type repl_qm = acc_char_replicate ('?');
 236
 237   unsigned int misalign;
 238   const word_type *p;
 239   word_type val, t;
 240
 241   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 242   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 243   val = *p;
 244   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 245   if (misalign)
 246     val = acc_char_mask_misalign (val, misalign);
 247
 248   /* Main loop.  */
 249   while (1)
 250     {
 251       t  = acc_char_cmp (val, repl_nl);
 252       t |= acc_char_cmp (val, repl_cr);
 253       t |= acc_char_cmp (val, repl_bs);
 254       t |= acc_char_cmp (val, repl_qm);
 255
 256       if (__builtin_expect (t != 0, 0))
 257         {
 258           int i = acc_char_index (t, val);
 259           if (i >= 0)
 260             return (const uchar *)p + i;
 261         }
 262
 263       val = *++p;
 264     }
 265 }
 266
 267 /* Disable on Solaris 2/x86 until the following problems can be properly
 268    autoconfed:
 269
 270    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 271    Before Solaris 9 Update 6, SSE insns cannot be executed.
 272    The Solaris 10+ assembler tags objects with the instruction set
 273    extensions used, so SSE4.2 executables cannot run on machines that
 274    don't support that extension.  */
 275
 276 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 277
 278 /* Replicated character data to be shared between implementations.
 279    Recall that outside of a context with vector support we can't
 280    define compatible vector types, therefore these are all defined
 281    in terms of raw characters.  */
 282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 283   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 284     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 285   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 286     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 287   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 288     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 289   { '?', '?', '?', '?', '?', '?', '?', '?',
 290     '?', '?', '?', '?', '?', '?', '?', '?' },
 291 };
 292
 293 /* A version of the fast scanner using MMX vectorized byte compare insns.
 294
 295    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 296    which was packaged into SSE1; it is also present in the AMD MMX
 297    extension.  Mark the function as using "sse" so that we emit a real
 298    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 299
 300 static const uchar *
 301 #ifndef __SSE__
 302 __attribute__((__target__("sse")))
 303 #endif
 304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 305 {
 306   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 307   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 308
 309   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 310   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 311   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 312   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 313
 314   unsigned int misalign, found, mask;
 315   const v8qi *p;
 316   v8qi data, t, c;
 317
 318   /* Align the source pointer.  While MMX doesn't generate unaligned data
 319      faults, this allows us to safely scan to the end of the buffer without
 320      reading beyond the end of the last page.  */
 321   misalign = (uintptr_t)s & 7;
 322   p = (const v8qi *)((uintptr_t)s & -8);
 323   data = *p;
 324
 325   /* Create a mask for the bytes that are valid within the first
 326      16-byte block.  The Idea here is that the AND with the mask
 327      within the loop is "free", since we need some AND or TEST
 328      insn in order to set the flags for the branch anyway.  */
 329   mask = -1u << misalign;
 330
 331   /* Main loop processing 8 bytes at a time.  */
 332   goto start;
 333   do
 334     {
 335       data = *++p;
 336       mask = -1;
 337
 338     start:
 339       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 340       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 341       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 342       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 343       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 344       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 345       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 346       found = __builtin_ia32_pmovmskb (t);
 347       found &= mask;
 348     }
 349   while (!found);
 350
 351   __builtin_ia32_emms ();
 352
 353   /* FOUND contains 1 in bits for which we matched a relevant
 354      character.  Conversion to the byte index is trivial.  */
 355   found = __builtin_ctz(found);
 356   return (const uchar *)p + found;
 357 }
 358
 359 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 360
 361 static const uchar *
 362 #ifndef __SSE2__
 363 __attribute__((__target__("sse2")))
 364 #endif
 365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 366 {
 367   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 368
 369   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 370   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 371   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 372   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 373
 374   unsigned int misalign, found, mask;
 375   const v16qi *p;
 376   v16qi data, t;
 377
 378   /* Align the source pointer.  */
 379   misalign = (uintptr_t)s & 15;
 380   p = (const v16qi *)((uintptr_t)s & -16);
 381   data = *p;
 382
 383   /* Create a mask for the bytes that are valid within the first
 384      16-byte block.  The Idea here is that the AND with the mask
 385      within the loop is "free", since we need some AND or TEST
 386      insn in order to set the flags for the branch anyway.  */
 387   mask = -1u << misalign;
 388
 389   /* Main loop processing 16 bytes at a time.  */
 390   goto start;
 391   do
 392     {
 393       data = *++p;
 394       mask = -1;
 395
 396     start:
 397       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 398       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 400       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 401       found = __builtin_ia32_pmovmskb128 (t);
 402       found &= mask;
 403     }
 404   while (!found);
 405
 406   /* FOUND contains 1 in bits for which we matched a relevant
 407      character.  Conversion to the byte index is trivial.  */
 408   found = __builtin_ctz(found);
 409   return (const uchar *)p + found;
 410 }
 411
 412 #ifdef HAVE_SSE4
 413 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 414
 415 static const uchar *
 416 #ifndef __SSE4_2__
 417 __attribute__((__target__("sse4.2")))
 418 #endif
 419 search_line_sse42 (const uchar *s, const uchar *end)
 420 {
 421   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 422   static const v16qi search = { '\n', '\r', '?', '\\' };
 423
 424   uintptr_t si = (uintptr_t)s;
 425   uintptr_t index;
 426
 427   /* Check for unaligned input.  */
 428   if (si & 15)
 429     {
 430       if (__builtin_expect (end - s < 16, 0)
 431           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 432         {
 433           /* There are less than 16 bytes left in the buffer, and less
 434              than 16 bytes left on the page.  Reading 16 bytes at this
 435              point might generate a spurious page fault.  Defer to the
 436              SSE2 implementation, which already handles alignment.  */
 437           return search_line_sse2 (s, end);
 438         }
 439
 440       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 441          memory need not be aligned.  */
 442       __asm ("%vpcmpestri $0, (%1), %2"
 443              : "=c"(index) : "r"(s), "x"(search), "a"(4), "d"(16));
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 16) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 454      in inline assembly, we can make proper use of the flags set.  */
 455   __asm (      "sub $16, %1\n"
 456         "       .balign 16\n"
 457         "0:     add $16, %1\n"
 458         "       %vpcmpestri $0, (%1), %2\n"
 459         "       jnc 0b"
 460         : "=&c"(index), "+r"(s)
 461         : "x"(search), "a"(4), "d"(16));
 462
 463  found:
 464   return s + index;
 465 }
 466
 467 #else
 468 /* Work around out-dated assemblers without sse4 support.  */
 469 #define search_line_sse42 search_line_sse2
 470 #endif
 471
 472 /* Check the CPU capabilities.  */
 473
 474 #include "../gcc/config/i386/cpuid.h"
 475
 476 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 477 static search_line_fast_type search_line_fast;
 478
 479 #define HAVE_init_vectorized_lexer 1
 480 static inline void
 481 init_vectorized_lexer (void)
 482 {
 483   unsigned dummy, ecx = 0, edx = 0;
 484   search_line_fast_type impl = search_line_acc_char;
 485   int minimum = 0;
 486
 487 #if defined(__SSE4_2__)
 488   minimum = 3;
 489 #elif defined(__SSE2__)
 490   minimum = 2;
 491 #elif defined(__SSE__)
 492   minimum = 1;
 493 #endif
 494
 495   if (minimum == 3)
 496     impl = search_line_sse42;
 497   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 498     {
 499       if (minimum == 3 || (ecx & bit_SSE4_2))
 500         impl = search_line_sse42;
 501       else if (minimum == 2 || (edx & bit_SSE2))
 502         impl = search_line_sse2;
 503       else if (minimum == 1 || (edx & bit_SSE))
 504         impl = search_line_mmx;
 505     }
 506   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 507     {
 508       if (minimum == 1
 509           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 510         impl = search_line_mmx;
 511     }
 512
 513   search_line_fast = impl;
 514 }
 515
 516 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 517
 518 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 519 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 520    so we can't compile this function without -maltivec on the command line
 521    (or implied by some other switch).  */
 522
 523 static const uchar *
 524 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 525 {
 526   typedef __attribute__((altivec(vector))) unsigned char vc;
 527
 528   const vc repl_nl = {
 529     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 530     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 531   };
 532   const vc repl_cr = {
 533     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 534     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 535   };
 536   const vc repl_bs = {
 537     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 538     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 539   };
 540   const vc repl_qm = {
 541     '?', '?', '?', '?', '?', '?', '?', '?',
 542     '?', '?', '?', '?', '?', '?', '?', '?',
 543   };
 544   const vc ones = {
 545     -1, -1, -1, -1, -1, -1, -1, -1,
 546     -1, -1, -1, -1, -1, -1, -1, -1,
 547   };
 548   const vc zero = { 0 };
 549
 550   vc data, mask, t;
 551
 552   /* Altivec loads automatically mask addresses with -16.  This lets us
 553      issue the first load as early as possible.  */
 554   data = __builtin_vec_ld(0, (const vc *)s);
 555
 556   /* Discard bytes before the beginning of the buffer.  Do this by
 557      beginning with all ones and shifting in zeros according to the
 558      mis-alignment.  The LVSR instruction pulls the exact shift we
 559      want from the address.  */
 560   mask = __builtin_vec_lvsr(0, s);
 561   mask = __builtin_vec_perm(zero, ones, mask);
 562   data &= mask;
 563
 564   /* While altivec loads mask addresses, we still need to align S so
 565      that the offset we compute at the end is correct.  */
 566   s = (const uchar *)((uintptr_t)s & -16);
 567
 568   /* Main loop processing 16 bytes at a time.  */
 569   goto start;
 570   do
 571     {
 572       vc m_nl, m_cr, m_bs, m_qm;
 573
 574       s += 16;
 575       data = __builtin_vec_ld(0, (const vc *)s);
 576
 577     start:
 578       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 579       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 580       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 581       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 582       t = (m_nl | m_cr) | (m_bs | m_qm);
 583
 584       /* T now contains 0xff in bytes for which we matched one of the relevant
 585          characters.  We want to exit the loop if any byte in T is non-zero.
 586          Below is the expansion of vec_any_ne(t, zero).  */
 587     }
 588   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 589
 590   {
 591 #define N  (sizeof(vc) / sizeof(long))
 592
 593     typedef char check_count[(N == 2 || N == 4) * 2 - 1];
 594     union {
 595       vc v;
 596       unsigned long l[N];
 597     } u;
 598     unsigned long l, i = 0;
 599
 600     u.v = t;
 601
 602     /* Find the first word of T that is non-zero.  */
 603     switch (N)
 604       {
 605       case 4:
 606         l = u.l[i++];
 607         if (l != 0)
 608           break;
 609         s += sizeof(unsigned long);
 610         l = u.l[i++];
 611         if (l != 0)
 612           break;
 613         s += sizeof(unsigned long);
 614       case 2:
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619         l = u.l[i];
 620       }
 621
 622     /* L now contains 0xff in bytes for which we matched one of the
 623        relevant characters.  We can find the byte index by finding
 624        its bit index and dividing by 8.  */
 625     l = __builtin_clzl(l) >> 3;
 626     return s + l;
 627
 628 #undef N
 629   }
 630 }
 631
 632 #elif defined (__ARM_NEON__)
 633 #include "arm_neon.h"
 634
 635 static const uchar *
 636 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 637 {
 638   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 639   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 640   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 641   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 642   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 643
 644   unsigned int misalign, found, mask;
 645   const uint8_t *p;
 646   uint8x16_t data;
 647
 648   /* Align the source pointer.  */
 649   misalign = (uintptr_t)s & 15;
 650   p = (const uint8_t *)((uintptr_t)s & -16);
 651   data = vld1q_u8 (p);
 652
 653   /* Create a mask for the bytes that are valid within the first
 654      16-byte block.  The Idea here is that the AND with the mask
 655      within the loop is "free", since we need some AND or TEST
 656      insn in order to set the flags for the branch anyway.  */
 657   mask = (-1u << misalign) & 0xffff;
 658
 659   /* Main loop, processing 16 bytes at a time.  */
 660   goto start;
 661
 662   do
 663     {
 664       uint8x8_t l;
 665       uint16x4_t m;
 666       uint32x2_t n;
 667       uint8x16_t t, u, v, w;
 668
 669       p += 16;
 670       data = vld1q_u8 (p);
 671       mask = 0xffff;
 672
 673     start:
 674       t = vceqq_u8 (data, repl_nl);
 675       u = vceqq_u8 (data, repl_cr);
 676       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 677       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 678       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 679       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 680       m = vpaddl_u8 (l);
 681       n = vpaddl_u16 (m);
 682
 683       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 684               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 685       found &= mask;
 686     }
 687   while (!found);
 688
 689   /* FOUND contains 1 in bits for which we matched a relevant
 690      character.  Conversion to the byte index is trivial.  */
 691   found = __builtin_ctz (found);
 692   return (const uchar *)p + found;
 693 }
 694
 695 #else
 696
 697 /* We only have one accellerated alternative.  Use a direct call so that
 698    we encourage inlining.  */
 699
 700 #define search_line_fast  search_line_acc_char
 701
 702 #endif
 703
 704 /* Initialize the lexer if needed.  */
 705
 706 void
 707 _cpp_init_lexer (void)
 708 {
 709 #ifdef HAVE_init_vectorized_lexer
 710   init_vectorized_lexer ();
 711 #endif
 712 }
 713
 714 /* Returns with a logical line that contains no escaped newlines or
 715    trigraphs.  This is a time-critical inner loop.  */
 716 void
 717 _cpp_clean_line (cpp_reader *pfile)
 718 {
 719   cpp_buffer *buffer;
 720   const uchar *s;
 721   uchar c, *d, *p;
 722
 723   buffer = pfile->buffer;
 724   buffer->cur_note = buffer->notes_used = 0;
 725   buffer->cur = buffer->line_base = buffer->next_line;
 726   buffer->need_line = false;
 727   s = buffer->next_line;
 728
 729   if (!buffer->from_stage3)
 730     {
 731       const uchar *pbackslash = NULL;
 732
 733       /* Fast path.  This is the common case of an un-escaped line with
 734          no trigraphs.  The primary win here is by not writing any
 735          data back to memory until we have to.  */
 736       while (1)
 737         {
 738           /* Perform an optimized search for \n, \r, \\, ?.  */
 739           s = search_line_fast (s, buffer->rlimit);
 740
 741           c = *s;
 742           if (c == '\\')
 743             {
 744               /* Record the location of the backslash and continue.  */
 745               pbackslash = s++;
 746             }
 747           else if (__builtin_expect (c == '?', 0))
 748             {
 749               if (__builtin_expect (s[1] == '?', false)
 750                    && _cpp_trigraph_map[s[2]])
 751                 {
 752                   /* Have a trigraph.  We may or may not have to convert
 753                      it.  Add a line note regardless, for -Wtrigraphs.  */
 754                   add_line_note (buffer, s, s[2]);
 755                   if (CPP_OPTION (pfile, trigraphs))
 756                     {
 757                       /* We do, and that means we have to switch to the
 758                          slow path.  */
 759                       d = (uchar *) s;
 760                       *d = _cpp_trigraph_map[s[2]];
 761                       s += 2;
 762                       goto slow_path;
 763                     }
 764                 }
 765               /* Not a trigraph.  Continue on fast-path.  */
 766               s++;
 767             }
 768           else
 769             break;
 770         }
 771
 772       /* This must be \r or \n.  We're either done, or we'll be forced
 773          to write back to the buffer and continue on the slow path.  */
 774       d = (uchar *) s;
 775
 776       if (__builtin_expect (s == buffer->rlimit, false))
 777         goto done;
 778
 779       /* DOS line ending? */
 780       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 781         {
 782           s++;
 783           if (s == buffer->rlimit)
 784             goto done;
 785         }
 786
 787       if (__builtin_expect (pbackslash == NULL, true))
 788         goto done;
 789
 790       /* Check for escaped newline.  */
 791       p = d;
 792       while (is_nvspace (p[-1]))
 793         p--;
 794       if (p - 1 != pbackslash)
 795         goto done;
 796
 797       /* Have an escaped newline; process it and proceed to
 798          the slow path.  */
 799       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 800       d = p - 2;
 801       buffer->next_line = p - 1;
 802
 803     slow_path:
 804       while (1)
 805         {
 806           c = *++s;
 807           *++d = c;
 808
 809           if (c == '\n' || c == '\r')
 810             {
 811               /* Handle DOS line endings.  */
 812               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 813                 s++;
 814               if (s == buffer->rlimit)
 815                 break;
 816
 817               /* Escaped?  */
 818               p = d;
 819               while (p != buffer->next_line && is_nvspace (p[-1]))
 820                 p--;
 821               if (p == buffer->next_line || p[-1] != '\\')
 822                 break;
 823
 824               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 825               d = p - 2;
 826               buffer->next_line = p - 1;
 827             }
 828           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 829             {
 830               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 831               add_line_note (buffer, d, s[2]);
 832               if (CPP_OPTION (pfile, trigraphs))
 833                 {
 834                   *d = _cpp_trigraph_map[s[2]];
 835                   s += 2;
 836                 }
 837             }
 838         }
 839     }
 840   else
 841     {
 842       while (*s != '\n' && *s != '\r')
 843         s++;
 844       d = (uchar *) s;
 845
 846       /* Handle DOS line endings.  */
 847       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 848         s++;
 849     }
 850
 851  done:
 852   *d = '\n';
 853   /* A sentinel note that should never be processed.  */
 854   add_line_note (buffer, d + 1, '\n');
 855   buffer->next_line = s + 1;
 856 }
 857
 858 /* Return true if the trigraph indicated by NOTE should be warned
 859    about in a comment.  */
 860 static bool
 861 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 862 {
 863   const uchar *p;
 864
 865   /* Within comments we don't warn about trigraphs, unless the
 866      trigraph forms an escaped newline, as that may change
 867      behavior.  */
 868   if (note->type != '/')
 869     return false;
 870
 871   /* If -trigraphs, then this was an escaped newline iff the next note
 872      is coincident.  */
 873   if (CPP_OPTION (pfile, trigraphs))
 874     return note[1].pos == note->pos;
 875
 876   /* Otherwise, see if this forms an escaped newline.  */
 877   p = note->pos + 3;
 878   while (is_nvspace (*p))
 879     p++;
 880
 881   /* There might have been escaped newlines between the trigraph and the
 882      newline we found.  Hence the position test.  */
 883   return (*p == '\n' && p < note[1].pos);
 884 }
 885
 886 /* Process the notes created by add_line_note as far as the current
 887    location.  */
 888 void
 889 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 890 {
 891   cpp_buffer *buffer = pfile->buffer;
 892
 893   for (;;)
 894     {
 895       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 896       unsigned int col;
 897
 898       if (note->pos > buffer->cur)
 899         break;
 900
 901       buffer->cur_note++;
 902       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 903
 904       if (note->type == '\\' || note->type == ' ')
 905         {
 906           if (note->type == ' ' && !in_comment)
 907             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 908                                  "backslash and newline separated by space");
 909
 910           if (buffer->next_line > buffer->rlimit)
 911             {
 912               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 913                                    "backslash-newline at end of file");
 914               /* Prevent "no newline at end of file" warning.  */
 915               buffer->next_line = buffer->rlimit;
 916             }
 917
 918           buffer->line_base = note->pos;
 919           CPP_INCREMENT_LINE (pfile, 0);
 920         }
 921       else if (_cpp_trigraph_map[note->type])
 922         {
 923           if (CPP_OPTION (pfile, warn_trigraphs)
 924               && (!in_comment || warn_in_comment (pfile, note)))
 925             {
 926               if (CPP_OPTION (pfile, trigraphs))
 927                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 928                                        pfile->line_table->highest_line, col,
 929                                        "trigraph ??%c converted to %c",
 930                                        note->type,
 931                                        (int) _cpp_trigraph_map[note->type]);
 932               else
 933                 {
 934                   cpp_warning_with_line
 935                     (pfile, CPP_W_TRIGRAPHS,
 936                      pfile->line_table->highest_line, col,
 937                      "trigraph ??%c ignored, use -trigraphs to enable",
 938                      note->type);
 939                 }
 940             }
 941         }
 942       else if (note->type == 0)
 943         /* Already processed in lex_raw_string.  */;
 944       else
 945         abort ();
 946     }
 947 }
 948
 949 /* Skip a C-style block comment.  We find the end of the comment by
 950    seeing if an asterisk is before every '/' we encounter.  Returns
 951    nonzero if comment terminated by EOF, zero otherwise.
 952
 953    Buffer->cur points to the initial asterisk of the comment.  */
 954 bool
 955 _cpp_skip_block_comment (cpp_reader *pfile)
 956 {
 957   cpp_buffer *buffer = pfile->buffer;
 958   const uchar *cur = buffer->cur;
 959   uchar c;
 960
 961   cur++;
 962   if (*cur == '/')
 963     cur++;
 964
 965   for (;;)
 966     {
 967       /* People like decorating comments with '*', so check for '/'
 968          instead for efficiency.  */
 969       c = *cur++;
 970
 971       if (c == '/')
 972         {
 973           if (cur[-2] == '*')
 974             break;
 975
 976           /* Warn about potential nested comments, but not if the '/'
 977              comes immediately before the true comment delimiter.
 978              Don't bother to get it right across escaped newlines.  */
 979           if (CPP_OPTION (pfile, warn_comments)
 980               && cur[0] == '*' && cur[1] != '/')
 981             {
 982               buffer->cur = cur;
 983               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 984                                      pfile->line_table->highest_line,
 985                                      CPP_BUF_COL (buffer),
 986                                      "\"/*\" within comment");
 987             }
 988         }
 989       else if (c == '\n')
 990         {
 991           unsigned int cols;
 992           buffer->cur = cur - 1;
 993           _cpp_process_line_notes (pfile, true);
 994           if (buffer->next_line >= buffer->rlimit)
 995             return true;
 996           _cpp_clean_line (pfile);
 997
 998           cols = buffer->next_line - buffer->line_base;
 999           CPP_INCREMENT_LINE (pfile, cols);
1000
1001           cur = buffer->cur;
1002         }
1003     }
1004
1005   buffer->cur = cur;
1006   _cpp_process_line_notes (pfile, true);
1007   return false;
1008 }
1009
1010 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1011    terminating newline.  Handles escaped newlines.  Returns nonzero
1012    if a multiline comment.  */
1013 static int
1014 skip_line_comment (cpp_reader *pfile)
1015 {
1016   cpp_buffer *buffer = pfile->buffer;
1017   source_location orig_line = pfile->line_table->highest_line;
1018
1019   while (*buffer->cur != '\n')
1020     buffer->cur++;
1021
1022   _cpp_process_line_notes (pfile, true);
1023   return orig_line != pfile->line_table->highest_line;
1024 }
1025
1026 /* Skips whitespace, saving the next non-whitespace character.  */
1027 static void
1028 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1029 {
1030   cpp_buffer *buffer = pfile->buffer;
1031   bool saw_NUL = false;
1032
1033   do
1034     {
1035       /* Horizontal space always OK.  */
1036       if (c == ' ' || c == '\t')
1037         ;
1038       /* Just \f \v or \0 left.  */
1039       else if (c == '\0')
1040         saw_NUL = true;
1041       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1042         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1043                              CPP_BUF_COL (buffer),
1044                              "%s in preprocessing directive",
1045                              c == '\f' ? "form feed" : "vertical tab");
1046
1047       c = *buffer->cur++;
1048     }
1049   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1050   while (is_nvspace (c));
1051
1052   if (saw_NUL)
1053     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1054
1055   buffer->cur--;
1056 }
1057
1058 /* See if the characters of a number token are valid in a name (no
1059    '.', '+' or '-').  */
1060 static int
1061 name_p (cpp_reader *pfile, const cpp_string *string)
1062 {
1063   unsigned int i;
1064
1065   for (i = 0; i < string->len; i++)
1066     if (!is_idchar (string->text[i]))
1067       return 0;
1068
1069   return 1;
1070 }
1071
1072 /* After parsing an identifier or other sequence, produce a warning about
1073    sequences not in NFC/NFKC.  */
1074 static void
1075 warn_about_normalization (cpp_reader *pfile,
1076                           const cpp_token *token,
1077                           const struct normalize_state *s)
1078 {
1079   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1080       && !pfile->state.skipping)
1081     {
1082       /* Make sure that the token is printed using UCNs, even
1083          if we'd otherwise happily print UTF-8.  */
1084       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1085       size_t sz;
1086
1087       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1088       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1089         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1090                                "`%.*s' is not in NFKC", (int) sz, buf);
1091       else
1092         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1093                                "`%.*s' is not in NFC", (int) sz, buf);
1094     }
1095 }
1096
1097 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1098    an identifier.  FIRST is TRUE if this starts an identifier.  */
1099 static bool
1100 forms_identifier_p (cpp_reader *pfile, int first,
1101                     struct normalize_state *state)
1102 {
1103   cpp_buffer *buffer = pfile->buffer;
1104
1105   if (*buffer->cur == '$')
1106     {
1107       if (!CPP_OPTION (pfile, dollars_in_ident))
1108         return false;
1109
1110       buffer->cur++;
1111       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1112         {
1113           CPP_OPTION (pfile, warn_dollars) = 0;
1114           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1115         }
1116
1117       return true;
1118     }
1119
1120   /* Is this a syntactically valid UCN?  */
1121   if (CPP_OPTION (pfile, extended_identifiers)
1122       && *buffer->cur == '\\'
1123       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1124     {
1125       buffer->cur += 2;
1126       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1127                           state))
1128         return true;
1129       buffer->cur -= 2;
1130     }
1131
1132   return false;
1133 }
1134
1135 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1136 static cpp_hashnode *
1137 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1138 {
1139   cpp_hashnode *result;
1140   const uchar *cur;
1141   unsigned int len;
1142   unsigned int hash = HT_HASHSTEP (0, *base);
1143
1144   cur = base + 1;
1145   while (ISIDNUM (*cur))
1146     {
1147       hash = HT_HASHSTEP (hash, *cur);
1148       cur++;
1149     }
1150   len = cur - base;
1151   hash = HT_HASHFINISH (hash, len);
1152   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1153                                               base, len, hash, HT_ALLOC));
1154
1155   /* Rarely, identifiers require diagnostics when lexed.  */
1156   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1157                         && !pfile->state.skipping, 0))
1158     {
1159       /* It is allowed to poison the same identifier twice.  */
1160       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1161         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1162                    NODE_NAME (result));
1163
1164       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1165          replacement list of a variadic macro.  */
1166       if (result == pfile->spec_nodes.n__VA_ARGS__
1167           && !pfile->state.va_args_ok)
1168         cpp_error (pfile, CPP_DL_PEDWARN,
1169                    "__VA_ARGS__ can only appear in the expansion"
1170                    " of a C99 variadic macro");
1171
1172       /* For -Wc++-compat, warn about use of C++ named operators.  */
1173       if (result->flags & NODE_WARN_OPERATOR)
1174         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1175                      "identifier \"%s\" is a special operator name in C++",
1176                      NODE_NAME (result));
1177     }
1178
1179   return result;
1180 }
1181
1182 /* Get the cpp_hashnode of an identifier specified by NAME in
1183    the current cpp_reader object.  If none is found, NULL is returned.  */
1184 cpp_hashnode *
1185 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1186 {
1187   cpp_hashnode *result;
1188   result = lex_identifier_intern (pfile, (uchar *) name);
1189   return result;
1190 }
1191
1192 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1193 static cpp_hashnode *
1194 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1195                 struct normalize_state *nst)
1196 {
1197   cpp_hashnode *result;
1198   const uchar *cur;
1199   unsigned int len;
1200   unsigned int hash = HT_HASHSTEP (0, *base);
1201
1202   cur = pfile->buffer->cur;
1203   if (! starts_ucn)
1204     while (ISIDNUM (*cur))
1205       {
1206         hash = HT_HASHSTEP (hash, *cur);
1207         cur++;
1208       }
1209   pfile->buffer->cur = cur;
1210   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1211     {
1212       /* Slower version for identifiers containing UCNs (or $).  */
1213       do {
1214         while (ISIDNUM (*pfile->buffer->cur))
1215           {
1216             pfile->buffer->cur++;
1217             NORMALIZE_STATE_UPDATE_IDNUM (nst);
1218           }
1219       } while (forms_identifier_p (pfile, false, nst));
1220       result = _cpp_interpret_identifier (pfile, base,
1221                                           pfile->buffer->cur - base);
1222     }
1223   else
1224     {
1225       len = cur - base;
1226       hash = HT_HASHFINISH (hash, len);
1227
1228       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1229                                                   base, len, hash, HT_ALLOC));
1230     }
1231
1232   /* Rarely, identifiers require diagnostics when lexed.  */
1233   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1234                         && !pfile->state.skipping, 0))
1235     {
1236       /* It is allowed to poison the same identifier twice.  */
1237       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1238         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1239                    NODE_NAME (result));
1240
1241       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1242          replacement list of a variadic macro.  */
1243       if (result == pfile->spec_nodes.n__VA_ARGS__
1244           && !pfile->state.va_args_ok)
1245         cpp_error (pfile, CPP_DL_PEDWARN,
1246                    "__VA_ARGS__ can only appear in the expansion"
1247                    " of a C99 variadic macro");
1248
1249       /* For -Wc++-compat, warn about use of C++ named operators.  */
1250       if (result->flags & NODE_WARN_OPERATOR)
1251         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1252                      "identifier \"%s\" is a special operator name in C++",
1253                      NODE_NAME (result));
1254     }
1255
1256   return result;
1257 }
1258
1259 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1260 static void
1261 lex_number (cpp_reader *pfile, cpp_string *number,
1262             struct normalize_state *nst)
1263 {
1264   const uchar *cur;
1265   const uchar *base;
1266   uchar *dest;
1267
1268   base = pfile->buffer->cur - 1;
1269   do
1270     {
1271       cur = pfile->buffer->cur;
1272
1273       /* N.B. ISIDNUM does not include $.  */
1274       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
1275         {
1276           cur++;
1277           NORMALIZE_STATE_UPDATE_IDNUM (nst);
1278         }
1279
1280       pfile->buffer->cur = cur;
1281     }
1282   while (forms_identifier_p (pfile, false, nst));
1283
1284   number->len = cur - base;
1285   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1286   memcpy (dest, base, number->len);
1287   dest[number->len] = '\0';
1288   number->text = dest;
1289 }
1290
1291 /* Create a token of type TYPE with a literal spelling.  */
1292 static void
1293 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1294                 unsigned int len, enum cpp_ttype type)
1295 {
1296   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1297
1298   memcpy (dest, base, len);
1299   dest[len] = '\0';
1300   token->type = type;
1301   token->val.str.len = len;
1302   token->val.str.text = dest;
1303 }
1304
1305 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1306    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1307
1308 static void
1309 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1310                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1311 {
1312   _cpp_buff *first_buff = *first_buff_p;
1313   _cpp_buff *last_buff = *last_buff_p;
1314
1315   if (first_buff == NULL)
1316     first_buff = last_buff = _cpp_get_buff (pfile, len);
1317   else if (len > BUFF_ROOM (last_buff))
1318     {
1319       size_t room = BUFF_ROOM (last_buff);
1320       memcpy (BUFF_FRONT (last_buff), base, room);
1321       BUFF_FRONT (last_buff) += room;
1322       base += room;
1323       len -= room;
1324       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1325     }
1326
1327   memcpy (BUFF_FRONT (last_buff), base, len);
1328   BUFF_FRONT (last_buff) += len;
1329
1330   *first_buff_p = first_buff;
1331   *last_buff_p = last_buff;
1332 }
1333
1334 /* Lexes a raw string.  The stored string contains the spelling, including
1335    double quotes, delimiter string, '(' and ')', any leading
1336    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1337    literal, or CPP_OTHER if it was not properly terminated.
1338
1339    The spelling is NUL-terminated, but it is not guaranteed that this
1340    is the first NUL since embedded NULs are preserved.  */
1341
1342 static void
1343 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1344                 const uchar *cur)
1345 {
1346   const uchar *raw_prefix;
1347   unsigned int raw_prefix_len = 0;
1348   enum cpp_ttype type;
1349   size_t total_len = 0;
1350   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1351   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1352
1353   type = (*base == 'L' ? CPP_WSTRING :
1354           *base == 'U' ? CPP_STRING32 :
1355           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1356           : CPP_STRING);
1357
1358   raw_prefix = cur + 1;
1359   while (raw_prefix_len < 16)
1360     {
1361       switch (raw_prefix[raw_prefix_len])
1362         {
1363         case ' ': case '(': case ')': case '\\': case '\t':
1364         case '\v': case '\f': case '\n': default:
1365           break;
1366         /* Basic source charset except the above chars.  */
1367         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1368         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1369         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1370         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1371         case 'y': case 'z':
1372         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1373         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1374         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1375         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1376         case 'Y': case 'Z':
1377         case '0': case '1': case '2': case '3': case '4': case '5':
1378         case '6': case '7': case '8': case '9':
1379         case '_': case '{': case '}': case '#': case '[': case ']':
1380         case '<': case '>': case '%': case ':': case ';': case '.':
1381         case '?': case '*': case '+': case '-': case '/': case '^':
1382         case '&': case '|': case '~': case '!': case '=': case ',':
1383         case '"': case '\'':
1384           raw_prefix_len++;
1385           continue;
1386         }
1387       break;
1388     }
1389
1390   if (raw_prefix[raw_prefix_len] != '(')
1391     {
1392       int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
1393                 + 1;
1394       if (raw_prefix_len == 16)
1395         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1396                              "raw string delimiter longer than 16 characters");
1397       else
1398         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1399                              "invalid character '%c' in raw string delimiter",
1400                              (int) raw_prefix[raw_prefix_len]);
1401       pfile->buffer->cur = raw_prefix - 1;
1402       create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
1403       return;
1404     }
1405
1406   cur = raw_prefix + raw_prefix_len + 1;
1407   for (;;)
1408     {
1409 #define BUF_APPEND(STR,LEN)                                     \
1410       do {                                                      \
1411         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1412                         &first_buff, &last_buff);               \
1413         total_len += (LEN);                                     \
1414       } while (0);
1415
1416       cppchar_t c;
1417
1418       /* If we previously performed any trigraph or line splicing
1419          transformations, undo them within the body of the raw string.  */
1420       while (note->pos < cur)
1421         ++note;
1422       for (; note->pos == cur; ++note)
1423         {
1424           switch (note->type)
1425             {
1426             case '\\':
1427             case ' ':
1428               /* Restore backslash followed by newline.  */
1429               BUF_APPEND (base, cur - base);
1430               base = cur;
1431               BUF_APPEND ("\\", 1);
1432             after_backslash:
1433               if (note->type == ' ')
1434                 {
1435                   /* GNU backslash whitespace newline extension.  FIXME
1436                      could be any sequence of non-vertical space.  When we
1437                      can properly restore any such sequence, we should mark
1438                      this note as handled so _cpp_process_line_notes
1439                      doesn't warn.  */
1440                   BUF_APPEND (" ", 1);
1441                 }
1442
1443               BUF_APPEND ("\n", 1);
1444               break;
1445
1446             case 0:
1447               /* Already handled.  */
1448               break;
1449
1450             default:
1451               if (_cpp_trigraph_map[note->type])
1452                 {
1453                   /* Don't warn about this trigraph in
1454                      _cpp_process_line_notes, since trigraphs show up as
1455                      trigraphs in raw strings.  */
1456                   uchar type = note->type;
1457                   note->type = 0;
1458
1459                   if (!CPP_OPTION (pfile, trigraphs))
1460                     /* If we didn't convert the trigraph in the first
1461                        place, don't do anything now either.  */
1462                     break;
1463
1464                   BUF_APPEND (base, cur - base);
1465                   base = cur;
1466                   BUF_APPEND ("??", 2);
1467
1468                   /* ??/ followed by newline gets two line notes, one for
1469                      the trigraph and one for the backslash/newline.  */
1470                   if (type == '/' && note[1].pos == cur)
1471                     {
1472                       if (note[1].type != '\\'
1473                           && note[1].type != ' ')
1474                         abort ();
1475                       BUF_APPEND ("/", 1);
1476                       ++note;
1477                       goto after_backslash;
1478                     }
1479                   /* The ) from ??) could be part of the suffix.  */
1480                   else if (type == ')'
1481                            && strncmp ((const char *) cur+1,
1482                                        (const char *) raw_prefix,
1483                                        raw_prefix_len) == 0
1484                            && cur[raw_prefix_len+1] == '"')
1485                     {
1486                       BUF_APPEND (")", 1);
1487                       base++;
1488                       cur += raw_prefix_len + 2;
1489                       goto break_outer_loop;
1490                     }
1491                   else
1492                     {
1493                       /* Skip the replacement character.  */
1494                       base = ++cur;
1495                       BUF_APPEND (&type, 1);
1496                     }
1497                 }
1498               else
1499                 abort ();
1500               break;
1501             }
1502         }
1503       c = *cur++;
1504
1505       if (c == ')'
1506           && strncmp ((const char *) cur, (const char *) raw_prefix,
1507                       raw_prefix_len) == 0
1508           && cur[raw_prefix_len] == '"')
1509         {
1510           cur += raw_prefix_len + 1;
1511           break;
1512         }
1513       else if (c == '\n')
1514         {
1515           if (pfile->state.in_directive
1516               || pfile->state.parsing_args
1517               || pfile->state.in_deferred_pragma)
1518             {
1519               cur--;
1520               type = CPP_OTHER;
1521               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1522                                    "unterminated raw string");
1523               break;
1524             }
1525
1526           BUF_APPEND (base, cur - base);
1527
1528           if (pfile->buffer->cur < pfile->buffer->rlimit)
1529             CPP_INCREMENT_LINE (pfile, 0);
1530           pfile->buffer->need_line = true;
1531
1532           pfile->buffer->cur = cur-1;
1533           _cpp_process_line_notes (pfile, false);
1534           if (!_cpp_get_fresh_line (pfile))
1535             {
1536               source_location src_loc = token->src_loc;
1537               token->type = CPP_EOF;
1538               /* Tell the compiler the line number of the EOF token.  */
1539               token->src_loc = pfile->line_table->highest_line;
1540               token->flags = BOL;
1541               if (first_buff != NULL)
1542                 _cpp_release_buff (pfile, first_buff);
1543               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1544                                    "unterminated raw string");
1545               return;
1546             }
1547
1548           cur = base = pfile->buffer->cur;
1549           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1550         }
1551     }
1552  break_outer_loop:
1553
1554   if (CPP_OPTION (pfile, user_literals))
1555     {
1556       /* Grab user defined literal suffix.  */
1557       if (ISIDST (*cur))
1558         {
1559           type = cpp_userdef_string_add_type (type);
1560           ++cur;
1561         }
1562       while (ISIDNUM (*cur))
1563         ++cur;
1564     }
1565
1566   pfile->buffer->cur = cur;
1567   if (first_buff == NULL)
1568     create_literal (pfile, token, base, cur - base, type);
1569   else
1570     {
1571       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1572
1573       token->type = type;
1574       token->val.str.len = total_len + (cur - base);
1575       token->val.str.text = dest;
1576       last_buff = first_buff;
1577       while (last_buff != NULL)
1578         {
1579           memcpy (dest, last_buff->base,
1580                   BUFF_FRONT (last_buff) - last_buff->base);
1581           dest += BUFF_FRONT (last_buff) - last_buff->base;
1582           last_buff = last_buff->next;
1583         }
1584       _cpp_release_buff (pfile, first_buff);
1585       memcpy (dest, base, cur - base);
1586       dest[cur - base] = '\0';
1587     }
1588 }
1589
1590 /* Lexes a string, character constant, or angle-bracketed header file
1591    name.  The stored string contains the spelling, including opening
1592    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1593    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1594    if it was not properly terminated, or CPP_LESS for an unterminated
1595    header name which must be relexed as normal tokens.
1596
1597    The spelling is NUL-terminated, but it is not guaranteed that this
1598    is the first NUL since embedded NULs are preserved.  */
1599 static void
1600 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1601 {
1602   bool saw_NUL = false;
1603   const uchar *cur;
1604   cppchar_t terminator;
1605   enum cpp_ttype type;
1606
1607   cur = base;
1608   terminator = *cur++;
1609   if (terminator == 'L' || terminator == 'U')
1610     terminator = *cur++;
1611   else if (terminator == 'u')
1612     {
1613       terminator = *cur++;
1614       if (terminator == '8')
1615         terminator = *cur++;
1616     }
1617   if (terminator == 'R')
1618     {
1619       lex_raw_string (pfile, token, base, cur);
1620       return;
1621     }
1622   if (terminator == '"')
1623     type = (*base == 'L' ? CPP_WSTRING :
1624             *base == 'U' ? CPP_STRING32 :
1625             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1626                          : CPP_STRING);
1627   else if (terminator == '\'')
1628     type = (*base == 'L' ? CPP_WCHAR :
1629             *base == 'U' ? CPP_CHAR32 :
1630             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1631   else
1632     terminator = '>', type = CPP_HEADER_NAME;
1633
1634   for (;;)
1635     {
1636       cppchar_t c = *cur++;
1637
1638       /* In #include-style directives, terminators are not escapable.  */
1639       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1640         cur++;
1641       else if (c == terminator)
1642         break;
1643       else if (c == '\n')
1644         {
1645           cur--;
1646           /* Unmatched quotes always yield undefined behavior, but
1647              greedy lexing means that what appears to be an unterminated
1648              header name may actually be a legitimate sequence of tokens.  */
1649           if (terminator == '>')
1650             {
1651               token->type = CPP_LESS;
1652               return;
1653             }
1654           type = CPP_OTHER;
1655           break;
1656         }
1657       else if (c == '\0')
1658         saw_NUL = true;
1659     }
1660
1661   if (saw_NUL && !pfile->state.skipping)
1662     cpp_error (pfile, CPP_DL_WARNING,
1663                "null character(s) preserved in literal");
1664
1665   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1666     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1667                (int) terminator);
1668
1669   if (CPP_OPTION (pfile, user_literals))
1670     {
1671       /* Grab user defined literal suffix.  */
1672       if (ISIDST (*cur))
1673         {
1674           type = cpp_userdef_char_add_type (type);
1675           type = cpp_userdef_string_add_type (type);
1676           ++cur;
1677         }
1678       while (ISIDNUM (*cur))
1679         ++cur;
1680     }
1681
1682   pfile->buffer->cur = cur;
1683   create_literal (pfile, token, base, cur - base, type);
1684 }
1685
1686 /* Return the comment table. The client may not make any assumption
1687    about the ordering of the table.  */
1688 cpp_comment_table *
1689 cpp_get_comments (cpp_reader *pfile)
1690 {
1691   return &pfile->comments;
1692 }
1693
1694 /* Append a comment to the end of the comment table. */
1695 static void
1696 store_comment (cpp_reader *pfile, cpp_token *token)
1697 {
1698   int len;
1699
1700   if (pfile->comments.allocated == 0)
1701     {
1702       pfile->comments.allocated = 256;
1703       pfile->comments.entries = (cpp_comment *) xmalloc
1704         (pfile->comments.allocated * sizeof (cpp_comment));
1705     }
1706
1707   if (pfile->comments.count == pfile->comments.allocated)
1708     {
1709       pfile->comments.allocated *= 2;
1710       pfile->comments.entries = (cpp_comment *) xrealloc
1711         (pfile->comments.entries,
1712          pfile->comments.allocated * sizeof (cpp_comment));
1713     }
1714
1715   len = token->val.str.len;
1716
1717   /* Copy comment. Note, token may not be NULL terminated. */
1718   pfile->comments.entries[pfile->comments.count].comment =
1719     (char *) xmalloc (sizeof (char) * (len + 1));
1720   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1721           token->val.str.text, len);
1722   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1723
1724   /* Set source location. */
1725   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1726
1727   /* Increment the count of entries in the comment table. */
1728   pfile->comments.count++;
1729 }
1730
1731 /* The stored comment includes the comment start and any terminator.  */
1732 static void
1733 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1734               cppchar_t type)
1735 {
1736   unsigned char *buffer;
1737   unsigned int len, clen, i;
1738
1739   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1740
1741   /* C++ comments probably (not definitely) have moved past a new
1742      line, which we don't want to save in the comment.  */
1743   if (is_vspace (pfile->buffer->cur[-1]))
1744     len--;
1745
1746   /* If we are currently in a directive or in argument parsing, then
1747      we need to store all C++ comments as C comments internally, and
1748      so we need to allocate a little extra space in that case.
1749
1750      Note that the only time we encounter a directive here is
1751      when we are saving comments in a "#define".  */
1752   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1753           && type == '/') ? len + 2 : len;
1754
1755   buffer = _cpp_unaligned_alloc (pfile, clen);
1756
1757   token->type = CPP_COMMENT;
1758   token->val.str.len = clen;
1759   token->val.str.text = buffer;
1760
1761   buffer[0] = '/';
1762   memcpy (buffer + 1, from, len - 1);
1763
1764   /* Finish conversion to a C comment, if necessary.  */
1765   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1766     {
1767       buffer[1] = '*';
1768       buffer[clen - 2] = '*';
1769       buffer[clen - 1] = '/';
1770       /* As there can be in a C++ comments illegal sequences for C comments
1771          we need to filter them out.  */
1772       for (i = 2; i < (clen - 2); i++)
1773         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1774           buffer[i] = '|';
1775     }
1776
1777   /* Finally store this comment for use by clients of libcpp. */
1778   store_comment (pfile, token);
1779 }
1780
1781 /* Allocate COUNT tokens for RUN.  */
1782 void
1783 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1784 {
1785   run->base = XNEWVEC (cpp_token, count);
1786   run->limit = run->base + count;
1787   run->next = NULL;
1788 }
1789
1790 /* Returns the next tokenrun, or creates one if there is none.  */
1791 static tokenrun *
1792 next_tokenrun (tokenrun *run)
1793 {
1794   if (run->next == NULL)
1795     {
1796       run->next = XNEW (tokenrun);
1797       run->next->prev = run;
1798       _cpp_init_tokenrun (run->next, 250);
1799     }
1800
1801   return run->next;
1802 }
1803
1804 /* Return the number of not yet processed token in a given
1805    context.  */
1806 int
1807 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1808 {
1809   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1810     return (LAST (context).token - FIRST (context).token);
1811   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1812            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1813     return (LAST (context).ptoken - FIRST (context).ptoken);
1814   else
1815       abort ();
1816 }
1817
1818 /* Returns the token present at index INDEX in a given context.  If
1819    INDEX is zero, the next token to be processed is returned.  */
1820 static const cpp_token*
1821 _cpp_token_from_context_at (cpp_context *context, int index)
1822 {
1823   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1824     return &(FIRST (context).token[index]);
1825   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1826            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1827     return FIRST (context).ptoken[index];
1828  else
1829    abort ();
1830 }
1831
1832 /* Look ahead in the input stream.  */
1833 const cpp_token *
1834 cpp_peek_token (cpp_reader *pfile, int index)
1835 {
1836   cpp_context *context = pfile->context;
1837   const cpp_token *peektok;
1838   int count;
1839
1840   /* First, scan through any pending cpp_context objects.  */
1841   while (context->prev)
1842     {
1843       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1844
1845       if (index < (int) sz)
1846         return _cpp_token_from_context_at (context, index);
1847       index -= (int) sz;
1848       context = context->prev;
1849     }
1850
1851   /* We will have to read some new tokens after all (and do so
1852      without invalidating preceding tokens).  */
1853   count = index;
1854   pfile->keep_tokens++;
1855
1856   do
1857     {
1858       peektok = _cpp_lex_token (pfile);
1859       if (peektok->type == CPP_EOF)
1860         return peektok;
1861     }
1862   while (index--);
1863
1864   _cpp_backup_tokens_direct (pfile, count + 1);
1865   pfile->keep_tokens--;
1866
1867   return peektok;
1868 }
1869
1870 /* Allocate a single token that is invalidated at the same time as the
1871    rest of the tokens on the line.  Has its line and col set to the
1872    same as the last lexed token, so that diagnostics appear in the
1873    right place.  */
1874 cpp_token *
1875 _cpp_temp_token (cpp_reader *pfile)
1876 {
1877   cpp_token *old, *result;
1878   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1879   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1880
1881   old = pfile->cur_token - 1;
1882   /* Any pre-existing lookaheads must not be clobbered.  */
1883   if (la)
1884     {
1885       if (sz <= la)
1886         {
1887           tokenrun *next = next_tokenrun (pfile->cur_run);
1888
1889           if (sz < la)
1890             memmove (next->base + 1, next->base,
1891                      (la - sz) * sizeof (cpp_token));
1892
1893           next->base[0] = pfile->cur_run->limit[-1];
1894         }
1895
1896       if (sz > 1)
1897         memmove (pfile->cur_token + 1, pfile->cur_token,
1898                  MIN (la, sz - 1) * sizeof (cpp_token));
1899     }
1900
1901   if (!sz && pfile->cur_token == pfile->cur_run->limit)
1902     {
1903       pfile->cur_run = next_tokenrun (pfile->cur_run);
1904       pfile->cur_token = pfile->cur_run->base;
1905     }
1906
1907   result = pfile->cur_token++;
1908   result->src_loc = old->src_loc;
1909   return result;
1910 }
1911
1912 /* Lex a token into RESULT (external interface).  Takes care of issues
1913    like directive handling, token lookahead, multiple include
1914    optimization and skipping.  */
1915 const cpp_token *
1916 _cpp_lex_token (cpp_reader *pfile)
1917 {
1918   cpp_token *result;
1919
1920   for (;;)
1921     {
1922       if (pfile->cur_token == pfile->cur_run->limit)
1923         {
1924           pfile->cur_run = next_tokenrun (pfile->cur_run);
1925           pfile->cur_token = pfile->cur_run->base;
1926         }
1927       /* We assume that the current token is somewhere in the current
1928          run.  */
1929       if (pfile->cur_token < pfile->cur_run->base
1930           || pfile->cur_token >= pfile->cur_run->limit)
1931         abort ();
1932
1933       if (pfile->lookaheads)
1934         {
1935           pfile->lookaheads--;
1936           result = pfile->cur_token++;
1937         }
1938       else
1939         result = _cpp_lex_direct (pfile);
1940
1941       if (result->flags & BOL)
1942         {
1943           /* Is this a directive.  If _cpp_handle_directive returns
1944              false, it is an assembler #.  */
1945           if (result->type == CPP_HASH
1946               /* 6.10.3 p 11: Directives in a list of macro arguments
1947                  gives undefined behavior.  This implementation
1948                  handles the directive as normal.  */
1949               && pfile->state.parsing_args != 1)
1950             {
1951               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1952                 {
1953                   if (pfile->directive_result.type == CPP_PADDING)
1954                     continue;
1955                   result = &pfile->directive_result;
1956                 }
1957             }
1958           else if (pfile->state.in_deferred_pragma)
1959             result = &pfile->directive_result;
1960
1961           if (pfile->cb.line_change && !pfile->state.skipping)
1962             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1963         }
1964
1965       /* We don't skip tokens in directives.  */
1966       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
1967         break;
1968
1969       /* Outside a directive, invalidate controlling macros.  At file
1970          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
1971          get here and MI optimization works.  */
1972       pfile->mi_valid = false;
1973
1974       if (!pfile->state.skipping || result->type == CPP_EOF)
1975         break;
1976     }
1977
1978   return result;
1979 }
1980
1981 /* Returns true if a fresh line has been loaded.  */
1982 bool
1983 _cpp_get_fresh_line (cpp_reader *pfile)
1984 {
1985   int return_at_eof;
1986
1987   /* We can't get a new line until we leave the current directive.  */
1988   if (pfile->state.in_directive)
1989     return false;
1990
1991   for (;;)
1992     {
1993       cpp_buffer *buffer = pfile->buffer;
1994
1995       if (!buffer->need_line)
1996         return true;
1997
1998       if (buffer->next_line < buffer->rlimit)
1999         {
2000           _cpp_clean_line (pfile);
2001           return true;
2002         }
2003
2004       /* First, get out of parsing arguments state.  */
2005       if (pfile->state.parsing_args)
2006         return false;
2007
2008       /* End of buffer.  Non-empty files should end in a newline.  */
2009       if (buffer->buf != buffer->rlimit
2010           && buffer->next_line > buffer->rlimit
2011           && !buffer->from_stage3)
2012         {
2013           /* Clip to buffer size.  */
2014           buffer->next_line = buffer->rlimit;
2015         }
2016
2017       return_at_eof = buffer->return_at_eof;
2018       _cpp_pop_buffer (pfile);
2019       if (pfile->buffer == NULL || return_at_eof)
2020         return false;
2021     }
2022 }
2023
2024 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2025   do                                                    \
2026     {                                                   \
2027       result->type = ELSE_TYPE;                         \
2028       if (*buffer->cur == CHAR)                         \
2029         buffer->cur++, result->type = THEN_TYPE;        \
2030     }                                                   \
2031   while (0)
2032
2033 /* Lex a token into pfile->cur_token, which is also incremented, to
2034    get diagnostics pointing to the correct location.
2035
2036    Does not handle issues such as token lookahead, multiple-include
2037    optimization, directives, skipping etc.  This function is only
2038    suitable for use by _cpp_lex_token, and in special cases like
2039    lex_expansion_token which doesn't care for any of these issues.
2040
2041    When meeting a newline, returns CPP_EOF if parsing a directive,
2042    otherwise returns to the start of the token buffer if permissible.
2043    Returns the location of the lexed token.  */
2044 cpp_token *
2045 _cpp_lex_direct (cpp_reader *pfile)
2046 {
2047   cppchar_t c;
2048   cpp_buffer *buffer;
2049   const unsigned char *comment_start;
2050   cpp_token *result = pfile->cur_token++;
2051
2052  fresh_line:
2053   result->flags = 0;
2054   buffer = pfile->buffer;
2055   if (buffer->need_line)
2056     {
2057       if (pfile->state.in_deferred_pragma)
2058         {
2059           result->type = CPP_PRAGMA_EOL;
2060           pfile->state.in_deferred_pragma = false;
2061           if (!pfile->state.pragma_allow_expansion)
2062             pfile->state.prevent_expansion--;
2063           return result;
2064         }
2065       if (!_cpp_get_fresh_line (pfile))
2066         {
2067           result->type = CPP_EOF;
2068           if (!pfile->state.in_directive)
2069             {
2070               /* Tell the compiler the line number of the EOF token.  */
2071               result->src_loc = pfile->line_table->highest_line;
2072               result->flags = BOL;
2073             }
2074           return result;
2075         }
2076       if (!pfile->keep_tokens)
2077         {
2078           pfile->cur_run = &pfile->base_run;
2079           result = pfile->base_run.base;
2080           pfile->cur_token = result + 1;
2081         }
2082       result->flags = BOL;
2083       if (pfile->state.parsing_args == 2)
2084         result->flags |= PREV_WHITE;
2085     }
2086   buffer = pfile->buffer;
2087  update_tokens_line:
2088   result->src_loc = pfile->line_table->highest_line;
2089
2090  skipped_white:
2091   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2092       && !pfile->overlaid_buffer)
2093     {
2094       _cpp_process_line_notes (pfile, false);
2095       result->src_loc = pfile->line_table->highest_line;
2096     }
2097   c = *buffer->cur++;
2098
2099   if (pfile->forced_token_location_p)
2100     result->src_loc = *pfile->forced_token_location_p;
2101   else
2102     result->src_loc = linemap_position_for_column (pfile->line_table,
2103                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2104
2105   switch (c)
2106     {
2107     case ' ': case '\t': case '\f': case '\v': case '\0':
2108       result->flags |= PREV_WHITE;
2109       skip_whitespace (pfile, c);
2110       goto skipped_white;
2111
2112     case '\n':
2113       if (buffer->cur < buffer->rlimit)
2114         CPP_INCREMENT_LINE (pfile, 0);
2115       buffer->need_line = true;
2116       goto fresh_line;
2117
2118     case '0': case '1': case '2': case '3': case '4':
2119     case '5': case '6': case '7': case '8': case '9':
2120       {
2121         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2122         result->type = CPP_NUMBER;
2123         lex_number (pfile, &result->val.str, &nst);
2124         warn_about_normalization (pfile, result, &nst);
2125         break;
2126       }
2127
2128     case 'L':
2129     case 'u':
2130     case 'U':
2131     case 'R':
2132       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2133          wide strings or raw strings.  */
2134       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2135           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2136         {
2137           if ((*buffer->cur == '\'' && c != 'R')
2138               || *buffer->cur == '"'
2139               || (*buffer->cur == 'R'
2140                   && c != 'R'
2141                   && buffer->cur[1] == '"'
2142                   && CPP_OPTION (pfile, rliterals))
2143               || (*buffer->cur == '8'
2144                   && c == 'u'
2145                   && (buffer->cur[1] == '"'
2146                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2147                           && CPP_OPTION (pfile, rliterals)))))
2148             {
2149               lex_string (pfile, result, buffer->cur - 1);
2150               break;
2151             }
2152         }
2153       /* Fall through.  */
2154
2155     case '_':
2156     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2157     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2158     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2159     case 's': case 't':           case 'v': case 'w': case 'x':
2160     case 'y': case 'z':
2161     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2162     case 'G': case 'H': case 'I': case 'J': case 'K':
2163     case 'M': case 'N': case 'O': case 'P': case 'Q':
2164     case 'S': case 'T':           case 'V': case 'W': case 'X':
2165     case 'Y': case 'Z':
2166       result->type = CPP_NAME;
2167       {
2168         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2169         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2170                                                 &nst);
2171         warn_about_normalization (pfile, result, &nst);
2172       }
2173
2174       /* Convert named operators to their proper types.  */
2175       if (result->val.node.node->flags & NODE_OPERATOR)
2176         {
2177           result->flags |= NAMED_OP;
2178           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2179         }
2180       break;
2181
2182     case '\'':
2183     case '"':
2184       lex_string (pfile, result, buffer->cur - 1);
2185       break;
2186
2187     case '/':
2188       /* A potential block or line comment.  */
2189       comment_start = buffer->cur;
2190       c = *buffer->cur;
2191
2192       if (c == '*')
2193         {
2194           if (_cpp_skip_block_comment (pfile))
2195             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2196         }
2197       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2198                             || cpp_in_system_header (pfile)))
2199         {
2200           /* Warn about comments only if pedantically GNUC89, and not
2201              in system headers.  */
2202           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2203               && ! buffer->warned_cplusplus_comments)
2204             {
2205               cpp_error (pfile, CPP_DL_PEDWARN,
2206                          "C++ style comments are not allowed in ISO C90");
2207               cpp_error (pfile, CPP_DL_PEDWARN,
2208                          "(this will be reported only once per input file)");
2209               buffer->warned_cplusplus_comments = 1;
2210             }
2211
2212           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2213             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2214         }
2215       else if (c == '=')
2216         {
2217           buffer->cur++;
2218           result->type = CPP_DIV_EQ;
2219           break;
2220         }
2221       else
2222         {
2223           result->type = CPP_DIV;
2224           break;
2225         }
2226
2227       if (!pfile->state.save_comments)
2228         {
2229           result->flags |= PREV_WHITE;
2230           goto update_tokens_line;
2231         }
2232
2233       /* Save the comment as a token in its own right.  */
2234       save_comment (pfile, result, comment_start, c);
2235       break;
2236
2237     case '<':
2238       if (pfile->state.angled_headers)
2239         {
2240           lex_string (pfile, result, buffer->cur - 1);
2241           if (result->type != CPP_LESS)
2242             break;
2243         }
2244
2245       result->type = CPP_LESS;
2246       if (*buffer->cur == '=')
2247         buffer->cur++, result->type = CPP_LESS_EQ;
2248       else if (*buffer->cur == '<')
2249         {
2250           buffer->cur++;
2251           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2252         }
2253       else if (CPP_OPTION (pfile, digraphs))
2254         {
2255           if (*buffer->cur == ':')
2256             {
2257               buffer->cur++;
2258               result->flags |= DIGRAPH;
2259               result->type = CPP_OPEN_SQUARE;
2260             }
2261           else if (*buffer->cur == '%')
2262             {
2263               buffer->cur++;
2264               result->flags |= DIGRAPH;
2265               result->type = CPP_OPEN_BRACE;
2266             }
2267         }
2268       break;
2269
2270     case '>':
2271       result->type = CPP_GREATER;
2272       if (*buffer->cur == '=')
2273         buffer->cur++, result->type = CPP_GREATER_EQ;
2274       else if (*buffer->cur == '>')
2275         {
2276           buffer->cur++;
2277           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2278         }
2279       break;
2280
2281     case '%':
2282       result->type = CPP_MOD;
2283       if (*buffer->cur == '=')
2284         buffer->cur++, result->type = CPP_MOD_EQ;
2285       else if (CPP_OPTION (pfile, digraphs))
2286         {
2287           if (*buffer->cur == ':')
2288             {
2289               buffer->cur++;
2290               result->flags |= DIGRAPH;
2291               result->type = CPP_HASH;
2292               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2293                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2294             }
2295           else if (*buffer->cur == '>')
2296             {
2297               buffer->cur++;
2298               result->flags |= DIGRAPH;
2299               result->type = CPP_CLOSE_BRACE;
2300             }
2301         }
2302       break;
2303
2304     case '.':
2305       result->type = CPP_DOT;
2306       if (ISDIGIT (*buffer->cur))
2307         {
2308           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2309           result->type = CPP_NUMBER;
2310           lex_number (pfile, &result->val.str, &nst);
2311           warn_about_normalization (pfile, result, &nst);
2312         }
2313       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2314         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2315       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2316         buffer->cur++, result->type = CPP_DOT_STAR;
2317       break;
2318
2319     case '+':
2320       result->type = CPP_PLUS;
2321       if (*buffer->cur == '+')
2322         buffer->cur++, result->type = CPP_PLUS_PLUS;
2323       else if (*buffer->cur == '=')
2324         buffer->cur++, result->type = CPP_PLUS_EQ;
2325       break;
2326
2327     case '-':
2328       result->type = CPP_MINUS;
2329       if (*buffer->cur == '>')
2330         {
2331           buffer->cur++;
2332           result->type = CPP_DEREF;
2333           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2334             buffer->cur++, result->type = CPP_DEREF_STAR;
2335         }
2336       else if (*buffer->cur == '-')
2337         buffer->cur++, result->type = CPP_MINUS_MINUS;
2338       else if (*buffer->cur == '=')
2339         buffer->cur++, result->type = CPP_MINUS_EQ;
2340       break;
2341
2342     case '&':
2343       result->type = CPP_AND;
2344       if (*buffer->cur == '&')
2345         buffer->cur++, result->type = CPP_AND_AND;
2346       else if (*buffer->cur == '=')
2347         buffer->cur++, result->type = CPP_AND_EQ;
2348       break;
2349
2350     case '|':
2351       result->type = CPP_OR;
2352       if (*buffer->cur == '|')
2353         buffer->cur++, result->type = CPP_OR_OR;
2354       else if (*buffer->cur == '=')
2355         buffer->cur++, result->type = CPP_OR_EQ;
2356       break;
2357
2358     case ':':
2359       result->type = CPP_COLON;
2360       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2361         buffer->cur++, result->type = CPP_SCOPE;
2362       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2363         {
2364           buffer->cur++;
2365           result->flags |= DIGRAPH;
2366           result->type = CPP_CLOSE_SQUARE;
2367         }
2368       break;
2369
2370     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2371     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2372     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2373     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2374     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2375
2376     case '?': result->type = CPP_QUERY; break;
2377     case '~': result->type = CPP_COMPL; break;
2378     case ',': result->type = CPP_COMMA; break;
2379     case '(': result->type = CPP_OPEN_PAREN; break;
2380     case ')': result->type = CPP_CLOSE_PAREN; break;
2381     case '[': result->type = CPP_OPEN_SQUARE; break;
2382     case ']': result->type = CPP_CLOSE_SQUARE; break;
2383     case '{': result->type = CPP_OPEN_BRACE; break;
2384     case '}': result->type = CPP_CLOSE_BRACE; break;
2385     case ';': result->type = CPP_SEMICOLON; break;
2386
2387       /* @ is a punctuator in Objective-C.  */
2388     case '@': result->type = CPP_ATSIGN; break;
2389
2390     case '$':
2391     case '\\':
2392       {
2393         const uchar *base = --buffer->cur;
2394         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2395
2396         if (forms_identifier_p (pfile, true, &nst))
2397           {
2398             result->type = CPP_NAME;
2399             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2400             warn_about_normalization (pfile, result, &nst);
2401             break;
2402           }
2403         buffer->cur++;
2404       }
2405
2406     default:
2407       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2408       break;
2409     }
2410
2411   return result;
2412 }
2413
2414 /* An upper bound on the number of bytes needed to spell TOKEN.
2415    Does not include preceding whitespace.  */
2416 unsigned int
2417 cpp_token_len (const cpp_token *token)
2418 {
2419   unsigned int len;
2420
2421   switch (TOKEN_SPELL (token))
2422     {
2423     default:            len = 6;                                break;
2424     case SPELL_LITERAL: len = token->val.str.len;               break;
2425     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2426     }
2427
2428   return len;
2429 }
2430
2431 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2432    Return the number of bytes read out of NAME.  (There are always
2433    10 bytes written to BUFFER.)  */
2434
2435 static size_t
2436 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2437 {
2438   int j;
2439   int ucn_len = 0;
2440   int ucn_len_c;
2441   unsigned t;
2442   unsigned long utf32;
2443
2444   /* Compute the length of the UTF-8 sequence.  */
2445   for (t = *name; t & 0x80; t <<= 1)
2446     ucn_len++;
2447
2448   utf32 = *name & (0x7F >> ucn_len);
2449   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2450     {
2451       utf32 = (utf32 << 6) | (*++name & 0x3F);
2452
2453       /* Ill-formed UTF-8.  */
2454       if ((*name & ~0x3F) != 0x80)
2455         abort ();
2456     }
2457
2458   *buffer++ = '\\';
2459   *buffer++ = 'U';
2460   for (j = 7; j >= 0; j--)
2461     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2462   return ucn_len;
2463 }
2464
2465 /* Given a token TYPE corresponding to a digraph, return a pointer to
2466    the spelling of the digraph.  */
2467 static const unsigned char *
2468 cpp_digraph2name (enum cpp_ttype type)
2469 {
2470   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2471 }
2472
2473 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2474    already contain the enough space to hold the token's spelling.
2475    Returns a pointer to the character after the last character written.
2476    FORSTRING is true if this is to be the spelling after translation
2477    phase 1 (this is different for UCNs).
2478    FIXME: Would be nice if we didn't need the PFILE argument.  */
2479 unsigned char *
2480 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2481                  unsigned char *buffer, bool forstring)
2482 {
2483   switch (TOKEN_SPELL (token))
2484     {
2485     case SPELL_OPERATOR:
2486       {
2487         const unsigned char *spelling;
2488         unsigned char c;
2489
2490         if (token->flags & DIGRAPH)
2491           spelling = cpp_digraph2name (token->type);
2492         else if (token->flags & NAMED_OP)
2493           goto spell_ident;
2494         else
2495           spelling = TOKEN_NAME (token);
2496
2497         while ((c = *spelling++) != '\0')
2498           *buffer++ = c;
2499       }
2500       break;
2501
2502     spell_ident:
2503     case SPELL_IDENT:
2504       if (forstring)
2505         {
2506           memcpy (buffer, NODE_NAME (token->val.node.node),
2507                   NODE_LEN (token->val.node.node));
2508           buffer += NODE_LEN (token->val.node.node);
2509         }
2510       else
2511         {
2512           size_t i;
2513           const unsigned char * name = NODE_NAME (token->val.node.node);
2514
2515           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2516             if (name[i] & ~0x7F)
2517               {
2518                 i += utf8_to_ucn (buffer, name + i) - 1;
2519                 buffer += 10;
2520               }
2521             else
2522               *buffer++ = NODE_NAME (token->val.node.node)[i];
2523         }
2524       break;
2525
2526     case SPELL_LITERAL:
2527       memcpy (buffer, token->val.str.text, token->val.str.len);
2528       buffer += token->val.str.len;
2529       break;
2530
2531     case SPELL_NONE:
2532       cpp_error (pfile, CPP_DL_ICE,
2533                  "unspellable token %s", TOKEN_NAME (token));
2534       break;
2535     }
2536
2537   return buffer;
2538 }
2539
2540 /* Returns TOKEN spelt as a null-terminated string.  The string is
2541    freed when the reader is destroyed.  Useful for diagnostics.  */
2542 unsigned char *
2543 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2544 {
2545   unsigned int len = cpp_token_len (token) + 1;
2546   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2547
2548   end = cpp_spell_token (pfile, token, start, false);
2549   end[0] = '\0';
2550
2551   return start;
2552 }
2553
2554 /* Returns a pointer to a string which spells the token defined by
2555    TYPE and FLAGS.  Used by C front ends, which really should move to
2556    using cpp_token_as_text.  */
2557 const char *
2558 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2559 {
2560   if (flags & DIGRAPH)
2561     return (const char *) cpp_digraph2name (type);
2562   else if (flags & NAMED_OP)
2563     return cpp_named_operator2name (type);
2564
2565   return (const char *) token_spellings[type].name;
2566 }
2567
2568 /* Writes the spelling of token to FP, without any preceding space.
2569    Separated from cpp_spell_token for efficiency - to avoid stdio
2570    double-buffering.  */
2571 void
2572 cpp_output_token (const cpp_token *token, FILE *fp)
2573 {
2574   switch (TOKEN_SPELL (token))
2575     {
2576     case SPELL_OPERATOR:
2577       {
2578         const unsigned char *spelling;
2579         int c;
2580
2581         if (token->flags & DIGRAPH)
2582           spelling = cpp_digraph2name (token->type);
2583         else if (token->flags & NAMED_OP)
2584           goto spell_ident;
2585         else
2586           spelling = TOKEN_NAME (token);
2587
2588         c = *spelling;
2589         do
2590           putc (c, fp);
2591         while ((c = *++spelling) != '\0');
2592       }
2593       break;
2594
2595     spell_ident:
2596     case SPELL_IDENT:
2597       {
2598         size_t i;
2599         const unsigned char * name = NODE_NAME (token->val.node.node);
2600
2601         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2602           if (name[i] & ~0x7F)
2603             {
2604               unsigned char buffer[10];
2605               i += utf8_to_ucn (buffer, name + i) - 1;
2606               fwrite (buffer, 1, 10, fp);
2607             }
2608           else
2609             fputc (NODE_NAME (token->val.node.node)[i], fp);
2610       }
2611       break;
2612
2613     case SPELL_LITERAL:
2614       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2615       break;
2616
2617     case SPELL_NONE:
2618       /* An error, most probably.  */
2619       break;
2620     }
2621 }
2622
2623 /* Compare two tokens.  */
2624 int
2625 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2626 {
2627   if (a->type == b->type && a->flags == b->flags)
2628     switch (TOKEN_SPELL (a))
2629       {
2630       default:                  /* Keep compiler happy.  */
2631       case SPELL_OPERATOR:
2632         /* token_no is used to track where multiple consecutive ##
2633            tokens were originally located.  */
2634         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2635       case SPELL_NONE:
2636         return (a->type != CPP_MACRO_ARG
2637                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2638       case SPELL_IDENT:
2639         return a->val.node.node == b->val.node.node;
2640       case SPELL_LITERAL:
2641         return (a->val.str.len == b->val.str.len
2642                 && !memcmp (a->val.str.text, b->val.str.text,
2643                             a->val.str.len));
2644       }
2645
2646   return 0;
2647 }
2648
2649 /* Returns nonzero if a space should be inserted to avoid an
2650    accidental token paste for output.  For simplicity, it is
2651    conservative, and occasionally advises a space where one is not
2652    needed, e.g. "." and ".2".  */
2653 int
2654 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2655                  const cpp_token *token2)
2656 {
2657   enum cpp_ttype a = token1->type, b = token2->type;
2658   cppchar_t c;
2659
2660   if (token1->flags & NAMED_OP)
2661     a = CPP_NAME;
2662   if (token2->flags & NAMED_OP)
2663     b = CPP_NAME;
2664
2665   c = EOF;
2666   if (token2->flags & DIGRAPH)
2667     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2668   else if (token_spellings[b].category == SPELL_OPERATOR)
2669     c = token_spellings[b].name[0];
2670
2671   /* Quickly get everything that can paste with an '='.  */
2672   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2673     return 1;
2674
2675   switch (a)
2676     {
2677     case CPP_GREATER:   return c == '>';
2678     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2679     case CPP_PLUS:      return c == '+';
2680     case CPP_MINUS:     return c == '-' || c == '>';
2681     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2682     case CPP_MOD:       return c == ':' || c == '>';
2683     case CPP_AND:       return c == '&';
2684     case CPP_OR:        return c == '|';
2685     case CPP_COLON:     return c == ':' || c == '>';
2686     case CPP_DEREF:     return c == '*';
2687     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2688     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2689     case CPP_NAME:      return ((b == CPP_NUMBER
2690                                  && name_p (pfile, &token2->val.str))
2691                                 || b == CPP_NAME
2692                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2693     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2694                                 || c == '.' || c == '+' || c == '-');
2695                                       /* UCNs */
2696     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2697                                  && b == CPP_NAME)
2698                                 || (CPP_OPTION (pfile, objc)
2699                                     && token1->val.str.text[0] == '@'
2700                                     && (b == CPP_NAME || b == CPP_STRING)));
2701     default:            break;
2702     }
2703
2704   return 0;
2705 }
2706
2707 /* Output all the remaining tokens on the current line, and a newline
2708    character, to FP.  Leading whitespace is removed.  If there are
2709    macros, special token padding is not performed.  */
2710 void
2711 cpp_output_line (cpp_reader *pfile, FILE *fp)
2712 {
2713   const cpp_token *token;
2714
2715   token = cpp_get_token (pfile);
2716   while (token->type != CPP_EOF)
2717     {
2718       cpp_output_token (token, fp);
2719       token = cpp_get_token (pfile);
2720       if (token->flags & PREV_WHITE)
2721         putc (' ', fp);
2722     }
2723
2724   putc ('\n', fp);
2725 }
2726
2727 /* Return a string representation of all the remaining tokens on the
2728    current line.  The result is allocated using xmalloc and must be
2729    freed by the caller.  */
2730 unsigned char *
2731 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2732 {
2733   const cpp_token *token;
2734   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2735   unsigned int alloced = 120 + out;
2736   unsigned char *result = (unsigned char *) xmalloc (alloced);
2737
2738   /* If DIR_NAME is empty, there are no initial contents.  */
2739   if (dir_name)
2740     {
2741       sprintf ((char *) result, "#%s ", dir_name);
2742       out += 2;
2743     }
2744
2745   token = cpp_get_token (pfile);
2746   while (token->type != CPP_EOF)
2747     {
2748       unsigned char *last;
2749       /* Include room for a possible space and the terminating nul.  */
2750       unsigned int len = cpp_token_len (token) + 2;
2751
2752       if (out + len > alloced)
2753         {
2754           alloced *= 2;
2755           if (out + len > alloced)
2756             alloced = out + len;
2757           result = (unsigned char *) xrealloc (result, alloced);
2758         }
2759
2760       last = cpp_spell_token (pfile, token, &result[out], 0);
2761       out = last - result;
2762
2763       token = cpp_get_token (pfile);
2764       if (token->flags & PREV_WHITE)
2765         result[out++] = ' ';
2766     }
2767
2768   result[out] = '\0';
2769   return result;
2770 }
2771
2772 /* Memory buffers.  Changing these three constants can have a dramatic
2773    effect on performance.  The values here are reasonable defaults,
2774    but might be tuned.  If you adjust them, be sure to test across a
2775    range of uses of cpplib, including heavy nested function-like macro
2776    expansion.  Also check the change in peak memory usage (NJAMD is a
2777    good tool for this).  */
2778 #define MIN_BUFF_SIZE 8000
2779 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2780 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2781         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2782
2783 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2784   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2785 #endif
2786
2787 /* Create a new allocation buffer.  Place the control block at the end
2788    of the buffer, so that buffer overflows will cause immediate chaos.  */
2789 static _cpp_buff *
2790 new_buff (size_t len)
2791 {
2792   _cpp_buff *result;
2793   unsigned char *base;
2794
2795   if (len < MIN_BUFF_SIZE)
2796     len = MIN_BUFF_SIZE;
2797   len = CPP_ALIGN (len);
2798
2799   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2800   result = (_cpp_buff *) (base + len);
2801   result->base = base;
2802   result->cur = base;
2803   result->limit = base + len;
2804   result->next = NULL;
2805   return result;
2806 }
2807
2808 /* Place a chain of unwanted allocation buffers on the free list.  */
2809 void
2810 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2811 {
2812   _cpp_buff *end = buff;
2813
2814   while (end->next)
2815     end = end->next;
2816   end->next = pfile->free_buffs;
2817   pfile->free_buffs = buff;
2818 }
2819
2820 /* Return a free buffer of size at least MIN_SIZE.  */
2821 _cpp_buff *
2822 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2823 {
2824   _cpp_buff *result, **p;
2825
2826   for (p = &pfile->free_buffs;; p = &(*p)->next)
2827     {
2828       size_t size;
2829
2830       if (*p == NULL)
2831         return new_buff (min_size);
2832       result = *p;
2833       size = result->limit - result->base;
2834       /* Return a buffer that's big enough, but don't waste one that's
2835          way too big.  */
2836       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2837         break;
2838     }
2839
2840   *p = result->next;
2841   result->next = NULL;
2842   result->cur = result->base;
2843   return result;
2844 }
2845
2846 /* Creates a new buffer with enough space to hold the uncommitted
2847    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2848    the excess bytes to the new buffer.  Chains the new buffer after
2849    BUFF, and returns the new buffer.  */
2850 _cpp_buff *
2851 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2852 {
2853   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2854   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2855
2856   buff->next = new_buff;
2857   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2858   return new_buff;
2859 }
2860
2861 /* Creates a new buffer with enough space to hold the uncommitted
2862    remaining bytes of the buffer pointed to by BUFF, and at least
2863    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
2864    Chains the new buffer before the buffer pointed to by BUFF, and
2865    updates the pointer to point to the new buffer.  */
2866 void
2867 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
2868 {
2869   _cpp_buff *new_buff, *old_buff = *pbuff;
2870   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2871
2872   new_buff = _cpp_get_buff (pfile, size);
2873   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2874   new_buff->next = old_buff;
2875   *pbuff = new_buff;
2876 }
2877
2878 /* Free a chain of buffers starting at BUFF.  */
2879 void
2880 _cpp_free_buff (_cpp_buff *buff)
2881 {
2882   _cpp_buff *next;
2883
2884   for (; buff; buff = next)
2885     {
2886       next = buff->next;
2887       free (buff->base);
2888     }
2889 }
2890
2891 /* Allocate permanent, unaligned storage of length LEN.  */
2892 unsigned char *
2893 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2894 {
2895   _cpp_buff *buff = pfile->u_buff;
2896   unsigned char *result = buff->cur;
2897
2898   if (len > (size_t) (buff->limit - result))
2899     {
2900       buff = _cpp_get_buff (pfile, len);
2901       buff->next = pfile->u_buff;
2902       pfile->u_buff = buff;
2903       result = buff->cur;
2904     }
2905
2906   buff->cur = result + len;
2907   return result;
2908 }
2909
2910 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2911    That buffer is used for growing allocations when saving macro
2912    replacement lists in a #define, and when parsing an answer to an
2913    assertion in #assert, #unassert or #if (and therefore possibly
2914    whilst expanding macros).  It therefore must not be used by any
2915    code that they might call: specifically the lexer and the guts of
2916    the macro expander.
2917
2918    All existing other uses clearly fit this restriction: storing
2919    registered pragmas during initialization.  */
2920 unsigned char *
2921 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2922 {
2923   _cpp_buff *buff = pfile->a_buff;
2924   unsigned char *result = buff->cur;
2925
2926   if (len > (size_t) (buff->limit - result))
2927     {
2928       buff = _cpp_get_buff (pfile, len);
2929       buff->next = pfile->a_buff;
2930       pfile->a_buff = buff;
2931       result = buff->cur;
2932     }
2933
2934   buff->cur = result + len;
2935   return result;
2936 }
2937
2938 /* Say which field of TOK is in use.  */
2939
2940 enum cpp_token_fld_kind
2941 cpp_token_val_index (cpp_token *tok)
2942 {
2943   switch (TOKEN_SPELL (tok))
2944     {
2945     case SPELL_IDENT:
2946       return CPP_TOKEN_FLD_NODE;
2947     case SPELL_LITERAL:
2948       return CPP_TOKEN_FLD_STR;
2949     case SPELL_OPERATOR:
2950       if (tok->type == CPP_PASTE)
2951         return CPP_TOKEN_FLD_TOKEN_NO;
2952       else
2953         return CPP_TOKEN_FLD_NONE;
2954     case SPELL_NONE:
2955       if (tok->type == CPP_MACRO_ARG)
2956         return CPP_TOKEN_FLD_ARG_NO;
2957       else if (tok->type == CPP_PADDING)
2958         return CPP_TOKEN_FLD_SOURCE;
2959       else if (tok->type == CPP_PRAGMA)
2960         return CPP_TOKEN_FLD_PRAGMA;
2961       /* else fall through */
2962     default:
2963       return CPP_TOKEN_FLD_NONE;
2964     }
2965 }
2966
2967 /* All tokens lexed in R after calling this function will be forced to have
2968    their source_location the same as the location referenced by P, until
2969    cpp_stop_forcing_token_locations is called for R.  */
2970
2971 void
2972 cpp_force_token_locations (cpp_reader *r, source_location *p)
2973 {
2974   r->forced_token_location_p = p;
2975 }
2976
2977 /* Go back to assigning locations naturally for lexed tokens.  */
2978
2979 void
2980 cpp_stop_forcing_token_locations (cpp_reader *r)
2981 {
2982   r->forced_token_location_p = NULL;
2983 }