libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2010
   3    Free Software Foundation, Inc.
   4    Contributed by Per Bothner, 1994-95.
   5    Based on CCCP program by Paul Rubin, June 1986
   6    Adapted to ANSI C, Richard Stallman, Jan 1987
   7    Broken out to separate file, Zack Weinberg, Mar 2000
   8
   9 This program is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 3, or (at your option) any
  12 later version.
  13
  14 This program is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with this program; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "cpplib.h"
  26 #include "internal.h"
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  46 #define TK(e, s) { SPELL_ ## s,    UC #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void store_comment (cpp_reader *, cpp_token *);
  60 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  61                             unsigned int, enum cpp_ttype);
  62 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  63 static int name_p (cpp_reader *, const cpp_string *);
  64 static tokenrun *next_tokenrun (tokenrun *);
  65
  66 static _cpp_buff *new_buff (size_t);
  67
  68
  69 /* Utility routine:
  70
  71    Compares, the token TOKEN to the NUL-terminated string STRING.
  72    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  73 int
  74 cpp_ideq (const cpp_token *token, const char *string)
  75 {
  76   if (token->type != CPP_NAME)
  77     return 0;
  78
  79   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  80 }
  81
  82 /* Record a note TYPE at byte POS into the current cleaned logical
  83    line.  */
  84 static void
  85 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  86 {
  87   if (buffer->notes_used == buffer->notes_cap)
  88     {
  89       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  90       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  91                                   buffer->notes_cap);
  92     }
  93
  94   buffer->notes[buffer->notes_used].pos = pos;
  95   buffer->notes[buffer->notes_used].type = type;
  96   buffer->notes_used++;
  97 }
  98
  99 \f
 100 /* Fast path to find line special characters using optimized character
 101    scanning algorithms.  Anything complicated falls back to the slow
 102    path below.  Since this loop is very hot it's worth doing these kinds
 103    of optimizations.
 104
 105    One of the paths through the ifdefs should provide
 106
 107      const uchar *search_line_fast (const uchar *s, const uchar *end);
 108
 109    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 110    the found character.
 111
 112    Note that the last character of the buffer is *always* a newline,
 113    as forced by _cpp_convert_input.  This fact can be used to avoid
 114    explicitly looking for the end of the buffer.  */
 115
 116 /* Configure gives us an ifdef test.  */
 117 #ifndef WORDS_BIGENDIAN
 118 #define WORDS_BIGENDIAN 0
 119 #endif
 120
 121 /* We'd like the largest integer that fits into a register.  There's nothing
 122    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 123    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 124    can get the "real" word size.  */
 125 #ifdef __GNUC__
 126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 127 #else
 128 typedef unsigned long word_type;
 129 #endif
 130
 131 /* The code below is only expecting sizes 4 or 8.
 132    Die at compile-time if this expectation is violated.  */
 133 typedef char check_word_type_size
 134   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 135
 136 /* Return X with the first N bytes forced to values that won't match one
 137    of the interesting characters.  Note that NUL is not interesting.  */
 138
 139 static inline word_type
 140 acc_char_mask_misalign (word_type val, unsigned int n)
 141 {
 142   word_type mask = -1;
 143   if (WORDS_BIGENDIAN)
 144     mask >>= n * 8;
 145   else
 146     mask <<= n * 8;
 147   return val & mask;
 148 }
 149
 150 /* Return X replicated to all byte positions within WORD_TYPE.  */
 151
 152 static inline word_type
 153 acc_char_replicate (uchar x)
 154 {
 155   word_type ret;
 156
 157   ret = (x << 24) | (x << 16) | (x << 8) | x;
 158   if (sizeof(word_type) == 8)
 159     ret = (ret << 16 << 16) | ret;
 160   return ret;
 161 }
 162
 163 /* Return non-zero if some byte of VAL is (probably) C.  */
 164
 165 static inline word_type
 166 acc_char_cmp (word_type val, word_type c)
 167 {
 168 #if defined(__GNUC__) && defined(__alpha__)
 169   /* We can get exact results using a compare-bytes instruction.
 170      Get (val == c) via (0 >= (val ^ c)).  */
 171   return __builtin_alpha_cmpbge (0, val ^ c);
 172 #else
 173   word_type magic = 0x7efefefeU;
 174   if (sizeof(word_type) == 8)
 175     magic = (magic << 16 << 16) | 0xfefefefeU;
 176   magic |= 1;
 177
 178   val ^= c;
 179   return ((val + magic) ^ ~val) & ~magic;
 180 #endif
 181 }
 182
 183 /* Given the result of acc_char_cmp is non-zero, return the index of
 184    the found character.  If this was a false positive, return -1.  */
 185
 186 static inline int
 187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 188                 word_type val ATTRIBUTE_UNUSED)
 189 {
 190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 191   /* The cmpbge instruction sets *bits* of the result corresponding to
 192      matches in the bytes with no false positives.  */
 193   return __builtin_ctzl (cmp);
 194 #else
 195   unsigned int i;
 196
 197   /* ??? It would be nice to force unrolling here,
 198      and have all of these constants folded.  */
 199   for (i = 0; i < sizeof(word_type); ++i)
 200     {
 201       uchar c;
 202       if (WORDS_BIGENDIAN)
 203         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 204       else
 205         c = (val >> i * 8) & 0xff;
 206
 207       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 208         return i;
 209     }
 210
 211   return -1;
 212 #endif
 213 }
 214
 215 /* A version of the fast scanner using bit fiddling techniques.
 216
 217    For 32-bit words, one would normally perform 16 comparisons and
 218    16 branches.  With this algorithm one performs 24 arithmetic
 219    operations and one branch.  Whether this is faster with a 32-bit
 220    word size is going to be somewhat system dependent.
 221
 222    For 64-bit words, we eliminate twice the number of comparisons
 223    and branches without increasing the number of arithmetic operations.
 224    It's almost certainly going to be a win with 64-bit word size.  */
 225
 226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 227   ATTRIBUTE_UNUSED;
 228
 229 static const uchar *
 230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 231 {
 232   const word_type repl_nl = acc_char_replicate ('\n');
 233   const word_type repl_cr = acc_char_replicate ('\r');
 234   const word_type repl_bs = acc_char_replicate ('\\');
 235   const word_type repl_qm = acc_char_replicate ('?');
 236
 237   unsigned int misalign;
 238   const word_type *p;
 239   word_type val, t;
 240
 241   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 242   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 243   val = *p;
 244   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 245   if (misalign)
 246     val = acc_char_mask_misalign (val, misalign);
 247
 248   /* Main loop.  */
 249   while (1)
 250     {
 251       t  = acc_char_cmp (val, repl_nl);
 252       t |= acc_char_cmp (val, repl_cr);
 253       t |= acc_char_cmp (val, repl_bs);
 254       t |= acc_char_cmp (val, repl_qm);
 255
 256       if (__builtin_expect (t != 0, 0))
 257         {
 258           int i = acc_char_index (t, val);
 259           if (i >= 0)
 260             return (const uchar *)p + i;
 261         }
 262
 263       val = *++p;
 264     }
 265 }
 266
 267 /* Disable on Solaris 2/x86 until the following problems can be properly
 268    autoconfed:
 269
 270    The Solaris 8 assembler cannot assemble SSE2/SSE4.2 insns.
 271    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 272    Before Solaris 9 Update 6, SSE insns cannot be executed.
 273    The Solaris 10+ assembler tags objects with the instruction set
 274    extensions used, so SSE4.2 executables cannot run on machines that
 275    don't support that extension.  */
 276
 277 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 278
 279 /* Replicated character data to be shared between implementations.
 280    Recall that outside of a context with vector support we can't
 281    define compatible vector types, therefore these are all defined
 282    in terms of raw characters.  */
 283 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 284   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 285     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 286   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 287     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 288   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 289     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 290   { '?', '?', '?', '?', '?', '?', '?', '?',
 291     '?', '?', '?', '?', '?', '?', '?', '?' },
 292 };
 293
 294 /* A version of the fast scanner using MMX vectorized byte compare insns.
 295
 296    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 297    which was packaged into SSE1; it is also present in the AMD 3dNOW-A
 298    extension.  Mark the function as using "sse" so that we emit a real
 299    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 300
 301 static const uchar *
 302 #ifndef __SSE__
 303 __attribute__((__target__("sse")))
 304 #endif
 305 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 306 {
 307   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 308   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 309
 310   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 311   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 312   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 313   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 314
 315   unsigned int misalign, found, mask;
 316   const v8qi *p;
 317   v8qi data, t, c;
 318
 319   /* Align the source pointer.  While MMX doesn't generate unaligned data
 320      faults, this allows us to safely scan to the end of the buffer without
 321      reading beyond the end of the last page.  */
 322   misalign = (uintptr_t)s & 7;
 323   p = (const v8qi *)((uintptr_t)s & -8);
 324   data = *p;
 325
 326   /* Create a mask for the bytes that are valid within the first
 327      16-byte block.  The Idea here is that the AND with the mask
 328      within the loop is "free", since we need some AND or TEST
 329      insn in order to set the flags for the branch anyway.  */
 330   mask = -1u << misalign;
 331
 332   /* Main loop processing 8 bytes at a time.  */
 333   goto start;
 334   do
 335     {
 336       data = *++p;
 337       mask = -1;
 338
 339     start:
 340       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 341       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 344       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 345       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 346       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 347       found = __builtin_ia32_pmovmskb (t);
 348       found &= mask;
 349     }
 350   while (!found);
 351
 352   __builtin_ia32_emms ();
 353
 354   /* FOUND contains 1 in bits for which we matched a relevant
 355      character.  Conversion to the byte index is trivial.  */
 356   found = __builtin_ctz(found);
 357   return (const uchar *)p + found;
 358 }
 359
 360 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 361
 362 static const uchar *
 363 #ifndef __SSE2__
 364 __attribute__((__target__("sse2")))
 365 #endif
 366 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 367 {
 368   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 369
 370   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 371   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 372   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 373   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 374
 375   unsigned int misalign, found, mask;
 376   const v16qi *p;
 377   v16qi data, t;
 378
 379   /* Align the source pointer.  */
 380   misalign = (uintptr_t)s & 15;
 381   p = (const v16qi *)((uintptr_t)s & -16);
 382   data = *p;
 383
 384   /* Create a mask for the bytes that are valid within the first
 385      16-byte block.  The Idea here is that the AND with the mask
 386      within the loop is "free", since we need some AND or TEST
 387      insn in order to set the flags for the branch anyway.  */
 388   mask = -1u << misalign;
 389
 390   /* Main loop processing 16 bytes at a time.  */
 391   goto start;
 392   do
 393     {
 394       data = *++p;
 395       mask = -1;
 396
 397     start:
 398       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 400       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 401       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 402       found = __builtin_ia32_pmovmskb128 (t);
 403       found &= mask;
 404     }
 405   while (!found);
 406
 407   /* FOUND contains 1 in bits for which we matched a relevant
 408      character.  Conversion to the byte index is trivial.  */
 409   found = __builtin_ctz(found);
 410   return (const uchar *)p + found;
 411 }
 412
 413 #ifdef HAVE_SSE4
 414 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 415
 416 static const uchar *
 417 #ifndef __SSE4_2__
 418 __attribute__((__target__("sse4.2")))
 419 #endif
 420 search_line_sse42 (const uchar *s, const uchar *end)
 421 {
 422   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 423   static const v16qi search = { '\n', '\r', '?', '\\' };
 424
 425   uintptr_t si = (uintptr_t)s;
 426   uintptr_t index;
 427
 428   /* Check for unaligned input.  */
 429   if (si & 15)
 430     {
 431       if (__builtin_expect (end - s < 16, 0)
 432           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 433         {
 434           /* There are less than 16 bytes left in the buffer, and less
 435              than 16 bytes left on the page.  Reading 16 bytes at this
 436              point might generate a spurious page fault.  Defer to the
 437              SSE2 implementation, which already handles alignment.  */
 438           return search_line_sse2 (s, end);
 439         }
 440
 441       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 442          memory need not be aligned.  */
 443       __asm ("%vpcmpestri $0, (%1), %2"
 444              : "=c"(index) : "r"(s), "x"(search), "a"(4), "d"(16));
 445       if (__builtin_expect (index < 16, 0))
 446         goto found;
 447
 448       /* Advance the pointer to an aligned address.  We will re-scan a
 449          few bytes, but we no longer need care for reading past the
 450          end of a page, since we're guaranteed a match.  */
 451       s = (const uchar *)((si + 16) & -16);
 452     }
 453
 454   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 455      in inline assembly, we can make proper use of the flags set.  */
 456   __asm (      "sub $16, %1\n"
 457         "       .balign 16\n"
 458         "0:     add $16, %1\n"
 459         "       %vpcmpestri $0, (%1), %2\n"
 460         "       jnc 0b"
 461         : "=&c"(index), "+r"(s)
 462         : "x"(search), "a"(4), "d"(16));
 463
 464  found:
 465   return s + index;
 466 }
 467
 468 #else
 469 /* Work around out-dated assemblers without sse4 support.  */
 470 #define search_line_sse42 search_line_sse2
 471 #endif
 472
 473 /* Check the CPU capabilities.  */
 474
 475 #include "../gcc/config/i386/cpuid.h"
 476
 477 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 478 static search_line_fast_type search_line_fast;
 479
 480 static void __attribute__((constructor))
 481 init_vectorized_lexer (void)
 482 {
 483   unsigned dummy, ecx = 0, edx = 0;
 484   search_line_fast_type impl = search_line_acc_char;
 485   int minimum = 0;
 486
 487 #if defined(__SSE4_2__)
 488   minimum = 3;
 489 #elif defined(__SSE2__)
 490   minimum = 2;
 491 #elif defined(__SSE__) || defined(__3dNOW_A__)
 492   minimum = 1;
 493 #endif
 494
 495   if (minimum == 3)
 496     impl = search_line_sse42;
 497   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 498     {
 499       if (minimum == 3 || (ecx & bit_SSE4_2))
 500         impl = search_line_sse42;
 501       else if (minimum == 2 || (edx & bit_SSE2))
 502         impl = search_line_sse2;
 503       else if (minimum == 1 || (edx & bit_SSE))
 504         impl = search_line_mmx;
 505     }
 506   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 507     {
 508       if (minimum == 1 || edx & bit_3DNOWP)
 509         impl = search_line_mmx;
 510     }
 511
 512   search_line_fast = impl;
 513 }
 514
 515 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 516
 517 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 518 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 519    so we can't compile this function without -maltivec on the command line
 520    (or implied by some other switch).  */
 521
 522 static const uchar *
 523 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 524 {
 525   typedef __attribute__((altivec(vector))) unsigned char vc;
 526
 527   const vc repl_nl = {
 528     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 529     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 530   };
 531   const vc repl_cr = {
 532     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 533     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 534   };
 535   const vc repl_bs = {
 536     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 537     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 538   };
 539   const vc repl_qm = {
 540     '?', '?', '?', '?', '?', '?', '?', '?',
 541     '?', '?', '?', '?', '?', '?', '?', '?',
 542   };
 543   const vc ones = {
 544     -1, -1, -1, -1, -1, -1, -1, -1,
 545     -1, -1, -1, -1, -1, -1, -1, -1,
 546   };
 547   const vc zero = { 0 };
 548
 549   vc data, mask, t;
 550
 551   /* Altivec loads automatically mask addresses with -16.  This lets us
 552      issue the first load as early as possible.  */
 553   data = __builtin_vec_ld(0, (const vc *)s);
 554
 555   /* Discard bytes before the beginning of the buffer.  Do this by
 556      beginning with all ones and shifting in zeros according to the
 557      mis-alignment.  The LVSR instruction pulls the exact shift we
 558      want from the address.  */
 559   mask = __builtin_vec_lvsr(0, s);
 560   mask = __builtin_vec_perm(zero, ones, mask);
 561   data &= mask;
 562
 563   /* While altivec loads mask addresses, we still need to align S so
 564      that the offset we compute at the end is correct.  */
 565   s = (const uchar *)((uintptr_t)s & -16);
 566
 567   /* Main loop processing 16 bytes at a time.  */
 568   goto start;
 569   do
 570     {
 571       vc m_nl, m_cr, m_bs, m_qm;
 572
 573       s += 16;
 574       data = __builtin_vec_ld(0, (const vc *)s);
 575
 576     start:
 577       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 578       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 579       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 580       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 581       t = (m_nl | m_cr) | (m_bs | m_qm);
 582
 583       /* T now contains 0xff in bytes for which we matched one of the relevant
 584          characters.  We want to exit the loop if any byte in T is non-zero.
 585          Below is the expansion of vec_any_ne(t, zero).  */
 586     }
 587   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 588
 589   {
 590 #define N  (sizeof(vc) / sizeof(long))
 591
 592     typedef char check_count[(N == 2 || N == 4) * 2 - 1];
 593     union {
 594       vc v;
 595       unsigned long l[N];
 596     } u;
 597     unsigned long l, i = 0;
 598
 599     u.v = t;
 600
 601     /* Find the first word of T that is non-zero.  */
 602     switch (N)
 603       {
 604       case 4:
 605         l = u.l[i++];
 606         if (l != 0)
 607           break;
 608         s += sizeof(unsigned long);
 609         l = u.l[i++];
 610         if (l != 0)
 611           break;
 612         s += sizeof(unsigned long);
 613       case 2:
 614         l = u.l[i++];
 615         if (l != 0)
 616           break;
 617         s += sizeof(unsigned long);
 618         l = u.l[i];
 619       }
 620
 621     /* L now contains 0xff in bytes for which we matched one of the
 622        relevant characters.  We can find the byte index by finding
 623        its bit index and dividing by 8.  */
 624     l = __builtin_clzl(l) >> 3;
 625     return s + l;
 626
 627 #undef N
 628   }
 629 }
 630
 631 #else
 632
 633 /* We only have one accellerated alternative.  Use a direct call so that
 634    we encourage inlining.  */
 635
 636 #define search_line_fast  search_line_acc_char
 637
 638 #endif
 639
 640 /* Returns with a logical line that contains no escaped newlines or
 641    trigraphs.  This is a time-critical inner loop.  */
 642 void
 643 _cpp_clean_line (cpp_reader *pfile)
 644 {
 645   cpp_buffer *buffer;
 646   const uchar *s;
 647   uchar c, *d, *p;
 648
 649   buffer = pfile->buffer;
 650   buffer->cur_note = buffer->notes_used = 0;
 651   buffer->cur = buffer->line_base = buffer->next_line;
 652   buffer->need_line = false;
 653   s = buffer->next_line;
 654
 655   if (!buffer->from_stage3)
 656     {
 657       const uchar *pbackslash = NULL;
 658
 659       /* Fast path.  This is the common case of an un-escaped line with
 660          no trigraphs.  The primary win here is by not writing any
 661          data back to memory until we have to.  */
 662       while (1)
 663         {
 664           /* Perform an optimized search for \n, \r, \\, ?.  */
 665           s = search_line_fast (s, buffer->rlimit);
 666
 667           c = *s;
 668           if (c == '\\')
 669             {
 670               /* Record the location of the backslash and continue.  */
 671               pbackslash = s++;
 672             }
 673           else if (__builtin_expect (c == '?', 0))
 674             {
 675               if (__builtin_expect (s[1] == '?', false)
 676                    && _cpp_trigraph_map[s[2]])
 677                 {
 678                   /* Have a trigraph.  We may or may not have to convert
 679                      it.  Add a line note regardless, for -Wtrigraphs.  */
 680                   add_line_note (buffer, s, s[2]);
 681                   if (CPP_OPTION (pfile, trigraphs))
 682                     {
 683                       /* We do, and that means we have to switch to the
 684                          slow path.  */
 685                       d = (uchar *) s;
 686                       *d = _cpp_trigraph_map[s[2]];
 687                       s += 2;
 688                       goto slow_path;
 689                     }
 690                 }
 691               /* Not a trigraph.  Continue on fast-path.  */
 692               s++;
 693             }
 694           else
 695             break;
 696         }
 697
 698       /* This must be \r or \n.  We're either done, or we'll be forced
 699          to write back to the buffer and continue on the slow path.  */
 700       d = (uchar *) s;
 701
 702       if (__builtin_expect (s == buffer->rlimit, false))
 703         goto done;
 704
 705       /* DOS line ending? */
 706       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 707         {
 708           s++;
 709           if (s == buffer->rlimit)
 710             goto done;
 711         }
 712
 713       if (__builtin_expect (pbackslash == NULL, true))
 714         goto done;
 715
 716       /* Check for escaped newline.  */
 717       p = d;
 718       while (is_nvspace (p[-1]))
 719         p--;
 720       if (p - 1 != pbackslash)
 721         goto done;
 722
 723       /* Have an escaped newline; process it and proceed to
 724          the slow path.  */
 725       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 726       d = p - 2;
 727       buffer->next_line = p - 1;
 728
 729     slow_path:
 730       while (1)
 731         {
 732           c = *++s;
 733           *++d = c;
 734
 735           if (c == '\n' || c == '\r')
 736             {
 737               /* Handle DOS line endings.  */
 738               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 739                 s++;
 740               if (s == buffer->rlimit)
 741                 break;
 742
 743               /* Escaped?  */
 744               p = d;
 745               while (p != buffer->next_line && is_nvspace (p[-1]))
 746                 p--;
 747               if (p == buffer->next_line || p[-1] != '\\')
 748                 break;
 749
 750               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 751               d = p - 2;
 752               buffer->next_line = p - 1;
 753             }
 754           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 755             {
 756               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 757               add_line_note (buffer, d, s[2]);
 758               if (CPP_OPTION (pfile, trigraphs))
 759                 {
 760                   *d = _cpp_trigraph_map[s[2]];
 761                   s += 2;
 762                 }
 763             }
 764         }
 765     }
 766   else
 767     {
 768       while (*s != '\n' && *s != '\r')
 769         s++;
 770       d = (uchar *) s;
 771
 772       /* Handle DOS line endings.  */
 773       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 774         s++;
 775     }
 776
 777  done:
 778   *d = '\n';
 779   /* A sentinel note that should never be processed.  */
 780   add_line_note (buffer, d + 1, '\n');
 781   buffer->next_line = s + 1;
 782 }
 783
 784 /* Return true if the trigraph indicated by NOTE should be warned
 785    about in a comment.  */
 786 static bool
 787 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 788 {
 789   const uchar *p;
 790
 791   /* Within comments we don't warn about trigraphs, unless the
 792      trigraph forms an escaped newline, as that may change
 793      behavior.  */
 794   if (note->type != '/')
 795     return false;
 796
 797   /* If -trigraphs, then this was an escaped newline iff the next note
 798      is coincident.  */
 799   if (CPP_OPTION (pfile, trigraphs))
 800     return note[1].pos == note->pos;
 801
 802   /* Otherwise, see if this forms an escaped newline.  */
 803   p = note->pos + 3;
 804   while (is_nvspace (*p))
 805     p++;
 806
 807   /* There might have been escaped newlines between the trigraph and the
 808      newline we found.  Hence the position test.  */
 809   return (*p == '\n' && p < note[1].pos);
 810 }
 811
 812 /* Process the notes created by add_line_note as far as the current
 813    location.  */
 814 void
 815 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 816 {
 817   cpp_buffer *buffer = pfile->buffer;
 818
 819   for (;;)
 820     {
 821       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 822       unsigned int col;
 823
 824       if (note->pos > buffer->cur)
 825         break;
 826
 827       buffer->cur_note++;
 828       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 829
 830       if (note->type == '\\' || note->type == ' ')
 831         {
 832           if (note->type == ' ' && !in_comment)
 833             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 834                                  "backslash and newline separated by space");
 835
 836           if (buffer->next_line > buffer->rlimit)
 837             {
 838               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 839                                    "backslash-newline at end of file");
 840               /* Prevent "no newline at end of file" warning.  */
 841               buffer->next_line = buffer->rlimit;
 842             }
 843
 844           buffer->line_base = note->pos;
 845           CPP_INCREMENT_LINE (pfile, 0);
 846         }
 847       else if (_cpp_trigraph_map[note->type])
 848         {
 849           if (CPP_OPTION (pfile, warn_trigraphs)
 850               && (!in_comment || warn_in_comment (pfile, note)))
 851             {
 852               if (CPP_OPTION (pfile, trigraphs))
 853                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 854                                        pfile->line_table->highest_line, col,
 855                                        "trigraph ??%c converted to %c",
 856                                        note->type,
 857                                        (int) _cpp_trigraph_map[note->type]);
 858               else
 859                 {
 860                   cpp_warning_with_line
 861                     (pfile, CPP_W_TRIGRAPHS,
 862                      pfile->line_table->highest_line, col,
 863                      "trigraph ??%c ignored, use -trigraphs to enable",
 864                      note->type);
 865                 }
 866             }
 867         }
 868       else if (note->type == 0)
 869         /* Already processed in lex_raw_string.  */;
 870       else
 871         abort ();
 872     }
 873 }
 874
 875 /* Skip a C-style block comment.  We find the end of the comment by
 876    seeing if an asterisk is before every '/' we encounter.  Returns
 877    nonzero if comment terminated by EOF, zero otherwise.
 878
 879    Buffer->cur points to the initial asterisk of the comment.  */
 880 bool
 881 _cpp_skip_block_comment (cpp_reader *pfile)
 882 {
 883   cpp_buffer *buffer = pfile->buffer;
 884   const uchar *cur = buffer->cur;
 885   uchar c;
 886
 887   cur++;
 888   if (*cur == '/')
 889     cur++;
 890
 891   for (;;)
 892     {
 893       /* People like decorating comments with '*', so check for '/'
 894          instead for efficiency.  */
 895       c = *cur++;
 896
 897       if (c == '/')
 898         {
 899           if (cur[-2] == '*')
 900             break;
 901
 902           /* Warn about potential nested comments, but not if the '/'
 903              comes immediately before the true comment delimiter.
 904              Don't bother to get it right across escaped newlines.  */
 905           if (CPP_OPTION (pfile, warn_comments)
 906               && cur[0] == '*' && cur[1] != '/')
 907             {
 908               buffer->cur = cur;
 909               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 910                                      pfile->line_table->highest_line,
 911                                      CPP_BUF_COL (buffer),
 912                                      "\"/*\" within comment");
 913             }
 914         }
 915       else if (c == '\n')
 916         {
 917           unsigned int cols;
 918           buffer->cur = cur - 1;
 919           _cpp_process_line_notes (pfile, true);
 920           if (buffer->next_line >= buffer->rlimit)
 921             return true;
 922           _cpp_clean_line (pfile);
 923
 924           cols = buffer->next_line - buffer->line_base;
 925           CPP_INCREMENT_LINE (pfile, cols);
 926
 927           cur = buffer->cur;
 928         }
 929     }
 930
 931   buffer->cur = cur;
 932   _cpp_process_line_notes (pfile, true);
 933   return false;
 934 }
 935
 936 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 937    terminating newline.  Handles escaped newlines.  Returns nonzero
 938    if a multiline comment.  */
 939 static int
 940 skip_line_comment (cpp_reader *pfile)
 941 {
 942   cpp_buffer *buffer = pfile->buffer;
 943   source_location orig_line = pfile->line_table->highest_line;
 944
 945   while (*buffer->cur != '\n')
 946     buffer->cur++;
 947
 948   _cpp_process_line_notes (pfile, true);
 949   return orig_line != pfile->line_table->highest_line;
 950 }
 951
 952 /* Skips whitespace, saving the next non-whitespace character.  */
 953 static void
 954 skip_whitespace (cpp_reader *pfile, cppchar_t c)
 955 {
 956   cpp_buffer *buffer = pfile->buffer;
 957   bool saw_NUL = false;
 958
 959   do
 960     {
 961       /* Horizontal space always OK.  */
 962       if (c == ' ' || c == '\t')
 963         ;
 964       /* Just \f \v or \0 left.  */
 965       else if (c == '\0')
 966         saw_NUL = true;
 967       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 968         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 969                              CPP_BUF_COL (buffer),
 970                              "%s in preprocessing directive",
 971                              c == '\f' ? "form feed" : "vertical tab");
 972
 973       c = *buffer->cur++;
 974     }
 975   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 976   while (is_nvspace (c));
 977
 978   if (saw_NUL)
 979     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
 980
 981   buffer->cur--;
 982 }
 983
 984 /* See if the characters of a number token are valid in a name (no
 985    '.', '+' or '-').  */
 986 static int
 987 name_p (cpp_reader *pfile, const cpp_string *string)
 988 {
 989   unsigned int i;
 990
 991   for (i = 0; i < string->len; i++)
 992     if (!is_idchar (string->text[i]))
 993       return 0;
 994
 995   return 1;
 996 }
 997
 998 /* After parsing an identifier or other sequence, produce a warning about
 999    sequences not in NFC/NFKC.  */
1000 static void
1001 warn_about_normalization (cpp_reader *pfile,
1002                           const cpp_token *token,
1003                           const struct normalize_state *s)
1004 {
1005   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1006       && !pfile->state.skipping)
1007     {
1008       /* Make sure that the token is printed using UCNs, even
1009          if we'd otherwise happily print UTF-8.  */
1010       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1011       size_t sz;
1012
1013       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1014       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1015         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1016                                "`%.*s' is not in NFKC", (int) sz, buf);
1017       else
1018         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1019                                "`%.*s' is not in NFC", (int) sz, buf);
1020     }
1021 }
1022
1023 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1024    an identifier.  FIRST is TRUE if this starts an identifier.  */
1025 static bool
1026 forms_identifier_p (cpp_reader *pfile, int first,
1027                     struct normalize_state *state)
1028 {
1029   cpp_buffer *buffer = pfile->buffer;
1030
1031   if (*buffer->cur == '$')
1032     {
1033       if (!CPP_OPTION (pfile, dollars_in_ident))
1034         return false;
1035
1036       buffer->cur++;
1037       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1038         {
1039           CPP_OPTION (pfile, warn_dollars) = 0;
1040           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1041         }
1042
1043       return true;
1044     }
1045
1046   /* Is this a syntactically valid UCN?  */
1047   if (CPP_OPTION (pfile, extended_identifiers)
1048       && *buffer->cur == '\\'
1049       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1050     {
1051       buffer->cur += 2;
1052       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1053                           state))
1054         return true;
1055       buffer->cur -= 2;
1056     }
1057
1058   return false;
1059 }
1060
1061 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1062 static cpp_hashnode *
1063 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1064 {
1065   cpp_hashnode *result;
1066   const uchar *cur;
1067   unsigned int len;
1068   unsigned int hash = HT_HASHSTEP (0, *base);
1069
1070   cur = base + 1;
1071   while (ISIDNUM (*cur))
1072     {
1073       hash = HT_HASHSTEP (hash, *cur);
1074       cur++;
1075     }
1076   len = cur - base;
1077   hash = HT_HASHFINISH (hash, len);
1078   result = cpp_lookup_with_hash (pfile, base, len, hash);
1079
1080   /* Rarely, identifiers require diagnostics when lexed.  */
1081   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1082                         && !pfile->state.skipping, 0))
1083     {
1084       /* It is allowed to poison the same identifier twice.  */
1085       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1086         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1087                    NODE_NAME (result));
1088
1089       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1090          replacement list of a variadic macro.  */
1091       if (result == pfile->spec_nodes.n__VA_ARGS__
1092           && !pfile->state.va_args_ok)
1093         cpp_error (pfile, CPP_DL_PEDWARN,
1094                    "__VA_ARGS__ can only appear in the expansion"
1095                    " of a C99 variadic macro");
1096
1097       /* For -Wc++-compat, warn about use of C++ named operators.  */
1098       if (result->flags & NODE_WARN_OPERATOR)
1099         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1100                      "identifier \"%s\" is a special operator name in C++",
1101                      NODE_NAME (result));
1102     }
1103
1104   return result;
1105 }
1106
1107 /* Get the cpp_hashnode of an identifier specified by NAME in
1108    the current cpp_reader object.  If none is found, NULL is returned.  */
1109 cpp_hashnode *
1110 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1111 {
1112   cpp_hashnode *result;
1113   result = lex_identifier_intern (pfile, (uchar *) name);
1114   return result;
1115 }
1116
1117 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1118 static cpp_hashnode *
1119 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1120                 struct normalize_state *nst)
1121 {
1122   cpp_hashnode *result;
1123   const uchar *cur;
1124   unsigned int len;
1125   unsigned int hash = HT_HASHSTEP (0, *base);
1126
1127   cur = pfile->buffer->cur;
1128   if (! starts_ucn)
1129     while (ISIDNUM (*cur))
1130       {
1131         hash = HT_HASHSTEP (hash, *cur);
1132         cur++;
1133       }
1134   pfile->buffer->cur = cur;
1135   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1136     {
1137       /* Slower version for identifiers containing UCNs (or $).  */
1138       do {
1139         while (ISIDNUM (*pfile->buffer->cur))
1140           {
1141             pfile->buffer->cur++;
1142             NORMALIZE_STATE_UPDATE_IDNUM (nst);
1143           }
1144       } while (forms_identifier_p (pfile, false, nst));
1145       result = _cpp_interpret_identifier (pfile, base,
1146                                           pfile->buffer->cur - base);
1147     }
1148   else
1149     {
1150       len = cur - base;
1151       hash = HT_HASHFINISH (hash, len);
1152
1153       result = cpp_lookup_with_hash (pfile, base, len, hash);
1154     }
1155
1156   /* Rarely, identifiers require diagnostics when lexed.  */
1157   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1158                         && !pfile->state.skipping, 0))
1159     {
1160       /* It is allowed to poison the same identifier twice.  */
1161       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1162         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1163                    NODE_NAME (result));
1164
1165       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1166          replacement list of a variadic macro.  */
1167       if (result == pfile->spec_nodes.n__VA_ARGS__
1168           && !pfile->state.va_args_ok)
1169         cpp_error (pfile, CPP_DL_PEDWARN,
1170                    "__VA_ARGS__ can only appear in the expansion"
1171                    " of a C99 variadic macro");
1172
1173       /* For -Wc++-compat, warn about use of C++ named operators.  */
1174       if (result->flags & NODE_WARN_OPERATOR)
1175         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1176                      "identifier \"%s\" is a special operator name in C++",
1177                      NODE_NAME (result));
1178     }
1179
1180   return result;
1181 }
1182
1183 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1184 static void
1185 lex_number (cpp_reader *pfile, cpp_string *number,
1186             struct normalize_state *nst)
1187 {
1188   const uchar *cur;
1189   const uchar *base;
1190   uchar *dest;
1191
1192   base = pfile->buffer->cur - 1;
1193   do
1194     {
1195       cur = pfile->buffer->cur;
1196
1197       /* N.B. ISIDNUM does not include $.  */
1198       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
1199         {
1200           cur++;
1201           NORMALIZE_STATE_UPDATE_IDNUM (nst);
1202         }
1203
1204       pfile->buffer->cur = cur;
1205     }
1206   while (forms_identifier_p (pfile, false, nst));
1207
1208   number->len = cur - base;
1209   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1210   memcpy (dest, base, number->len);
1211   dest[number->len] = '\0';
1212   number->text = dest;
1213 }
1214
1215 /* Create a token of type TYPE with a literal spelling.  */
1216 static void
1217 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1218                 unsigned int len, enum cpp_ttype type)
1219 {
1220   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1221
1222   memcpy (dest, base, len);
1223   dest[len] = '\0';
1224   token->type = type;
1225   token->val.str.len = len;
1226   token->val.str.text = dest;
1227 }
1228
1229 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1230    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1231
1232 static void
1233 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1234                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1235 {
1236   _cpp_buff *first_buff = *first_buff_p;
1237   _cpp_buff *last_buff = *last_buff_p;
1238
1239   if (first_buff == NULL)
1240     first_buff = last_buff = _cpp_get_buff (pfile, len);
1241   else if (len > BUFF_ROOM (last_buff))
1242     {
1243       size_t room = BUFF_ROOM (last_buff);
1244       memcpy (BUFF_FRONT (last_buff), base, room);
1245       BUFF_FRONT (last_buff) += room;
1246       base += room;
1247       len -= room;
1248       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1249     }
1250
1251   memcpy (BUFF_FRONT (last_buff), base, len);
1252   BUFF_FRONT (last_buff) += len;
1253
1254   *first_buff_p = first_buff;
1255   *last_buff_p = last_buff;
1256 }
1257
1258 /* Lexes a raw string.  The stored string contains the spelling, including
1259    double quotes, delimiter string, '(' and ')', any leading
1260    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1261    literal, or CPP_OTHER if it was not properly terminated.
1262
1263    The spelling is NUL-terminated, but it is not guaranteed that this
1264    is the first NUL since embedded NULs are preserved.  */
1265
1266 static void
1267 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1268                 const uchar *cur)
1269 {
1270   source_location saw_NUL = 0;
1271   const uchar *raw_prefix;
1272   unsigned int raw_prefix_len = 0;
1273   enum cpp_ttype type;
1274   size_t total_len = 0;
1275   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1276   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1277
1278   type = (*base == 'L' ? CPP_WSTRING :
1279           *base == 'U' ? CPP_STRING32 :
1280           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1281           : CPP_STRING);
1282
1283   raw_prefix = cur + 1;
1284   while (raw_prefix_len < 16)
1285     {
1286       switch (raw_prefix[raw_prefix_len])
1287         {
1288         case ' ': case '(': case ')': case '\\': case '\t':
1289         case '\v': case '\f': case '\n': default:
1290           break;
1291         /* Basic source charset except the above chars.  */
1292         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1293         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1294         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1295         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1296         case 'y': case 'z':
1297         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1298         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1299         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1300         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1301         case 'Y': case 'Z':
1302         case '0': case '1': case '2': case '3': case '4': case '5':
1303         case '6': case '7': case '8': case '9':
1304         case '_': case '{': case '}': case '#': case '[': case ']':
1305         case '<': case '>': case '%': case ':': case ';': case '.':
1306         case '?': case '*': case '+': case '-': case '/': case '^':
1307         case '&': case '|': case '~': case '!': case '=': case ',':
1308         case '"': case '\'':
1309           raw_prefix_len++;
1310           continue;
1311         }
1312       break;
1313     }
1314
1315   if (raw_prefix[raw_prefix_len] != '(')
1316     {
1317       int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
1318                 + 1;
1319       if (raw_prefix_len == 16)
1320         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1321                              "raw string delimiter longer than 16 characters");
1322       else
1323         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1324                              "invalid character '%c' in raw string delimiter",
1325                              (int) raw_prefix[raw_prefix_len]);
1326       pfile->buffer->cur = raw_prefix - 1;
1327       create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
1328       return;
1329     }
1330
1331   cur = raw_prefix + raw_prefix_len + 1;
1332   for (;;)
1333     {
1334 #define BUF_APPEND(STR,LEN)                                     \
1335       do {                                                      \
1336         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1337                         &first_buff, &last_buff);               \
1338         total_len += (LEN);                                     \
1339       } while (0);
1340
1341       cppchar_t c;
1342
1343       /* If we previously performed any trigraph or line splicing
1344          transformations, undo them within the body of the raw string.  */
1345       while (note->pos < cur)
1346         ++note;
1347       for (; note->pos == cur; ++note)
1348         {
1349           switch (note->type)
1350             {
1351             case '\\':
1352             case ' ':
1353               /* Restore backslash followed by newline.  */
1354               BUF_APPEND (base, cur - base);
1355               base = cur;
1356               BUF_APPEND ("\\", 1);
1357             after_backslash:
1358               if (note->type == ' ')
1359                 {
1360                   /* GNU backslash whitespace newline extension.  FIXME
1361                      could be any sequence of non-vertical space.  When we
1362                      can properly restore any such sequence, we should mark
1363                      this note as handled so _cpp_process_line_notes
1364                      doesn't warn.  */
1365                   BUF_APPEND (" ", 1);
1366                 }
1367
1368               BUF_APPEND ("\n", 1);
1369               break;
1370
1371             case 0:
1372               /* Already handled.  */
1373               break;
1374
1375             default:
1376               if (_cpp_trigraph_map[note->type])
1377                 {
1378                   /* Don't warn about this trigraph in
1379                      _cpp_process_line_notes, since trigraphs show up as
1380                      trigraphs in raw strings.  */
1381                   uchar type = note->type;
1382                   note->type = 0;
1383
1384                   if (!CPP_OPTION (pfile, trigraphs))
1385                     /* If we didn't convert the trigraph in the first
1386                        place, don't do anything now either.  */
1387                     break;
1388
1389                   BUF_APPEND (base, cur - base);
1390                   base = cur;
1391                   BUF_APPEND ("??", 2);
1392
1393                   /* ??/ followed by newline gets two line notes, one for
1394                      the trigraph and one for the backslash/newline.  */
1395                   if (type == '/' && note[1].pos == cur)
1396                     {
1397                       if (note[1].type != '\\'
1398                           && note[1].type != ' ')
1399                         abort ();
1400                       BUF_APPEND ("/", 1);
1401                       ++note;
1402                       goto after_backslash;
1403                     }
1404                   /* The ) from ??) could be part of the suffix.  */
1405                   else if (type == ')'
1406                            && strncmp ((const char *) cur+1,
1407                                        (const char *) raw_prefix,
1408                                        raw_prefix_len) == 0
1409                            && cur[raw_prefix_len+1] == '"')
1410                     {
1411                       cur += raw_prefix_len+2;
1412                       goto break_outer_loop;
1413                     }
1414                   else
1415                     {
1416                       /* Skip the replacement character.  */
1417                       base = ++cur;
1418                       BUF_APPEND (&type, 1);
1419                     }
1420                 }
1421               else
1422                 abort ();
1423               break;
1424             }
1425         }
1426       c = *cur++;
1427
1428       if (c == ')'
1429           && strncmp ((const char *) cur, (const char *) raw_prefix,
1430                       raw_prefix_len) == 0
1431           && cur[raw_prefix_len] == '"')
1432         {
1433           cur += raw_prefix_len + 1;
1434           break;
1435         }
1436       else if (c == '\n')
1437         {
1438           if (pfile->state.in_directive
1439               || pfile->state.parsing_args
1440               || pfile->state.in_deferred_pragma)
1441             {
1442               cur--;
1443               type = CPP_OTHER;
1444               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1445                                    "unterminated raw string");
1446               break;
1447             }
1448
1449           BUF_APPEND (base, cur - base);
1450
1451           if (pfile->buffer->cur < pfile->buffer->rlimit)
1452             CPP_INCREMENT_LINE (pfile, 0);
1453           pfile->buffer->need_line = true;
1454
1455           pfile->buffer->cur = cur-1;
1456           _cpp_process_line_notes (pfile, false);
1457           if (!_cpp_get_fresh_line (pfile))
1458             {
1459               source_location src_loc = token->src_loc;
1460               token->type = CPP_EOF;
1461               /* Tell the compiler the line number of the EOF token.  */
1462               token->src_loc = pfile->line_table->highest_line;
1463               token->flags = BOL;
1464               if (first_buff != NULL)
1465                 _cpp_release_buff (pfile, first_buff);
1466               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1467                                    "unterminated raw string");
1468               return;
1469             }
1470
1471           cur = base = pfile->buffer->cur;
1472           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1473         }
1474       else if (c == '\0' && !saw_NUL)
1475         LINEMAP_POSITION_FOR_COLUMN (saw_NUL, pfile->line_table,
1476                                      CPP_BUF_COLUMN (pfile->buffer, cur));
1477     }
1478  break_outer_loop:
1479
1480   if (saw_NUL && !pfile->state.skipping)
1481     cpp_error_with_line (pfile, CPP_DL_WARNING, saw_NUL, 0,
1482                "null character(s) preserved in literal");
1483
1484   pfile->buffer->cur = cur;
1485   if (first_buff == NULL)
1486     create_literal (pfile, token, base, cur - base, type);
1487   else
1488     {
1489       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1490
1491       token->type = type;
1492       token->val.str.len = total_len + (cur - base);
1493       token->val.str.text = dest;
1494       last_buff = first_buff;
1495       while (last_buff != NULL)
1496         {
1497           memcpy (dest, last_buff->base,
1498                   BUFF_FRONT (last_buff) - last_buff->base);
1499           dest += BUFF_FRONT (last_buff) - last_buff->base;
1500           last_buff = last_buff->next;
1501         }
1502       _cpp_release_buff (pfile, first_buff);
1503       memcpy (dest, base, cur - base);
1504       dest[cur - base] = '\0';
1505     }
1506 }
1507
1508 /* Lexes a string, character constant, or angle-bracketed header file
1509    name.  The stored string contains the spelling, including opening
1510    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1511    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1512    if it was not properly terminated, or CPP_LESS for an unterminated
1513    header name which must be relexed as normal tokens.
1514
1515    The spelling is NUL-terminated, but it is not guaranteed that this
1516    is the first NUL since embedded NULs are preserved.  */
1517 static void
1518 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1519 {
1520   bool saw_NUL = false;
1521   const uchar *cur;
1522   cppchar_t terminator;
1523   enum cpp_ttype type;
1524
1525   cur = base;
1526   terminator = *cur++;
1527   if (terminator == 'L' || terminator == 'U')
1528     terminator = *cur++;
1529   else if (terminator == 'u')
1530     {
1531       terminator = *cur++;
1532       if (terminator == '8')
1533         terminator = *cur++;
1534     }
1535   if (terminator == 'R')
1536     {
1537       lex_raw_string (pfile, token, base, cur);
1538       return;
1539     }
1540   if (terminator == '"')
1541     type = (*base == 'L' ? CPP_WSTRING :
1542             *base == 'U' ? CPP_STRING32 :
1543             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1544                          : CPP_STRING);
1545   else if (terminator == '\'')
1546     type = (*base == 'L' ? CPP_WCHAR :
1547             *base == 'U' ? CPP_CHAR32 :
1548             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1549   else
1550     terminator = '>', type = CPP_HEADER_NAME;
1551
1552   for (;;)
1553     {
1554       cppchar_t c = *cur++;
1555
1556       /* In #include-style directives, terminators are not escapable.  */
1557       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1558         cur++;
1559       else if (c == terminator)
1560         break;
1561       else if (c == '\n')
1562         {
1563           cur--;
1564           /* Unmatched quotes always yield undefined behavior, but
1565              greedy lexing means that what appears to be an unterminated
1566              header name may actually be a legitimate sequence of tokens.  */
1567           if (terminator == '>')
1568             {
1569               token->type = CPP_LESS;
1570               return;
1571             }
1572           type = CPP_OTHER;
1573           break;
1574         }
1575       else if (c == '\0')
1576         saw_NUL = true;
1577     }
1578
1579   if (saw_NUL && !pfile->state.skipping)
1580     cpp_error (pfile, CPP_DL_WARNING,
1581                "null character(s) preserved in literal");
1582
1583   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1584     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1585                (int) terminator);
1586
1587   pfile->buffer->cur = cur;
1588   create_literal (pfile, token, base, cur - base, type);
1589 }
1590
1591 /* Return the comment table. The client may not make any assumption
1592    about the ordering of the table.  */
1593 cpp_comment_table *
1594 cpp_get_comments (cpp_reader *pfile)
1595 {
1596   return &pfile->comments;
1597 }
1598
1599 /* Append a comment to the end of the comment table. */
1600 static void
1601 store_comment (cpp_reader *pfile, cpp_token *token)
1602 {
1603   int len;
1604
1605   if (pfile->comments.allocated == 0)
1606     {
1607       pfile->comments.allocated = 256;
1608       pfile->comments.entries = (cpp_comment *) xmalloc
1609         (pfile->comments.allocated * sizeof (cpp_comment));
1610     }
1611
1612   if (pfile->comments.count == pfile->comments.allocated)
1613     {
1614       pfile->comments.allocated *= 2;
1615       pfile->comments.entries = (cpp_comment *) xrealloc
1616         (pfile->comments.entries,
1617          pfile->comments.allocated * sizeof (cpp_comment));
1618     }
1619
1620   len = token->val.str.len;
1621
1622   /* Copy comment. Note, token may not be NULL terminated. */
1623   pfile->comments.entries[pfile->comments.count].comment =
1624     (char *) xmalloc (sizeof (char) * (len + 1));
1625   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1626           token->val.str.text, len);
1627   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1628
1629   /* Set source location. */
1630   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1631
1632   /* Increment the count of entries in the comment table. */
1633   pfile->comments.count++;
1634 }
1635
1636 /* The stored comment includes the comment start and any terminator.  */
1637 static void
1638 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1639               cppchar_t type)
1640 {
1641   unsigned char *buffer;
1642   unsigned int len, clen, i;
1643
1644   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1645
1646   /* C++ comments probably (not definitely) have moved past a new
1647      line, which we don't want to save in the comment.  */
1648   if (is_vspace (pfile->buffer->cur[-1]))
1649     len--;
1650
1651   /* If we are currently in a directive or in argument parsing, then
1652      we need to store all C++ comments as C comments internally, and
1653      so we need to allocate a little extra space in that case.
1654
1655      Note that the only time we encounter a directive here is
1656      when we are saving comments in a "#define".  */
1657   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1658           && type == '/') ? len + 2 : len;
1659
1660   buffer = _cpp_unaligned_alloc (pfile, clen);
1661
1662   token->type = CPP_COMMENT;
1663   token->val.str.len = clen;
1664   token->val.str.text = buffer;
1665
1666   buffer[0] = '/';
1667   memcpy (buffer + 1, from, len - 1);
1668
1669   /* Finish conversion to a C comment, if necessary.  */
1670   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1671     {
1672       buffer[1] = '*';
1673       buffer[clen - 2] = '*';
1674       buffer[clen - 1] = '/';
1675       /* As there can be in a C++ comments illegal sequences for C comments
1676          we need to filter them out.  */
1677       for (i = 2; i < (clen - 2); i++)
1678         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1679           buffer[i] = '|';
1680     }
1681
1682   /* Finally store this comment for use by clients of libcpp. */
1683   store_comment (pfile, token);
1684 }
1685
1686 /* Allocate COUNT tokens for RUN.  */
1687 void
1688 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1689 {
1690   run->base = XNEWVEC (cpp_token, count);
1691   run->limit = run->base + count;
1692   run->next = NULL;
1693 }
1694
1695 /* Returns the next tokenrun, or creates one if there is none.  */
1696 static tokenrun *
1697 next_tokenrun (tokenrun *run)
1698 {
1699   if (run->next == NULL)
1700     {
1701       run->next = XNEW (tokenrun);
1702       run->next->prev = run;
1703       _cpp_init_tokenrun (run->next, 250);
1704     }
1705
1706   return run->next;
1707 }
1708
1709 /* Look ahead in the input stream.  */
1710 const cpp_token *
1711 cpp_peek_token (cpp_reader *pfile, int index)
1712 {
1713   cpp_context *context = pfile->context;
1714   const cpp_token *peektok;
1715   int count;
1716
1717   /* First, scan through any pending cpp_context objects.  */
1718   while (context->prev)
1719     {
1720       ptrdiff_t sz = (context->direct_p
1721                       ? LAST (context).token - FIRST (context).token
1722                       : LAST (context).ptoken - FIRST (context).ptoken);
1723
1724       if (index < (int) sz)
1725         return (context->direct_p
1726                 ? FIRST (context).token + index
1727                 : *(FIRST (context).ptoken + index));
1728
1729       index -= (int) sz;
1730       context = context->prev;
1731     }
1732
1733   /* We will have to read some new tokens after all (and do so
1734      without invalidating preceding tokens).  */
1735   count = index;
1736   pfile->keep_tokens++;
1737
1738   do
1739     {
1740       peektok = _cpp_lex_token (pfile);
1741       if (peektok->type == CPP_EOF)
1742         return peektok;
1743     }
1744   while (index--);
1745
1746   _cpp_backup_tokens_direct (pfile, count + 1);
1747   pfile->keep_tokens--;
1748
1749   return peektok;
1750 }
1751
1752 /* Allocate a single token that is invalidated at the same time as the
1753    rest of the tokens on the line.  Has its line and col set to the
1754    same as the last lexed token, so that diagnostics appear in the
1755    right place.  */
1756 cpp_token *
1757 _cpp_temp_token (cpp_reader *pfile)
1758 {
1759   cpp_token *old, *result;
1760   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1761   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1762
1763   old = pfile->cur_token - 1;
1764   /* Any pre-existing lookaheads must not be clobbered.  */
1765   if (la)
1766     {
1767       if (sz <= la)
1768         {
1769           tokenrun *next = next_tokenrun (pfile->cur_run);
1770
1771           if (sz < la)
1772             memmove (next->base + 1, next->base,
1773                      (la - sz) * sizeof (cpp_token));
1774
1775           next->base[0] = pfile->cur_run->limit[-1];
1776         }
1777
1778       if (sz > 1)
1779         memmove (pfile->cur_token + 1, pfile->cur_token,
1780                  MIN (la, sz - 1) * sizeof (cpp_token));
1781     }
1782
1783   if (!sz && pfile->cur_token == pfile->cur_run->limit)
1784     {
1785       pfile->cur_run = next_tokenrun (pfile->cur_run);
1786       pfile->cur_token = pfile->cur_run->base;
1787     }
1788
1789   result = pfile->cur_token++;
1790   result->src_loc = old->src_loc;
1791   return result;
1792 }
1793
1794 /* Lex a token into RESULT (external interface).  Takes care of issues
1795    like directive handling, token lookahead, multiple include
1796    optimization and skipping.  */
1797 const cpp_token *
1798 _cpp_lex_token (cpp_reader *pfile)
1799 {
1800   cpp_token *result;
1801
1802   result = NULL;
1803   for (;;)
1804     {
1805       if (pfile->cur_token == pfile->cur_run->limit)
1806         {
1807           pfile->cur_run = next_tokenrun (pfile->cur_run);
1808           pfile->cur_token = pfile->cur_run->base;
1809         }
1810       /* We assume that the current token is somewhere in the current
1811          run.  */
1812       if (pfile->cur_token < pfile->cur_run->base
1813           || pfile->cur_token >= pfile->cur_run->limit)
1814         abort ();
1815
1816       if (pfile->lookaheads)
1817         {
1818           pfile->lookaheads--;
1819           result = pfile->cur_token++;
1820         }
1821       else
1822         result = _cpp_lex_direct (pfile);
1823
1824       if (result->flags & BOL)
1825         {
1826           /* Is this a directive.  If _cpp_handle_directive returns
1827              false, it is an assembler #.  */
1828           if (result->type == CPP_HASH
1829               /* 6.10.3 p 11: Directives in a list of macro arguments
1830                  gives undefined behavior.  This implementation
1831                  handles the directive as normal.  */
1832               && pfile->state.parsing_args != 1)
1833             {
1834               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1835                 {
1836                   if (pfile->directive_result.type == CPP_PADDING)
1837                     continue;
1838                   result = &pfile->directive_result;
1839                 }
1840             }
1841           else if (pfile->state.in_deferred_pragma)
1842             result = &pfile->directive_result;
1843
1844           if (pfile->cb.line_change && !pfile->state.skipping)
1845             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1846         }
1847
1848       /* We don't skip tokens in directives.  */
1849       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
1850         break;
1851
1852       /* Outside a directive, invalidate controlling macros.  At file
1853          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
1854          get here and MI optimization works.  */
1855       pfile->mi_valid = false;
1856
1857       if (!pfile->state.skipping || result->type == CPP_EOF)
1858         break;
1859     }
1860
1861   return result;
1862 }
1863
1864 /* Returns true if a fresh line has been loaded.  */
1865 bool
1866 _cpp_get_fresh_line (cpp_reader *pfile)
1867 {
1868   int return_at_eof;
1869
1870   /* We can't get a new line until we leave the current directive.  */
1871   if (pfile->state.in_directive)
1872     return false;
1873
1874   for (;;)
1875     {
1876       cpp_buffer *buffer = pfile->buffer;
1877
1878       if (!buffer->need_line)
1879         return true;
1880
1881       if (buffer->next_line < buffer->rlimit)
1882         {
1883           _cpp_clean_line (pfile);
1884           return true;
1885         }
1886
1887       /* First, get out of parsing arguments state.  */
1888       if (pfile->state.parsing_args)
1889         return false;
1890
1891       /* End of buffer.  Non-empty files should end in a newline.  */
1892       if (buffer->buf != buffer->rlimit
1893           && buffer->next_line > buffer->rlimit
1894           && !buffer->from_stage3)
1895         {
1896           /* Clip to buffer size.  */
1897           buffer->next_line = buffer->rlimit;
1898         }
1899
1900       return_at_eof = buffer->return_at_eof;
1901       _cpp_pop_buffer (pfile);
1902       if (pfile->buffer == NULL || return_at_eof)
1903         return false;
1904     }
1905 }
1906
1907 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
1908   do                                                    \
1909     {                                                   \
1910       result->type = ELSE_TYPE;                         \
1911       if (*buffer->cur == CHAR)                         \
1912         buffer->cur++, result->type = THEN_TYPE;        \
1913     }                                                   \
1914   while (0)
1915
1916 /* Lex a token into pfile->cur_token, which is also incremented, to
1917    get diagnostics pointing to the correct location.
1918
1919    Does not handle issues such as token lookahead, multiple-include
1920    optimization, directives, skipping etc.  This function is only
1921    suitable for use by _cpp_lex_token, and in special cases like
1922    lex_expansion_token which doesn't care for any of these issues.
1923
1924    When meeting a newline, returns CPP_EOF if parsing a directive,
1925    otherwise returns to the start of the token buffer if permissible.
1926    Returns the location of the lexed token.  */
1927 cpp_token *
1928 _cpp_lex_direct (cpp_reader *pfile)
1929 {
1930   cppchar_t c;
1931   cpp_buffer *buffer;
1932   const unsigned char *comment_start;
1933   cpp_token *result = pfile->cur_token++;
1934
1935  fresh_line:
1936   result->flags = 0;
1937   buffer = pfile->buffer;
1938   if (buffer->need_line)
1939     {
1940       if (pfile->state.in_deferred_pragma)
1941         {
1942           result->type = CPP_PRAGMA_EOL;
1943           pfile->state.in_deferred_pragma = false;
1944           if (!pfile->state.pragma_allow_expansion)
1945             pfile->state.prevent_expansion--;
1946           return result;
1947         }
1948       if (!_cpp_get_fresh_line (pfile))
1949         {
1950           result->type = CPP_EOF;
1951           if (!pfile->state.in_directive)
1952             {
1953               /* Tell the compiler the line number of the EOF token.  */
1954               result->src_loc = pfile->line_table->highest_line;
1955               result->flags = BOL;
1956             }
1957           return result;
1958         }
1959       if (!pfile->keep_tokens)
1960         {
1961           pfile->cur_run = &pfile->base_run;
1962           result = pfile->base_run.base;
1963           pfile->cur_token = result + 1;
1964         }
1965       result->flags = BOL;
1966       if (pfile->state.parsing_args == 2)
1967         result->flags |= PREV_WHITE;
1968     }
1969   buffer = pfile->buffer;
1970  update_tokens_line:
1971   result->src_loc = pfile->line_table->highest_line;
1972
1973  skipped_white:
1974   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
1975       && !pfile->overlaid_buffer)
1976     {
1977       _cpp_process_line_notes (pfile, false);
1978       result->src_loc = pfile->line_table->highest_line;
1979     }
1980   c = *buffer->cur++;
1981
1982   LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
1983                                CPP_BUF_COLUMN (buffer, buffer->cur));
1984
1985   switch (c)
1986     {
1987     case ' ': case '\t': case '\f': case '\v': case '\0':
1988       result->flags |= PREV_WHITE;
1989       skip_whitespace (pfile, c);
1990       goto skipped_white;
1991
1992     case '\n':
1993       if (buffer->cur < buffer->rlimit)
1994         CPP_INCREMENT_LINE (pfile, 0);
1995       buffer->need_line = true;
1996       goto fresh_line;
1997
1998     case '0': case '1': case '2': case '3': case '4':
1999     case '5': case '6': case '7': case '8': case '9':
2000       {
2001         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2002         result->type = CPP_NUMBER;
2003         lex_number (pfile, &result->val.str, &nst);
2004         warn_about_normalization (pfile, result, &nst);
2005         break;
2006       }
2007
2008     case 'L':
2009     case 'u':
2010     case 'U':
2011     case 'R':
2012       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2013          wide strings or raw strings.  */
2014       if (c == 'L' || CPP_OPTION (pfile, uliterals))
2015         {
2016           if ((*buffer->cur == '\'' && c != 'R')
2017               || *buffer->cur == '"'
2018               || (*buffer->cur == 'R'
2019                   && c != 'R'
2020                   && buffer->cur[1] == '"'
2021                   && CPP_OPTION (pfile, uliterals))
2022               || (*buffer->cur == '8'
2023                   && c == 'u'
2024                   && (buffer->cur[1] == '"'
2025                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'))))
2026             {
2027               lex_string (pfile, result, buffer->cur - 1);
2028               break;
2029             }
2030         }
2031       /* Fall through.  */
2032
2033     case '_':
2034     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2035     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2036     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2037     case 's': case 't':           case 'v': case 'w': case 'x':
2038     case 'y': case 'z':
2039     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2040     case 'G': case 'H': case 'I': case 'J': case 'K':
2041     case 'M': case 'N': case 'O': case 'P': case 'Q':
2042     case 'S': case 'T':           case 'V': case 'W': case 'X':
2043     case 'Y': case 'Z':
2044       result->type = CPP_NAME;
2045       {
2046         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2047         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2048                                                 &nst);
2049         warn_about_normalization (pfile, result, &nst);
2050       }
2051
2052       /* Convert named operators to their proper types.  */
2053       if (result->val.node.node->flags & NODE_OPERATOR)
2054         {
2055           result->flags |= NAMED_OP;
2056           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2057         }
2058       break;
2059
2060     case '\'':
2061     case '"':
2062       lex_string (pfile, result, buffer->cur - 1);
2063       break;
2064
2065     case '/':
2066       /* A potential block or line comment.  */
2067       comment_start = buffer->cur;
2068       c = *buffer->cur;
2069
2070       if (c == '*')
2071         {
2072           if (_cpp_skip_block_comment (pfile))
2073             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2074         }
2075       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2076                             || cpp_in_system_header (pfile)))
2077         {
2078           /* Warn about comments only if pedantically GNUC89, and not
2079              in system headers.  */
2080           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2081               && ! buffer->warned_cplusplus_comments)
2082             {
2083               cpp_error (pfile, CPP_DL_PEDWARN,
2084                          "C++ style comments are not allowed in ISO C90");
2085               cpp_error (pfile, CPP_DL_PEDWARN,
2086                          "(this will be reported only once per input file)");
2087               buffer->warned_cplusplus_comments = 1;
2088             }
2089
2090           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2091             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2092         }
2093       else if (c == '=')
2094         {
2095           buffer->cur++;
2096           result->type = CPP_DIV_EQ;
2097           break;
2098         }
2099       else
2100         {
2101           result->type = CPP_DIV;
2102           break;
2103         }
2104
2105       if (!pfile->state.save_comments)
2106         {
2107           result->flags |= PREV_WHITE;
2108           goto update_tokens_line;
2109         }
2110
2111       /* Save the comment as a token in its own right.  */
2112       save_comment (pfile, result, comment_start, c);
2113       break;
2114
2115     case '<':
2116       if (pfile->state.angled_headers)
2117         {
2118           lex_string (pfile, result, buffer->cur - 1);
2119           if (result->type != CPP_LESS)
2120             break;
2121         }
2122
2123       result->type = CPP_LESS;
2124       if (*buffer->cur == '=')
2125         buffer->cur++, result->type = CPP_LESS_EQ;
2126       else if (*buffer->cur == '<')
2127         {
2128           buffer->cur++;
2129           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2130         }
2131       else if (CPP_OPTION (pfile, digraphs))
2132         {
2133           if (*buffer->cur == ':')
2134             {
2135               buffer->cur++;
2136               result->flags |= DIGRAPH;
2137               result->type = CPP_OPEN_SQUARE;
2138             }
2139           else if (*buffer->cur == '%')
2140             {
2141               buffer->cur++;
2142               result->flags |= DIGRAPH;
2143               result->type = CPP_OPEN_BRACE;
2144             }
2145         }
2146       break;
2147
2148     case '>':
2149       result->type = CPP_GREATER;
2150       if (*buffer->cur == '=')
2151         buffer->cur++, result->type = CPP_GREATER_EQ;
2152       else if (*buffer->cur == '>')
2153         {
2154           buffer->cur++;
2155           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2156         }
2157       break;
2158
2159     case '%':
2160       result->type = CPP_MOD;
2161       if (*buffer->cur == '=')
2162         buffer->cur++, result->type = CPP_MOD_EQ;
2163       else if (CPP_OPTION (pfile, digraphs))
2164         {
2165           if (*buffer->cur == ':')
2166             {
2167               buffer->cur++;
2168               result->flags |= DIGRAPH;
2169               result->type = CPP_HASH;
2170               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2171                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2172             }
2173           else if (*buffer->cur == '>')
2174             {
2175               buffer->cur++;
2176               result->flags |= DIGRAPH;
2177               result->type = CPP_CLOSE_BRACE;
2178             }
2179         }
2180       break;
2181
2182     case '.':
2183       result->type = CPP_DOT;
2184       if (ISDIGIT (*buffer->cur))
2185         {
2186           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2187           result->type = CPP_NUMBER;
2188           lex_number (pfile, &result->val.str, &nst);
2189           warn_about_normalization (pfile, result, &nst);
2190         }
2191       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2192         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2193       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2194         buffer->cur++, result->type = CPP_DOT_STAR;
2195       break;
2196
2197     case '+':
2198       result->type = CPP_PLUS;
2199       if (*buffer->cur == '+')
2200         buffer->cur++, result->type = CPP_PLUS_PLUS;
2201       else if (*buffer->cur == '=')
2202         buffer->cur++, result->type = CPP_PLUS_EQ;
2203       break;
2204
2205     case '-':
2206       result->type = CPP_MINUS;
2207       if (*buffer->cur == '>')
2208         {
2209           buffer->cur++;
2210           result->type = CPP_DEREF;
2211           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2212             buffer->cur++, result->type = CPP_DEREF_STAR;
2213         }
2214       else if (*buffer->cur == '-')
2215         buffer->cur++, result->type = CPP_MINUS_MINUS;
2216       else if (*buffer->cur == '=')
2217         buffer->cur++, result->type = CPP_MINUS_EQ;
2218       break;
2219
2220     case '&':
2221       result->type = CPP_AND;
2222       if (*buffer->cur == '&')
2223         buffer->cur++, result->type = CPP_AND_AND;
2224       else if (*buffer->cur == '=')
2225         buffer->cur++, result->type = CPP_AND_EQ;
2226       break;
2227
2228     case '|':
2229       result->type = CPP_OR;
2230       if (*buffer->cur == '|')
2231         buffer->cur++, result->type = CPP_OR_OR;
2232       else if (*buffer->cur == '=')
2233         buffer->cur++, result->type = CPP_OR_EQ;
2234       break;
2235
2236     case ':':
2237       result->type = CPP_COLON;
2238       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2239         buffer->cur++, result->type = CPP_SCOPE;
2240       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2241         {
2242           buffer->cur++;
2243           result->flags |= DIGRAPH;
2244           result->type = CPP_CLOSE_SQUARE;
2245         }
2246       break;
2247
2248     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2249     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2250     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2251     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2252     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2253
2254     case '?': result->type = CPP_QUERY; break;
2255     case '~': result->type = CPP_COMPL; break;
2256     case ',': result->type = CPP_COMMA; break;
2257     case '(': result->type = CPP_OPEN_PAREN; break;
2258     case ')': result->type = CPP_CLOSE_PAREN; break;
2259     case '[': result->type = CPP_OPEN_SQUARE; break;
2260     case ']': result->type = CPP_CLOSE_SQUARE; break;
2261     case '{': result->type = CPP_OPEN_BRACE; break;
2262     case '}': result->type = CPP_CLOSE_BRACE; break;
2263     case ';': result->type = CPP_SEMICOLON; break;
2264
2265       /* @ is a punctuator in Objective-C.  */
2266     case '@': result->type = CPP_ATSIGN; break;
2267
2268     case '$':
2269     case '\\':
2270       {
2271         const uchar *base = --buffer->cur;
2272         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2273
2274         if (forms_identifier_p (pfile, true, &nst))
2275           {
2276             result->type = CPP_NAME;
2277             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2278             warn_about_normalization (pfile, result, &nst);
2279             break;
2280           }
2281         buffer->cur++;
2282       }
2283
2284     default:
2285       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2286       break;
2287     }
2288
2289   return result;
2290 }
2291
2292 /* An upper bound on the number of bytes needed to spell TOKEN.
2293    Does not include preceding whitespace.  */
2294 unsigned int
2295 cpp_token_len (const cpp_token *token)
2296 {
2297   unsigned int len;
2298
2299   switch (TOKEN_SPELL (token))
2300     {
2301     default:            len = 6;                                break;
2302     case SPELL_LITERAL: len = token->val.str.len;               break;
2303     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2304     }
2305
2306   return len;
2307 }
2308
2309 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2310    Return the number of bytes read out of NAME.  (There are always
2311    10 bytes written to BUFFER.)  */
2312
2313 static size_t
2314 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2315 {
2316   int j;
2317   int ucn_len = 0;
2318   int ucn_len_c;
2319   unsigned t;
2320   unsigned long utf32;
2321
2322   /* Compute the length of the UTF-8 sequence.  */
2323   for (t = *name; t & 0x80; t <<= 1)
2324     ucn_len++;
2325
2326   utf32 = *name & (0x7F >> ucn_len);
2327   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2328     {
2329       utf32 = (utf32 << 6) | (*++name & 0x3F);
2330
2331       /* Ill-formed UTF-8.  */
2332       if ((*name & ~0x3F) != 0x80)
2333         abort ();
2334     }
2335
2336   *buffer++ = '\\';
2337   *buffer++ = 'U';
2338   for (j = 7; j >= 0; j--)
2339     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2340   return ucn_len;
2341 }
2342
2343 /* Given a token TYPE corresponding to a digraph, return a pointer to
2344    the spelling of the digraph.  */
2345 static const unsigned char *
2346 cpp_digraph2name (enum cpp_ttype type)
2347 {
2348   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2349 }
2350
2351 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2352    already contain the enough space to hold the token's spelling.
2353    Returns a pointer to the character after the last character written.
2354    FORSTRING is true if this is to be the spelling after translation
2355    phase 1 (this is different for UCNs).
2356    FIXME: Would be nice if we didn't need the PFILE argument.  */
2357 unsigned char *
2358 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2359                  unsigned char *buffer, bool forstring)
2360 {
2361   switch (TOKEN_SPELL (token))
2362     {
2363     case SPELL_OPERATOR:
2364       {
2365         const unsigned char *spelling;
2366         unsigned char c;
2367
2368         if (token->flags & DIGRAPH)
2369           spelling = cpp_digraph2name (token->type);
2370         else if (token->flags & NAMED_OP)
2371           goto spell_ident;
2372         else
2373           spelling = TOKEN_NAME (token);
2374
2375         while ((c = *spelling++) != '\0')
2376           *buffer++ = c;
2377       }
2378       break;
2379
2380     spell_ident:
2381     case SPELL_IDENT:
2382       if (forstring)
2383         {
2384           memcpy (buffer, NODE_NAME (token->val.node.node),
2385                   NODE_LEN (token->val.node.node));
2386           buffer += NODE_LEN (token->val.node.node);
2387         }
2388       else
2389         {
2390           size_t i;
2391           const unsigned char * name = NODE_NAME (token->val.node.node);
2392
2393           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2394             if (name[i] & ~0x7F)
2395               {
2396                 i += utf8_to_ucn (buffer, name + i) - 1;
2397                 buffer += 10;
2398               }
2399             else
2400               *buffer++ = NODE_NAME (token->val.node.node)[i];
2401         }
2402       break;
2403
2404     case SPELL_LITERAL:
2405       memcpy (buffer, token->val.str.text, token->val.str.len);
2406       buffer += token->val.str.len;
2407       break;
2408
2409     case SPELL_NONE:
2410       cpp_error (pfile, CPP_DL_ICE,
2411                  "unspellable token %s", TOKEN_NAME (token));
2412       break;
2413     }
2414
2415   return buffer;
2416 }
2417
2418 /* Returns TOKEN spelt as a null-terminated string.  The string is
2419    freed when the reader is destroyed.  Useful for diagnostics.  */
2420 unsigned char *
2421 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2422 {
2423   unsigned int len = cpp_token_len (token) + 1;
2424   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2425
2426   end = cpp_spell_token (pfile, token, start, false);
2427   end[0] = '\0';
2428
2429   return start;
2430 }
2431
2432 /* Returns a pointer to a string which spells the token defined by
2433    TYPE and FLAGS.  Used by C front ends, which really should move to
2434    using cpp_token_as_text.  */
2435 const char *
2436 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2437 {
2438   if (flags & DIGRAPH)
2439     return (const char *) cpp_digraph2name (type);
2440   else if (flags & NAMED_OP)
2441     return cpp_named_operator2name (type);
2442
2443   return (const char *) token_spellings[type].name;
2444 }
2445
2446 /* Writes the spelling of token to FP, without any preceding space.
2447    Separated from cpp_spell_token for efficiency - to avoid stdio
2448    double-buffering.  */
2449 void
2450 cpp_output_token (const cpp_token *token, FILE *fp)
2451 {
2452   switch (TOKEN_SPELL (token))
2453     {
2454     case SPELL_OPERATOR:
2455       {
2456         const unsigned char *spelling;
2457         int c;
2458
2459         if (token->flags & DIGRAPH)
2460           spelling = cpp_digraph2name (token->type);
2461         else if (token->flags & NAMED_OP)
2462           goto spell_ident;
2463         else
2464           spelling = TOKEN_NAME (token);
2465
2466         c = *spelling;
2467         do
2468           putc (c, fp);
2469         while ((c = *++spelling) != '\0');
2470       }
2471       break;
2472
2473     spell_ident:
2474     case SPELL_IDENT:
2475       {
2476         size_t i;
2477         const unsigned char * name = NODE_NAME (token->val.node.node);
2478
2479         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2480           if (name[i] & ~0x7F)
2481             {
2482               unsigned char buffer[10];
2483               i += utf8_to_ucn (buffer, name + i) - 1;
2484               fwrite (buffer, 1, 10, fp);
2485             }
2486           else
2487             fputc (NODE_NAME (token->val.node.node)[i], fp);
2488       }
2489       break;
2490
2491     case SPELL_LITERAL:
2492       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2493       break;
2494
2495     case SPELL_NONE:
2496       /* An error, most probably.  */
2497       break;
2498     }
2499 }
2500
2501 /* Compare two tokens.  */
2502 int
2503 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2504 {
2505   if (a->type == b->type && a->flags == b->flags)
2506     switch (TOKEN_SPELL (a))
2507       {
2508       default:                  /* Keep compiler happy.  */
2509       case SPELL_OPERATOR:
2510         /* token_no is used to track where multiple consecutive ##
2511            tokens were originally located.  */
2512         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2513       case SPELL_NONE:
2514         return (a->type != CPP_MACRO_ARG
2515                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2516       case SPELL_IDENT:
2517         return a->val.node.node == b->val.node.node;
2518       case SPELL_LITERAL:
2519         return (a->val.str.len == b->val.str.len
2520                 && !memcmp (a->val.str.text, b->val.str.text,
2521                             a->val.str.len));
2522       }
2523
2524   return 0;
2525 }
2526
2527 /* Returns nonzero if a space should be inserted to avoid an
2528    accidental token paste for output.  For simplicity, it is
2529    conservative, and occasionally advises a space where one is not
2530    needed, e.g. "." and ".2".  */
2531 int
2532 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2533                  const cpp_token *token2)
2534 {
2535   enum cpp_ttype a = token1->type, b = token2->type;
2536   cppchar_t c;
2537
2538   if (token1->flags & NAMED_OP)
2539     a = CPP_NAME;
2540   if (token2->flags & NAMED_OP)
2541     b = CPP_NAME;
2542
2543   c = EOF;
2544   if (token2->flags & DIGRAPH)
2545     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2546   else if (token_spellings[b].category == SPELL_OPERATOR)
2547     c = token_spellings[b].name[0];
2548
2549   /* Quickly get everything that can paste with an '='.  */
2550   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2551     return 1;
2552
2553   switch (a)
2554     {
2555     case CPP_GREATER:   return c == '>';
2556     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2557     case CPP_PLUS:      return c == '+';
2558     case CPP_MINUS:     return c == '-' || c == '>';
2559     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2560     case CPP_MOD:       return c == ':' || c == '>';
2561     case CPP_AND:       return c == '&';
2562     case CPP_OR:        return c == '|';
2563     case CPP_COLON:     return c == ':' || c == '>';
2564     case CPP_DEREF:     return c == '*';
2565     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2566     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2567     case CPP_NAME:      return ((b == CPP_NUMBER
2568                                  && name_p (pfile, &token2->val.str))
2569                                 || b == CPP_NAME
2570                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2571     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2572                                 || c == '.' || c == '+' || c == '-');
2573                                       /* UCNs */
2574     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2575                                  && b == CPP_NAME)
2576                                 || (CPP_OPTION (pfile, objc)
2577                                     && token1->val.str.text[0] == '@'
2578                                     && (b == CPP_NAME || b == CPP_STRING)));
2579     default:            break;
2580     }
2581
2582   return 0;
2583 }
2584
2585 /* Output all the remaining tokens on the current line, and a newline
2586    character, to FP.  Leading whitespace is removed.  If there are
2587    macros, special token padding is not performed.  */
2588 void
2589 cpp_output_line (cpp_reader *pfile, FILE *fp)
2590 {
2591   const cpp_token *token;
2592
2593   token = cpp_get_token (pfile);
2594   while (token->type != CPP_EOF)
2595     {
2596       cpp_output_token (token, fp);
2597       token = cpp_get_token (pfile);
2598       if (token->flags & PREV_WHITE)
2599         putc (' ', fp);
2600     }
2601
2602   putc ('\n', fp);
2603 }
2604
2605 /* Return a string representation of all the remaining tokens on the
2606    current line.  The result is allocated using xmalloc and must be
2607    freed by the caller.  */
2608 unsigned char *
2609 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2610 {
2611   const cpp_token *token;
2612   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2613   unsigned int alloced = 120 + out;
2614   unsigned char *result = (unsigned char *) xmalloc (alloced);
2615
2616   /* If DIR_NAME is empty, there are no initial contents.  */
2617   if (dir_name)
2618     {
2619       sprintf ((char *) result, "#%s ", dir_name);
2620       out += 2;
2621     }
2622
2623   token = cpp_get_token (pfile);
2624   while (token->type != CPP_EOF)
2625     {
2626       unsigned char *last;
2627       /* Include room for a possible space and the terminating nul.  */
2628       unsigned int len = cpp_token_len (token) + 2;
2629
2630       if (out + len > alloced)
2631         {
2632           alloced *= 2;
2633           if (out + len > alloced)
2634             alloced = out + len;
2635           result = (unsigned char *) xrealloc (result, alloced);
2636         }
2637
2638       last = cpp_spell_token (pfile, token, &result[out], 0);
2639       out = last - result;
2640
2641       token = cpp_get_token (pfile);
2642       if (token->flags & PREV_WHITE)
2643         result[out++] = ' ';
2644     }
2645
2646   result[out] = '\0';
2647   return result;
2648 }
2649
2650 /* Memory buffers.  Changing these three constants can have a dramatic
2651    effect on performance.  The values here are reasonable defaults,
2652    but might be tuned.  If you adjust them, be sure to test across a
2653    range of uses of cpplib, including heavy nested function-like macro
2654    expansion.  Also check the change in peak memory usage (NJAMD is a
2655    good tool for this).  */
2656 #define MIN_BUFF_SIZE 8000
2657 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2658 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2659         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2660
2661 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2662   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2663 #endif
2664
2665 /* Create a new allocation buffer.  Place the control block at the end
2666    of the buffer, so that buffer overflows will cause immediate chaos.  */
2667 static _cpp_buff *
2668 new_buff (size_t len)
2669 {
2670   _cpp_buff *result;
2671   unsigned char *base;
2672
2673   if (len < MIN_BUFF_SIZE)
2674     len = MIN_BUFF_SIZE;
2675   len = CPP_ALIGN (len);
2676
2677   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2678   result = (_cpp_buff *) (base + len);
2679   result->base = base;
2680   result->cur = base;
2681   result->limit = base + len;
2682   result->next = NULL;
2683   return result;
2684 }
2685
2686 /* Place a chain of unwanted allocation buffers on the free list.  */
2687 void
2688 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2689 {
2690   _cpp_buff *end = buff;
2691
2692   while (end->next)
2693     end = end->next;
2694   end->next = pfile->free_buffs;
2695   pfile->free_buffs = buff;
2696 }
2697
2698 /* Return a free buffer of size at least MIN_SIZE.  */
2699 _cpp_buff *
2700 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2701 {
2702   _cpp_buff *result, **p;
2703
2704   for (p = &pfile->free_buffs;; p = &(*p)->next)
2705     {
2706       size_t size;
2707
2708       if (*p == NULL)
2709         return new_buff (min_size);
2710       result = *p;
2711       size = result->limit - result->base;
2712       /* Return a buffer that's big enough, but don't waste one that's
2713          way too big.  */
2714       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2715         break;
2716     }
2717
2718   *p = result->next;
2719   result->next = NULL;
2720   result->cur = result->base;
2721   return result;
2722 }
2723
2724 /* Creates a new buffer with enough space to hold the uncommitted
2725    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2726    the excess bytes to the new buffer.  Chains the new buffer after
2727    BUFF, and returns the new buffer.  */
2728 _cpp_buff *
2729 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2730 {
2731   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2732   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2733
2734   buff->next = new_buff;
2735   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2736   return new_buff;
2737 }
2738
2739 /* Creates a new buffer with enough space to hold the uncommitted
2740    remaining bytes of the buffer pointed to by BUFF, and at least
2741    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
2742    Chains the new buffer before the buffer pointed to by BUFF, and
2743    updates the pointer to point to the new buffer.  */
2744 void
2745 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
2746 {
2747   _cpp_buff *new_buff, *old_buff = *pbuff;
2748   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2749
2750   new_buff = _cpp_get_buff (pfile, size);
2751   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2752   new_buff->next = old_buff;
2753   *pbuff = new_buff;
2754 }
2755
2756 /* Free a chain of buffers starting at BUFF.  */
2757 void
2758 _cpp_free_buff (_cpp_buff *buff)
2759 {
2760   _cpp_buff *next;
2761
2762   for (; buff; buff = next)
2763     {
2764       next = buff->next;
2765       free (buff->base);
2766     }
2767 }
2768
2769 /* Allocate permanent, unaligned storage of length LEN.  */
2770 unsigned char *
2771 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2772 {
2773   _cpp_buff *buff = pfile->u_buff;
2774   unsigned char *result = buff->cur;
2775
2776   if (len > (size_t) (buff->limit - result))
2777     {
2778       buff = _cpp_get_buff (pfile, len);
2779       buff->next = pfile->u_buff;
2780       pfile->u_buff = buff;
2781       result = buff->cur;
2782     }
2783
2784   buff->cur = result + len;
2785   return result;
2786 }
2787
2788 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2789    That buffer is used for growing allocations when saving macro
2790    replacement lists in a #define, and when parsing an answer to an
2791    assertion in #assert, #unassert or #if (and therefore possibly
2792    whilst expanding macros).  It therefore must not be used by any
2793    code that they might call: specifically the lexer and the guts of
2794    the macro expander.
2795
2796    All existing other uses clearly fit this restriction: storing
2797    registered pragmas during initialization.  */
2798 unsigned char *
2799 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2800 {
2801   _cpp_buff *buff = pfile->a_buff;
2802   unsigned char *result = buff->cur;
2803
2804   if (len > (size_t) (buff->limit - result))
2805     {
2806       buff = _cpp_get_buff (pfile, len);
2807       buff->next = pfile->a_buff;
2808       pfile->a_buff = buff;
2809       result = buff->cur;
2810     }
2811
2812   buff->cur = result + len;
2813   return result;
2814 }
2815
2816 /* Say which field of TOK is in use.  */
2817
2818 enum cpp_token_fld_kind
2819 cpp_token_val_index (cpp_token *tok)
2820 {
2821   switch (TOKEN_SPELL (tok))
2822     {
2823     case SPELL_IDENT:
2824       return CPP_TOKEN_FLD_NODE;
2825     case SPELL_LITERAL:
2826       return CPP_TOKEN_FLD_STR;
2827     case SPELL_OPERATOR:
2828       if (tok->type == CPP_PASTE)
2829         return CPP_TOKEN_FLD_TOKEN_NO;
2830       else
2831         return CPP_TOKEN_FLD_NONE;
2832     case SPELL_NONE:
2833       if (tok->type == CPP_MACRO_ARG)
2834         return CPP_TOKEN_FLD_ARG_NO;
2835       else if (tok->type == CPP_PADDING)
2836         return CPP_TOKEN_FLD_SOURCE;
2837       else if (tok->type == CPP_PRAGMA)
2838         return CPP_TOKEN_FLD_PRAGMA;
2839       /* else fall through */
2840     default:
2841       return CPP_TOKEN_FLD_NONE;
2842     }
2843 }