libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2010
   3    Free Software Foundation, Inc.
   4    Contributed by Per Bothner, 1994-95.
   5    Based on CCCP program by Paul Rubin, June 1986
   6    Adapted to ANSI C, Richard Stallman, Jan 1987
   7    Broken out to separate file, Zack Weinberg, Mar 2000
   8
   9 This program is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 3, or (at your option) any
  12 later version.
  13
  14 This program is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with this program; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "cpplib.h"
  26 #include "internal.h"
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  46 #define TK(e, s) { SPELL_ ## s,    UC #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void store_comment (cpp_reader *, cpp_token *);
  60 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  61                             unsigned int, enum cpp_ttype);
  62 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  63 static int name_p (cpp_reader *, const cpp_string *);
  64 static tokenrun *next_tokenrun (tokenrun *);
  65
  66 static _cpp_buff *new_buff (size_t);
  67
  68
  69 /* Utility routine:
  70
  71    Compares, the token TOKEN to the NUL-terminated string STRING.
  72    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  73 int
  74 cpp_ideq (const cpp_token *token, const char *string)
  75 {
  76   if (token->type != CPP_NAME)
  77     return 0;
  78
  79   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  80 }
  81
  82 /* Record a note TYPE at byte POS into the current cleaned logical
  83    line.  */
  84 static void
  85 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  86 {
  87   if (buffer->notes_used == buffer->notes_cap)
  88     {
  89       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  90       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  91                                   buffer->notes_cap);
  92     }
  93
  94   buffer->notes[buffer->notes_used].pos = pos;
  95   buffer->notes[buffer->notes_used].type = type;
  96   buffer->notes_used++;
  97 }
  98
  99 \f
 100 /* Fast path to find line special characters using optimized character
 101    scanning algorithms.  Anything complicated falls back to the slow
 102    path below.  Since this loop is very hot it's worth doing these kinds
 103    of optimizations.
 104
 105    One of the paths through the ifdefs should provide
 106
 107      const uchar *search_line_fast (const uchar *s, const uchar *end);
 108
 109    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 110    the found character.
 111
 112    Note that the last character of the buffer is *always* a newline,
 113    as forced by _cpp_convert_input.  This fact can be used to avoid
 114    explicitly looking for the end of the buffer.  */
 115
 116 /* Configure gives us an ifdef test.  */
 117 #ifndef WORDS_BIGENDIAN
 118 #define WORDS_BIGENDIAN 0
 119 #endif
 120
 121 /* We'd like the largest integer that fits into a register.  There's nothing
 122    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 123    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 124    can get the "real" word size.  */
 125 #ifdef __GNUC__
 126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 127 #else
 128 typedef unsigned long word_type;
 129 #endif
 130
 131 /* The code below is only expecting sizes 4 or 8.
 132    Die at compile-time if this expectation is violated.  */
 133 typedef char check_word_type_size
 134   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 135
 136 /* Return X with the first N bytes forced to values that won't match one
 137    of the interesting characters.  Note that NUL is not interesting.  */
 138
 139 static inline word_type
 140 acc_char_mask_misalign (word_type val, unsigned int n)
 141 {
 142   word_type mask = -1;
 143   if (WORDS_BIGENDIAN)
 144     mask >>= n * 8;
 145   else
 146     mask <<= n * 8;
 147   return val & mask;
 148 }
 149
 150 /* Return X replicated to all byte positions within WORD_TYPE.  */
 151
 152 static inline word_type
 153 acc_char_replicate (uchar x)
 154 {
 155   word_type ret;
 156
 157   ret = (x << 24) | (x << 16) | (x << 8) | x;
 158   if (sizeof(word_type) == 8)
 159     ret = (ret << 16 << 16) | ret;
 160   return ret;
 161 }
 162
 163 /* Return non-zero if some byte of VAL is (probably) C.  */
 164
 165 static inline word_type
 166 acc_char_cmp (word_type val, word_type c)
 167 {
 168 #if defined(__GNUC__) && defined(__alpha__)
 169   /* We can get exact results using a compare-bytes instruction.
 170      Get (val == c) via (0 >= (val ^ c)).  */
 171   return __builtin_alpha_cmpbge (0, val ^ c);
 172 #else
 173   word_type magic = 0x7efefefeU;
 174   if (sizeof(word_type) == 8)
 175     magic = (magic << 16 << 16) | 0xfefefefeU;
 176   magic |= 1;
 177
 178   val ^= c;
 179   return ((val + magic) ^ ~val) & ~magic;
 180 #endif
 181 }
 182
 183 /* Given the result of acc_char_cmp is non-zero, return the index of
 184    the found character.  If this was a false positive, return -1.  */
 185
 186 static inline int
 187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 188                 word_type val ATTRIBUTE_UNUSED)
 189 {
 190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 191   /* The cmpbge instruction sets *bits* of the result corresponding to
 192      matches in the bytes with no false positives.  */
 193   return __builtin_ctzl (cmp);
 194 #else
 195   unsigned int i;
 196
 197   /* ??? It would be nice to force unrolling here,
 198      and have all of these constants folded.  */
 199   for (i = 0; i < sizeof(word_type); ++i)
 200     {
 201       uchar c;
 202       if (WORDS_BIGENDIAN)
 203         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 204       else
 205         c = (val >> i * 8) & 0xff;
 206
 207       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 208         return i;
 209     }
 210
 211   return -1;
 212 #endif
 213 }
 214
 215 /* A version of the fast scanner using bit fiddling techniques.
 216
 217    For 32-bit words, one would normally perform 16 comparisons and
 218    16 branches.  With this algorithm one performs 24 arithmetic
 219    operations and one branch.  Whether this is faster with a 32-bit
 220    word size is going to be somewhat system dependent.
 221
 222    For 64-bit words, we eliminate twice the number of comparisons
 223    and branches without increasing the number of arithmetic operations.
 224    It's almost certainly going to be a win with 64-bit word size.  */
 225
 226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 227   ATTRIBUTE_UNUSED;
 228
 229 static const uchar *
 230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 231 {
 232   const word_type repl_nl = acc_char_replicate ('\n');
 233   const word_type repl_cr = acc_char_replicate ('\r');
 234   const word_type repl_bs = acc_char_replicate ('\\');
 235   const word_type repl_qm = acc_char_replicate ('?');
 236
 237   unsigned int misalign;
 238   const word_type *p;
 239   word_type val, t;
 240
 241   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 242   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 243   val = *p;
 244   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 245   if (misalign)
 246     val = acc_char_mask_misalign (val, misalign);
 247
 248   /* Main loop.  */
 249   while (1)
 250     {
 251       t  = acc_char_cmp (val, repl_nl);
 252       t |= acc_char_cmp (val, repl_cr);
 253       t |= acc_char_cmp (val, repl_bs);
 254       t |= acc_char_cmp (val, repl_qm);
 255
 256       if (__builtin_expect (t != 0, 0))
 257         {
 258           int i = acc_char_index (t, val);
 259           if (i >= 0)
 260             return (const uchar *)p + i;
 261         }
 262
 263       val = *++p;
 264     }
 265 }
 266
 267 /* Disable on Solaris 2/x86 until the following problems can be properly
 268    autoconfed:
 269
 270    The Solaris 8 assembler cannot assemble SSE2/SSE4.2 insns.
 271    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 272    Before Solaris 9 Update 6, SSE insns cannot be executed.
 273    The Solaris 10+ assembler tags objects with the instruction set
 274    extensions used, so SSE4.2 executables cannot run on machines that
 275    don't support that extension.  */
 276
 277 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 278
 279 /* Replicated character data to be shared between implementations.
 280    Recall that outside of a context with vector support we can't
 281    define compatible vector types, therefore these are all defined
 282    in terms of raw characters.  */
 283 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 284   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 285     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 286   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 287     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 288   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 289     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 290   { '?', '?', '?', '?', '?', '?', '?', '?',
 291     '?', '?', '?', '?', '?', '?', '?', '?' },
 292 };
 293
 294 /* A version of the fast scanner using MMX vectorized byte compare insns.
 295
 296    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 297    which was packaged into SSE1; it is also present in the AMD MMX
 298    extension.  Mark the function as using "sse" so that we emit a real
 299    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 300
 301 static const uchar *
 302 #ifndef __SSE__
 303 __attribute__((__target__("sse")))
 304 #endif
 305 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 306 {
 307   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 308   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 309
 310   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 311   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 312   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 313   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 314
 315   unsigned int misalign, found, mask;
 316   const v8qi *p;
 317   v8qi data, t, c;
 318
 319   /* Align the source pointer.  While MMX doesn't generate unaligned data
 320      faults, this allows us to safely scan to the end of the buffer without
 321      reading beyond the end of the last page.  */
 322   misalign = (uintptr_t)s & 7;
 323   p = (const v8qi *)((uintptr_t)s & -8);
 324   data = *p;
 325
 326   /* Create a mask for the bytes that are valid within the first
 327      16-byte block.  The Idea here is that the AND with the mask
 328      within the loop is "free", since we need some AND or TEST
 329      insn in order to set the flags for the branch anyway.  */
 330   mask = -1u << misalign;
 331
 332   /* Main loop processing 8 bytes at a time.  */
 333   goto start;
 334   do
 335     {
 336       data = *++p;
 337       mask = -1;
 338
 339     start:
 340       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 341       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 344       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 345       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 346       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 347       found = __builtin_ia32_pmovmskb (t);
 348       found &= mask;
 349     }
 350   while (!found);
 351
 352   __builtin_ia32_emms ();
 353
 354   /* FOUND contains 1 in bits for which we matched a relevant
 355      character.  Conversion to the byte index is trivial.  */
 356   found = __builtin_ctz(found);
 357   return (const uchar *)p + found;
 358 }
 359
 360 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 361
 362 static const uchar *
 363 #ifndef __SSE2__
 364 __attribute__((__target__("sse2")))
 365 #endif
 366 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 367 {
 368   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 369
 370   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 371   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 372   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 373   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 374
 375   unsigned int misalign, found, mask;
 376   const v16qi *p;
 377   v16qi data, t;
 378
 379   /* Align the source pointer.  */
 380   misalign = (uintptr_t)s & 15;
 381   p = (const v16qi *)((uintptr_t)s & -16);
 382   data = *p;
 383
 384   /* Create a mask for the bytes that are valid within the first
 385      16-byte block.  The Idea here is that the AND with the mask
 386      within the loop is "free", since we need some AND or TEST
 387      insn in order to set the flags for the branch anyway.  */
 388   mask = -1u << misalign;
 389
 390   /* Main loop processing 16 bytes at a time.  */
 391   goto start;
 392   do
 393     {
 394       data = *++p;
 395       mask = -1;
 396
 397     start:
 398       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 400       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 401       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 402       found = __builtin_ia32_pmovmskb128 (t);
 403       found &= mask;
 404     }
 405   while (!found);
 406
 407   /* FOUND contains 1 in bits for which we matched a relevant
 408      character.  Conversion to the byte index is trivial.  */
 409   found = __builtin_ctz(found);
 410   return (const uchar *)p + found;
 411 }
 412
 413 #ifdef HAVE_SSE4
 414 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 415
 416 static const uchar *
 417 #ifndef __SSE4_2__
 418 __attribute__((__target__("sse4.2")))
 419 #endif
 420 search_line_sse42 (const uchar *s, const uchar *end)
 421 {
 422   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 423   static const v16qi search = { '\n', '\r', '?', '\\' };
 424
 425   uintptr_t si = (uintptr_t)s;
 426   uintptr_t index;
 427
 428   /* Check for unaligned input.  */
 429   if (si & 15)
 430     {
 431       if (__builtin_expect (end - s < 16, 0)
 432           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 433         {
 434           /* There are less than 16 bytes left in the buffer, and less
 435              than 16 bytes left on the page.  Reading 16 bytes at this
 436              point might generate a spurious page fault.  Defer to the
 437              SSE2 implementation, which already handles alignment.  */
 438           return search_line_sse2 (s, end);
 439         }
 440
 441       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 442          memory need not be aligned.  */
 443       __asm ("%vpcmpestri $0, (%1), %2"
 444              : "=c"(index) : "r"(s), "x"(search), "a"(4), "d"(16));
 445       if (__builtin_expect (index < 16, 0))
 446         goto found;
 447
 448       /* Advance the pointer to an aligned address.  We will re-scan a
 449          few bytes, but we no longer need care for reading past the
 450          end of a page, since we're guaranteed a match.  */
 451       s = (const uchar *)((si + 16) & -16);
 452     }
 453
 454   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 455      in inline assembly, we can make proper use of the flags set.  */
 456   __asm (      "sub $16, %1\n"
 457         "       .balign 16\n"
 458         "0:     add $16, %1\n"
 459         "       %vpcmpestri $0, (%1), %2\n"
 460         "       jnc 0b"
 461         : "=&c"(index), "+r"(s)
 462         : "x"(search), "a"(4), "d"(16));
 463
 464  found:
 465   return s + index;
 466 }
 467
 468 #else
 469 /* Work around out-dated assemblers without sse4 support.  */
 470 #define search_line_sse42 search_line_sse2
 471 #endif
 472
 473 /* Check the CPU capabilities.  */
 474
 475 #include "../gcc/config/i386/cpuid.h"
 476
 477 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 478 static search_line_fast_type search_line_fast;
 479
 480 static void __attribute__((constructor))
 481 init_vectorized_lexer (void)
 482 {
 483   unsigned dummy, ecx = 0, edx = 0;
 484   search_line_fast_type impl = search_line_acc_char;
 485   int minimum = 0;
 486
 487 #if defined(__SSE4_2__)
 488   minimum = 3;
 489 #elif defined(__SSE2__)
 490   minimum = 2;
 491 #elif defined(__SSE__)
 492   minimum = 1;
 493 #endif
 494
 495   if (minimum == 3)
 496     impl = search_line_sse42;
 497   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 498     {
 499       if (minimum == 3 || (ecx & bit_SSE4_2))
 500         impl = search_line_sse42;
 501       else if (minimum == 2 || (edx & bit_SSE2))
 502         impl = search_line_sse2;
 503       else if (minimum == 1 || (edx & bit_SSE))
 504         impl = search_line_mmx;
 505     }
 506   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 507     {
 508       if (minimum == 1
 509           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 510         impl = search_line_mmx;
 511     }
 512
 513   search_line_fast = impl;
 514 }
 515
 516 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 517
 518 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 519 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 520    so we can't compile this function without -maltivec on the command line
 521    (or implied by some other switch).  */
 522
 523 static const uchar *
 524 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 525 {
 526   typedef __attribute__((altivec(vector))) unsigned char vc;
 527
 528   const vc repl_nl = {
 529     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 530     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 531   };
 532   const vc repl_cr = {
 533     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 534     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 535   };
 536   const vc repl_bs = {
 537     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 538     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 539   };
 540   const vc repl_qm = {
 541     '?', '?', '?', '?', '?', '?', '?', '?',
 542     '?', '?', '?', '?', '?', '?', '?', '?',
 543   };
 544   const vc ones = {
 545     -1, -1, -1, -1, -1, -1, -1, -1,
 546     -1, -1, -1, -1, -1, -1, -1, -1,
 547   };
 548   const vc zero = { 0 };
 549
 550   vc data, mask, t;
 551
 552   /* Altivec loads automatically mask addresses with -16.  This lets us
 553      issue the first load as early as possible.  */
 554   data = __builtin_vec_ld(0, (const vc *)s);
 555
 556   /* Discard bytes before the beginning of the buffer.  Do this by
 557      beginning with all ones and shifting in zeros according to the
 558      mis-alignment.  The LVSR instruction pulls the exact shift we
 559      want from the address.  */
 560   mask = __builtin_vec_lvsr(0, s);
 561   mask = __builtin_vec_perm(zero, ones, mask);
 562   data &= mask;
 563
 564   /* While altivec loads mask addresses, we still need to align S so
 565      that the offset we compute at the end is correct.  */
 566   s = (const uchar *)((uintptr_t)s & -16);
 567
 568   /* Main loop processing 16 bytes at a time.  */
 569   goto start;
 570   do
 571     {
 572       vc m_nl, m_cr, m_bs, m_qm;
 573
 574       s += 16;
 575       data = __builtin_vec_ld(0, (const vc *)s);
 576
 577     start:
 578       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 579       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 580       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 581       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 582       t = (m_nl | m_cr) | (m_bs | m_qm);
 583
 584       /* T now contains 0xff in bytes for which we matched one of the relevant
 585          characters.  We want to exit the loop if any byte in T is non-zero.
 586          Below is the expansion of vec_any_ne(t, zero).  */
 587     }
 588   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 589
 590   {
 591 #define N  (sizeof(vc) / sizeof(long))
 592
 593     typedef char check_count[(N == 2 || N == 4) * 2 - 1];
 594     union {
 595       vc v;
 596       unsigned long l[N];
 597     } u;
 598     unsigned long l, i = 0;
 599
 600     u.v = t;
 601
 602     /* Find the first word of T that is non-zero.  */
 603     switch (N)
 604       {
 605       case 4:
 606         l = u.l[i++];
 607         if (l != 0)
 608           break;
 609         s += sizeof(unsigned long);
 610         l = u.l[i++];
 611         if (l != 0)
 612           break;
 613         s += sizeof(unsigned long);
 614       case 2:
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619         l = u.l[i];
 620       }
 621
 622     /* L now contains 0xff in bytes for which we matched one of the
 623        relevant characters.  We can find the byte index by finding
 624        its bit index and dividing by 8.  */
 625     l = __builtin_clzl(l) >> 3;
 626     return s + l;
 627
 628 #undef N
 629   }
 630 }
 631
 632 #else
 633
 634 /* We only have one accellerated alternative.  Use a direct call so that
 635    we encourage inlining.  */
 636
 637 #define search_line_fast  search_line_acc_char
 638
 639 #endif
 640
 641 /* Returns with a logical line that contains no escaped newlines or
 642    trigraphs.  This is a time-critical inner loop.  */
 643 void
 644 _cpp_clean_line (cpp_reader *pfile)
 645 {
 646   cpp_buffer *buffer;
 647   const uchar *s;
 648   uchar c, *d, *p;
 649
 650   buffer = pfile->buffer;
 651   buffer->cur_note = buffer->notes_used = 0;
 652   buffer->cur = buffer->line_base = buffer->next_line;
 653   buffer->need_line = false;
 654   s = buffer->next_line;
 655
 656   if (!buffer->from_stage3)
 657     {
 658       const uchar *pbackslash = NULL;
 659
 660       /* Fast path.  This is the common case of an un-escaped line with
 661          no trigraphs.  The primary win here is by not writing any
 662          data back to memory until we have to.  */
 663       while (1)
 664         {
 665           /* Perform an optimized search for \n, \r, \\, ?.  */
 666           s = search_line_fast (s, buffer->rlimit);
 667
 668           c = *s;
 669           if (c == '\\')
 670             {
 671               /* Record the location of the backslash and continue.  */
 672               pbackslash = s++;
 673             }
 674           else if (__builtin_expect (c == '?', 0))
 675             {
 676               if (__builtin_expect (s[1] == '?', false)
 677                    && _cpp_trigraph_map[s[2]])
 678                 {
 679                   /* Have a trigraph.  We may or may not have to convert
 680                      it.  Add a line note regardless, for -Wtrigraphs.  */
 681                   add_line_note (buffer, s, s[2]);
 682                   if (CPP_OPTION (pfile, trigraphs))
 683                     {
 684                       /* We do, and that means we have to switch to the
 685                          slow path.  */
 686                       d = (uchar *) s;
 687                       *d = _cpp_trigraph_map[s[2]];
 688                       s += 2;
 689                       goto slow_path;
 690                     }
 691                 }
 692               /* Not a trigraph.  Continue on fast-path.  */
 693               s++;
 694             }
 695           else
 696             break;
 697         }
 698
 699       /* This must be \r or \n.  We're either done, or we'll be forced
 700          to write back to the buffer and continue on the slow path.  */
 701       d = (uchar *) s;
 702
 703       if (__builtin_expect (s == buffer->rlimit, false))
 704         goto done;
 705
 706       /* DOS line ending? */
 707       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 708         {
 709           s++;
 710           if (s == buffer->rlimit)
 711             goto done;
 712         }
 713
 714       if (__builtin_expect (pbackslash == NULL, true))
 715         goto done;
 716
 717       /* Check for escaped newline.  */
 718       p = d;
 719       while (is_nvspace (p[-1]))
 720         p--;
 721       if (p - 1 != pbackslash)
 722         goto done;
 723
 724       /* Have an escaped newline; process it and proceed to
 725          the slow path.  */
 726       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 727       d = p - 2;
 728       buffer->next_line = p - 1;
 729
 730     slow_path:
 731       while (1)
 732         {
 733           c = *++s;
 734           *++d = c;
 735
 736           if (c == '\n' || c == '\r')
 737             {
 738               /* Handle DOS line endings.  */
 739               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 740                 s++;
 741               if (s == buffer->rlimit)
 742                 break;
 743
 744               /* Escaped?  */
 745               p = d;
 746               while (p != buffer->next_line && is_nvspace (p[-1]))
 747                 p--;
 748               if (p == buffer->next_line || p[-1] != '\\')
 749                 break;
 750
 751               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 752               d = p - 2;
 753               buffer->next_line = p - 1;
 754             }
 755           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 756             {
 757               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 758               add_line_note (buffer, d, s[2]);
 759               if (CPP_OPTION (pfile, trigraphs))
 760                 {
 761                   *d = _cpp_trigraph_map[s[2]];
 762                   s += 2;
 763                 }
 764             }
 765         }
 766     }
 767   else
 768     {
 769       while (*s != '\n' && *s != '\r')
 770         s++;
 771       d = (uchar *) s;
 772
 773       /* Handle DOS line endings.  */
 774       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 775         s++;
 776     }
 777
 778  done:
 779   *d = '\n';
 780   /* A sentinel note that should never be processed.  */
 781   add_line_note (buffer, d + 1, '\n');
 782   buffer->next_line = s + 1;
 783 }
 784
 785 /* Return true if the trigraph indicated by NOTE should be warned
 786    about in a comment.  */
 787 static bool
 788 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 789 {
 790   const uchar *p;
 791
 792   /* Within comments we don't warn about trigraphs, unless the
 793      trigraph forms an escaped newline, as that may change
 794      behavior.  */
 795   if (note->type != '/')
 796     return false;
 797
 798   /* If -trigraphs, then this was an escaped newline iff the next note
 799      is coincident.  */
 800   if (CPP_OPTION (pfile, trigraphs))
 801     return note[1].pos == note->pos;
 802
 803   /* Otherwise, see if this forms an escaped newline.  */
 804   p = note->pos + 3;
 805   while (is_nvspace (*p))
 806     p++;
 807
 808   /* There might have been escaped newlines between the trigraph and the
 809      newline we found.  Hence the position test.  */
 810   return (*p == '\n' && p < note[1].pos);
 811 }
 812
 813 /* Process the notes created by add_line_note as far as the current
 814    location.  */
 815 void
 816 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 817 {
 818   cpp_buffer *buffer = pfile->buffer;
 819
 820   for (;;)
 821     {
 822       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 823       unsigned int col;
 824
 825       if (note->pos > buffer->cur)
 826         break;
 827
 828       buffer->cur_note++;
 829       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 830
 831       if (note->type == '\\' || note->type == ' ')
 832         {
 833           if (note->type == ' ' && !in_comment)
 834             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 835                                  "backslash and newline separated by space");
 836
 837           if (buffer->next_line > buffer->rlimit)
 838             {
 839               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 840                                    "backslash-newline at end of file");
 841               /* Prevent "no newline at end of file" warning.  */
 842               buffer->next_line = buffer->rlimit;
 843             }
 844
 845           buffer->line_base = note->pos;
 846           CPP_INCREMENT_LINE (pfile, 0);
 847         }
 848       else if (_cpp_trigraph_map[note->type])
 849         {
 850           if (CPP_OPTION (pfile, warn_trigraphs)
 851               && (!in_comment || warn_in_comment (pfile, note)))
 852             {
 853               if (CPP_OPTION (pfile, trigraphs))
 854                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 855                                        pfile->line_table->highest_line, col,
 856                                        "trigraph ??%c converted to %c",
 857                                        note->type,
 858                                        (int) _cpp_trigraph_map[note->type]);
 859               else
 860                 {
 861                   cpp_warning_with_line
 862                     (pfile, CPP_W_TRIGRAPHS,
 863                      pfile->line_table->highest_line, col,
 864                      "trigraph ??%c ignored, use -trigraphs to enable",
 865                      note->type);
 866                 }
 867             }
 868         }
 869       else if (note->type == 0)
 870         /* Already processed in lex_raw_string.  */;
 871       else
 872         abort ();
 873     }
 874 }
 875
 876 /* Skip a C-style block comment.  We find the end of the comment by
 877    seeing if an asterisk is before every '/' we encounter.  Returns
 878    nonzero if comment terminated by EOF, zero otherwise.
 879
 880    Buffer->cur points to the initial asterisk of the comment.  */
 881 bool
 882 _cpp_skip_block_comment (cpp_reader *pfile)
 883 {
 884   cpp_buffer *buffer = pfile->buffer;
 885   const uchar *cur = buffer->cur;
 886   uchar c;
 887
 888   cur++;
 889   if (*cur == '/')
 890     cur++;
 891
 892   for (;;)
 893     {
 894       /* People like decorating comments with '*', so check for '/'
 895          instead for efficiency.  */
 896       c = *cur++;
 897
 898       if (c == '/')
 899         {
 900           if (cur[-2] == '*')
 901             break;
 902
 903           /* Warn about potential nested comments, but not if the '/'
 904              comes immediately before the true comment delimiter.
 905              Don't bother to get it right across escaped newlines.  */
 906           if (CPP_OPTION (pfile, warn_comments)
 907               && cur[0] == '*' && cur[1] != '/')
 908             {
 909               buffer->cur = cur;
 910               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 911                                      pfile->line_table->highest_line,
 912                                      CPP_BUF_COL (buffer),
 913                                      "\"/*\" within comment");
 914             }
 915         }
 916       else if (c == '\n')
 917         {
 918           unsigned int cols;
 919           buffer->cur = cur - 1;
 920           _cpp_process_line_notes (pfile, true);
 921           if (buffer->next_line >= buffer->rlimit)
 922             return true;
 923           _cpp_clean_line (pfile);
 924
 925           cols = buffer->next_line - buffer->line_base;
 926           CPP_INCREMENT_LINE (pfile, cols);
 927
 928           cur = buffer->cur;
 929         }
 930     }
 931
 932   buffer->cur = cur;
 933   _cpp_process_line_notes (pfile, true);
 934   return false;
 935 }
 936
 937 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 938    terminating newline.  Handles escaped newlines.  Returns nonzero
 939    if a multiline comment.  */
 940 static int
 941 skip_line_comment (cpp_reader *pfile)
 942 {
 943   cpp_buffer *buffer = pfile->buffer;
 944   source_location orig_line = pfile->line_table->highest_line;
 945
 946   while (*buffer->cur != '\n')
 947     buffer->cur++;
 948
 949   _cpp_process_line_notes (pfile, true);
 950   return orig_line != pfile->line_table->highest_line;
 951 }
 952
 953 /* Skips whitespace, saving the next non-whitespace character.  */
 954 static void
 955 skip_whitespace (cpp_reader *pfile, cppchar_t c)
 956 {
 957   cpp_buffer *buffer = pfile->buffer;
 958   bool saw_NUL = false;
 959
 960   do
 961     {
 962       /* Horizontal space always OK.  */
 963       if (c == ' ' || c == '\t')
 964         ;
 965       /* Just \f \v or \0 left.  */
 966       else if (c == '\0')
 967         saw_NUL = true;
 968       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 969         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 970                              CPP_BUF_COL (buffer),
 971                              "%s in preprocessing directive",
 972                              c == '\f' ? "form feed" : "vertical tab");
 973
 974       c = *buffer->cur++;
 975     }
 976   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 977   while (is_nvspace (c));
 978
 979   if (saw_NUL)
 980     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
 981
 982   buffer->cur--;
 983 }
 984
 985 /* See if the characters of a number token are valid in a name (no
 986    '.', '+' or '-').  */
 987 static int
 988 name_p (cpp_reader *pfile, const cpp_string *string)
 989 {
 990   unsigned int i;
 991
 992   for (i = 0; i < string->len; i++)
 993     if (!is_idchar (string->text[i]))
 994       return 0;
 995
 996   return 1;
 997 }
 998
 999 /* After parsing an identifier or other sequence, produce a warning about
1000    sequences not in NFC/NFKC.  */
1001 static void
1002 warn_about_normalization (cpp_reader *pfile,
1003                           const cpp_token *token,
1004                           const struct normalize_state *s)
1005 {
1006   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1007       && !pfile->state.skipping)
1008     {
1009       /* Make sure that the token is printed using UCNs, even
1010          if we'd otherwise happily print UTF-8.  */
1011       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1012       size_t sz;
1013
1014       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1015       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1016         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1017                                "`%.*s' is not in NFKC", (int) sz, buf);
1018       else
1019         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1020                                "`%.*s' is not in NFC", (int) sz, buf);
1021     }
1022 }
1023
1024 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1025    an identifier.  FIRST is TRUE if this starts an identifier.  */
1026 static bool
1027 forms_identifier_p (cpp_reader *pfile, int first,
1028                     struct normalize_state *state)
1029 {
1030   cpp_buffer *buffer = pfile->buffer;
1031
1032   if (*buffer->cur == '$')
1033     {
1034       if (!CPP_OPTION (pfile, dollars_in_ident))
1035         return false;
1036
1037       buffer->cur++;
1038       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1039         {
1040           CPP_OPTION (pfile, warn_dollars) = 0;
1041           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1042         }
1043
1044       return true;
1045     }
1046
1047   /* Is this a syntactically valid UCN?  */
1048   if (CPP_OPTION (pfile, extended_identifiers)
1049       && *buffer->cur == '\\'
1050       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1051     {
1052       buffer->cur += 2;
1053       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1054                           state))
1055         return true;
1056       buffer->cur -= 2;
1057     }
1058
1059   return false;
1060 }
1061
1062 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1063 static cpp_hashnode *
1064 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1065 {
1066   cpp_hashnode *result;
1067   const uchar *cur;
1068   unsigned int len;
1069   unsigned int hash = HT_HASHSTEP (0, *base);
1070
1071   cur = base + 1;
1072   while (ISIDNUM (*cur))
1073     {
1074       hash = HT_HASHSTEP (hash, *cur);
1075       cur++;
1076     }
1077   len = cur - base;
1078   hash = HT_HASHFINISH (hash, len);
1079   result = cpp_lookup_with_hash (pfile, base, len, hash);
1080
1081   /* Rarely, identifiers require diagnostics when lexed.  */
1082   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1083                         && !pfile->state.skipping, 0))
1084     {
1085       /* It is allowed to poison the same identifier twice.  */
1086       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1087         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1088                    NODE_NAME (result));
1089
1090       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1091          replacement list of a variadic macro.  */
1092       if (result == pfile->spec_nodes.n__VA_ARGS__
1093           && !pfile->state.va_args_ok)
1094         cpp_error (pfile, CPP_DL_PEDWARN,
1095                    "__VA_ARGS__ can only appear in the expansion"
1096                    " of a C99 variadic macro");
1097
1098       /* For -Wc++-compat, warn about use of C++ named operators.  */
1099       if (result->flags & NODE_WARN_OPERATOR)
1100         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1101                      "identifier \"%s\" is a special operator name in C++",
1102                      NODE_NAME (result));
1103     }
1104
1105   return result;
1106 }
1107
1108 /* Get the cpp_hashnode of an identifier specified by NAME in
1109    the current cpp_reader object.  If none is found, NULL is returned.  */
1110 cpp_hashnode *
1111 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1112 {
1113   cpp_hashnode *result;
1114   result = lex_identifier_intern (pfile, (uchar *) name);
1115   return result;
1116 }
1117
1118 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1119 static cpp_hashnode *
1120 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1121                 struct normalize_state *nst)
1122 {
1123   cpp_hashnode *result;
1124   const uchar *cur;
1125   unsigned int len;
1126   unsigned int hash = HT_HASHSTEP (0, *base);
1127
1128   cur = pfile->buffer->cur;
1129   if (! starts_ucn)
1130     while (ISIDNUM (*cur))
1131       {
1132         hash = HT_HASHSTEP (hash, *cur);
1133         cur++;
1134       }
1135   pfile->buffer->cur = cur;
1136   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1137     {
1138       /* Slower version for identifiers containing UCNs (or $).  */
1139       do {
1140         while (ISIDNUM (*pfile->buffer->cur))
1141           {
1142             pfile->buffer->cur++;
1143             NORMALIZE_STATE_UPDATE_IDNUM (nst);
1144           }
1145       } while (forms_identifier_p (pfile, false, nst));
1146       result = _cpp_interpret_identifier (pfile, base,
1147                                           pfile->buffer->cur - base);
1148     }
1149   else
1150     {
1151       len = cur - base;
1152       hash = HT_HASHFINISH (hash, len);
1153
1154       result = cpp_lookup_with_hash (pfile, base, len, hash);
1155     }
1156
1157   /* Rarely, identifiers require diagnostics when lexed.  */
1158   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1159                         && !pfile->state.skipping, 0))
1160     {
1161       /* It is allowed to poison the same identifier twice.  */
1162       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1163         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1164                    NODE_NAME (result));
1165
1166       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1167          replacement list of a variadic macro.  */
1168       if (result == pfile->spec_nodes.n__VA_ARGS__
1169           && !pfile->state.va_args_ok)
1170         cpp_error (pfile, CPP_DL_PEDWARN,
1171                    "__VA_ARGS__ can only appear in the expansion"
1172                    " of a C99 variadic macro");
1173
1174       /* For -Wc++-compat, warn about use of C++ named operators.  */
1175       if (result->flags & NODE_WARN_OPERATOR)
1176         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1177                      "identifier \"%s\" is a special operator name in C++",
1178                      NODE_NAME (result));
1179     }
1180
1181   return result;
1182 }
1183
1184 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1185 static void
1186 lex_number (cpp_reader *pfile, cpp_string *number,
1187             struct normalize_state *nst)
1188 {
1189   const uchar *cur;
1190   const uchar *base;
1191   uchar *dest;
1192
1193   base = pfile->buffer->cur - 1;
1194   do
1195     {
1196       cur = pfile->buffer->cur;
1197
1198       /* N.B. ISIDNUM does not include $.  */
1199       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
1200         {
1201           cur++;
1202           NORMALIZE_STATE_UPDATE_IDNUM (nst);
1203         }
1204
1205       pfile->buffer->cur = cur;
1206     }
1207   while (forms_identifier_p (pfile, false, nst));
1208
1209   number->len = cur - base;
1210   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1211   memcpy (dest, base, number->len);
1212   dest[number->len] = '\0';
1213   number->text = dest;
1214 }
1215
1216 /* Create a token of type TYPE with a literal spelling.  */
1217 static void
1218 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1219                 unsigned int len, enum cpp_ttype type)
1220 {
1221   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1222
1223   memcpy (dest, base, len);
1224   dest[len] = '\0';
1225   token->type = type;
1226   token->val.str.len = len;
1227   token->val.str.text = dest;
1228 }
1229
1230 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1231    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1232
1233 static void
1234 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1235                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1236 {
1237   _cpp_buff *first_buff = *first_buff_p;
1238   _cpp_buff *last_buff = *last_buff_p;
1239
1240   if (first_buff == NULL)
1241     first_buff = last_buff = _cpp_get_buff (pfile, len);
1242   else if (len > BUFF_ROOM (last_buff))
1243     {
1244       size_t room = BUFF_ROOM (last_buff);
1245       memcpy (BUFF_FRONT (last_buff), base, room);
1246       BUFF_FRONT (last_buff) += room;
1247       base += room;
1248       len -= room;
1249       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1250     }
1251
1252   memcpy (BUFF_FRONT (last_buff), base, len);
1253   BUFF_FRONT (last_buff) += len;
1254
1255   *first_buff_p = first_buff;
1256   *last_buff_p = last_buff;
1257 }
1258
1259 /* Lexes a raw string.  The stored string contains the spelling, including
1260    double quotes, delimiter string, '(' and ')', any leading
1261    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1262    literal, or CPP_OTHER if it was not properly terminated.
1263
1264    The spelling is NUL-terminated, but it is not guaranteed that this
1265    is the first NUL since embedded NULs are preserved.  */
1266
1267 static void
1268 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1269                 const uchar *cur)
1270 {
1271   source_location saw_NUL = 0;
1272   const uchar *raw_prefix;
1273   unsigned int raw_prefix_len = 0;
1274   enum cpp_ttype type;
1275   size_t total_len = 0;
1276   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1277   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1278
1279   type = (*base == 'L' ? CPP_WSTRING :
1280           *base == 'U' ? CPP_STRING32 :
1281           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1282           : CPP_STRING);
1283
1284   raw_prefix = cur + 1;
1285   while (raw_prefix_len < 16)
1286     {
1287       switch (raw_prefix[raw_prefix_len])
1288         {
1289         case ' ': case '(': case ')': case '\\': case '\t':
1290         case '\v': case '\f': case '\n': default:
1291           break;
1292         /* Basic source charset except the above chars.  */
1293         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1294         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1295         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1296         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1297         case 'y': case 'z':
1298         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1299         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1300         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1301         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1302         case 'Y': case 'Z':
1303         case '0': case '1': case '2': case '3': case '4': case '5':
1304         case '6': case '7': case '8': case '9':
1305         case '_': case '{': case '}': case '#': case '[': case ']':
1306         case '<': case '>': case '%': case ':': case ';': case '.':
1307         case '?': case '*': case '+': case '-': case '/': case '^':
1308         case '&': case '|': case '~': case '!': case '=': case ',':
1309         case '"': case '\'':
1310           raw_prefix_len++;
1311           continue;
1312         }
1313       break;
1314     }
1315
1316   if (raw_prefix[raw_prefix_len] != '(')
1317     {
1318       int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
1319                 + 1;
1320       if (raw_prefix_len == 16)
1321         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1322                              "raw string delimiter longer than 16 characters");
1323       else
1324         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1325                              "invalid character '%c' in raw string delimiter",
1326                              (int) raw_prefix[raw_prefix_len]);
1327       pfile->buffer->cur = raw_prefix - 1;
1328       create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
1329       return;
1330     }
1331
1332   cur = raw_prefix + raw_prefix_len + 1;
1333   for (;;)
1334     {
1335 #define BUF_APPEND(STR,LEN)                                     \
1336       do {                                                      \
1337         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1338                         &first_buff, &last_buff);               \
1339         total_len += (LEN);                                     \
1340       } while (0);
1341
1342       cppchar_t c;
1343
1344       /* If we previously performed any trigraph or line splicing
1345          transformations, undo them within the body of the raw string.  */
1346       while (note->pos < cur)
1347         ++note;
1348       for (; note->pos == cur; ++note)
1349         {
1350           switch (note->type)
1351             {
1352             case '\\':
1353             case ' ':
1354               /* Restore backslash followed by newline.  */
1355               BUF_APPEND (base, cur - base);
1356               base = cur;
1357               BUF_APPEND ("\\", 1);
1358             after_backslash:
1359               if (note->type == ' ')
1360                 {
1361                   /* GNU backslash whitespace newline extension.  FIXME
1362                      could be any sequence of non-vertical space.  When we
1363                      can properly restore any such sequence, we should mark
1364                      this note as handled so _cpp_process_line_notes
1365                      doesn't warn.  */
1366                   BUF_APPEND (" ", 1);
1367                 }
1368
1369               BUF_APPEND ("\n", 1);
1370               break;
1371
1372             case 0:
1373               /* Already handled.  */
1374               break;
1375
1376             default:
1377               if (_cpp_trigraph_map[note->type])
1378                 {
1379                   /* Don't warn about this trigraph in
1380                      _cpp_process_line_notes, since trigraphs show up as
1381                      trigraphs in raw strings.  */
1382                   uchar type = note->type;
1383                   note->type = 0;
1384
1385                   if (!CPP_OPTION (pfile, trigraphs))
1386                     /* If we didn't convert the trigraph in the first
1387                        place, don't do anything now either.  */
1388                     break;
1389
1390                   BUF_APPEND (base, cur - base);
1391                   base = cur;
1392                   BUF_APPEND ("??", 2);
1393
1394                   /* ??/ followed by newline gets two line notes, one for
1395                      the trigraph and one for the backslash/newline.  */
1396                   if (type == '/' && note[1].pos == cur)
1397                     {
1398                       if (note[1].type != '\\'
1399                           && note[1].type != ' ')
1400                         abort ();
1401                       BUF_APPEND ("/", 1);
1402                       ++note;
1403                       goto after_backslash;
1404                     }
1405                   /* The ) from ??) could be part of the suffix.  */
1406                   else if (type == ')'
1407                            && strncmp ((const char *) cur+1,
1408                                        (const char *) raw_prefix,
1409                                        raw_prefix_len) == 0
1410                            && cur[raw_prefix_len+1] == '"')
1411                     {
1412                       BUF_APPEND (")", 1);
1413                       base++;
1414                       cur += raw_prefix_len + 2;
1415                       goto break_outer_loop;
1416                     }
1417                   else
1418                     {
1419                       /* Skip the replacement character.  */
1420                       base = ++cur;
1421                       BUF_APPEND (&type, 1);
1422                     }
1423                 }
1424               else
1425                 abort ();
1426               break;
1427             }
1428         }
1429       c = *cur++;
1430
1431       if (c == ')'
1432           && strncmp ((const char *) cur, (const char *) raw_prefix,
1433                       raw_prefix_len) == 0
1434           && cur[raw_prefix_len] == '"')
1435         {
1436           cur += raw_prefix_len + 1;
1437           break;
1438         }
1439       else if (c == '\n')
1440         {
1441           if (pfile->state.in_directive
1442               || pfile->state.parsing_args
1443               || pfile->state.in_deferred_pragma)
1444             {
1445               cur--;
1446               type = CPP_OTHER;
1447               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1448                                    "unterminated raw string");
1449               break;
1450             }
1451
1452           BUF_APPEND (base, cur - base);
1453
1454           if (pfile->buffer->cur < pfile->buffer->rlimit)
1455             CPP_INCREMENT_LINE (pfile, 0);
1456           pfile->buffer->need_line = true;
1457
1458           pfile->buffer->cur = cur-1;
1459           _cpp_process_line_notes (pfile, false);
1460           if (!_cpp_get_fresh_line (pfile))
1461             {
1462               source_location src_loc = token->src_loc;
1463               token->type = CPP_EOF;
1464               /* Tell the compiler the line number of the EOF token.  */
1465               token->src_loc = pfile->line_table->highest_line;
1466               token->flags = BOL;
1467               if (first_buff != NULL)
1468                 _cpp_release_buff (pfile, first_buff);
1469               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1470                                    "unterminated raw string");
1471               return;
1472             }
1473
1474           cur = base = pfile->buffer->cur;
1475           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1476         }
1477       else if (c == '\0' && !saw_NUL)
1478         LINEMAP_POSITION_FOR_COLUMN (saw_NUL, pfile->line_table,
1479                                      CPP_BUF_COLUMN (pfile->buffer, cur));
1480     }
1481  break_outer_loop:
1482
1483   if (saw_NUL && !pfile->state.skipping)
1484     cpp_error_with_line (pfile, CPP_DL_WARNING, saw_NUL, 0,
1485                "null character(s) preserved in literal");
1486
1487   pfile->buffer->cur = cur;
1488   if (first_buff == NULL)
1489     create_literal (pfile, token, base, cur - base, type);
1490   else
1491     {
1492       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1493
1494       token->type = type;
1495       token->val.str.len = total_len + (cur - base);
1496       token->val.str.text = dest;
1497       last_buff = first_buff;
1498       while (last_buff != NULL)
1499         {
1500           memcpy (dest, last_buff->base,
1501                   BUFF_FRONT (last_buff) - last_buff->base);
1502           dest += BUFF_FRONT (last_buff) - last_buff->base;
1503           last_buff = last_buff->next;
1504         }
1505       _cpp_release_buff (pfile, first_buff);
1506       memcpy (dest, base, cur - base);
1507       dest[cur - base] = '\0';
1508     }
1509 }
1510
1511 /* Lexes a string, character constant, or angle-bracketed header file
1512    name.  The stored string contains the spelling, including opening
1513    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1514    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1515    if it was not properly terminated, or CPP_LESS for an unterminated
1516    header name which must be relexed as normal tokens.
1517
1518    The spelling is NUL-terminated, but it is not guaranteed that this
1519    is the first NUL since embedded NULs are preserved.  */
1520 static void
1521 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1522 {
1523   bool saw_NUL = false;
1524   const uchar *cur;
1525   cppchar_t terminator;
1526   enum cpp_ttype type;
1527
1528   cur = base;
1529   terminator = *cur++;
1530   if (terminator == 'L' || terminator == 'U')
1531     terminator = *cur++;
1532   else if (terminator == 'u')
1533     {
1534       terminator = *cur++;
1535       if (terminator == '8')
1536         terminator = *cur++;
1537     }
1538   if (terminator == 'R')
1539     {
1540       lex_raw_string (pfile, token, base, cur);
1541       return;
1542     }
1543   if (terminator == '"')
1544     type = (*base == 'L' ? CPP_WSTRING :
1545             *base == 'U' ? CPP_STRING32 :
1546             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1547                          : CPP_STRING);
1548   else if (terminator == '\'')
1549     type = (*base == 'L' ? CPP_WCHAR :
1550             *base == 'U' ? CPP_CHAR32 :
1551             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1552   else
1553     terminator = '>', type = CPP_HEADER_NAME;
1554
1555   for (;;)
1556     {
1557       cppchar_t c = *cur++;
1558
1559       /* In #include-style directives, terminators are not escapable.  */
1560       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1561         cur++;
1562       else if (c == terminator)
1563         break;
1564       else if (c == '\n')
1565         {
1566           cur--;
1567           /* Unmatched quotes always yield undefined behavior, but
1568              greedy lexing means that what appears to be an unterminated
1569              header name may actually be a legitimate sequence of tokens.  */
1570           if (terminator == '>')
1571             {
1572               token->type = CPP_LESS;
1573               return;
1574             }
1575           type = CPP_OTHER;
1576           break;
1577         }
1578       else if (c == '\0')
1579         saw_NUL = true;
1580     }
1581
1582   if (saw_NUL && !pfile->state.skipping)
1583     cpp_error (pfile, CPP_DL_WARNING,
1584                "null character(s) preserved in literal");
1585
1586   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1587     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1588                (int) terminator);
1589
1590   pfile->buffer->cur = cur;
1591   create_literal (pfile, token, base, cur - base, type);
1592 }
1593
1594 /* Return the comment table. The client may not make any assumption
1595    about the ordering of the table.  */
1596 cpp_comment_table *
1597 cpp_get_comments (cpp_reader *pfile)
1598 {
1599   return &pfile->comments;
1600 }
1601
1602 /* Append a comment to the end of the comment table. */
1603 static void
1604 store_comment (cpp_reader *pfile, cpp_token *token)
1605 {
1606   int len;
1607
1608   if (pfile->comments.allocated == 0)
1609     {
1610       pfile->comments.allocated = 256;
1611       pfile->comments.entries = (cpp_comment *) xmalloc
1612         (pfile->comments.allocated * sizeof (cpp_comment));
1613     }
1614
1615   if (pfile->comments.count == pfile->comments.allocated)
1616     {
1617       pfile->comments.allocated *= 2;
1618       pfile->comments.entries = (cpp_comment *) xrealloc
1619         (pfile->comments.entries,
1620          pfile->comments.allocated * sizeof (cpp_comment));
1621     }
1622
1623   len = token->val.str.len;
1624
1625   /* Copy comment. Note, token may not be NULL terminated. */
1626   pfile->comments.entries[pfile->comments.count].comment =
1627     (char *) xmalloc (sizeof (char) * (len + 1));
1628   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1629           token->val.str.text, len);
1630   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1631
1632   /* Set source location. */
1633   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1634
1635   /* Increment the count of entries in the comment table. */
1636   pfile->comments.count++;
1637 }
1638
1639 /* The stored comment includes the comment start and any terminator.  */
1640 static void
1641 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1642               cppchar_t type)
1643 {
1644   unsigned char *buffer;
1645   unsigned int len, clen, i;
1646
1647   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1648
1649   /* C++ comments probably (not definitely) have moved past a new
1650      line, which we don't want to save in the comment.  */
1651   if (is_vspace (pfile->buffer->cur[-1]))
1652     len--;
1653
1654   /* If we are currently in a directive or in argument parsing, then
1655      we need to store all C++ comments as C comments internally, and
1656      so we need to allocate a little extra space in that case.
1657
1658      Note that the only time we encounter a directive here is
1659      when we are saving comments in a "#define".  */
1660   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1661           && type == '/') ? len + 2 : len;
1662
1663   buffer = _cpp_unaligned_alloc (pfile, clen);
1664
1665   token->type = CPP_COMMENT;
1666   token->val.str.len = clen;
1667   token->val.str.text = buffer;
1668
1669   buffer[0] = '/';
1670   memcpy (buffer + 1, from, len - 1);
1671
1672   /* Finish conversion to a C comment, if necessary.  */
1673   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1674     {
1675       buffer[1] = '*';
1676       buffer[clen - 2] = '*';
1677       buffer[clen - 1] = '/';
1678       /* As there can be in a C++ comments illegal sequences for C comments
1679          we need to filter them out.  */
1680       for (i = 2; i < (clen - 2); i++)
1681         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1682           buffer[i] = '|';
1683     }
1684
1685   /* Finally store this comment for use by clients of libcpp. */
1686   store_comment (pfile, token);
1687 }
1688
1689 /* Allocate COUNT tokens for RUN.  */
1690 void
1691 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1692 {
1693   run->base = XNEWVEC (cpp_token, count);
1694   run->limit = run->base + count;
1695   run->next = NULL;
1696 }
1697
1698 /* Returns the next tokenrun, or creates one if there is none.  */
1699 static tokenrun *
1700 next_tokenrun (tokenrun *run)
1701 {
1702   if (run->next == NULL)
1703     {
1704       run->next = XNEW (tokenrun);
1705       run->next->prev = run;
1706       _cpp_init_tokenrun (run->next, 250);
1707     }
1708
1709   return run->next;
1710 }
1711
1712 /* Look ahead in the input stream.  */
1713 const cpp_token *
1714 cpp_peek_token (cpp_reader *pfile, int index)
1715 {
1716   cpp_context *context = pfile->context;
1717   const cpp_token *peektok;
1718   int count;
1719
1720   /* First, scan through any pending cpp_context objects.  */
1721   while (context->prev)
1722     {
1723       ptrdiff_t sz = (context->direct_p
1724                       ? LAST (context).token - FIRST (context).token
1725                       : LAST (context).ptoken - FIRST (context).ptoken);
1726
1727       if (index < (int) sz)
1728         return (context->direct_p
1729                 ? FIRST (context).token + index
1730                 : *(FIRST (context).ptoken + index));
1731
1732       index -= (int) sz;
1733       context = context->prev;
1734     }
1735
1736   /* We will have to read some new tokens after all (and do so
1737      without invalidating preceding tokens).  */
1738   count = index;
1739   pfile->keep_tokens++;
1740
1741   do
1742     {
1743       peektok = _cpp_lex_token (pfile);
1744       if (peektok->type == CPP_EOF)
1745         return peektok;
1746     }
1747   while (index--);
1748
1749   _cpp_backup_tokens_direct (pfile, count + 1);
1750   pfile->keep_tokens--;
1751
1752   return peektok;
1753 }
1754
1755 /* Allocate a single token that is invalidated at the same time as the
1756    rest of the tokens on the line.  Has its line and col set to the
1757    same as the last lexed token, so that diagnostics appear in the
1758    right place.  */
1759 cpp_token *
1760 _cpp_temp_token (cpp_reader *pfile)
1761 {
1762   cpp_token *old, *result;
1763   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1764   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1765
1766   old = pfile->cur_token - 1;
1767   /* Any pre-existing lookaheads must not be clobbered.  */
1768   if (la)
1769     {
1770       if (sz <= la)
1771         {
1772           tokenrun *next = next_tokenrun (pfile->cur_run);
1773
1774           if (sz < la)
1775             memmove (next->base + 1, next->base,
1776                      (la - sz) * sizeof (cpp_token));
1777
1778           next->base[0] = pfile->cur_run->limit[-1];
1779         }
1780
1781       if (sz > 1)
1782         memmove (pfile->cur_token + 1, pfile->cur_token,
1783                  MIN (la, sz - 1) * sizeof (cpp_token));
1784     }
1785
1786   if (!sz && pfile->cur_token == pfile->cur_run->limit)
1787     {
1788       pfile->cur_run = next_tokenrun (pfile->cur_run);
1789       pfile->cur_token = pfile->cur_run->base;
1790     }
1791
1792   result = pfile->cur_token++;
1793   result->src_loc = old->src_loc;
1794   return result;
1795 }
1796
1797 /* Lex a token into RESULT (external interface).  Takes care of issues
1798    like directive handling, token lookahead, multiple include
1799    optimization and skipping.  */
1800 const cpp_token *
1801 _cpp_lex_token (cpp_reader *pfile)
1802 {
1803   cpp_token *result;
1804
1805   result = NULL;
1806   for (;;)
1807     {
1808       if (pfile->cur_token == pfile->cur_run->limit)
1809         {
1810           pfile->cur_run = next_tokenrun (pfile->cur_run);
1811           pfile->cur_token = pfile->cur_run->base;
1812         }
1813       /* We assume that the current token is somewhere in the current
1814          run.  */
1815       if (pfile->cur_token < pfile->cur_run->base
1816           || pfile->cur_token >= pfile->cur_run->limit)
1817         abort ();
1818
1819       if (pfile->lookaheads)
1820         {
1821           pfile->lookaheads--;
1822           result = pfile->cur_token++;
1823         }
1824       else
1825         result = _cpp_lex_direct (pfile);
1826
1827       if (result->flags & BOL)
1828         {
1829           /* Is this a directive.  If _cpp_handle_directive returns
1830              false, it is an assembler #.  */
1831           if (result->type == CPP_HASH
1832               /* 6.10.3 p 11: Directives in a list of macro arguments
1833                  gives undefined behavior.  This implementation
1834                  handles the directive as normal.  */
1835               && pfile->state.parsing_args != 1)
1836             {
1837               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1838                 {
1839                   if (pfile->directive_result.type == CPP_PADDING)
1840                     continue;
1841                   result = &pfile->directive_result;
1842                 }
1843             }
1844           else if (pfile->state.in_deferred_pragma)
1845             result = &pfile->directive_result;
1846
1847           if (pfile->cb.line_change && !pfile->state.skipping)
1848             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1849         }
1850
1851       /* We don't skip tokens in directives.  */
1852       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
1853         break;
1854
1855       /* Outside a directive, invalidate controlling macros.  At file
1856          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
1857          get here and MI optimization works.  */
1858       pfile->mi_valid = false;
1859
1860       if (!pfile->state.skipping || result->type == CPP_EOF)
1861         break;
1862     }
1863
1864   return result;
1865 }
1866
1867 /* Returns true if a fresh line has been loaded.  */
1868 bool
1869 _cpp_get_fresh_line (cpp_reader *pfile)
1870 {
1871   int return_at_eof;
1872
1873   /* We can't get a new line until we leave the current directive.  */
1874   if (pfile->state.in_directive)
1875     return false;
1876
1877   for (;;)
1878     {
1879       cpp_buffer *buffer = pfile->buffer;
1880
1881       if (!buffer->need_line)
1882         return true;
1883
1884       if (buffer->next_line < buffer->rlimit)
1885         {
1886           _cpp_clean_line (pfile);
1887           return true;
1888         }
1889
1890       /* First, get out of parsing arguments state.  */
1891       if (pfile->state.parsing_args)
1892         return false;
1893
1894       /* End of buffer.  Non-empty files should end in a newline.  */
1895       if (buffer->buf != buffer->rlimit
1896           && buffer->next_line > buffer->rlimit
1897           && !buffer->from_stage3)
1898         {
1899           /* Clip to buffer size.  */
1900           buffer->next_line = buffer->rlimit;
1901         }
1902
1903       return_at_eof = buffer->return_at_eof;
1904       _cpp_pop_buffer (pfile);
1905       if (pfile->buffer == NULL || return_at_eof)
1906         return false;
1907     }
1908 }
1909
1910 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
1911   do                                                    \
1912     {                                                   \
1913       result->type = ELSE_TYPE;                         \
1914       if (*buffer->cur == CHAR)                         \
1915         buffer->cur++, result->type = THEN_TYPE;        \
1916     }                                                   \
1917   while (0)
1918
1919 /* Lex a token into pfile->cur_token, which is also incremented, to
1920    get diagnostics pointing to the correct location.
1921
1922    Does not handle issues such as token lookahead, multiple-include
1923    optimization, directives, skipping etc.  This function is only
1924    suitable for use by _cpp_lex_token, and in special cases like
1925    lex_expansion_token which doesn't care for any of these issues.
1926
1927    When meeting a newline, returns CPP_EOF if parsing a directive,
1928    otherwise returns to the start of the token buffer if permissible.
1929    Returns the location of the lexed token.  */
1930 cpp_token *
1931 _cpp_lex_direct (cpp_reader *pfile)
1932 {
1933   cppchar_t c;
1934   cpp_buffer *buffer;
1935   const unsigned char *comment_start;
1936   cpp_token *result = pfile->cur_token++;
1937
1938  fresh_line:
1939   result->flags = 0;
1940   buffer = pfile->buffer;
1941   if (buffer->need_line)
1942     {
1943       if (pfile->state.in_deferred_pragma)
1944         {
1945           result->type = CPP_PRAGMA_EOL;
1946           pfile->state.in_deferred_pragma = false;
1947           if (!pfile->state.pragma_allow_expansion)
1948             pfile->state.prevent_expansion--;
1949           return result;
1950         }
1951       if (!_cpp_get_fresh_line (pfile))
1952         {
1953           result->type = CPP_EOF;
1954           if (!pfile->state.in_directive)
1955             {
1956               /* Tell the compiler the line number of the EOF token.  */
1957               result->src_loc = pfile->line_table->highest_line;
1958               result->flags = BOL;
1959             }
1960           return result;
1961         }
1962       if (!pfile->keep_tokens)
1963         {
1964           pfile->cur_run = &pfile->base_run;
1965           result = pfile->base_run.base;
1966           pfile->cur_token = result + 1;
1967         }
1968       result->flags = BOL;
1969       if (pfile->state.parsing_args == 2)
1970         result->flags |= PREV_WHITE;
1971     }
1972   buffer = pfile->buffer;
1973  update_tokens_line:
1974   result->src_loc = pfile->line_table->highest_line;
1975
1976  skipped_white:
1977   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
1978       && !pfile->overlaid_buffer)
1979     {
1980       _cpp_process_line_notes (pfile, false);
1981       result->src_loc = pfile->line_table->highest_line;
1982     }
1983   c = *buffer->cur++;
1984
1985   LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
1986                                CPP_BUF_COLUMN (buffer, buffer->cur));
1987
1988   switch (c)
1989     {
1990     case ' ': case '\t': case '\f': case '\v': case '\0':
1991       result->flags |= PREV_WHITE;
1992       skip_whitespace (pfile, c);
1993       goto skipped_white;
1994
1995     case '\n':
1996       if (buffer->cur < buffer->rlimit)
1997         CPP_INCREMENT_LINE (pfile, 0);
1998       buffer->need_line = true;
1999       goto fresh_line;
2000
2001     case '0': case '1': case '2': case '3': case '4':
2002     case '5': case '6': case '7': case '8': case '9':
2003       {
2004         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2005         result->type = CPP_NUMBER;
2006         lex_number (pfile, &result->val.str, &nst);
2007         warn_about_normalization (pfile, result, &nst);
2008         break;
2009       }
2010
2011     case 'L':
2012     case 'u':
2013     case 'U':
2014     case 'R':
2015       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2016          wide strings or raw strings.  */
2017       if (c == 'L' || CPP_OPTION (pfile, uliterals))
2018         {
2019           if ((*buffer->cur == '\'' && c != 'R')
2020               || *buffer->cur == '"'
2021               || (*buffer->cur == 'R'
2022                   && c != 'R'
2023                   && buffer->cur[1] == '"'
2024                   && CPP_OPTION (pfile, uliterals))
2025               || (*buffer->cur == '8'
2026                   && c == 'u'
2027                   && (buffer->cur[1] == '"'
2028                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'))))
2029             {
2030               lex_string (pfile, result, buffer->cur - 1);
2031               break;
2032             }
2033         }
2034       /* Fall through.  */
2035
2036     case '_':
2037     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2038     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2039     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2040     case 's': case 't':           case 'v': case 'w': case 'x':
2041     case 'y': case 'z':
2042     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2043     case 'G': case 'H': case 'I': case 'J': case 'K':
2044     case 'M': case 'N': case 'O': case 'P': case 'Q':
2045     case 'S': case 'T':           case 'V': case 'W': case 'X':
2046     case 'Y': case 'Z':
2047       result->type = CPP_NAME;
2048       {
2049         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2050         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2051                                                 &nst);
2052         warn_about_normalization (pfile, result, &nst);
2053       }
2054
2055       /* Convert named operators to their proper types.  */
2056       if (result->val.node.node->flags & NODE_OPERATOR)
2057         {
2058           result->flags |= NAMED_OP;
2059           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2060         }
2061       break;
2062
2063     case '\'':
2064     case '"':
2065       lex_string (pfile, result, buffer->cur - 1);
2066       break;
2067
2068     case '/':
2069       /* A potential block or line comment.  */
2070       comment_start = buffer->cur;
2071       c = *buffer->cur;
2072
2073       if (c == '*')
2074         {
2075           if (_cpp_skip_block_comment (pfile))
2076             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2077         }
2078       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2079                             || cpp_in_system_header (pfile)))
2080         {
2081           /* Warn about comments only if pedantically GNUC89, and not
2082              in system headers.  */
2083           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2084               && ! buffer->warned_cplusplus_comments)
2085             {
2086               cpp_error (pfile, CPP_DL_PEDWARN,
2087                          "C++ style comments are not allowed in ISO C90");
2088               cpp_error (pfile, CPP_DL_PEDWARN,
2089                          "(this will be reported only once per input file)");
2090               buffer->warned_cplusplus_comments = 1;
2091             }
2092
2093           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2094             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2095         }
2096       else if (c == '=')
2097         {
2098           buffer->cur++;
2099           result->type = CPP_DIV_EQ;
2100           break;
2101         }
2102       else
2103         {
2104           result->type = CPP_DIV;
2105           break;
2106         }
2107
2108       if (!pfile->state.save_comments)
2109         {
2110           result->flags |= PREV_WHITE;
2111           goto update_tokens_line;
2112         }
2113
2114       /* Save the comment as a token in its own right.  */
2115       save_comment (pfile, result, comment_start, c);
2116       break;
2117
2118     case '<':
2119       if (pfile->state.angled_headers)
2120         {
2121           lex_string (pfile, result, buffer->cur - 1);
2122           if (result->type != CPP_LESS)
2123             break;
2124         }
2125
2126       result->type = CPP_LESS;
2127       if (*buffer->cur == '=')
2128         buffer->cur++, result->type = CPP_LESS_EQ;
2129       else if (*buffer->cur == '<')
2130         {
2131           buffer->cur++;
2132           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2133         }
2134       else if (CPP_OPTION (pfile, digraphs))
2135         {
2136           if (*buffer->cur == ':')
2137             {
2138               buffer->cur++;
2139               result->flags |= DIGRAPH;
2140               result->type = CPP_OPEN_SQUARE;
2141             }
2142           else if (*buffer->cur == '%')
2143             {
2144               buffer->cur++;
2145               result->flags |= DIGRAPH;
2146               result->type = CPP_OPEN_BRACE;
2147             }
2148         }
2149       break;
2150
2151     case '>':
2152       result->type = CPP_GREATER;
2153       if (*buffer->cur == '=')
2154         buffer->cur++, result->type = CPP_GREATER_EQ;
2155       else if (*buffer->cur == '>')
2156         {
2157           buffer->cur++;
2158           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2159         }
2160       break;
2161
2162     case '%':
2163       result->type = CPP_MOD;
2164       if (*buffer->cur == '=')
2165         buffer->cur++, result->type = CPP_MOD_EQ;
2166       else if (CPP_OPTION (pfile, digraphs))
2167         {
2168           if (*buffer->cur == ':')
2169             {
2170               buffer->cur++;
2171               result->flags |= DIGRAPH;
2172               result->type = CPP_HASH;
2173               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2174                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2175             }
2176           else if (*buffer->cur == '>')
2177             {
2178               buffer->cur++;
2179               result->flags |= DIGRAPH;
2180               result->type = CPP_CLOSE_BRACE;
2181             }
2182         }
2183       break;
2184
2185     case '.':
2186       result->type = CPP_DOT;
2187       if (ISDIGIT (*buffer->cur))
2188         {
2189           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2190           result->type = CPP_NUMBER;
2191           lex_number (pfile, &result->val.str, &nst);
2192           warn_about_normalization (pfile, result, &nst);
2193         }
2194       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2195         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2196       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2197         buffer->cur++, result->type = CPP_DOT_STAR;
2198       break;
2199
2200     case '+':
2201       result->type = CPP_PLUS;
2202       if (*buffer->cur == '+')
2203         buffer->cur++, result->type = CPP_PLUS_PLUS;
2204       else if (*buffer->cur == '=')
2205         buffer->cur++, result->type = CPP_PLUS_EQ;
2206       break;
2207
2208     case '-':
2209       result->type = CPP_MINUS;
2210       if (*buffer->cur == '>')
2211         {
2212           buffer->cur++;
2213           result->type = CPP_DEREF;
2214           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2215             buffer->cur++, result->type = CPP_DEREF_STAR;
2216         }
2217       else if (*buffer->cur == '-')
2218         buffer->cur++, result->type = CPP_MINUS_MINUS;
2219       else if (*buffer->cur == '=')
2220         buffer->cur++, result->type = CPP_MINUS_EQ;
2221       break;
2222
2223     case '&':
2224       result->type = CPP_AND;
2225       if (*buffer->cur == '&')
2226         buffer->cur++, result->type = CPP_AND_AND;
2227       else if (*buffer->cur == '=')
2228         buffer->cur++, result->type = CPP_AND_EQ;
2229       break;
2230
2231     case '|':
2232       result->type = CPP_OR;
2233       if (*buffer->cur == '|')
2234         buffer->cur++, result->type = CPP_OR_OR;
2235       else if (*buffer->cur == '=')
2236         buffer->cur++, result->type = CPP_OR_EQ;
2237       break;
2238
2239     case ':':
2240       result->type = CPP_COLON;
2241       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2242         buffer->cur++, result->type = CPP_SCOPE;
2243       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2244         {
2245           buffer->cur++;
2246           result->flags |= DIGRAPH;
2247           result->type = CPP_CLOSE_SQUARE;
2248         }
2249       break;
2250
2251     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2252     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2253     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2254     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2255     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2256
2257     case '?': result->type = CPP_QUERY; break;
2258     case '~': result->type = CPP_COMPL; break;
2259     case ',': result->type = CPP_COMMA; break;
2260     case '(': result->type = CPP_OPEN_PAREN; break;
2261     case ')': result->type = CPP_CLOSE_PAREN; break;
2262     case '[': result->type = CPP_OPEN_SQUARE; break;
2263     case ']': result->type = CPP_CLOSE_SQUARE; break;
2264     case '{': result->type = CPP_OPEN_BRACE; break;
2265     case '}': result->type = CPP_CLOSE_BRACE; break;
2266     case ';': result->type = CPP_SEMICOLON; break;
2267
2268       /* @ is a punctuator in Objective-C.  */
2269     case '@': result->type = CPP_ATSIGN; break;
2270
2271     case '$':
2272     case '\\':
2273       {
2274         const uchar *base = --buffer->cur;
2275         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2276
2277         if (forms_identifier_p (pfile, true, &nst))
2278           {
2279             result->type = CPP_NAME;
2280             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2281             warn_about_normalization (pfile, result, &nst);
2282             break;
2283           }
2284         buffer->cur++;
2285       }
2286
2287     default:
2288       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2289       break;
2290     }
2291
2292   return result;
2293 }
2294
2295 /* An upper bound on the number of bytes needed to spell TOKEN.
2296    Does not include preceding whitespace.  */
2297 unsigned int
2298 cpp_token_len (const cpp_token *token)
2299 {
2300   unsigned int len;
2301
2302   switch (TOKEN_SPELL (token))
2303     {
2304     default:            len = 6;                                break;
2305     case SPELL_LITERAL: len = token->val.str.len;               break;
2306     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2307     }
2308
2309   return len;
2310 }
2311
2312 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2313    Return the number of bytes read out of NAME.  (There are always
2314    10 bytes written to BUFFER.)  */
2315
2316 static size_t
2317 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2318 {
2319   int j;
2320   int ucn_len = 0;
2321   int ucn_len_c;
2322   unsigned t;
2323   unsigned long utf32;
2324
2325   /* Compute the length of the UTF-8 sequence.  */
2326   for (t = *name; t & 0x80; t <<= 1)
2327     ucn_len++;
2328
2329   utf32 = *name & (0x7F >> ucn_len);
2330   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2331     {
2332       utf32 = (utf32 << 6) | (*++name & 0x3F);
2333
2334       /* Ill-formed UTF-8.  */
2335       if ((*name & ~0x3F) != 0x80)
2336         abort ();
2337     }
2338
2339   *buffer++ = '\\';
2340   *buffer++ = 'U';
2341   for (j = 7; j >= 0; j--)
2342     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2343   return ucn_len;
2344 }
2345
2346 /* Given a token TYPE corresponding to a digraph, return a pointer to
2347    the spelling of the digraph.  */
2348 static const unsigned char *
2349 cpp_digraph2name (enum cpp_ttype type)
2350 {
2351   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2352 }
2353
2354 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2355    already contain the enough space to hold the token's spelling.
2356    Returns a pointer to the character after the last character written.
2357    FORSTRING is true if this is to be the spelling after translation
2358    phase 1 (this is different for UCNs).
2359    FIXME: Would be nice if we didn't need the PFILE argument.  */
2360 unsigned char *
2361 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2362                  unsigned char *buffer, bool forstring)
2363 {
2364   switch (TOKEN_SPELL (token))
2365     {
2366     case SPELL_OPERATOR:
2367       {
2368         const unsigned char *spelling;
2369         unsigned char c;
2370
2371         if (token->flags & DIGRAPH)
2372           spelling = cpp_digraph2name (token->type);
2373         else if (token->flags & NAMED_OP)
2374           goto spell_ident;
2375         else
2376           spelling = TOKEN_NAME (token);
2377
2378         while ((c = *spelling++) != '\0')
2379           *buffer++ = c;
2380       }
2381       break;
2382
2383     spell_ident:
2384     case SPELL_IDENT:
2385       if (forstring)
2386         {
2387           memcpy (buffer, NODE_NAME (token->val.node.node),
2388                   NODE_LEN (token->val.node.node));
2389           buffer += NODE_LEN (token->val.node.node);
2390         }
2391       else
2392         {
2393           size_t i;
2394           const unsigned char * name = NODE_NAME (token->val.node.node);
2395
2396           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2397             if (name[i] & ~0x7F)
2398               {
2399                 i += utf8_to_ucn (buffer, name + i) - 1;
2400                 buffer += 10;
2401               }
2402             else
2403               *buffer++ = NODE_NAME (token->val.node.node)[i];
2404         }
2405       break;
2406
2407     case SPELL_LITERAL:
2408       memcpy (buffer, token->val.str.text, token->val.str.len);
2409       buffer += token->val.str.len;
2410       break;
2411
2412     case SPELL_NONE:
2413       cpp_error (pfile, CPP_DL_ICE,
2414                  "unspellable token %s", TOKEN_NAME (token));
2415       break;
2416     }
2417
2418   return buffer;
2419 }
2420
2421 /* Returns TOKEN spelt as a null-terminated string.  The string is
2422    freed when the reader is destroyed.  Useful for diagnostics.  */
2423 unsigned char *
2424 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2425 {
2426   unsigned int len = cpp_token_len (token) + 1;
2427   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2428
2429   end = cpp_spell_token (pfile, token, start, false);
2430   end[0] = '\0';
2431
2432   return start;
2433 }
2434
2435 /* Returns a pointer to a string which spells the token defined by
2436    TYPE and FLAGS.  Used by C front ends, which really should move to
2437    using cpp_token_as_text.  */
2438 const char *
2439 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2440 {
2441   if (flags & DIGRAPH)
2442     return (const char *) cpp_digraph2name (type);
2443   else if (flags & NAMED_OP)
2444     return cpp_named_operator2name (type);
2445
2446   return (const char *) token_spellings[type].name;
2447 }
2448
2449 /* Writes the spelling of token to FP, without any preceding space.
2450    Separated from cpp_spell_token for efficiency - to avoid stdio
2451    double-buffering.  */
2452 void
2453 cpp_output_token (const cpp_token *token, FILE *fp)
2454 {
2455   switch (TOKEN_SPELL (token))
2456     {
2457     case SPELL_OPERATOR:
2458       {
2459         const unsigned char *spelling;
2460         int c;
2461
2462         if (token->flags & DIGRAPH)
2463           spelling = cpp_digraph2name (token->type);
2464         else if (token->flags & NAMED_OP)
2465           goto spell_ident;
2466         else
2467           spelling = TOKEN_NAME (token);
2468
2469         c = *spelling;
2470         do
2471           putc (c, fp);
2472         while ((c = *++spelling) != '\0');
2473       }
2474       break;
2475
2476     spell_ident:
2477     case SPELL_IDENT:
2478       {
2479         size_t i;
2480         const unsigned char * name = NODE_NAME (token->val.node.node);
2481
2482         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2483           if (name[i] & ~0x7F)
2484             {
2485               unsigned char buffer[10];
2486               i += utf8_to_ucn (buffer, name + i) - 1;
2487               fwrite (buffer, 1, 10, fp);
2488             }
2489           else
2490             fputc (NODE_NAME (token->val.node.node)[i], fp);
2491       }
2492       break;
2493
2494     case SPELL_LITERAL:
2495       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2496       break;
2497
2498     case SPELL_NONE:
2499       /* An error, most probably.  */
2500       break;
2501     }
2502 }
2503
2504 /* Compare two tokens.  */
2505 int
2506 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2507 {
2508   if (a->type == b->type && a->flags == b->flags)
2509     switch (TOKEN_SPELL (a))
2510       {
2511       default:                  /* Keep compiler happy.  */
2512       case SPELL_OPERATOR:
2513         /* token_no is used to track where multiple consecutive ##
2514            tokens were originally located.  */
2515         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2516       case SPELL_NONE:
2517         return (a->type != CPP_MACRO_ARG
2518                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2519       case SPELL_IDENT:
2520         return a->val.node.node == b->val.node.node;
2521       case SPELL_LITERAL:
2522         return (a->val.str.len == b->val.str.len
2523                 && !memcmp (a->val.str.text, b->val.str.text,
2524                             a->val.str.len));
2525       }
2526
2527   return 0;
2528 }
2529
2530 /* Returns nonzero if a space should be inserted to avoid an
2531    accidental token paste for output.  For simplicity, it is
2532    conservative, and occasionally advises a space where one is not
2533    needed, e.g. "." and ".2".  */
2534 int
2535 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2536                  const cpp_token *token2)
2537 {
2538   enum cpp_ttype a = token1->type, b = token2->type;
2539   cppchar_t c;
2540
2541   if (token1->flags & NAMED_OP)
2542     a = CPP_NAME;
2543   if (token2->flags & NAMED_OP)
2544     b = CPP_NAME;
2545
2546   c = EOF;
2547   if (token2->flags & DIGRAPH)
2548     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2549   else if (token_spellings[b].category == SPELL_OPERATOR)
2550     c = token_spellings[b].name[0];
2551
2552   /* Quickly get everything that can paste with an '='.  */
2553   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2554     return 1;
2555
2556   switch (a)
2557     {
2558     case CPP_GREATER:   return c == '>';
2559     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2560     case CPP_PLUS:      return c == '+';
2561     case CPP_MINUS:     return c == '-' || c == '>';
2562     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2563     case CPP_MOD:       return c == ':' || c == '>';
2564     case CPP_AND:       return c == '&';
2565     case CPP_OR:        return c == '|';
2566     case CPP_COLON:     return c == ':' || c == '>';
2567     case CPP_DEREF:     return c == '*';
2568     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2569     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2570     case CPP_NAME:      return ((b == CPP_NUMBER
2571                                  && name_p (pfile, &token2->val.str))
2572                                 || b == CPP_NAME
2573                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2574     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2575                                 || c == '.' || c == '+' || c == '-');
2576                                       /* UCNs */
2577     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2578                                  && b == CPP_NAME)
2579                                 || (CPP_OPTION (pfile, objc)
2580                                     && token1->val.str.text[0] == '@'
2581                                     && (b == CPP_NAME || b == CPP_STRING)));
2582     default:            break;
2583     }
2584
2585   return 0;
2586 }
2587
2588 /* Output all the remaining tokens on the current line, and a newline
2589    character, to FP.  Leading whitespace is removed.  If there are
2590    macros, special token padding is not performed.  */
2591 void
2592 cpp_output_line (cpp_reader *pfile, FILE *fp)
2593 {
2594   const cpp_token *token;
2595
2596   token = cpp_get_token (pfile);
2597   while (token->type != CPP_EOF)
2598     {
2599       cpp_output_token (token, fp);
2600       token = cpp_get_token (pfile);
2601       if (token->flags & PREV_WHITE)
2602         putc (' ', fp);
2603     }
2604
2605   putc ('\n', fp);
2606 }
2607
2608 /* Return a string representation of all the remaining tokens on the
2609    current line.  The result is allocated using xmalloc and must be
2610    freed by the caller.  */
2611 unsigned char *
2612 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2613 {
2614   const cpp_token *token;
2615   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2616   unsigned int alloced = 120 + out;
2617   unsigned char *result = (unsigned char *) xmalloc (alloced);
2618
2619   /* If DIR_NAME is empty, there are no initial contents.  */
2620   if (dir_name)
2621     {
2622       sprintf ((char *) result, "#%s ", dir_name);
2623       out += 2;
2624     }
2625
2626   token = cpp_get_token (pfile);
2627   while (token->type != CPP_EOF)
2628     {
2629       unsigned char *last;
2630       /* Include room for a possible space and the terminating nul.  */
2631       unsigned int len = cpp_token_len (token) + 2;
2632
2633       if (out + len > alloced)
2634         {
2635           alloced *= 2;
2636           if (out + len > alloced)
2637             alloced = out + len;
2638           result = (unsigned char *) xrealloc (result, alloced);
2639         }
2640
2641       last = cpp_spell_token (pfile, token, &result[out], 0);
2642       out = last - result;
2643
2644       token = cpp_get_token (pfile);
2645       if (token->flags & PREV_WHITE)
2646         result[out++] = ' ';
2647     }
2648
2649   result[out] = '\0';
2650   return result;
2651 }
2652
2653 /* Memory buffers.  Changing these three constants can have a dramatic
2654    effect on performance.  The values here are reasonable defaults,
2655    but might be tuned.  If you adjust them, be sure to test across a
2656    range of uses of cpplib, including heavy nested function-like macro
2657    expansion.  Also check the change in peak memory usage (NJAMD is a
2658    good tool for this).  */
2659 #define MIN_BUFF_SIZE 8000
2660 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2661 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2662         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2663
2664 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2665   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2666 #endif
2667
2668 /* Create a new allocation buffer.  Place the control block at the end
2669    of the buffer, so that buffer overflows will cause immediate chaos.  */
2670 static _cpp_buff *
2671 new_buff (size_t len)
2672 {
2673   _cpp_buff *result;
2674   unsigned char *base;
2675
2676   if (len < MIN_BUFF_SIZE)
2677     len = MIN_BUFF_SIZE;
2678   len = CPP_ALIGN (len);
2679
2680   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2681   result = (_cpp_buff *) (base + len);
2682   result->base = base;
2683   result->cur = base;
2684   result->limit = base + len;
2685   result->next = NULL;
2686   return result;
2687 }
2688
2689 /* Place a chain of unwanted allocation buffers on the free list.  */
2690 void
2691 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2692 {
2693   _cpp_buff *end = buff;
2694
2695   while (end->next)
2696     end = end->next;
2697   end->next = pfile->free_buffs;
2698   pfile->free_buffs = buff;
2699 }
2700
2701 /* Return a free buffer of size at least MIN_SIZE.  */
2702 _cpp_buff *
2703 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2704 {
2705   _cpp_buff *result, **p;
2706
2707   for (p = &pfile->free_buffs;; p = &(*p)->next)
2708     {
2709       size_t size;
2710
2711       if (*p == NULL)
2712         return new_buff (min_size);
2713       result = *p;
2714       size = result->limit - result->base;
2715       /* Return a buffer that's big enough, but don't waste one that's
2716          way too big.  */
2717       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2718         break;
2719     }
2720
2721   *p = result->next;
2722   result->next = NULL;
2723   result->cur = result->base;
2724   return result;
2725 }
2726
2727 /* Creates a new buffer with enough space to hold the uncommitted
2728    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2729    the excess bytes to the new buffer.  Chains the new buffer after
2730    BUFF, and returns the new buffer.  */
2731 _cpp_buff *
2732 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2733 {
2734   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2735   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2736
2737   buff->next = new_buff;
2738   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2739   return new_buff;
2740 }
2741
2742 /* Creates a new buffer with enough space to hold the uncommitted
2743    remaining bytes of the buffer pointed to by BUFF, and at least
2744    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
2745    Chains the new buffer before the buffer pointed to by BUFF, and
2746    updates the pointer to point to the new buffer.  */
2747 void
2748 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
2749 {
2750   _cpp_buff *new_buff, *old_buff = *pbuff;
2751   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2752
2753   new_buff = _cpp_get_buff (pfile, size);
2754   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2755   new_buff->next = old_buff;
2756   *pbuff = new_buff;
2757 }
2758
2759 /* Free a chain of buffers starting at BUFF.  */
2760 void
2761 _cpp_free_buff (_cpp_buff *buff)
2762 {
2763   _cpp_buff *next;
2764
2765   for (; buff; buff = next)
2766     {
2767       next = buff->next;
2768       free (buff->base);
2769     }
2770 }
2771
2772 /* Allocate permanent, unaligned storage of length LEN.  */
2773 unsigned char *
2774 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2775 {
2776   _cpp_buff *buff = pfile->u_buff;
2777   unsigned char *result = buff->cur;
2778
2779   if (len > (size_t) (buff->limit - result))
2780     {
2781       buff = _cpp_get_buff (pfile, len);
2782       buff->next = pfile->u_buff;
2783       pfile->u_buff = buff;
2784       result = buff->cur;
2785     }
2786
2787   buff->cur = result + len;
2788   return result;
2789 }
2790
2791 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2792    That buffer is used for growing allocations when saving macro
2793    replacement lists in a #define, and when parsing an answer to an
2794    assertion in #assert, #unassert or #if (and therefore possibly
2795    whilst expanding macros).  It therefore must not be used by any
2796    code that they might call: specifically the lexer and the guts of
2797    the macro expander.
2798
2799    All existing other uses clearly fit this restriction: storing
2800    registered pragmas during initialization.  */
2801 unsigned char *
2802 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2803 {
2804   _cpp_buff *buff = pfile->a_buff;
2805   unsigned char *result = buff->cur;
2806
2807   if (len > (size_t) (buff->limit - result))
2808     {
2809       buff = _cpp_get_buff (pfile, len);
2810       buff->next = pfile->a_buff;
2811       pfile->a_buff = buff;
2812       result = buff->cur;
2813     }
2814
2815   buff->cur = result + len;
2816   return result;
2817 }
2818
2819 /* Say which field of TOK is in use.  */
2820
2821 enum cpp_token_fld_kind
2822 cpp_token_val_index (cpp_token *tok)
2823 {
2824   switch (TOKEN_SPELL (tok))
2825     {
2826     case SPELL_IDENT:
2827       return CPP_TOKEN_FLD_NODE;
2828     case SPELL_LITERAL:
2829       return CPP_TOKEN_FLD_STR;
2830     case SPELL_OPERATOR:
2831       if (tok->type == CPP_PASTE)
2832         return CPP_TOKEN_FLD_TOKEN_NO;
2833       else
2834         return CPP_TOKEN_FLD_NONE;
2835     case SPELL_NONE:
2836       if (tok->type == CPP_MACRO_ARG)
2837         return CPP_TOKEN_FLD_ARG_NO;
2838       else if (tok->type == CPP_PADDING)
2839         return CPP_TOKEN_FLD_SOURCE;
2840       else if (tok->type == CPP_PRAGMA)
2841         return CPP_TOKEN_FLD_PRAGMA;
2842       /* else fall through */
2843     default:
2844       return CPP_TOKEN_FLD_NONE;
2845     }
2846 }