libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2010,
   3    2011 Free Software Foundation, Inc.
   4    Contributed by Per Bothner, 1994-95.
   5    Based on CCCP program by Paul Rubin, June 1986
   6    Adapted to ANSI C, Richard Stallman, Jan 1987
   7    Broken out to separate file, Zack Weinberg, Mar 2000
   8
   9 This program is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 3, or (at your option) any
  12 later version.
  13
  14 This program is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with this program; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "cpplib.h"
  26 #include "internal.h"
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  46 #define TK(e, s) { SPELL_ ## s,    UC #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void store_comment (cpp_reader *, cpp_token *);
  60 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  61                             unsigned int, enum cpp_ttype);
  62 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  63 static int name_p (cpp_reader *, const cpp_string *);
  64 static tokenrun *next_tokenrun (tokenrun *);
  65
  66 static _cpp_buff *new_buff (size_t);
  67
  68
  69 /* Utility routine:
  70
  71    Compares, the token TOKEN to the NUL-terminated string STRING.
  72    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  73 int
  74 cpp_ideq (const cpp_token *token, const char *string)
  75 {
  76   if (token->type != CPP_NAME)
  77     return 0;
  78
  79   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  80 }
  81
  82 /* Record a note TYPE at byte POS into the current cleaned logical
  83    line.  */
  84 static void
  85 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  86 {
  87   if (buffer->notes_used == buffer->notes_cap)
  88     {
  89       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  90       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  91                                   buffer->notes_cap);
  92     }
  93
  94   buffer->notes[buffer->notes_used].pos = pos;
  95   buffer->notes[buffer->notes_used].type = type;
  96   buffer->notes_used++;
  97 }
  98
  99 \f
 100 /* Fast path to find line special characters using optimized character
 101    scanning algorithms.  Anything complicated falls back to the slow
 102    path below.  Since this loop is very hot it's worth doing these kinds
 103    of optimizations.
 104
 105    One of the paths through the ifdefs should provide
 106
 107      const uchar *search_line_fast (const uchar *s, const uchar *end);
 108
 109    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 110    the found character.
 111
 112    Note that the last character of the buffer is *always* a newline,
 113    as forced by _cpp_convert_input.  This fact can be used to avoid
 114    explicitly looking for the end of the buffer.  */
 115
 116 /* Configure gives us an ifdef test.  */
 117 #ifndef WORDS_BIGENDIAN
 118 #define WORDS_BIGENDIAN 0
 119 #endif
 120
 121 /* We'd like the largest integer that fits into a register.  There's nothing
 122    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 123    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 124    can get the "real" word size.  */
 125 #ifdef __GNUC__
 126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 127 #else
 128 typedef unsigned long word_type;
 129 #endif
 130
 131 /* The code below is only expecting sizes 4 or 8.
 132    Die at compile-time if this expectation is violated.  */
 133 typedef char check_word_type_size
 134   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 135
 136 /* Return X with the first N bytes forced to values that won't match one
 137    of the interesting characters.  Note that NUL is not interesting.  */
 138
 139 static inline word_type
 140 acc_char_mask_misalign (word_type val, unsigned int n)
 141 {
 142   word_type mask = -1;
 143   if (WORDS_BIGENDIAN)
 144     mask >>= n * 8;
 145   else
 146     mask <<= n * 8;
 147   return val & mask;
 148 }
 149
 150 /* Return X replicated to all byte positions within WORD_TYPE.  */
 151
 152 static inline word_type
 153 acc_char_replicate (uchar x)
 154 {
 155   word_type ret;
 156
 157   ret = (x << 24) | (x << 16) | (x << 8) | x;
 158   if (sizeof(word_type) == 8)
 159     ret = (ret << 16 << 16) | ret;
 160   return ret;
 161 }
 162
 163 /* Return non-zero if some byte of VAL is (probably) C.  */
 164
 165 static inline word_type
 166 acc_char_cmp (word_type val, word_type c)
 167 {
 168 #if defined(__GNUC__) && defined(__alpha__)
 169   /* We can get exact results using a compare-bytes instruction.
 170      Get (val == c) via (0 >= (val ^ c)).  */
 171   return __builtin_alpha_cmpbge (0, val ^ c);
 172 #else
 173   word_type magic = 0x7efefefeU;
 174   if (sizeof(word_type) == 8)
 175     magic = (magic << 16 << 16) | 0xfefefefeU;
 176   magic |= 1;
 177
 178   val ^= c;
 179   return ((val + magic) ^ ~val) & ~magic;
 180 #endif
 181 }
 182
 183 /* Given the result of acc_char_cmp is non-zero, return the index of
 184    the found character.  If this was a false positive, return -1.  */
 185
 186 static inline int
 187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 188                 word_type val ATTRIBUTE_UNUSED)
 189 {
 190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 191   /* The cmpbge instruction sets *bits* of the result corresponding to
 192      matches in the bytes with no false positives.  */
 193   return __builtin_ctzl (cmp);
 194 #else
 195   unsigned int i;
 196
 197   /* ??? It would be nice to force unrolling here,
 198      and have all of these constants folded.  */
 199   for (i = 0; i < sizeof(word_type); ++i)
 200     {
 201       uchar c;
 202       if (WORDS_BIGENDIAN)
 203         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 204       else
 205         c = (val >> i * 8) & 0xff;
 206
 207       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 208         return i;
 209     }
 210
 211   return -1;
 212 #endif
 213 }
 214
 215 /* A version of the fast scanner using bit fiddling techniques.
 216
 217    For 32-bit words, one would normally perform 16 comparisons and
 218    16 branches.  With this algorithm one performs 24 arithmetic
 219    operations and one branch.  Whether this is faster with a 32-bit
 220    word size is going to be somewhat system dependent.
 221
 222    For 64-bit words, we eliminate twice the number of comparisons
 223    and branches without increasing the number of arithmetic operations.
 224    It's almost certainly going to be a win with 64-bit word size.  */
 225
 226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 227   ATTRIBUTE_UNUSED;
 228
 229 static const uchar *
 230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 231 {
 232   const word_type repl_nl = acc_char_replicate ('\n');
 233   const word_type repl_cr = acc_char_replicate ('\r');
 234   const word_type repl_bs = acc_char_replicate ('\\');
 235   const word_type repl_qm = acc_char_replicate ('?');
 236
 237   unsigned int misalign;
 238   const word_type *p;
 239   word_type val, t;
 240
 241   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 242   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 243   val = *p;
 244   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 245   if (misalign)
 246     val = acc_char_mask_misalign (val, misalign);
 247
 248   /* Main loop.  */
 249   while (1)
 250     {
 251       t  = acc_char_cmp (val, repl_nl);
 252       t |= acc_char_cmp (val, repl_cr);
 253       t |= acc_char_cmp (val, repl_bs);
 254       t |= acc_char_cmp (val, repl_qm);
 255
 256       if (__builtin_expect (t != 0, 0))
 257         {
 258           int i = acc_char_index (t, val);
 259           if (i >= 0)
 260             return (const uchar *)p + i;
 261         }
 262
 263       val = *++p;
 264     }
 265 }
 266
 267 /* Disable on Solaris 2/x86 until the following problems can be properly
 268    autoconfed:
 269
 270    The Solaris 8 assembler cannot assemble SSE2/SSE4.2 insns.
 271    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 272    Before Solaris 9 Update 6, SSE insns cannot be executed.
 273    The Solaris 10+ assembler tags objects with the instruction set
 274    extensions used, so SSE4.2 executables cannot run on machines that
 275    don't support that extension.  */
 276
 277 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 278
 279 /* Replicated character data to be shared between implementations.
 280    Recall that outside of a context with vector support we can't
 281    define compatible vector types, therefore these are all defined
 282    in terms of raw characters.  */
 283 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 284   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 285     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 286   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 287     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 288   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 289     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 290   { '?', '?', '?', '?', '?', '?', '?', '?',
 291     '?', '?', '?', '?', '?', '?', '?', '?' },
 292 };
 293
 294 /* A version of the fast scanner using MMX vectorized byte compare insns.
 295
 296    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 297    which was packaged into SSE1; it is also present in the AMD MMX
 298    extension.  Mark the function as using "sse" so that we emit a real
 299    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 300
 301 static const uchar *
 302 #ifndef __SSE__
 303 __attribute__((__target__("sse")))
 304 #endif
 305 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 306 {
 307   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 308   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 309
 310   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 311   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 312   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 313   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 314
 315   unsigned int misalign, found, mask;
 316   const v8qi *p;
 317   v8qi data, t, c;
 318
 319   /* Align the source pointer.  While MMX doesn't generate unaligned data
 320      faults, this allows us to safely scan to the end of the buffer without
 321      reading beyond the end of the last page.  */
 322   misalign = (uintptr_t)s & 7;
 323   p = (const v8qi *)((uintptr_t)s & -8);
 324   data = *p;
 325
 326   /* Create a mask for the bytes that are valid within the first
 327      16-byte block.  The Idea here is that the AND with the mask
 328      within the loop is "free", since we need some AND or TEST
 329      insn in order to set the flags for the branch anyway.  */
 330   mask = -1u << misalign;
 331
 332   /* Main loop processing 8 bytes at a time.  */
 333   goto start;
 334   do
 335     {
 336       data = *++p;
 337       mask = -1;
 338
 339     start:
 340       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 341       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 344       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 345       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 346       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 347       found = __builtin_ia32_pmovmskb (t);
 348       found &= mask;
 349     }
 350   while (!found);
 351
 352   __builtin_ia32_emms ();
 353
 354   /* FOUND contains 1 in bits for which we matched a relevant
 355      character.  Conversion to the byte index is trivial.  */
 356   found = __builtin_ctz(found);
 357   return (const uchar *)p + found;
 358 }
 359
 360 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 361
 362 static const uchar *
 363 #ifndef __SSE2__
 364 __attribute__((__target__("sse2")))
 365 #endif
 366 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 367 {
 368   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 369
 370   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 371   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 372   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 373   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 374
 375   unsigned int misalign, found, mask;
 376   const v16qi *p;
 377   v16qi data, t;
 378
 379   /* Align the source pointer.  */
 380   misalign = (uintptr_t)s & 15;
 381   p = (const v16qi *)((uintptr_t)s & -16);
 382   data = *p;
 383
 384   /* Create a mask for the bytes that are valid within the first
 385      16-byte block.  The Idea here is that the AND with the mask
 386      within the loop is "free", since we need some AND or TEST
 387      insn in order to set the flags for the branch anyway.  */
 388   mask = -1u << misalign;
 389
 390   /* Main loop processing 16 bytes at a time.  */
 391   goto start;
 392   do
 393     {
 394       data = *++p;
 395       mask = -1;
 396
 397     start:
 398       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 400       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 401       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 402       found = __builtin_ia32_pmovmskb128 (t);
 403       found &= mask;
 404     }
 405   while (!found);
 406
 407   /* FOUND contains 1 in bits for which we matched a relevant
 408      character.  Conversion to the byte index is trivial.  */
 409   found = __builtin_ctz(found);
 410   return (const uchar *)p + found;
 411 }
 412
 413 #ifdef HAVE_SSE4
 414 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 415
 416 static const uchar *
 417 #ifndef __SSE4_2__
 418 __attribute__((__target__("sse4.2")))
 419 #endif
 420 search_line_sse42 (const uchar *s, const uchar *end)
 421 {
 422   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 423   static const v16qi search = { '\n', '\r', '?', '\\' };
 424
 425   uintptr_t si = (uintptr_t)s;
 426   uintptr_t index;
 427
 428   /* Check for unaligned input.  */
 429   if (si & 15)
 430     {
 431       if (__builtin_expect (end - s < 16, 0)
 432           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 433         {
 434           /* There are less than 16 bytes left in the buffer, and less
 435              than 16 bytes left on the page.  Reading 16 bytes at this
 436              point might generate a spurious page fault.  Defer to the
 437              SSE2 implementation, which already handles alignment.  */
 438           return search_line_sse2 (s, end);
 439         }
 440
 441       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 442          memory need not be aligned.  */
 443       __asm ("%vpcmpestri $0, (%1), %2"
 444              : "=c"(index) : "r"(s), "x"(search), "a"(4), "d"(16));
 445       if (__builtin_expect (index < 16, 0))
 446         goto found;
 447
 448       /* Advance the pointer to an aligned address.  We will re-scan a
 449          few bytes, but we no longer need care for reading past the
 450          end of a page, since we're guaranteed a match.  */
 451       s = (const uchar *)((si + 16) & -16);
 452     }
 453
 454   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 455      in inline assembly, we can make proper use of the flags set.  */
 456   __asm (      "sub $16, %1\n"
 457         "       .balign 16\n"
 458         "0:     add $16, %1\n"
 459         "       %vpcmpestri $0, (%1), %2\n"
 460         "       jnc 0b"
 461         : "=&c"(index), "+r"(s)
 462         : "x"(search), "a"(4), "d"(16));
 463
 464  found:
 465   return s + index;
 466 }
 467
 468 #else
 469 /* Work around out-dated assemblers without sse4 support.  */
 470 #define search_line_sse42 search_line_sse2
 471 #endif
 472
 473 /* Check the CPU capabilities.  */
 474
 475 #include "../gcc/config/i386/cpuid.h"
 476
 477 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 478 static search_line_fast_type search_line_fast;
 479
 480 #define HAVE_init_vectorized_lexer 1
 481 static inline void
 482 init_vectorized_lexer (void)
 483 {
 484   unsigned dummy, ecx = 0, edx = 0;
 485   search_line_fast_type impl = search_line_acc_char;
 486   int minimum = 0;
 487
 488 #if defined(__SSE4_2__)
 489   minimum = 3;
 490 #elif defined(__SSE2__)
 491   minimum = 2;
 492 #elif defined(__SSE__)
 493   minimum = 1;
 494 #endif
 495
 496   if (minimum == 3)
 497     impl = search_line_sse42;
 498   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 499     {
 500       if (minimum == 3 || (ecx & bit_SSE4_2))
 501         impl = search_line_sse42;
 502       else if (minimum == 2 || (edx & bit_SSE2))
 503         impl = search_line_sse2;
 504       else if (minimum == 1 || (edx & bit_SSE))
 505         impl = search_line_mmx;
 506     }
 507   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 508     {
 509       if (minimum == 1
 510           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 511         impl = search_line_mmx;
 512     }
 513
 514   search_line_fast = impl;
 515 }
 516
 517 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 518
 519 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 520 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 521    so we can't compile this function without -maltivec on the command line
 522    (or implied by some other switch).  */
 523
 524 static const uchar *
 525 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 526 {
 527   typedef __attribute__((altivec(vector))) unsigned char vc;
 528
 529   const vc repl_nl = {
 530     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 531     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 532   };
 533   const vc repl_cr = {
 534     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 535     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 536   };
 537   const vc repl_bs = {
 538     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 539     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 540   };
 541   const vc repl_qm = {
 542     '?', '?', '?', '?', '?', '?', '?', '?',
 543     '?', '?', '?', '?', '?', '?', '?', '?',
 544   };
 545   const vc ones = {
 546     -1, -1, -1, -1, -1, -1, -1, -1,
 547     -1, -1, -1, -1, -1, -1, -1, -1,
 548   };
 549   const vc zero = { 0 };
 550
 551   vc data, mask, t;
 552
 553   /* Altivec loads automatically mask addresses with -16.  This lets us
 554      issue the first load as early as possible.  */
 555   data = __builtin_vec_ld(0, (const vc *)s);
 556
 557   /* Discard bytes before the beginning of the buffer.  Do this by
 558      beginning with all ones and shifting in zeros according to the
 559      mis-alignment.  The LVSR instruction pulls the exact shift we
 560      want from the address.  */
 561   mask = __builtin_vec_lvsr(0, s);
 562   mask = __builtin_vec_perm(zero, ones, mask);
 563   data &= mask;
 564
 565   /* While altivec loads mask addresses, we still need to align S so
 566      that the offset we compute at the end is correct.  */
 567   s = (const uchar *)((uintptr_t)s & -16);
 568
 569   /* Main loop processing 16 bytes at a time.  */
 570   goto start;
 571   do
 572     {
 573       vc m_nl, m_cr, m_bs, m_qm;
 574
 575       s += 16;
 576       data = __builtin_vec_ld(0, (const vc *)s);
 577
 578     start:
 579       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 580       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 581       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 582       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 583       t = (m_nl | m_cr) | (m_bs | m_qm);
 584
 585       /* T now contains 0xff in bytes for which we matched one of the relevant
 586          characters.  We want to exit the loop if any byte in T is non-zero.
 587          Below is the expansion of vec_any_ne(t, zero).  */
 588     }
 589   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 590
 591   {
 592 #define N  (sizeof(vc) / sizeof(long))
 593
 594     typedef char check_count[(N == 2 || N == 4) * 2 - 1];
 595     union {
 596       vc v;
 597       unsigned long l[N];
 598     } u;
 599     unsigned long l, i = 0;
 600
 601     u.v = t;
 602
 603     /* Find the first word of T that is non-zero.  */
 604     switch (N)
 605       {
 606       case 4:
 607         l = u.l[i++];
 608         if (l != 0)
 609           break;
 610         s += sizeof(unsigned long);
 611         l = u.l[i++];
 612         if (l != 0)
 613           break;
 614         s += sizeof(unsigned long);
 615       case 2:
 616         l = u.l[i++];
 617         if (l != 0)
 618           break;
 619         s += sizeof(unsigned long);
 620         l = u.l[i];
 621       }
 622
 623     /* L now contains 0xff in bytes for which we matched one of the
 624        relevant characters.  We can find the byte index by finding
 625        its bit index and dividing by 8.  */
 626     l = __builtin_clzl(l) >> 3;
 627     return s + l;
 628
 629 #undef N
 630   }
 631 }
 632
 633 #else
 634
 635 /* We only have one accellerated alternative.  Use a direct call so that
 636    we encourage inlining.  */
 637
 638 #define search_line_fast  search_line_acc_char
 639
 640 #endif
 641
 642 /* Initialize the lexer if needed.  */
 643
 644 void
 645 _cpp_init_lexer (void)
 646 {
 647 #ifdef HAVE_init_vectorized_lexer
 648   init_vectorized_lexer ();
 649 #endif
 650 }
 651
 652 /* Returns with a logical line that contains no escaped newlines or
 653    trigraphs.  This is a time-critical inner loop.  */
 654 void
 655 _cpp_clean_line (cpp_reader *pfile)
 656 {
 657   cpp_buffer *buffer;
 658   const uchar *s;
 659   uchar c, *d, *p;
 660
 661   buffer = pfile->buffer;
 662   buffer->cur_note = buffer->notes_used = 0;
 663   buffer->cur = buffer->line_base = buffer->next_line;
 664   buffer->need_line = false;
 665   s = buffer->next_line;
 666
 667   if (!buffer->from_stage3)
 668     {
 669       const uchar *pbackslash = NULL;
 670
 671       /* Fast path.  This is the common case of an un-escaped line with
 672          no trigraphs.  The primary win here is by not writing any
 673          data back to memory until we have to.  */
 674       while (1)
 675         {
 676           /* Perform an optimized search for \n, \r, \\, ?.  */
 677           s = search_line_fast (s, buffer->rlimit);
 678
 679           c = *s;
 680           if (c == '\\')
 681             {
 682               /* Record the location of the backslash and continue.  */
 683               pbackslash = s++;
 684             }
 685           else if (__builtin_expect (c == '?', 0))
 686             {
 687               if (__builtin_expect (s[1] == '?', false)
 688                    && _cpp_trigraph_map[s[2]])
 689                 {
 690                   /* Have a trigraph.  We may or may not have to convert
 691                      it.  Add a line note regardless, for -Wtrigraphs.  */
 692                   add_line_note (buffer, s, s[2]);
 693                   if (CPP_OPTION (pfile, trigraphs))
 694                     {
 695                       /* We do, and that means we have to switch to the
 696                          slow path.  */
 697                       d = (uchar *) s;
 698                       *d = _cpp_trigraph_map[s[2]];
 699                       s += 2;
 700                       goto slow_path;
 701                     }
 702                 }
 703               /* Not a trigraph.  Continue on fast-path.  */
 704               s++;
 705             }
 706           else
 707             break;
 708         }
 709
 710       /* This must be \r or \n.  We're either done, or we'll be forced
 711          to write back to the buffer and continue on the slow path.  */
 712       d = (uchar *) s;
 713
 714       if (__builtin_expect (s == buffer->rlimit, false))
 715         goto done;
 716
 717       /* DOS line ending? */
 718       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 719         {
 720           s++;
 721           if (s == buffer->rlimit)
 722             goto done;
 723         }
 724
 725       if (__builtin_expect (pbackslash == NULL, true))
 726         goto done;
 727
 728       /* Check for escaped newline.  */
 729       p = d;
 730       while (is_nvspace (p[-1]))
 731         p--;
 732       if (p - 1 != pbackslash)
 733         goto done;
 734
 735       /* Have an escaped newline; process it and proceed to
 736          the slow path.  */
 737       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 738       d = p - 2;
 739       buffer->next_line = p - 1;
 740
 741     slow_path:
 742       while (1)
 743         {
 744           c = *++s;
 745           *++d = c;
 746
 747           if (c == '\n' || c == '\r')
 748             {
 749               /* Handle DOS line endings.  */
 750               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 751                 s++;
 752               if (s == buffer->rlimit)
 753                 break;
 754
 755               /* Escaped?  */
 756               p = d;
 757               while (p != buffer->next_line && is_nvspace (p[-1]))
 758                 p--;
 759               if (p == buffer->next_line || p[-1] != '\\')
 760                 break;
 761
 762               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 763               d = p - 2;
 764               buffer->next_line = p - 1;
 765             }
 766           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 767             {
 768               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 769               add_line_note (buffer, d, s[2]);
 770               if (CPP_OPTION (pfile, trigraphs))
 771                 {
 772                   *d = _cpp_trigraph_map[s[2]];
 773                   s += 2;
 774                 }
 775             }
 776         }
 777     }
 778   else
 779     {
 780       while (*s != '\n' && *s != '\r')
 781         s++;
 782       d = (uchar *) s;
 783
 784       /* Handle DOS line endings.  */
 785       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 786         s++;
 787     }
 788
 789  done:
 790   *d = '\n';
 791   /* A sentinel note that should never be processed.  */
 792   add_line_note (buffer, d + 1, '\n');
 793   buffer->next_line = s + 1;
 794 }
 795
 796 /* Return true if the trigraph indicated by NOTE should be warned
 797    about in a comment.  */
 798 static bool
 799 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 800 {
 801   const uchar *p;
 802
 803   /* Within comments we don't warn about trigraphs, unless the
 804      trigraph forms an escaped newline, as that may change
 805      behavior.  */
 806   if (note->type != '/')
 807     return false;
 808
 809   /* If -trigraphs, then this was an escaped newline iff the next note
 810      is coincident.  */
 811   if (CPP_OPTION (pfile, trigraphs))
 812     return note[1].pos == note->pos;
 813
 814   /* Otherwise, see if this forms an escaped newline.  */
 815   p = note->pos + 3;
 816   while (is_nvspace (*p))
 817     p++;
 818
 819   /* There might have been escaped newlines between the trigraph and the
 820      newline we found.  Hence the position test.  */
 821   return (*p == '\n' && p < note[1].pos);
 822 }
 823
 824 /* Process the notes created by add_line_note as far as the current
 825    location.  */
 826 void
 827 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 828 {
 829   cpp_buffer *buffer = pfile->buffer;
 830
 831   for (;;)
 832     {
 833       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 834       unsigned int col;
 835
 836       if (note->pos > buffer->cur)
 837         break;
 838
 839       buffer->cur_note++;
 840       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 841
 842       if (note->type == '\\' || note->type == ' ')
 843         {
 844           if (note->type == ' ' && !in_comment)
 845             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 846                                  "backslash and newline separated by space");
 847
 848           if (buffer->next_line > buffer->rlimit)
 849             {
 850               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 851                                    "backslash-newline at end of file");
 852               /* Prevent "no newline at end of file" warning.  */
 853               buffer->next_line = buffer->rlimit;
 854             }
 855
 856           buffer->line_base = note->pos;
 857           CPP_INCREMENT_LINE (pfile, 0);
 858         }
 859       else if (_cpp_trigraph_map[note->type])
 860         {
 861           if (CPP_OPTION (pfile, warn_trigraphs)
 862               && (!in_comment || warn_in_comment (pfile, note)))
 863             {
 864               if (CPP_OPTION (pfile, trigraphs))
 865                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 866                                        pfile->line_table->highest_line, col,
 867                                        "trigraph ??%c converted to %c",
 868                                        note->type,
 869                                        (int) _cpp_trigraph_map[note->type]);
 870               else
 871                 {
 872                   cpp_warning_with_line
 873                     (pfile, CPP_W_TRIGRAPHS,
 874                      pfile->line_table->highest_line, col,
 875                      "trigraph ??%c ignored, use -trigraphs to enable",
 876                      note->type);
 877                 }
 878             }
 879         }
 880       else if (note->type == 0)
 881         /* Already processed in lex_raw_string.  */;
 882       else
 883         abort ();
 884     }
 885 }
 886
 887 /* Skip a C-style block comment.  We find the end of the comment by
 888    seeing if an asterisk is before every '/' we encounter.  Returns
 889    nonzero if comment terminated by EOF, zero otherwise.
 890
 891    Buffer->cur points to the initial asterisk of the comment.  */
 892 bool
 893 _cpp_skip_block_comment (cpp_reader *pfile)
 894 {
 895   cpp_buffer *buffer = pfile->buffer;
 896   const uchar *cur = buffer->cur;
 897   uchar c;
 898
 899   cur++;
 900   if (*cur == '/')
 901     cur++;
 902
 903   for (;;)
 904     {
 905       /* People like decorating comments with '*', so check for '/'
 906          instead for efficiency.  */
 907       c = *cur++;
 908
 909       if (c == '/')
 910         {
 911           if (cur[-2] == '*')
 912             break;
 913
 914           /* Warn about potential nested comments, but not if the '/'
 915              comes immediately before the true comment delimiter.
 916              Don't bother to get it right across escaped newlines.  */
 917           if (CPP_OPTION (pfile, warn_comments)
 918               && cur[0] == '*' && cur[1] != '/')
 919             {
 920               buffer->cur = cur;
 921               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 922                                      pfile->line_table->highest_line,
 923                                      CPP_BUF_COL (buffer),
 924                                      "\"/*\" within comment");
 925             }
 926         }
 927       else if (c == '\n')
 928         {
 929           unsigned int cols;
 930           buffer->cur = cur - 1;
 931           _cpp_process_line_notes (pfile, true);
 932           if (buffer->next_line >= buffer->rlimit)
 933             return true;
 934           _cpp_clean_line (pfile);
 935
 936           cols = buffer->next_line - buffer->line_base;
 937           CPP_INCREMENT_LINE (pfile, cols);
 938
 939           cur = buffer->cur;
 940         }
 941     }
 942
 943   buffer->cur = cur;
 944   _cpp_process_line_notes (pfile, true);
 945   return false;
 946 }
 947
 948 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 949    terminating newline.  Handles escaped newlines.  Returns nonzero
 950    if a multiline comment.  */
 951 static int
 952 skip_line_comment (cpp_reader *pfile)
 953 {
 954   cpp_buffer *buffer = pfile->buffer;
 955   source_location orig_line = pfile->line_table->highest_line;
 956
 957   while (*buffer->cur != '\n')
 958     buffer->cur++;
 959
 960   _cpp_process_line_notes (pfile, true);
 961   return orig_line != pfile->line_table->highest_line;
 962 }
 963
 964 /* Skips whitespace, saving the next non-whitespace character.  */
 965 static void
 966 skip_whitespace (cpp_reader *pfile, cppchar_t c)
 967 {
 968   cpp_buffer *buffer = pfile->buffer;
 969   bool saw_NUL = false;
 970
 971   do
 972     {
 973       /* Horizontal space always OK.  */
 974       if (c == ' ' || c == '\t')
 975         ;
 976       /* Just \f \v or \0 left.  */
 977       else if (c == '\0')
 978         saw_NUL = true;
 979       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 980         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 981                              CPP_BUF_COL (buffer),
 982                              "%s in preprocessing directive",
 983                              c == '\f' ? "form feed" : "vertical tab");
 984
 985       c = *buffer->cur++;
 986     }
 987   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 988   while (is_nvspace (c));
 989
 990   if (saw_NUL)
 991     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
 992
 993   buffer->cur--;
 994 }
 995
 996 /* See if the characters of a number token are valid in a name (no
 997    '.', '+' or '-').  */
 998 static int
 999 name_p (cpp_reader *pfile, const cpp_string *string)
1000 {
1001   unsigned int i;
1002
1003   for (i = 0; i < string->len; i++)
1004     if (!is_idchar (string->text[i]))
1005       return 0;
1006
1007   return 1;
1008 }
1009
1010 /* After parsing an identifier or other sequence, produce a warning about
1011    sequences not in NFC/NFKC.  */
1012 static void
1013 warn_about_normalization (cpp_reader *pfile,
1014                           const cpp_token *token,
1015                           const struct normalize_state *s)
1016 {
1017   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1018       && !pfile->state.skipping)
1019     {
1020       /* Make sure that the token is printed using UCNs, even
1021          if we'd otherwise happily print UTF-8.  */
1022       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1023       size_t sz;
1024
1025       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1026       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1027         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1028                                "`%.*s' is not in NFKC", (int) sz, buf);
1029       else
1030         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1031                                "`%.*s' is not in NFC", (int) sz, buf);
1032     }
1033 }
1034
1035 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1036    an identifier.  FIRST is TRUE if this starts an identifier.  */
1037 static bool
1038 forms_identifier_p (cpp_reader *pfile, int first,
1039                     struct normalize_state *state)
1040 {
1041   cpp_buffer *buffer = pfile->buffer;
1042
1043   if (*buffer->cur == '$')
1044     {
1045       if (!CPP_OPTION (pfile, dollars_in_ident))
1046         return false;
1047
1048       buffer->cur++;
1049       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1050         {
1051           CPP_OPTION (pfile, warn_dollars) = 0;
1052           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1053         }
1054
1055       return true;
1056     }
1057
1058   /* Is this a syntactically valid UCN?  */
1059   if (CPP_OPTION (pfile, extended_identifiers)
1060       && *buffer->cur == '\\'
1061       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1062     {
1063       buffer->cur += 2;
1064       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1065                           state))
1066         return true;
1067       buffer->cur -= 2;
1068     }
1069
1070   return false;
1071 }
1072
1073 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1074 static cpp_hashnode *
1075 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1076 {
1077   cpp_hashnode *result;
1078   const uchar *cur;
1079   unsigned int len;
1080   unsigned int hash = HT_HASHSTEP (0, *base);
1081
1082   cur = base + 1;
1083   while (ISIDNUM (*cur))
1084     {
1085       hash = HT_HASHSTEP (hash, *cur);
1086       cur++;
1087     }
1088   len = cur - base;
1089   hash = HT_HASHFINISH (hash, len);
1090   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1091                                               base, len, hash, HT_ALLOC));
1092
1093   /* Rarely, identifiers require diagnostics when lexed.  */
1094   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1095                         && !pfile->state.skipping, 0))
1096     {
1097       /* It is allowed to poison the same identifier twice.  */
1098       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1099         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1100                    NODE_NAME (result));
1101
1102       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1103          replacement list of a variadic macro.  */
1104       if (result == pfile->spec_nodes.n__VA_ARGS__
1105           && !pfile->state.va_args_ok)
1106         cpp_error (pfile, CPP_DL_PEDWARN,
1107                    "__VA_ARGS__ can only appear in the expansion"
1108                    " of a C99 variadic macro");
1109
1110       /* For -Wc++-compat, warn about use of C++ named operators.  */
1111       if (result->flags & NODE_WARN_OPERATOR)
1112         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1113                      "identifier \"%s\" is a special operator name in C++",
1114                      NODE_NAME (result));
1115     }
1116
1117   return result;
1118 }
1119
1120 /* Get the cpp_hashnode of an identifier specified by NAME in
1121    the current cpp_reader object.  If none is found, NULL is returned.  */
1122 cpp_hashnode *
1123 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1124 {
1125   cpp_hashnode *result;
1126   result = lex_identifier_intern (pfile, (uchar *) name);
1127   return result;
1128 }
1129
1130 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1131 static cpp_hashnode *
1132 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1133                 struct normalize_state *nst)
1134 {
1135   cpp_hashnode *result;
1136   const uchar *cur;
1137   unsigned int len;
1138   unsigned int hash = HT_HASHSTEP (0, *base);
1139
1140   cur = pfile->buffer->cur;
1141   if (! starts_ucn)
1142     while (ISIDNUM (*cur))
1143       {
1144         hash = HT_HASHSTEP (hash, *cur);
1145         cur++;
1146       }
1147   pfile->buffer->cur = cur;
1148   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1149     {
1150       /* Slower version for identifiers containing UCNs (or $).  */
1151       do {
1152         while (ISIDNUM (*pfile->buffer->cur))
1153           {
1154             pfile->buffer->cur++;
1155             NORMALIZE_STATE_UPDATE_IDNUM (nst);
1156           }
1157       } while (forms_identifier_p (pfile, false, nst));
1158       result = _cpp_interpret_identifier (pfile, base,
1159                                           pfile->buffer->cur - base);
1160     }
1161   else
1162     {
1163       len = cur - base;
1164       hash = HT_HASHFINISH (hash, len);
1165
1166       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1167                                                   base, len, hash, HT_ALLOC));
1168     }
1169
1170   /* Rarely, identifiers require diagnostics when lexed.  */
1171   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1172                         && !pfile->state.skipping, 0))
1173     {
1174       /* It is allowed to poison the same identifier twice.  */
1175       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1176         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1177                    NODE_NAME (result));
1178
1179       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1180          replacement list of a variadic macro.  */
1181       if (result == pfile->spec_nodes.n__VA_ARGS__
1182           && !pfile->state.va_args_ok)
1183         cpp_error (pfile, CPP_DL_PEDWARN,
1184                    "__VA_ARGS__ can only appear in the expansion"
1185                    " of a C99 variadic macro");
1186
1187       /* For -Wc++-compat, warn about use of C++ named operators.  */
1188       if (result->flags & NODE_WARN_OPERATOR)
1189         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1190                      "identifier \"%s\" is a special operator name in C++",
1191                      NODE_NAME (result));
1192     }
1193
1194   return result;
1195 }
1196
1197 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1198 static void
1199 lex_number (cpp_reader *pfile, cpp_string *number,
1200             struct normalize_state *nst)
1201 {
1202   const uchar *cur;
1203   const uchar *base;
1204   uchar *dest;
1205
1206   base = pfile->buffer->cur - 1;
1207   do
1208     {
1209       cur = pfile->buffer->cur;
1210
1211       /* N.B. ISIDNUM does not include $.  */
1212       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
1213         {
1214           cur++;
1215           NORMALIZE_STATE_UPDATE_IDNUM (nst);
1216         }
1217
1218       pfile->buffer->cur = cur;
1219     }
1220   while (forms_identifier_p (pfile, false, nst));
1221
1222   number->len = cur - base;
1223   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1224   memcpy (dest, base, number->len);
1225   dest[number->len] = '\0';
1226   number->text = dest;
1227 }
1228
1229 /* Create a token of type TYPE with a literal spelling.  */
1230 static void
1231 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1232                 unsigned int len, enum cpp_ttype type)
1233 {
1234   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1235
1236   memcpy (dest, base, len);
1237   dest[len] = '\0';
1238   token->type = type;
1239   token->val.str.len = len;
1240   token->val.str.text = dest;
1241 }
1242
1243 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1244    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1245
1246 static void
1247 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1248                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1249 {
1250   _cpp_buff *first_buff = *first_buff_p;
1251   _cpp_buff *last_buff = *last_buff_p;
1252
1253   if (first_buff == NULL)
1254     first_buff = last_buff = _cpp_get_buff (pfile, len);
1255   else if (len > BUFF_ROOM (last_buff))
1256     {
1257       size_t room = BUFF_ROOM (last_buff);
1258       memcpy (BUFF_FRONT (last_buff), base, room);
1259       BUFF_FRONT (last_buff) += room;
1260       base += room;
1261       len -= room;
1262       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1263     }
1264
1265   memcpy (BUFF_FRONT (last_buff), base, len);
1266   BUFF_FRONT (last_buff) += len;
1267
1268   *first_buff_p = first_buff;
1269   *last_buff_p = last_buff;
1270 }
1271
1272 /* Lexes a raw string.  The stored string contains the spelling, including
1273    double quotes, delimiter string, '(' and ')', any leading
1274    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1275    literal, or CPP_OTHER if it was not properly terminated.
1276
1277    The spelling is NUL-terminated, but it is not guaranteed that this
1278    is the first NUL since embedded NULs are preserved.  */
1279
1280 static void
1281 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1282                 const uchar *cur)
1283 {
1284   const uchar *raw_prefix;
1285   unsigned int raw_prefix_len = 0;
1286   enum cpp_ttype type;
1287   size_t total_len = 0;
1288   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1289   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1290
1291   type = (*base == 'L' ? CPP_WSTRING :
1292           *base == 'U' ? CPP_STRING32 :
1293           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1294           : CPP_STRING);
1295
1296   raw_prefix = cur + 1;
1297   while (raw_prefix_len < 16)
1298     {
1299       switch (raw_prefix[raw_prefix_len])
1300         {
1301         case ' ': case '(': case ')': case '\\': case '\t':
1302         case '\v': case '\f': case '\n': default:
1303           break;
1304         /* Basic source charset except the above chars.  */
1305         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1306         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1307         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1308         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1309         case 'y': case 'z':
1310         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1311         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1312         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1313         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1314         case 'Y': case 'Z':
1315         case '0': case '1': case '2': case '3': case '4': case '5':
1316         case '6': case '7': case '8': case '9':
1317         case '_': case '{': case '}': case '#': case '[': case ']':
1318         case '<': case '>': case '%': case ':': case ';': case '.':
1319         case '?': case '*': case '+': case '-': case '/': case '^':
1320         case '&': case '|': case '~': case '!': case '=': case ',':
1321         case '"': case '\'':
1322           raw_prefix_len++;
1323           continue;
1324         }
1325       break;
1326     }
1327
1328   if (raw_prefix[raw_prefix_len] != '(')
1329     {
1330       int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
1331                 + 1;
1332       if (raw_prefix_len == 16)
1333         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1334                              "raw string delimiter longer than 16 characters");
1335       else
1336         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1337                              "invalid character '%c' in raw string delimiter",
1338                              (int) raw_prefix[raw_prefix_len]);
1339       pfile->buffer->cur = raw_prefix - 1;
1340       create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
1341       return;
1342     }
1343
1344   cur = raw_prefix + raw_prefix_len + 1;
1345   for (;;)
1346     {
1347 #define BUF_APPEND(STR,LEN)                                     \
1348       do {                                                      \
1349         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1350                         &first_buff, &last_buff);               \
1351         total_len += (LEN);                                     \
1352       } while (0);
1353
1354       cppchar_t c;
1355
1356       /* If we previously performed any trigraph or line splicing
1357          transformations, undo them within the body of the raw string.  */
1358       while (note->pos < cur)
1359         ++note;
1360       for (; note->pos == cur; ++note)
1361         {
1362           switch (note->type)
1363             {
1364             case '\\':
1365             case ' ':
1366               /* Restore backslash followed by newline.  */
1367               BUF_APPEND (base, cur - base);
1368               base = cur;
1369               BUF_APPEND ("\\", 1);
1370             after_backslash:
1371               if (note->type == ' ')
1372                 {
1373                   /* GNU backslash whitespace newline extension.  FIXME
1374                      could be any sequence of non-vertical space.  When we
1375                      can properly restore any such sequence, we should mark
1376                      this note as handled so _cpp_process_line_notes
1377                      doesn't warn.  */
1378                   BUF_APPEND (" ", 1);
1379                 }
1380
1381               BUF_APPEND ("\n", 1);
1382               break;
1383
1384             case 0:
1385               /* Already handled.  */
1386               break;
1387
1388             default:
1389               if (_cpp_trigraph_map[note->type])
1390                 {
1391                   /* Don't warn about this trigraph in
1392                      _cpp_process_line_notes, since trigraphs show up as
1393                      trigraphs in raw strings.  */
1394                   uchar type = note->type;
1395                   note->type = 0;
1396
1397                   if (!CPP_OPTION (pfile, trigraphs))
1398                     /* If we didn't convert the trigraph in the first
1399                        place, don't do anything now either.  */
1400                     break;
1401
1402                   BUF_APPEND (base, cur - base);
1403                   base = cur;
1404                   BUF_APPEND ("??", 2);
1405
1406                   /* ??/ followed by newline gets two line notes, one for
1407                      the trigraph and one for the backslash/newline.  */
1408                   if (type == '/' && note[1].pos == cur)
1409                     {
1410                       if (note[1].type != '\\'
1411                           && note[1].type != ' ')
1412                         abort ();
1413                       BUF_APPEND ("/", 1);
1414                       ++note;
1415                       goto after_backslash;
1416                     }
1417                   /* The ) from ??) could be part of the suffix.  */
1418                   else if (type == ')'
1419                            && strncmp ((const char *) cur+1,
1420                                        (const char *) raw_prefix,
1421                                        raw_prefix_len) == 0
1422                            && cur[raw_prefix_len+1] == '"')
1423                     {
1424                       BUF_APPEND (")", 1);
1425                       base++;
1426                       cur += raw_prefix_len + 2;
1427                       goto break_outer_loop;
1428                     }
1429                   else
1430                     {
1431                       /* Skip the replacement character.  */
1432                       base = ++cur;
1433                       BUF_APPEND (&type, 1);
1434                     }
1435                 }
1436               else
1437                 abort ();
1438               break;
1439             }
1440         }
1441       c = *cur++;
1442
1443       if (c == ')'
1444           && strncmp ((const char *) cur, (const char *) raw_prefix,
1445                       raw_prefix_len) == 0
1446           && cur[raw_prefix_len] == '"')
1447         {
1448           cur += raw_prefix_len + 1;
1449           break;
1450         }
1451       else if (c == '\n')
1452         {
1453           if (pfile->state.in_directive
1454               || pfile->state.parsing_args
1455               || pfile->state.in_deferred_pragma)
1456             {
1457               cur--;
1458               type = CPP_OTHER;
1459               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1460                                    "unterminated raw string");
1461               break;
1462             }
1463
1464           BUF_APPEND (base, cur - base);
1465
1466           if (pfile->buffer->cur < pfile->buffer->rlimit)
1467             CPP_INCREMENT_LINE (pfile, 0);
1468           pfile->buffer->need_line = true;
1469
1470           pfile->buffer->cur = cur-1;
1471           _cpp_process_line_notes (pfile, false);
1472           if (!_cpp_get_fresh_line (pfile))
1473             {
1474               source_location src_loc = token->src_loc;
1475               token->type = CPP_EOF;
1476               /* Tell the compiler the line number of the EOF token.  */
1477               token->src_loc = pfile->line_table->highest_line;
1478               token->flags = BOL;
1479               if (first_buff != NULL)
1480                 _cpp_release_buff (pfile, first_buff);
1481               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1482                                    "unterminated raw string");
1483               return;
1484             }
1485
1486           cur = base = pfile->buffer->cur;
1487           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1488         }
1489     }
1490  break_outer_loop:
1491
1492   if (CPP_OPTION (pfile, user_literals))
1493     {
1494       /* Grab user defined literal suffix.  */
1495       if (ISIDST (*cur))
1496         {
1497           type = cpp_userdef_string_add_type (type);
1498           ++cur;
1499         }
1500       while (ISIDNUM (*cur))
1501         ++cur;
1502     }
1503
1504   pfile->buffer->cur = cur;
1505   if (first_buff == NULL)
1506     create_literal (pfile, token, base, cur - base, type);
1507   else
1508     {
1509       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1510
1511       token->type = type;
1512       token->val.str.len = total_len + (cur - base);
1513       token->val.str.text = dest;
1514       last_buff = first_buff;
1515       while (last_buff != NULL)
1516         {
1517           memcpy (dest, last_buff->base,
1518                   BUFF_FRONT (last_buff) - last_buff->base);
1519           dest += BUFF_FRONT (last_buff) - last_buff->base;
1520           last_buff = last_buff->next;
1521         }
1522       _cpp_release_buff (pfile, first_buff);
1523       memcpy (dest, base, cur - base);
1524       dest[cur - base] = '\0';
1525     }
1526 }
1527
1528 /* Lexes a string, character constant, or angle-bracketed header file
1529    name.  The stored string contains the spelling, including opening
1530    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1531    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1532    if it was not properly terminated, or CPP_LESS for an unterminated
1533    header name which must be relexed as normal tokens.
1534
1535    The spelling is NUL-terminated, but it is not guaranteed that this
1536    is the first NUL since embedded NULs are preserved.  */
1537 static void
1538 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1539 {
1540   bool saw_NUL = false;
1541   const uchar *cur;
1542   cppchar_t terminator;
1543   enum cpp_ttype type;
1544
1545   cur = base;
1546   terminator = *cur++;
1547   if (terminator == 'L' || terminator == 'U')
1548     terminator = *cur++;
1549   else if (terminator == 'u')
1550     {
1551       terminator = *cur++;
1552       if (terminator == '8')
1553         terminator = *cur++;
1554     }
1555   if (terminator == 'R')
1556     {
1557       lex_raw_string (pfile, token, base, cur);
1558       return;
1559     }
1560   if (terminator == '"')
1561     type = (*base == 'L' ? CPP_WSTRING :
1562             *base == 'U' ? CPP_STRING32 :
1563             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1564                          : CPP_STRING);
1565   else if (terminator == '\'')
1566     type = (*base == 'L' ? CPP_WCHAR :
1567             *base == 'U' ? CPP_CHAR32 :
1568             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1569   else
1570     terminator = '>', type = CPP_HEADER_NAME;
1571
1572   for (;;)
1573     {
1574       cppchar_t c = *cur++;
1575
1576       /* In #include-style directives, terminators are not escapable.  */
1577       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1578         cur++;
1579       else if (c == terminator)
1580         break;
1581       else if (c == '\n')
1582         {
1583           cur--;
1584           /* Unmatched quotes always yield undefined behavior, but
1585              greedy lexing means that what appears to be an unterminated
1586              header name may actually be a legitimate sequence of tokens.  */
1587           if (terminator == '>')
1588             {
1589               token->type = CPP_LESS;
1590               return;
1591             }
1592           type = CPP_OTHER;
1593           break;
1594         }
1595       else if (c == '\0')
1596         saw_NUL = true;
1597     }
1598
1599   if (saw_NUL && !pfile->state.skipping)
1600     cpp_error (pfile, CPP_DL_WARNING,
1601                "null character(s) preserved in literal");
1602
1603   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1604     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1605                (int) terminator);
1606
1607   if (CPP_OPTION (pfile, user_literals))
1608     {
1609       /* Grab user defined literal suffix.  */
1610       if (ISIDST (*cur))
1611         {
1612           type = cpp_userdef_char_add_type (type);
1613           type = cpp_userdef_string_add_type (type);
1614           ++cur;
1615         }
1616       while (ISIDNUM (*cur))
1617         ++cur;
1618     }
1619
1620   pfile->buffer->cur = cur;
1621   create_literal (pfile, token, base, cur - base, type);
1622 }
1623
1624 /* Return the comment table. The client may not make any assumption
1625    about the ordering of the table.  */
1626 cpp_comment_table *
1627 cpp_get_comments (cpp_reader *pfile)
1628 {
1629   return &pfile->comments;
1630 }
1631
1632 /* Append a comment to the end of the comment table. */
1633 static void
1634 store_comment (cpp_reader *pfile, cpp_token *token)
1635 {
1636   int len;
1637
1638   if (pfile->comments.allocated == 0)
1639     {
1640       pfile->comments.allocated = 256;
1641       pfile->comments.entries = (cpp_comment *) xmalloc
1642         (pfile->comments.allocated * sizeof (cpp_comment));
1643     }
1644
1645   if (pfile->comments.count == pfile->comments.allocated)
1646     {
1647       pfile->comments.allocated *= 2;
1648       pfile->comments.entries = (cpp_comment *) xrealloc
1649         (pfile->comments.entries,
1650          pfile->comments.allocated * sizeof (cpp_comment));
1651     }
1652
1653   len = token->val.str.len;
1654
1655   /* Copy comment. Note, token may not be NULL terminated. */
1656   pfile->comments.entries[pfile->comments.count].comment =
1657     (char *) xmalloc (sizeof (char) * (len + 1));
1658   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1659           token->val.str.text, len);
1660   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1661
1662   /* Set source location. */
1663   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1664
1665   /* Increment the count of entries in the comment table. */
1666   pfile->comments.count++;
1667 }
1668
1669 /* The stored comment includes the comment start and any terminator.  */
1670 static void
1671 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1672               cppchar_t type)
1673 {
1674   unsigned char *buffer;
1675   unsigned int len, clen, i;
1676
1677   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1678
1679   /* C++ comments probably (not definitely) have moved past a new
1680      line, which we don't want to save in the comment.  */
1681   if (is_vspace (pfile->buffer->cur[-1]))
1682     len--;
1683
1684   /* If we are currently in a directive or in argument parsing, then
1685      we need to store all C++ comments as C comments internally, and
1686      so we need to allocate a little extra space in that case.
1687
1688      Note that the only time we encounter a directive here is
1689      when we are saving comments in a "#define".  */
1690   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1691           && type == '/') ? len + 2 : len;
1692
1693   buffer = _cpp_unaligned_alloc (pfile, clen);
1694
1695   token->type = CPP_COMMENT;
1696   token->val.str.len = clen;
1697   token->val.str.text = buffer;
1698
1699   buffer[0] = '/';
1700   memcpy (buffer + 1, from, len - 1);
1701
1702   /* Finish conversion to a C comment, if necessary.  */
1703   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1704     {
1705       buffer[1] = '*';
1706       buffer[clen - 2] = '*';
1707       buffer[clen - 1] = '/';
1708       /* As there can be in a C++ comments illegal sequences for C comments
1709          we need to filter them out.  */
1710       for (i = 2; i < (clen - 2); i++)
1711         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1712           buffer[i] = '|';
1713     }
1714
1715   /* Finally store this comment for use by clients of libcpp. */
1716   store_comment (pfile, token);
1717 }
1718
1719 /* Allocate COUNT tokens for RUN.  */
1720 void
1721 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1722 {
1723   run->base = XNEWVEC (cpp_token, count);
1724   run->limit = run->base + count;
1725   run->next = NULL;
1726 }
1727
1728 /* Returns the next tokenrun, or creates one if there is none.  */
1729 static tokenrun *
1730 next_tokenrun (tokenrun *run)
1731 {
1732   if (run->next == NULL)
1733     {
1734       run->next = XNEW (tokenrun);
1735       run->next->prev = run;
1736       _cpp_init_tokenrun (run->next, 250);
1737     }
1738
1739   return run->next;
1740 }
1741
1742 /* Return the number of not yet processed token in a given
1743    context.  */
1744 int
1745 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1746 {
1747   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1748     return (LAST (context).token - FIRST (context).token);
1749   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1750            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1751     return (LAST (context).ptoken - FIRST (context).ptoken);
1752   else
1753       abort ();
1754 }
1755
1756 /* Returns the token present at index INDEX in a given context.  If
1757    INDEX is zero, the next token to be processed is returned.  */
1758 static const cpp_token*
1759 _cpp_token_from_context_at (cpp_context *context, int index)
1760 {
1761   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1762     return &(FIRST (context).token[index]);
1763   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1764            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1765     return FIRST (context).ptoken[index];
1766  else
1767    abort ();
1768 }
1769
1770 /* Look ahead in the input stream.  */
1771 const cpp_token *
1772 cpp_peek_token (cpp_reader *pfile, int index)
1773 {
1774   cpp_context *context = pfile->context;
1775   const cpp_token *peektok;
1776   int count;
1777
1778   /* First, scan through any pending cpp_context objects.  */
1779   while (context->prev)
1780     {
1781       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1782
1783       if (index < (int) sz)
1784         return _cpp_token_from_context_at (context, index);
1785       index -= (int) sz;
1786       context = context->prev;
1787     }
1788
1789   /* We will have to read some new tokens after all (and do so
1790      without invalidating preceding tokens).  */
1791   count = index;
1792   pfile->keep_tokens++;
1793
1794   do
1795     {
1796       peektok = _cpp_lex_token (pfile);
1797       if (peektok->type == CPP_EOF)
1798         return peektok;
1799     }
1800   while (index--);
1801
1802   _cpp_backup_tokens_direct (pfile, count + 1);
1803   pfile->keep_tokens--;
1804
1805   return peektok;
1806 }
1807
1808 /* Allocate a single token that is invalidated at the same time as the
1809    rest of the tokens on the line.  Has its line and col set to the
1810    same as the last lexed token, so that diagnostics appear in the
1811    right place.  */
1812 cpp_token *
1813 _cpp_temp_token (cpp_reader *pfile)
1814 {
1815   cpp_token *old, *result;
1816   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1817   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1818
1819   old = pfile->cur_token - 1;
1820   /* Any pre-existing lookaheads must not be clobbered.  */
1821   if (la)
1822     {
1823       if (sz <= la)
1824         {
1825           tokenrun *next = next_tokenrun (pfile->cur_run);
1826
1827           if (sz < la)
1828             memmove (next->base + 1, next->base,
1829                      (la - sz) * sizeof (cpp_token));
1830
1831           next->base[0] = pfile->cur_run->limit[-1];
1832         }
1833
1834       if (sz > 1)
1835         memmove (pfile->cur_token + 1, pfile->cur_token,
1836                  MIN (la, sz - 1) * sizeof (cpp_token));
1837     }
1838
1839   if (!sz && pfile->cur_token == pfile->cur_run->limit)
1840     {
1841       pfile->cur_run = next_tokenrun (pfile->cur_run);
1842       pfile->cur_token = pfile->cur_run->base;
1843     }
1844
1845   result = pfile->cur_token++;
1846   result->src_loc = old->src_loc;
1847   return result;
1848 }
1849
1850 /* Lex a token into RESULT (external interface).  Takes care of issues
1851    like directive handling, token lookahead, multiple include
1852    optimization and skipping.  */
1853 const cpp_token *
1854 _cpp_lex_token (cpp_reader *pfile)
1855 {
1856   cpp_token *result;
1857
1858   for (;;)
1859     {
1860       if (pfile->cur_token == pfile->cur_run->limit)
1861         {
1862           pfile->cur_run = next_tokenrun (pfile->cur_run);
1863           pfile->cur_token = pfile->cur_run->base;
1864         }
1865       /* We assume that the current token is somewhere in the current
1866          run.  */
1867       if (pfile->cur_token < pfile->cur_run->base
1868           || pfile->cur_token >= pfile->cur_run->limit)
1869         abort ();
1870
1871       if (pfile->lookaheads)
1872         {
1873           pfile->lookaheads--;
1874           result = pfile->cur_token++;
1875         }
1876       else
1877         result = _cpp_lex_direct (pfile);
1878
1879       if (result->flags & BOL)
1880         {
1881           /* Is this a directive.  If _cpp_handle_directive returns
1882              false, it is an assembler #.  */
1883           if (result->type == CPP_HASH
1884               /* 6.10.3 p 11: Directives in a list of macro arguments
1885                  gives undefined behavior.  This implementation
1886                  handles the directive as normal.  */
1887               && pfile->state.parsing_args != 1)
1888             {
1889               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1890                 {
1891                   if (pfile->directive_result.type == CPP_PADDING)
1892                     continue;
1893                   result = &pfile->directive_result;
1894                 }
1895             }
1896           else if (pfile->state.in_deferred_pragma)
1897             result = &pfile->directive_result;
1898
1899           if (pfile->cb.line_change && !pfile->state.skipping)
1900             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1901         }
1902
1903       /* We don't skip tokens in directives.  */
1904       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
1905         break;
1906
1907       /* Outside a directive, invalidate controlling macros.  At file
1908          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
1909          get here and MI optimization works.  */
1910       pfile->mi_valid = false;
1911
1912       if (!pfile->state.skipping || result->type == CPP_EOF)
1913         break;
1914     }
1915
1916   return result;
1917 }
1918
1919 /* Returns true if a fresh line has been loaded.  */
1920 bool
1921 _cpp_get_fresh_line (cpp_reader *pfile)
1922 {
1923   int return_at_eof;
1924
1925   /* We can't get a new line until we leave the current directive.  */
1926   if (pfile->state.in_directive)
1927     return false;
1928
1929   for (;;)
1930     {
1931       cpp_buffer *buffer = pfile->buffer;
1932
1933       if (!buffer->need_line)
1934         return true;
1935
1936       if (buffer->next_line < buffer->rlimit)
1937         {
1938           _cpp_clean_line (pfile);
1939           return true;
1940         }
1941
1942       /* First, get out of parsing arguments state.  */
1943       if (pfile->state.parsing_args)
1944         return false;
1945
1946       /* End of buffer.  Non-empty files should end in a newline.  */
1947       if (buffer->buf != buffer->rlimit
1948           && buffer->next_line > buffer->rlimit
1949           && !buffer->from_stage3)
1950         {
1951           /* Clip to buffer size.  */
1952           buffer->next_line = buffer->rlimit;
1953         }
1954
1955       return_at_eof = buffer->return_at_eof;
1956       _cpp_pop_buffer (pfile);
1957       if (pfile->buffer == NULL || return_at_eof)
1958         return false;
1959     }
1960 }
1961
1962 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
1963   do                                                    \
1964     {                                                   \
1965       result->type = ELSE_TYPE;                         \
1966       if (*buffer->cur == CHAR)                         \
1967         buffer->cur++, result->type = THEN_TYPE;        \
1968     }                                                   \
1969   while (0)
1970
1971 /* Lex a token into pfile->cur_token, which is also incremented, to
1972    get diagnostics pointing to the correct location.
1973
1974    Does not handle issues such as token lookahead, multiple-include
1975    optimization, directives, skipping etc.  This function is only
1976    suitable for use by _cpp_lex_token, and in special cases like
1977    lex_expansion_token which doesn't care for any of these issues.
1978
1979    When meeting a newline, returns CPP_EOF if parsing a directive,
1980    otherwise returns to the start of the token buffer if permissible.
1981    Returns the location of the lexed token.  */
1982 cpp_token *
1983 _cpp_lex_direct (cpp_reader *pfile)
1984 {
1985   cppchar_t c;
1986   cpp_buffer *buffer;
1987   const unsigned char *comment_start;
1988   cpp_token *result = pfile->cur_token++;
1989
1990  fresh_line:
1991   result->flags = 0;
1992   buffer = pfile->buffer;
1993   if (buffer->need_line)
1994     {
1995       if (pfile->state.in_deferred_pragma)
1996         {
1997           result->type = CPP_PRAGMA_EOL;
1998           pfile->state.in_deferred_pragma = false;
1999           if (!pfile->state.pragma_allow_expansion)
2000             pfile->state.prevent_expansion--;
2001           return result;
2002         }
2003       if (!_cpp_get_fresh_line (pfile))
2004         {
2005           result->type = CPP_EOF;
2006           if (!pfile->state.in_directive)
2007             {
2008               /* Tell the compiler the line number of the EOF token.  */
2009               result->src_loc = pfile->line_table->highest_line;
2010               result->flags = BOL;
2011             }
2012           return result;
2013         }
2014       if (!pfile->keep_tokens)
2015         {
2016           pfile->cur_run = &pfile->base_run;
2017           result = pfile->base_run.base;
2018           pfile->cur_token = result + 1;
2019         }
2020       result->flags = BOL;
2021       if (pfile->state.parsing_args == 2)
2022         result->flags |= PREV_WHITE;
2023     }
2024   buffer = pfile->buffer;
2025  update_tokens_line:
2026   result->src_loc = pfile->line_table->highest_line;
2027
2028  skipped_white:
2029   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2030       && !pfile->overlaid_buffer)
2031     {
2032       _cpp_process_line_notes (pfile, false);
2033       result->src_loc = pfile->line_table->highest_line;
2034     }
2035   c = *buffer->cur++;
2036
2037   if (pfile->forced_token_location_p)
2038     result->src_loc = *pfile->forced_token_location_p;
2039   else
2040     result->src_loc = linemap_position_for_column (pfile->line_table,
2041                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2042
2043   switch (c)
2044     {
2045     case ' ': case '\t': case '\f': case '\v': case '\0':
2046       result->flags |= PREV_WHITE;
2047       skip_whitespace (pfile, c);
2048       goto skipped_white;
2049
2050     case '\n':
2051       if (buffer->cur < buffer->rlimit)
2052         CPP_INCREMENT_LINE (pfile, 0);
2053       buffer->need_line = true;
2054       goto fresh_line;
2055
2056     case '0': case '1': case '2': case '3': case '4':
2057     case '5': case '6': case '7': case '8': case '9':
2058       {
2059         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2060         result->type = CPP_NUMBER;
2061         lex_number (pfile, &result->val.str, &nst);
2062         warn_about_normalization (pfile, result, &nst);
2063         break;
2064       }
2065
2066     case 'L':
2067     case 'u':
2068     case 'U':
2069     case 'R':
2070       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2071          wide strings or raw strings.  */
2072       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2073           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2074         {
2075           if ((*buffer->cur == '\'' && c != 'R')
2076               || *buffer->cur == '"'
2077               || (*buffer->cur == 'R'
2078                   && c != 'R'
2079                   && buffer->cur[1] == '"'
2080                   && CPP_OPTION (pfile, rliterals))
2081               || (*buffer->cur == '8'
2082                   && c == 'u'
2083                   && (buffer->cur[1] == '"'
2084                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2085                           && CPP_OPTION (pfile, rliterals)))))
2086             {
2087               lex_string (pfile, result, buffer->cur - 1);
2088               break;
2089             }
2090         }
2091       /* Fall through.  */
2092
2093     case '_':
2094     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2095     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2096     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2097     case 's': case 't':           case 'v': case 'w': case 'x':
2098     case 'y': case 'z':
2099     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2100     case 'G': case 'H': case 'I': case 'J': case 'K':
2101     case 'M': case 'N': case 'O': case 'P': case 'Q':
2102     case 'S': case 'T':           case 'V': case 'W': case 'X':
2103     case 'Y': case 'Z':
2104       result->type = CPP_NAME;
2105       {
2106         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2107         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2108                                                 &nst);
2109         warn_about_normalization (pfile, result, &nst);
2110       }
2111
2112       /* Convert named operators to their proper types.  */
2113       if (result->val.node.node->flags & NODE_OPERATOR)
2114         {
2115           result->flags |= NAMED_OP;
2116           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2117         }
2118       break;
2119
2120     case '\'':
2121     case '"':
2122       lex_string (pfile, result, buffer->cur - 1);
2123       break;
2124
2125     case '/':
2126       /* A potential block or line comment.  */
2127       comment_start = buffer->cur;
2128       c = *buffer->cur;
2129
2130       if (c == '*')
2131         {
2132           if (_cpp_skip_block_comment (pfile))
2133             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2134         }
2135       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2136                             || cpp_in_system_header (pfile)))
2137         {
2138           /* Warn about comments only if pedantically GNUC89, and not
2139              in system headers.  */
2140           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2141               && ! buffer->warned_cplusplus_comments)
2142             {
2143               cpp_error (pfile, CPP_DL_PEDWARN,
2144                          "C++ style comments are not allowed in ISO C90");
2145               cpp_error (pfile, CPP_DL_PEDWARN,
2146                          "(this will be reported only once per input file)");
2147               buffer->warned_cplusplus_comments = 1;
2148             }
2149
2150           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2151             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2152         }
2153       else if (c == '=')
2154         {
2155           buffer->cur++;
2156           result->type = CPP_DIV_EQ;
2157           break;
2158         }
2159       else
2160         {
2161           result->type = CPP_DIV;
2162           break;
2163         }
2164
2165       if (!pfile->state.save_comments)
2166         {
2167           result->flags |= PREV_WHITE;
2168           goto update_tokens_line;
2169         }
2170
2171       /* Save the comment as a token in its own right.  */
2172       save_comment (pfile, result, comment_start, c);
2173       break;
2174
2175     case '<':
2176       if (pfile->state.angled_headers)
2177         {
2178           lex_string (pfile, result, buffer->cur - 1);
2179           if (result->type != CPP_LESS)
2180             break;
2181         }
2182
2183       result->type = CPP_LESS;
2184       if (*buffer->cur == '=')
2185         buffer->cur++, result->type = CPP_LESS_EQ;
2186       else if (*buffer->cur == '<')
2187         {
2188           buffer->cur++;
2189           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2190         }
2191       else if (CPP_OPTION (pfile, digraphs))
2192         {
2193           if (*buffer->cur == ':')
2194             {
2195               buffer->cur++;
2196               result->flags |= DIGRAPH;
2197               result->type = CPP_OPEN_SQUARE;
2198             }
2199           else if (*buffer->cur == '%')
2200             {
2201               buffer->cur++;
2202               result->flags |= DIGRAPH;
2203               result->type = CPP_OPEN_BRACE;
2204             }
2205         }
2206       break;
2207
2208     case '>':
2209       result->type = CPP_GREATER;
2210       if (*buffer->cur == '=')
2211         buffer->cur++, result->type = CPP_GREATER_EQ;
2212       else if (*buffer->cur == '>')
2213         {
2214           buffer->cur++;
2215           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2216         }
2217       break;
2218
2219     case '%':
2220       result->type = CPP_MOD;
2221       if (*buffer->cur == '=')
2222         buffer->cur++, result->type = CPP_MOD_EQ;
2223       else if (CPP_OPTION (pfile, digraphs))
2224         {
2225           if (*buffer->cur == ':')
2226             {
2227               buffer->cur++;
2228               result->flags |= DIGRAPH;
2229               result->type = CPP_HASH;
2230               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2231                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2232             }
2233           else if (*buffer->cur == '>')
2234             {
2235               buffer->cur++;
2236               result->flags |= DIGRAPH;
2237               result->type = CPP_CLOSE_BRACE;
2238             }
2239         }
2240       break;
2241
2242     case '.':
2243       result->type = CPP_DOT;
2244       if (ISDIGIT (*buffer->cur))
2245         {
2246           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2247           result->type = CPP_NUMBER;
2248           lex_number (pfile, &result->val.str, &nst);
2249           warn_about_normalization (pfile, result, &nst);
2250         }
2251       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2252         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2253       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2254         buffer->cur++, result->type = CPP_DOT_STAR;
2255       break;
2256
2257     case '+':
2258       result->type = CPP_PLUS;
2259       if (*buffer->cur == '+')
2260         buffer->cur++, result->type = CPP_PLUS_PLUS;
2261       else if (*buffer->cur == '=')
2262         buffer->cur++, result->type = CPP_PLUS_EQ;
2263       break;
2264
2265     case '-':
2266       result->type = CPP_MINUS;
2267       if (*buffer->cur == '>')
2268         {
2269           buffer->cur++;
2270           result->type = CPP_DEREF;
2271           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2272             buffer->cur++, result->type = CPP_DEREF_STAR;
2273         }
2274       else if (*buffer->cur == '-')
2275         buffer->cur++, result->type = CPP_MINUS_MINUS;
2276       else if (*buffer->cur == '=')
2277         buffer->cur++, result->type = CPP_MINUS_EQ;
2278       break;
2279
2280     case '&':
2281       result->type = CPP_AND;
2282       if (*buffer->cur == '&')
2283         buffer->cur++, result->type = CPP_AND_AND;
2284       else if (*buffer->cur == '=')
2285         buffer->cur++, result->type = CPP_AND_EQ;
2286       break;
2287
2288     case '|':
2289       result->type = CPP_OR;
2290       if (*buffer->cur == '|')
2291         buffer->cur++, result->type = CPP_OR_OR;
2292       else if (*buffer->cur == '=')
2293         buffer->cur++, result->type = CPP_OR_EQ;
2294       break;
2295
2296     case ':':
2297       result->type = CPP_COLON;
2298       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2299         buffer->cur++, result->type = CPP_SCOPE;
2300       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2301         {
2302           buffer->cur++;
2303           result->flags |= DIGRAPH;
2304           result->type = CPP_CLOSE_SQUARE;
2305         }
2306       break;
2307
2308     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2309     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2310     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2311     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2312     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2313
2314     case '?': result->type = CPP_QUERY; break;
2315     case '~': result->type = CPP_COMPL; break;
2316     case ',': result->type = CPP_COMMA; break;
2317     case '(': result->type = CPP_OPEN_PAREN; break;
2318     case ')': result->type = CPP_CLOSE_PAREN; break;
2319     case '[': result->type = CPP_OPEN_SQUARE; break;
2320     case ']': result->type = CPP_CLOSE_SQUARE; break;
2321     case '{': result->type = CPP_OPEN_BRACE; break;
2322     case '}': result->type = CPP_CLOSE_BRACE; break;
2323     case ';': result->type = CPP_SEMICOLON; break;
2324
2325       /* @ is a punctuator in Objective-C.  */
2326     case '@': result->type = CPP_ATSIGN; break;
2327
2328     case '$':
2329     case '\\':
2330       {
2331         const uchar *base = --buffer->cur;
2332         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2333
2334         if (forms_identifier_p (pfile, true, &nst))
2335           {
2336             result->type = CPP_NAME;
2337             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2338             warn_about_normalization (pfile, result, &nst);
2339             break;
2340           }
2341         buffer->cur++;
2342       }
2343
2344     default:
2345       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2346       break;
2347     }
2348
2349   return result;
2350 }
2351
2352 /* An upper bound on the number of bytes needed to spell TOKEN.
2353    Does not include preceding whitespace.  */
2354 unsigned int
2355 cpp_token_len (const cpp_token *token)
2356 {
2357   unsigned int len;
2358
2359   switch (TOKEN_SPELL (token))
2360     {
2361     default:            len = 6;                                break;
2362     case SPELL_LITERAL: len = token->val.str.len;               break;
2363     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2364     }
2365
2366   return len;
2367 }
2368
2369 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2370    Return the number of bytes read out of NAME.  (There are always
2371    10 bytes written to BUFFER.)  */
2372
2373 static size_t
2374 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2375 {
2376   int j;
2377   int ucn_len = 0;
2378   int ucn_len_c;
2379   unsigned t;
2380   unsigned long utf32;
2381
2382   /* Compute the length of the UTF-8 sequence.  */
2383   for (t = *name; t & 0x80; t <<= 1)
2384     ucn_len++;
2385
2386   utf32 = *name & (0x7F >> ucn_len);
2387   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2388     {
2389       utf32 = (utf32 << 6) | (*++name & 0x3F);
2390
2391       /* Ill-formed UTF-8.  */
2392       if ((*name & ~0x3F) != 0x80)
2393         abort ();
2394     }
2395
2396   *buffer++ = '\\';
2397   *buffer++ = 'U';
2398   for (j = 7; j >= 0; j--)
2399     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2400   return ucn_len;
2401 }
2402
2403 /* Given a token TYPE corresponding to a digraph, return a pointer to
2404    the spelling of the digraph.  */
2405 static const unsigned char *
2406 cpp_digraph2name (enum cpp_ttype type)
2407 {
2408   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2409 }
2410
2411 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2412    already contain the enough space to hold the token's spelling.
2413    Returns a pointer to the character after the last character written.
2414    FORSTRING is true if this is to be the spelling after translation
2415    phase 1 (this is different for UCNs).
2416    FIXME: Would be nice if we didn't need the PFILE argument.  */
2417 unsigned char *
2418 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2419                  unsigned char *buffer, bool forstring)
2420 {
2421   switch (TOKEN_SPELL (token))
2422     {
2423     case SPELL_OPERATOR:
2424       {
2425         const unsigned char *spelling;
2426         unsigned char c;
2427
2428         if (token->flags & DIGRAPH)
2429           spelling = cpp_digraph2name (token->type);
2430         else if (token->flags & NAMED_OP)
2431           goto spell_ident;
2432         else
2433           spelling = TOKEN_NAME (token);
2434
2435         while ((c = *spelling++) != '\0')
2436           *buffer++ = c;
2437       }
2438       break;
2439
2440     spell_ident:
2441     case SPELL_IDENT:
2442       if (forstring)
2443         {
2444           memcpy (buffer, NODE_NAME (token->val.node.node),
2445                   NODE_LEN (token->val.node.node));
2446           buffer += NODE_LEN (token->val.node.node);
2447         }
2448       else
2449         {
2450           size_t i;
2451           const unsigned char * name = NODE_NAME (token->val.node.node);
2452
2453           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2454             if (name[i] & ~0x7F)
2455               {
2456                 i += utf8_to_ucn (buffer, name + i) - 1;
2457                 buffer += 10;
2458               }
2459             else
2460               *buffer++ = NODE_NAME (token->val.node.node)[i];
2461         }
2462       break;
2463
2464     case SPELL_LITERAL:
2465       memcpy (buffer, token->val.str.text, token->val.str.len);
2466       buffer += token->val.str.len;
2467       break;
2468
2469     case SPELL_NONE:
2470       cpp_error (pfile, CPP_DL_ICE,
2471                  "unspellable token %s", TOKEN_NAME (token));
2472       break;
2473     }
2474
2475   return buffer;
2476 }
2477
2478 /* Returns TOKEN spelt as a null-terminated string.  The string is
2479    freed when the reader is destroyed.  Useful for diagnostics.  */
2480 unsigned char *
2481 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2482 {
2483   unsigned int len = cpp_token_len (token) + 1;
2484   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2485
2486   end = cpp_spell_token (pfile, token, start, false);
2487   end[0] = '\0';
2488
2489   return start;
2490 }
2491
2492 /* Returns a pointer to a string which spells the token defined by
2493    TYPE and FLAGS.  Used by C front ends, which really should move to
2494    using cpp_token_as_text.  */
2495 const char *
2496 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2497 {
2498   if (flags & DIGRAPH)
2499     return (const char *) cpp_digraph2name (type);
2500   else if (flags & NAMED_OP)
2501     return cpp_named_operator2name (type);
2502
2503   return (const char *) token_spellings[type].name;
2504 }
2505
2506 /* Writes the spelling of token to FP, without any preceding space.
2507    Separated from cpp_spell_token for efficiency - to avoid stdio
2508    double-buffering.  */
2509 void
2510 cpp_output_token (const cpp_token *token, FILE *fp)
2511 {
2512   switch (TOKEN_SPELL (token))
2513     {
2514     case SPELL_OPERATOR:
2515       {
2516         const unsigned char *spelling;
2517         int c;
2518
2519         if (token->flags & DIGRAPH)
2520           spelling = cpp_digraph2name (token->type);
2521         else if (token->flags & NAMED_OP)
2522           goto spell_ident;
2523         else
2524           spelling = TOKEN_NAME (token);
2525
2526         c = *spelling;
2527         do
2528           putc (c, fp);
2529         while ((c = *++spelling) != '\0');
2530       }
2531       break;
2532
2533     spell_ident:
2534     case SPELL_IDENT:
2535       {
2536         size_t i;
2537         const unsigned char * name = NODE_NAME (token->val.node.node);
2538
2539         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2540           if (name[i] & ~0x7F)
2541             {
2542               unsigned char buffer[10];
2543               i += utf8_to_ucn (buffer, name + i) - 1;
2544               fwrite (buffer, 1, 10, fp);
2545             }
2546           else
2547             fputc (NODE_NAME (token->val.node.node)[i], fp);
2548       }
2549       break;
2550
2551     case SPELL_LITERAL:
2552       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2553       break;
2554
2555     case SPELL_NONE:
2556       /* An error, most probably.  */
2557       break;
2558     }
2559 }
2560
2561 /* Compare two tokens.  */
2562 int
2563 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2564 {
2565   if (a->type == b->type && a->flags == b->flags)
2566     switch (TOKEN_SPELL (a))
2567       {
2568       default:                  /* Keep compiler happy.  */
2569       case SPELL_OPERATOR:
2570         /* token_no is used to track where multiple consecutive ##
2571            tokens were originally located.  */
2572         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2573       case SPELL_NONE:
2574         return (a->type != CPP_MACRO_ARG
2575                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2576       case SPELL_IDENT:
2577         return a->val.node.node == b->val.node.node;
2578       case SPELL_LITERAL:
2579         return (a->val.str.len == b->val.str.len
2580                 && !memcmp (a->val.str.text, b->val.str.text,
2581                             a->val.str.len));
2582       }
2583
2584   return 0;
2585 }
2586
2587 /* Returns nonzero if a space should be inserted to avoid an
2588    accidental token paste for output.  For simplicity, it is
2589    conservative, and occasionally advises a space where one is not
2590    needed, e.g. "." and ".2".  */
2591 int
2592 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2593                  const cpp_token *token2)
2594 {
2595   enum cpp_ttype a = token1->type, b = token2->type;
2596   cppchar_t c;
2597
2598   if (token1->flags & NAMED_OP)
2599     a = CPP_NAME;
2600   if (token2->flags & NAMED_OP)
2601     b = CPP_NAME;
2602
2603   c = EOF;
2604   if (token2->flags & DIGRAPH)
2605     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2606   else if (token_spellings[b].category == SPELL_OPERATOR)
2607     c = token_spellings[b].name[0];
2608
2609   /* Quickly get everything that can paste with an '='.  */
2610   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2611     return 1;
2612
2613   switch (a)
2614     {
2615     case CPP_GREATER:   return c == '>';
2616     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2617     case CPP_PLUS:      return c == '+';
2618     case CPP_MINUS:     return c == '-' || c == '>';
2619     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2620     case CPP_MOD:       return c == ':' || c == '>';
2621     case CPP_AND:       return c == '&';
2622     case CPP_OR:        return c == '|';
2623     case CPP_COLON:     return c == ':' || c == '>';
2624     case CPP_DEREF:     return c == '*';
2625     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2626     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2627     case CPP_NAME:      return ((b == CPP_NUMBER
2628                                  && name_p (pfile, &token2->val.str))
2629                                 || b == CPP_NAME
2630                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2631     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2632                                 || c == '.' || c == '+' || c == '-');
2633                                       /* UCNs */
2634     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2635                                  && b == CPP_NAME)
2636                                 || (CPP_OPTION (pfile, objc)
2637                                     && token1->val.str.text[0] == '@'
2638                                     && (b == CPP_NAME || b == CPP_STRING)));
2639     default:            break;
2640     }
2641
2642   return 0;
2643 }
2644
2645 /* Output all the remaining tokens on the current line, and a newline
2646    character, to FP.  Leading whitespace is removed.  If there are
2647    macros, special token padding is not performed.  */
2648 void
2649 cpp_output_line (cpp_reader *pfile, FILE *fp)
2650 {
2651   const cpp_token *token;
2652
2653   token = cpp_get_token (pfile);
2654   while (token->type != CPP_EOF)
2655     {
2656       cpp_output_token (token, fp);
2657       token = cpp_get_token (pfile);
2658       if (token->flags & PREV_WHITE)
2659         putc (' ', fp);
2660     }
2661
2662   putc ('\n', fp);
2663 }
2664
2665 /* Return a string representation of all the remaining tokens on the
2666    current line.  The result is allocated using xmalloc and must be
2667    freed by the caller.  */
2668 unsigned char *
2669 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2670 {
2671   const cpp_token *token;
2672   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2673   unsigned int alloced = 120 + out;
2674   unsigned char *result = (unsigned char *) xmalloc (alloced);
2675
2676   /* If DIR_NAME is empty, there are no initial contents.  */
2677   if (dir_name)
2678     {
2679       sprintf ((char *) result, "#%s ", dir_name);
2680       out += 2;
2681     }
2682
2683   token = cpp_get_token (pfile);
2684   while (token->type != CPP_EOF)
2685     {
2686       unsigned char *last;
2687       /* Include room for a possible space and the terminating nul.  */
2688       unsigned int len = cpp_token_len (token) + 2;
2689
2690       if (out + len > alloced)
2691         {
2692           alloced *= 2;
2693           if (out + len > alloced)
2694             alloced = out + len;
2695           result = (unsigned char *) xrealloc (result, alloced);
2696         }
2697
2698       last = cpp_spell_token (pfile, token, &result[out], 0);
2699       out = last - result;
2700
2701       token = cpp_get_token (pfile);
2702       if (token->flags & PREV_WHITE)
2703         result[out++] = ' ';
2704     }
2705
2706   result[out] = '\0';
2707   return result;
2708 }
2709
2710 /* Memory buffers.  Changing these three constants can have a dramatic
2711    effect on performance.  The values here are reasonable defaults,
2712    but might be tuned.  If you adjust them, be sure to test across a
2713    range of uses of cpplib, including heavy nested function-like macro
2714    expansion.  Also check the change in peak memory usage (NJAMD is a
2715    good tool for this).  */
2716 #define MIN_BUFF_SIZE 8000
2717 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2718 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2719         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2720
2721 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2722   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2723 #endif
2724
2725 /* Create a new allocation buffer.  Place the control block at the end
2726    of the buffer, so that buffer overflows will cause immediate chaos.  */
2727 static _cpp_buff *
2728 new_buff (size_t len)
2729 {
2730   _cpp_buff *result;
2731   unsigned char *base;
2732
2733   if (len < MIN_BUFF_SIZE)
2734     len = MIN_BUFF_SIZE;
2735   len = CPP_ALIGN (len);
2736
2737   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2738   result = (_cpp_buff *) (base + len);
2739   result->base = base;
2740   result->cur = base;
2741   result->limit = base + len;
2742   result->next = NULL;
2743   return result;
2744 }
2745
2746 /* Place a chain of unwanted allocation buffers on the free list.  */
2747 void
2748 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2749 {
2750   _cpp_buff *end = buff;
2751
2752   while (end->next)
2753     end = end->next;
2754   end->next = pfile->free_buffs;
2755   pfile->free_buffs = buff;
2756 }
2757
2758 /* Return a free buffer of size at least MIN_SIZE.  */
2759 _cpp_buff *
2760 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2761 {
2762   _cpp_buff *result, **p;
2763
2764   for (p = &pfile->free_buffs;; p = &(*p)->next)
2765     {
2766       size_t size;
2767
2768       if (*p == NULL)
2769         return new_buff (min_size);
2770       result = *p;
2771       size = result->limit - result->base;
2772       /* Return a buffer that's big enough, but don't waste one that's
2773          way too big.  */
2774       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2775         break;
2776     }
2777
2778   *p = result->next;
2779   result->next = NULL;
2780   result->cur = result->base;
2781   return result;
2782 }
2783
2784 /* Creates a new buffer with enough space to hold the uncommitted
2785    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2786    the excess bytes to the new buffer.  Chains the new buffer after
2787    BUFF, and returns the new buffer.  */
2788 _cpp_buff *
2789 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2790 {
2791   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2792   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2793
2794   buff->next = new_buff;
2795   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2796   return new_buff;
2797 }
2798
2799 /* Creates a new buffer with enough space to hold the uncommitted
2800    remaining bytes of the buffer pointed to by BUFF, and at least
2801    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
2802    Chains the new buffer before the buffer pointed to by BUFF, and
2803    updates the pointer to point to the new buffer.  */
2804 void
2805 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
2806 {
2807   _cpp_buff *new_buff, *old_buff = *pbuff;
2808   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2809
2810   new_buff = _cpp_get_buff (pfile, size);
2811   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2812   new_buff->next = old_buff;
2813   *pbuff = new_buff;
2814 }
2815
2816 /* Free a chain of buffers starting at BUFF.  */
2817 void
2818 _cpp_free_buff (_cpp_buff *buff)
2819 {
2820   _cpp_buff *next;
2821
2822   for (; buff; buff = next)
2823     {
2824       next = buff->next;
2825       free (buff->base);
2826     }
2827 }
2828
2829 /* Allocate permanent, unaligned storage of length LEN.  */
2830 unsigned char *
2831 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2832 {
2833   _cpp_buff *buff = pfile->u_buff;
2834   unsigned char *result = buff->cur;
2835
2836   if (len > (size_t) (buff->limit - result))
2837     {
2838       buff = _cpp_get_buff (pfile, len);
2839       buff->next = pfile->u_buff;
2840       pfile->u_buff = buff;
2841       result = buff->cur;
2842     }
2843
2844   buff->cur = result + len;
2845   return result;
2846 }
2847
2848 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2849    That buffer is used for growing allocations when saving macro
2850    replacement lists in a #define, and when parsing an answer to an
2851    assertion in #assert, #unassert or #if (and therefore possibly
2852    whilst expanding macros).  It therefore must not be used by any
2853    code that they might call: specifically the lexer and the guts of
2854    the macro expander.
2855
2856    All existing other uses clearly fit this restriction: storing
2857    registered pragmas during initialization.  */
2858 unsigned char *
2859 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2860 {
2861   _cpp_buff *buff = pfile->a_buff;
2862   unsigned char *result = buff->cur;
2863
2864   if (len > (size_t) (buff->limit - result))
2865     {
2866       buff = _cpp_get_buff (pfile, len);
2867       buff->next = pfile->a_buff;
2868       pfile->a_buff = buff;
2869       result = buff->cur;
2870     }
2871
2872   buff->cur = result + len;
2873   return result;
2874 }
2875
2876 /* Say which field of TOK is in use.  */
2877
2878 enum cpp_token_fld_kind
2879 cpp_token_val_index (cpp_token *tok)
2880 {
2881   switch (TOKEN_SPELL (tok))
2882     {
2883     case SPELL_IDENT:
2884       return CPP_TOKEN_FLD_NODE;
2885     case SPELL_LITERAL:
2886       return CPP_TOKEN_FLD_STR;
2887     case SPELL_OPERATOR:
2888       if (tok->type == CPP_PASTE)
2889         return CPP_TOKEN_FLD_TOKEN_NO;
2890       else
2891         return CPP_TOKEN_FLD_NONE;
2892     case SPELL_NONE:
2893       if (tok->type == CPP_MACRO_ARG)
2894         return CPP_TOKEN_FLD_ARG_NO;
2895       else if (tok->type == CPP_PADDING)
2896         return CPP_TOKEN_FLD_SOURCE;
2897       else if (tok->type == CPP_PRAGMA)
2898         return CPP_TOKEN_FLD_PRAGMA;
2899       /* else fall through */
2900     default:
2901       return CPP_TOKEN_FLD_NONE;
2902     }
2903 }
2904
2905 /* All tokens lexed in R after calling this function will be forced to have
2906    their source_location the same as the location referenced by P, until
2907    cpp_stop_forcing_token_locations is called for R.  */
2908
2909 void
2910 cpp_force_token_locations (cpp_reader *r, source_location *p)
2911 {
2912   r->forced_token_location_p = p;
2913 }
2914
2915 /* Go back to assigning locations naturally for lexed tokens.  */
2916
2917 void
2918 cpp_stop_forcing_token_locations (cpp_reader *r)
2919 {
2920   r->forced_token_location_p = NULL;
2921 }