libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2013 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problems can be properly
 267    autoconfed:
 268
 269    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 270    Before Solaris 9 Update 6, SSE insns cannot be executed.
 271    The Solaris 10+ assembler tags objects with the instruction set
 272    extensions used, so SSE4.2 executables cannot run on machines that
 273    don't support that extension.  */
 274
 275 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 276
 277 /* Replicated character data to be shared between implementations.
 278    Recall that outside of a context with vector support we can't
 279    define compatible vector types, therefore these are all defined
 280    in terms of raw characters.  */
 281 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 282   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 283     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 284   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 285     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 286   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 287     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 288   { '?', '?', '?', '?', '?', '?', '?', '?',
 289     '?', '?', '?', '?', '?', '?', '?', '?' },
 290 };
 291
 292 /* A version of the fast scanner using MMX vectorized byte compare insns.
 293
 294    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 295    which was packaged into SSE1; it is also present in the AMD MMX
 296    extension.  Mark the function as using "sse" so that we emit a real
 297    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 298
 299 static const uchar *
 300 #ifndef __SSE__
 301 __attribute__((__target__("sse")))
 302 #endif
 303 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 304 {
 305   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 306   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 307
 308   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 309   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 310   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 311   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 312
 313   unsigned int misalign, found, mask;
 314   const v8qi *p;
 315   v8qi data, t, c;
 316
 317   /* Align the source pointer.  While MMX doesn't generate unaligned data
 318      faults, this allows us to safely scan to the end of the buffer without
 319      reading beyond the end of the last page.  */
 320   misalign = (uintptr_t)s & 7;
 321   p = (const v8qi *)((uintptr_t)s & -8);
 322   data = *p;
 323
 324   /* Create a mask for the bytes that are valid within the first
 325      16-byte block.  The Idea here is that the AND with the mask
 326      within the loop is "free", since we need some AND or TEST
 327      insn in order to set the flags for the branch anyway.  */
 328   mask = -1u << misalign;
 329
 330   /* Main loop processing 8 bytes at a time.  */
 331   goto start;
 332   do
 333     {
 334       data = *++p;
 335       mask = -1;
 336
 337     start:
 338       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 339       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 344       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 345       found = __builtin_ia32_pmovmskb (t);
 346       found &= mask;
 347     }
 348   while (!found);
 349
 350   __builtin_ia32_emms ();
 351
 352   /* FOUND contains 1 in bits for which we matched a relevant
 353      character.  Conversion to the byte index is trivial.  */
 354   found = __builtin_ctz(found);
 355   return (const uchar *)p + found;
 356 }
 357
 358 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 359
 360 static const uchar *
 361 #ifndef __SSE2__
 362 __attribute__((__target__("sse2")))
 363 #endif
 364 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 365 {
 366   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 367
 368   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 369   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 370   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 371   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 372
 373   unsigned int misalign, found, mask;
 374   const v16qi *p;
 375   v16qi data, t;
 376
 377   /* Align the source pointer.  */
 378   misalign = (uintptr_t)s & 15;
 379   p = (const v16qi *)((uintptr_t)s & -16);
 380   data = *p;
 381
 382   /* Create a mask for the bytes that are valid within the first
 383      16-byte block.  The Idea here is that the AND with the mask
 384      within the loop is "free", since we need some AND or TEST
 385      insn in order to set the flags for the branch anyway.  */
 386   mask = -1u << misalign;
 387
 388   /* Main loop processing 16 bytes at a time.  */
 389   goto start;
 390   do
 391     {
 392       data = *++p;
 393       mask = -1;
 394
 395     start:
 396       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 397       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 398       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 400       found = __builtin_ia32_pmovmskb128 (t);
 401       found &= mask;
 402     }
 403   while (!found);
 404
 405   /* FOUND contains 1 in bits for which we matched a relevant
 406      character.  Conversion to the byte index is trivial.  */
 407   found = __builtin_ctz(found);
 408   return (const uchar *)p + found;
 409 }
 410
 411 #ifdef HAVE_SSE4
 412 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 413
 414 static const uchar *
 415 #ifndef __SSE4_2__
 416 __attribute__((__target__("sse4.2")))
 417 #endif
 418 search_line_sse42 (const uchar *s, const uchar *end)
 419 {
 420   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 421   static const v16qi search = { '\n', '\r', '?', '\\' };
 422
 423   uintptr_t si = (uintptr_t)s;
 424   uintptr_t index;
 425
 426   /* Check for unaligned input.  */
 427   if (si & 15)
 428     {
 429       v16qi sv;
 430
 431       if (__builtin_expect (end - s < 16, 0)
 432           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 433         {
 434           /* There are less than 16 bytes left in the buffer, and less
 435              than 16 bytes left on the page.  Reading 16 bytes at this
 436              point might generate a spurious page fault.  Defer to the
 437              SSE2 implementation, which already handles alignment.  */
 438           return search_line_sse2 (s, end);
 439         }
 440
 441       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 442          memory need not be aligned.  */
 443       sv = __builtin_ia32_loaddqu ((const char *) s);
 444       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 445
 446       if (__builtin_expect (index < 16, 0))
 447         goto found;
 448
 449       /* Advance the pointer to an aligned address.  We will re-scan a
 450          few bytes, but we no longer need care for reading past the
 451          end of a page, since we're guaranteed a match.  */
 452       s = (const uchar *)((si + 16) & -16);
 453     }
 454
 455   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 456      in inline assembly, we can make proper use of the flags set.  */
 457   __asm (      "sub $16, %1\n"
 458         "       .balign 16\n"
 459         "0:     add $16, %1\n"
 460         "       %vpcmpestri $0, (%1), %2\n"
 461         "       jnc 0b"
 462         : "=&c"(index), "+r"(s)
 463         : "x"(search), "a"(4), "d"(16));
 464
 465  found:
 466   return s + index;
 467 }
 468
 469 #else
 470 /* Work around out-dated assemblers without sse4 support.  */
 471 #define search_line_sse42 search_line_sse2
 472 #endif
 473
 474 /* Check the CPU capabilities.  */
 475
 476 #include "../gcc/config/i386/cpuid.h"
 477
 478 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 479 static search_line_fast_type search_line_fast;
 480
 481 #define HAVE_init_vectorized_lexer 1
 482 static inline void
 483 init_vectorized_lexer (void)
 484 {
 485   unsigned dummy, ecx = 0, edx = 0;
 486   search_line_fast_type impl = search_line_acc_char;
 487   int minimum = 0;
 488
 489 #if defined(__SSE4_2__)
 490   minimum = 3;
 491 #elif defined(__SSE2__)
 492   minimum = 2;
 493 #elif defined(__SSE__)
 494   minimum = 1;
 495 #endif
 496
 497   if (minimum == 3)
 498     impl = search_line_sse42;
 499   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 500     {
 501       if (minimum == 3 || (ecx & bit_SSE4_2))
 502         impl = search_line_sse42;
 503       else if (minimum == 2 || (edx & bit_SSE2))
 504         impl = search_line_sse2;
 505       else if (minimum == 1 || (edx & bit_SSE))
 506         impl = search_line_mmx;
 507     }
 508   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 509     {
 510       if (minimum == 1
 511           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 512         impl = search_line_mmx;
 513     }
 514
 515   search_line_fast = impl;
 516 }
 517
 518 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 519
 520 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 521 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 522    so we can't compile this function without -maltivec on the command line
 523    (or implied by some other switch).  */
 524
 525 static const uchar *
 526 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 527 {
 528   typedef __attribute__((altivec(vector))) unsigned char vc;
 529
 530   const vc repl_nl = {
 531     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 532     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 533   };
 534   const vc repl_cr = {
 535     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 536     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 537   };
 538   const vc repl_bs = {
 539     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 540     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 541   };
 542   const vc repl_qm = {
 543     '?', '?', '?', '?', '?', '?', '?', '?',
 544     '?', '?', '?', '?', '?', '?', '?', '?',
 545   };
 546   const vc ones = {
 547     -1, -1, -1, -1, -1, -1, -1, -1,
 548     -1, -1, -1, -1, -1, -1, -1, -1,
 549   };
 550   const vc zero = { 0 };
 551
 552   vc data, mask, t;
 553
 554   /* Altivec loads automatically mask addresses with -16.  This lets us
 555      issue the first load as early as possible.  */
 556   data = __builtin_vec_ld(0, (const vc *)s);
 557
 558   /* Discard bytes before the beginning of the buffer.  Do this by
 559      beginning with all ones and shifting in zeros according to the
 560      mis-alignment.  The LVSR instruction pulls the exact shift we
 561      want from the address.  */
 562   mask = __builtin_vec_lvsr(0, s);
 563   mask = __builtin_vec_perm(zero, ones, mask);
 564   data &= mask;
 565
 566   /* While altivec loads mask addresses, we still need to align S so
 567      that the offset we compute at the end is correct.  */
 568   s = (const uchar *)((uintptr_t)s & -16);
 569
 570   /* Main loop processing 16 bytes at a time.  */
 571   goto start;
 572   do
 573     {
 574       vc m_nl, m_cr, m_bs, m_qm;
 575
 576       s += 16;
 577       data = __builtin_vec_ld(0, (const vc *)s);
 578
 579     start:
 580       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 581       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 582       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 583       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 584       t = (m_nl | m_cr) | (m_bs | m_qm);
 585
 586       /* T now contains 0xff in bytes for which we matched one of the relevant
 587          characters.  We want to exit the loop if any byte in T is non-zero.
 588          Below is the expansion of vec_any_ne(t, zero).  */
 589     }
 590   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 591
 592   {
 593 #define N  (sizeof(vc) / sizeof(long))
 594
 595     union {
 596       vc v;
 597       /* Statically assert that N is 2 or 4.  */
 598       unsigned long l[(N == 2 || N == 4) ? N : -1];
 599     } u;
 600     unsigned long l, i = 0;
 601
 602     u.v = t;
 603
 604     /* Find the first word of T that is non-zero.  */
 605     switch (N)
 606       {
 607       case 4:
 608         l = u.l[i++];
 609         if (l != 0)
 610           break;
 611         s += sizeof(unsigned long);
 612         l = u.l[i++];
 613         if (l != 0)
 614           break;
 615         s += sizeof(unsigned long);
 616       case 2:
 617         l = u.l[i++];
 618         if (l != 0)
 619           break;
 620         s += sizeof(unsigned long);
 621         l = u.l[i];
 622       }
 623
 624     /* L now contains 0xff in bytes for which we matched one of the
 625        relevant characters.  We can find the byte index by finding
 626        its bit index and dividing by 8.  */
 627     l = __builtin_clzl(l) >> 3;
 628     return s + l;
 629
 630 #undef N
 631   }
 632 }
 633
 634 #elif defined (__ARM_NEON__)
 635 #include "arm_neon.h"
 636
 637 static const uchar *
 638 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 639 {
 640   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 641   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 642   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 643   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 644   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 645
 646   unsigned int misalign, found, mask;
 647   const uint8_t *p;
 648   uint8x16_t data;
 649
 650   /* Align the source pointer.  */
 651   misalign = (uintptr_t)s & 15;
 652   p = (const uint8_t *)((uintptr_t)s & -16);
 653   data = vld1q_u8 (p);
 654
 655   /* Create a mask for the bytes that are valid within the first
 656      16-byte block.  The Idea here is that the AND with the mask
 657      within the loop is "free", since we need some AND or TEST
 658      insn in order to set the flags for the branch anyway.  */
 659   mask = (-1u << misalign) & 0xffff;
 660
 661   /* Main loop, processing 16 bytes at a time.  */
 662   goto start;
 663
 664   do
 665     {
 666       uint8x8_t l;
 667       uint16x4_t m;
 668       uint32x2_t n;
 669       uint8x16_t t, u, v, w;
 670
 671       p += 16;
 672       data = vld1q_u8 (p);
 673       mask = 0xffff;
 674
 675     start:
 676       t = vceqq_u8 (data, repl_nl);
 677       u = vceqq_u8 (data, repl_cr);
 678       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 679       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 680       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 681       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 682       m = vpaddl_u8 (l);
 683       n = vpaddl_u16 (m);
 684
 685       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 686               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 687       found &= mask;
 688     }
 689   while (!found);
 690
 691   /* FOUND contains 1 in bits for which we matched a relevant
 692      character.  Conversion to the byte index is trivial.  */
 693   found = __builtin_ctz (found);
 694   return (const uchar *)p + found;
 695 }
 696
 697 #else
 698
 699 /* We only have one accellerated alternative.  Use a direct call so that
 700    we encourage inlining.  */
 701
 702 #define search_line_fast  search_line_acc_char
 703
 704 #endif
 705
 706 /* Initialize the lexer if needed.  */
 707
 708 void
 709 _cpp_init_lexer (void)
 710 {
 711 #ifdef HAVE_init_vectorized_lexer
 712   init_vectorized_lexer ();
 713 #endif
 714 }
 715
 716 /* Returns with a logical line that contains no escaped newlines or
 717    trigraphs.  This is a time-critical inner loop.  */
 718 void
 719 _cpp_clean_line (cpp_reader *pfile)
 720 {
 721   cpp_buffer *buffer;
 722   const uchar *s;
 723   uchar c, *d, *p;
 724
 725   buffer = pfile->buffer;
 726   buffer->cur_note = buffer->notes_used = 0;
 727   buffer->cur = buffer->line_base = buffer->next_line;
 728   buffer->need_line = false;
 729   s = buffer->next_line;
 730
 731   if (!buffer->from_stage3)
 732     {
 733       const uchar *pbackslash = NULL;
 734
 735       /* Fast path.  This is the common case of an un-escaped line with
 736          no trigraphs.  The primary win here is by not writing any
 737          data back to memory until we have to.  */
 738       while (1)
 739         {
 740           /* Perform an optimized search for \n, \r, \\, ?.  */
 741           s = search_line_fast (s, buffer->rlimit);
 742
 743           c = *s;
 744           if (c == '\\')
 745             {
 746               /* Record the location of the backslash and continue.  */
 747               pbackslash = s++;
 748             }
 749           else if (__builtin_expect (c == '?', 0))
 750             {
 751               if (__builtin_expect (s[1] == '?', false)
 752                    && _cpp_trigraph_map[s[2]])
 753                 {
 754                   /* Have a trigraph.  We may or may not have to convert
 755                      it.  Add a line note regardless, for -Wtrigraphs.  */
 756                   add_line_note (buffer, s, s[2]);
 757                   if (CPP_OPTION (pfile, trigraphs))
 758                     {
 759                       /* We do, and that means we have to switch to the
 760                          slow path.  */
 761                       d = (uchar *) s;
 762                       *d = _cpp_trigraph_map[s[2]];
 763                       s += 2;
 764                       goto slow_path;
 765                     }
 766                 }
 767               /* Not a trigraph.  Continue on fast-path.  */
 768               s++;
 769             }
 770           else
 771             break;
 772         }
 773
 774       /* This must be \r or \n.  We're either done, or we'll be forced
 775          to write back to the buffer and continue on the slow path.  */
 776       d = (uchar *) s;
 777
 778       if (__builtin_expect (s == buffer->rlimit, false))
 779         goto done;
 780
 781       /* DOS line ending? */
 782       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 783         {
 784           s++;
 785           if (s == buffer->rlimit)
 786             goto done;
 787         }
 788
 789       if (__builtin_expect (pbackslash == NULL, true))
 790         goto done;
 791
 792       /* Check for escaped newline.  */
 793       p = d;
 794       while (is_nvspace (p[-1]))
 795         p--;
 796       if (p - 1 != pbackslash)
 797         goto done;
 798
 799       /* Have an escaped newline; process it and proceed to
 800          the slow path.  */
 801       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 802       d = p - 2;
 803       buffer->next_line = p - 1;
 804
 805     slow_path:
 806       while (1)
 807         {
 808           c = *++s;
 809           *++d = c;
 810
 811           if (c == '\n' || c == '\r')
 812             {
 813               /* Handle DOS line endings.  */
 814               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 815                 s++;
 816               if (s == buffer->rlimit)
 817                 break;
 818
 819               /* Escaped?  */
 820               p = d;
 821               while (p != buffer->next_line && is_nvspace (p[-1]))
 822                 p--;
 823               if (p == buffer->next_line || p[-1] != '\\')
 824                 break;
 825
 826               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 827               d = p - 2;
 828               buffer->next_line = p - 1;
 829             }
 830           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 831             {
 832               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 833               add_line_note (buffer, d, s[2]);
 834               if (CPP_OPTION (pfile, trigraphs))
 835                 {
 836                   *d = _cpp_trigraph_map[s[2]];
 837                   s += 2;
 838                 }
 839             }
 840         }
 841     }
 842   else
 843     {
 844       while (*s != '\n' && *s != '\r')
 845         s++;
 846       d = (uchar *) s;
 847
 848       /* Handle DOS line endings.  */
 849       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 850         s++;
 851     }
 852
 853  done:
 854   *d = '\n';
 855   /* A sentinel note that should never be processed.  */
 856   add_line_note (buffer, d + 1, '\n');
 857   buffer->next_line = s + 1;
 858 }
 859
 860 /* Return true if the trigraph indicated by NOTE should be warned
 861    about in a comment.  */
 862 static bool
 863 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 864 {
 865   const uchar *p;
 866
 867   /* Within comments we don't warn about trigraphs, unless the
 868      trigraph forms an escaped newline, as that may change
 869      behavior.  */
 870   if (note->type != '/')
 871     return false;
 872
 873   /* If -trigraphs, then this was an escaped newline iff the next note
 874      is coincident.  */
 875   if (CPP_OPTION (pfile, trigraphs))
 876     return note[1].pos == note->pos;
 877
 878   /* Otherwise, see if this forms an escaped newline.  */
 879   p = note->pos + 3;
 880   while (is_nvspace (*p))
 881     p++;
 882
 883   /* There might have been escaped newlines between the trigraph and the
 884      newline we found.  Hence the position test.  */
 885   return (*p == '\n' && p < note[1].pos);
 886 }
 887
 888 /* Process the notes created by add_line_note as far as the current
 889    location.  */
 890 void
 891 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 892 {
 893   cpp_buffer *buffer = pfile->buffer;
 894
 895   for (;;)
 896     {
 897       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 898       unsigned int col;
 899
 900       if (note->pos > buffer->cur)
 901         break;
 902
 903       buffer->cur_note++;
 904       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 905
 906       if (note->type == '\\' || note->type == ' ')
 907         {
 908           if (note->type == ' ' && !in_comment)
 909             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 910                                  "backslash and newline separated by space");
 911
 912           if (buffer->next_line > buffer->rlimit)
 913             {
 914               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 915                                    "backslash-newline at end of file");
 916               /* Prevent "no newline at end of file" warning.  */
 917               buffer->next_line = buffer->rlimit;
 918             }
 919
 920           buffer->line_base = note->pos;
 921           CPP_INCREMENT_LINE (pfile, 0);
 922         }
 923       else if (_cpp_trigraph_map[note->type])
 924         {
 925           if (CPP_OPTION (pfile, warn_trigraphs)
 926               && (!in_comment || warn_in_comment (pfile, note)))
 927             {
 928               if (CPP_OPTION (pfile, trigraphs))
 929                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 930                                        pfile->line_table->highest_line, col,
 931                                        "trigraph ??%c converted to %c",
 932                                        note->type,
 933                                        (int) _cpp_trigraph_map[note->type]);
 934               else
 935                 {
 936                   cpp_warning_with_line
 937                     (pfile, CPP_W_TRIGRAPHS,
 938                      pfile->line_table->highest_line, col,
 939                      "trigraph ??%c ignored, use -trigraphs to enable",
 940                      note->type);
 941                 }
 942             }
 943         }
 944       else if (note->type == 0)
 945         /* Already processed in lex_raw_string.  */;
 946       else
 947         abort ();
 948     }
 949 }
 950
 951 /* Skip a C-style block comment.  We find the end of the comment by
 952    seeing if an asterisk is before every '/' we encounter.  Returns
 953    nonzero if comment terminated by EOF, zero otherwise.
 954
 955    Buffer->cur points to the initial asterisk of the comment.  */
 956 bool
 957 _cpp_skip_block_comment (cpp_reader *pfile)
 958 {
 959   cpp_buffer *buffer = pfile->buffer;
 960   const uchar *cur = buffer->cur;
 961   uchar c;
 962
 963   cur++;
 964   if (*cur == '/')
 965     cur++;
 966
 967   for (;;)
 968     {
 969       /* People like decorating comments with '*', so check for '/'
 970          instead for efficiency.  */
 971       c = *cur++;
 972
 973       if (c == '/')
 974         {
 975           if (cur[-2] == '*')
 976             break;
 977
 978           /* Warn about potential nested comments, but not if the '/'
 979              comes immediately before the true comment delimiter.
 980              Don't bother to get it right across escaped newlines.  */
 981           if (CPP_OPTION (pfile, warn_comments)
 982               && cur[0] == '*' && cur[1] != '/')
 983             {
 984               buffer->cur = cur;
 985               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 986                                      pfile->line_table->highest_line,
 987                                      CPP_BUF_COL (buffer),
 988                                      "\"/*\" within comment");
 989             }
 990         }
 991       else if (c == '\n')
 992         {
 993           unsigned int cols;
 994           buffer->cur = cur - 1;
 995           _cpp_process_line_notes (pfile, true);
 996           if (buffer->next_line >= buffer->rlimit)
 997             return true;
 998           _cpp_clean_line (pfile);
 999
1000           cols = buffer->next_line - buffer->line_base;
1001           CPP_INCREMENT_LINE (pfile, cols);
1002
1003           cur = buffer->cur;
1004         }
1005     }
1006
1007   buffer->cur = cur;
1008   _cpp_process_line_notes (pfile, true);
1009   return false;
1010 }
1011
1012 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1013    terminating newline.  Handles escaped newlines.  Returns nonzero
1014    if a multiline comment.  */
1015 static int
1016 skip_line_comment (cpp_reader *pfile)
1017 {
1018   cpp_buffer *buffer = pfile->buffer;
1019   source_location orig_line = pfile->line_table->highest_line;
1020
1021   while (*buffer->cur != '\n')
1022     buffer->cur++;
1023
1024   _cpp_process_line_notes (pfile, true);
1025   return orig_line != pfile->line_table->highest_line;
1026 }
1027
1028 /* Skips whitespace, saving the next non-whitespace character.  */
1029 static void
1030 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1031 {
1032   cpp_buffer *buffer = pfile->buffer;
1033   bool saw_NUL = false;
1034
1035   do
1036     {
1037       /* Horizontal space always OK.  */
1038       if (c == ' ' || c == '\t')
1039         ;
1040       /* Just \f \v or \0 left.  */
1041       else if (c == '\0')
1042         saw_NUL = true;
1043       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1044         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1045                              CPP_BUF_COL (buffer),
1046                              "%s in preprocessing directive",
1047                              c == '\f' ? "form feed" : "vertical tab");
1048
1049       c = *buffer->cur++;
1050     }
1051   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1052   while (is_nvspace (c));
1053
1054   if (saw_NUL)
1055     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1056
1057   buffer->cur--;
1058 }
1059
1060 /* See if the characters of a number token are valid in a name (no
1061    '.', '+' or '-').  */
1062 static int
1063 name_p (cpp_reader *pfile, const cpp_string *string)
1064 {
1065   unsigned int i;
1066
1067   for (i = 0; i < string->len; i++)
1068     if (!is_idchar (string->text[i]))
1069       return 0;
1070
1071   return 1;
1072 }
1073
1074 /* After parsing an identifier or other sequence, produce a warning about
1075    sequences not in NFC/NFKC.  */
1076 static void
1077 warn_about_normalization (cpp_reader *pfile,
1078                           const cpp_token *token,
1079                           const struct normalize_state *s)
1080 {
1081   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1082       && !pfile->state.skipping)
1083     {
1084       /* Make sure that the token is printed using UCNs, even
1085          if we'd otherwise happily print UTF-8.  */
1086       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1087       size_t sz;
1088
1089       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1090       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1091         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1092                                "`%.*s' is not in NFKC", (int) sz, buf);
1093       else
1094         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1095                                "`%.*s' is not in NFC", (int) sz, buf);
1096       free (buf);
1097     }
1098 }
1099
1100 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1101    an identifier.  FIRST is TRUE if this starts an identifier.  */
1102 static bool
1103 forms_identifier_p (cpp_reader *pfile, int first,
1104                     struct normalize_state *state)
1105 {
1106   cpp_buffer *buffer = pfile->buffer;
1107
1108   if (*buffer->cur == '$')
1109     {
1110       if (!CPP_OPTION (pfile, dollars_in_ident))
1111         return false;
1112
1113       buffer->cur++;
1114       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1115         {
1116           CPP_OPTION (pfile, warn_dollars) = 0;
1117           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1118         }
1119
1120       return true;
1121     }
1122
1123   /* Is this a syntactically valid UCN?  */
1124   if (CPP_OPTION (pfile, extended_identifiers)
1125       && *buffer->cur == '\\'
1126       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1127     {
1128       buffer->cur += 2;
1129       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1130                           state))
1131         return true;
1132       buffer->cur -= 2;
1133     }
1134
1135   return false;
1136 }
1137
1138 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1139 static cpp_hashnode *
1140 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1141 {
1142   cpp_hashnode *result;
1143   const uchar *cur;
1144   unsigned int len;
1145   unsigned int hash = HT_HASHSTEP (0, *base);
1146
1147   cur = base + 1;
1148   while (ISIDNUM (*cur))
1149     {
1150       hash = HT_HASHSTEP (hash, *cur);
1151       cur++;
1152     }
1153   len = cur - base;
1154   hash = HT_HASHFINISH (hash, len);
1155   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1156                                               base, len, hash, HT_ALLOC));
1157
1158   /* Rarely, identifiers require diagnostics when lexed.  */
1159   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1160                         && !pfile->state.skipping, 0))
1161     {
1162       /* It is allowed to poison the same identifier twice.  */
1163       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1164         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1165                    NODE_NAME (result));
1166
1167       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1168          replacement list of a variadic macro.  */
1169       if (result == pfile->spec_nodes.n__VA_ARGS__
1170           && !pfile->state.va_args_ok)
1171         cpp_error (pfile, CPP_DL_PEDWARN,
1172                    "__VA_ARGS__ can only appear in the expansion"
1173                    " of a C99 variadic macro");
1174
1175       /* For -Wc++-compat, warn about use of C++ named operators.  */
1176       if (result->flags & NODE_WARN_OPERATOR)
1177         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1178                      "identifier \"%s\" is a special operator name in C++",
1179                      NODE_NAME (result));
1180     }
1181
1182   return result;
1183 }
1184
1185 /* Get the cpp_hashnode of an identifier specified by NAME in
1186    the current cpp_reader object.  If none is found, NULL is returned.  */
1187 cpp_hashnode *
1188 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1189 {
1190   cpp_hashnode *result;
1191   result = lex_identifier_intern (pfile, (uchar *) name);
1192   return result;
1193 }
1194
1195 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1196 static cpp_hashnode *
1197 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1198                 struct normalize_state *nst)
1199 {
1200   cpp_hashnode *result;
1201   const uchar *cur;
1202   unsigned int len;
1203   unsigned int hash = HT_HASHSTEP (0, *base);
1204
1205   cur = pfile->buffer->cur;
1206   if (! starts_ucn)
1207     while (ISIDNUM (*cur))
1208       {
1209         hash = HT_HASHSTEP (hash, *cur);
1210         cur++;
1211       }
1212   pfile->buffer->cur = cur;
1213   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1214     {
1215       /* Slower version for identifiers containing UCNs (or $).  */
1216       do {
1217         while (ISIDNUM (*pfile->buffer->cur))
1218           {
1219             pfile->buffer->cur++;
1220             NORMALIZE_STATE_UPDATE_IDNUM (nst);
1221           }
1222       } while (forms_identifier_p (pfile, false, nst));
1223       result = _cpp_interpret_identifier (pfile, base,
1224                                           pfile->buffer->cur - base);
1225     }
1226   else
1227     {
1228       len = cur - base;
1229       hash = HT_HASHFINISH (hash, len);
1230
1231       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1232                                                   base, len, hash, HT_ALLOC));
1233     }
1234
1235   /* Rarely, identifiers require diagnostics when lexed.  */
1236   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1237                         && !pfile->state.skipping, 0))
1238     {
1239       /* It is allowed to poison the same identifier twice.  */
1240       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1241         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1242                    NODE_NAME (result));
1243
1244       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1245          replacement list of a variadic macro.  */
1246       if (result == pfile->spec_nodes.n__VA_ARGS__
1247           && !pfile->state.va_args_ok)
1248         cpp_error (pfile, CPP_DL_PEDWARN,
1249                    "__VA_ARGS__ can only appear in the expansion"
1250                    " of a C99 variadic macro");
1251
1252       /* For -Wc++-compat, warn about use of C++ named operators.  */
1253       if (result->flags & NODE_WARN_OPERATOR)
1254         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1255                      "identifier \"%s\" is a special operator name in C++",
1256                      NODE_NAME (result));
1257     }
1258
1259   return result;
1260 }
1261
1262 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1263 static void
1264 lex_number (cpp_reader *pfile, cpp_string *number,
1265             struct normalize_state *nst)
1266 {
1267   const uchar *cur;
1268   const uchar *base;
1269   uchar *dest;
1270
1271   base = pfile->buffer->cur - 1;
1272   do
1273     {
1274       cur = pfile->buffer->cur;
1275
1276       /* N.B. ISIDNUM does not include $.  */
1277       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
1278         {
1279           cur++;
1280           NORMALIZE_STATE_UPDATE_IDNUM (nst);
1281         }
1282
1283       pfile->buffer->cur = cur;
1284     }
1285   while (forms_identifier_p (pfile, false, nst));
1286
1287   number->len = cur - base;
1288   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1289   memcpy (dest, base, number->len);
1290   dest[number->len] = '\0';
1291   number->text = dest;
1292 }
1293
1294 /* Create a token of type TYPE with a literal spelling.  */
1295 static void
1296 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1297                 unsigned int len, enum cpp_ttype type)
1298 {
1299   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1300
1301   memcpy (dest, base, len);
1302   dest[len] = '\0';
1303   token->type = type;
1304   token->val.str.len = len;
1305   token->val.str.text = dest;
1306 }
1307
1308 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1309    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1310
1311 static void
1312 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1313                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1314 {
1315   _cpp_buff *first_buff = *first_buff_p;
1316   _cpp_buff *last_buff = *last_buff_p;
1317
1318   if (first_buff == NULL)
1319     first_buff = last_buff = _cpp_get_buff (pfile, len);
1320   else if (len > BUFF_ROOM (last_buff))
1321     {
1322       size_t room = BUFF_ROOM (last_buff);
1323       memcpy (BUFF_FRONT (last_buff), base, room);
1324       BUFF_FRONT (last_buff) += room;
1325       base += room;
1326       len -= room;
1327       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1328     }
1329
1330   memcpy (BUFF_FRONT (last_buff), base, len);
1331   BUFF_FRONT (last_buff) += len;
1332
1333   *first_buff_p = first_buff;
1334   *last_buff_p = last_buff;
1335 }
1336
1337 /* Lexes a raw string.  The stored string contains the spelling, including
1338    double quotes, delimiter string, '(' and ')', any leading
1339    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1340    literal, or CPP_OTHER if it was not properly terminated.
1341
1342    The spelling is NUL-terminated, but it is not guaranteed that this
1343    is the first NUL since embedded NULs are preserved.  */
1344
1345 static void
1346 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1347                 const uchar *cur)
1348 {
1349   const uchar *raw_prefix;
1350   unsigned int raw_prefix_len = 0;
1351   enum cpp_ttype type;
1352   size_t total_len = 0;
1353   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1354   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1355
1356   type = (*base == 'L' ? CPP_WSTRING :
1357           *base == 'U' ? CPP_STRING32 :
1358           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1359           : CPP_STRING);
1360
1361   raw_prefix = cur + 1;
1362   while (raw_prefix_len < 16)
1363     {
1364       switch (raw_prefix[raw_prefix_len])
1365         {
1366         case ' ': case '(': case ')': case '\\': case '\t':
1367         case '\v': case '\f': case '\n': default:
1368           break;
1369         /* Basic source charset except the above chars.  */
1370         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1371         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1372         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1373         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1374         case 'y': case 'z':
1375         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1376         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1377         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1378         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1379         case 'Y': case 'Z':
1380         case '0': case '1': case '2': case '3': case '4': case '5':
1381         case '6': case '7': case '8': case '9':
1382         case '_': case '{': case '}': case '#': case '[': case ']':
1383         case '<': case '>': case '%': case ':': case ';': case '.':
1384         case '?': case '*': case '+': case '-': case '/': case '^':
1385         case '&': case '|': case '~': case '!': case '=': case ',':
1386         case '"': case '\'':
1387           raw_prefix_len++;
1388           continue;
1389         }
1390       break;
1391     }
1392
1393   if (raw_prefix[raw_prefix_len] != '(')
1394     {
1395       int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
1396                 + 1;
1397       if (raw_prefix_len == 16)
1398         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1399                              "raw string delimiter longer than 16 characters");
1400       else
1401         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1402                              "invalid character '%c' in raw string delimiter",
1403                              (int) raw_prefix[raw_prefix_len]);
1404       pfile->buffer->cur = raw_prefix - 1;
1405       create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
1406       return;
1407     }
1408
1409   cur = raw_prefix + raw_prefix_len + 1;
1410   for (;;)
1411     {
1412 #define BUF_APPEND(STR,LEN)                                     \
1413       do {                                                      \
1414         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1415                         &first_buff, &last_buff);               \
1416         total_len += (LEN);                                     \
1417       } while (0);
1418
1419       cppchar_t c;
1420
1421       /* If we previously performed any trigraph or line splicing
1422          transformations, undo them within the body of the raw string.  */
1423       while (note->pos < cur)
1424         ++note;
1425       for (; note->pos == cur; ++note)
1426         {
1427           switch (note->type)
1428             {
1429             case '\\':
1430             case ' ':
1431               /* Restore backslash followed by newline.  */
1432               BUF_APPEND (base, cur - base);
1433               base = cur;
1434               BUF_APPEND ("\\", 1);
1435             after_backslash:
1436               if (note->type == ' ')
1437                 {
1438                   /* GNU backslash whitespace newline extension.  FIXME
1439                      could be any sequence of non-vertical space.  When we
1440                      can properly restore any such sequence, we should mark
1441                      this note as handled so _cpp_process_line_notes
1442                      doesn't warn.  */
1443                   BUF_APPEND (" ", 1);
1444                 }
1445
1446               BUF_APPEND ("\n", 1);
1447               break;
1448
1449             case 0:
1450               /* Already handled.  */
1451               break;
1452
1453             default:
1454               if (_cpp_trigraph_map[note->type])
1455                 {
1456                   /* Don't warn about this trigraph in
1457                      _cpp_process_line_notes, since trigraphs show up as
1458                      trigraphs in raw strings.  */
1459                   uchar type = note->type;
1460                   note->type = 0;
1461
1462                   if (!CPP_OPTION (pfile, trigraphs))
1463                     /* If we didn't convert the trigraph in the first
1464                        place, don't do anything now either.  */
1465                     break;
1466
1467                   BUF_APPEND (base, cur - base);
1468                   base = cur;
1469                   BUF_APPEND ("??", 2);
1470
1471                   /* ??/ followed by newline gets two line notes, one for
1472                      the trigraph and one for the backslash/newline.  */
1473                   if (type == '/' && note[1].pos == cur)
1474                     {
1475                       if (note[1].type != '\\'
1476                           && note[1].type != ' ')
1477                         abort ();
1478                       BUF_APPEND ("/", 1);
1479                       ++note;
1480                       goto after_backslash;
1481                     }
1482                   /* The ) from ??) could be part of the suffix.  */
1483                   else if (type == ')'
1484                            && strncmp ((const char *) cur+1,
1485                                        (const char *) raw_prefix,
1486                                        raw_prefix_len) == 0
1487                            && cur[raw_prefix_len+1] == '"')
1488                     {
1489                       BUF_APPEND (")", 1);
1490                       base++;
1491                       cur += raw_prefix_len + 2;
1492                       goto break_outer_loop;
1493                     }
1494                   else
1495                     {
1496                       /* Skip the replacement character.  */
1497                       base = ++cur;
1498                       BUF_APPEND (&type, 1);
1499                     }
1500                 }
1501               else
1502                 abort ();
1503               break;
1504             }
1505         }
1506       c = *cur++;
1507
1508       if (c == ')'
1509           && strncmp ((const char *) cur, (const char *) raw_prefix,
1510                       raw_prefix_len) == 0
1511           && cur[raw_prefix_len] == '"')
1512         {
1513           cur += raw_prefix_len + 1;
1514           break;
1515         }
1516       else if (c == '\n')
1517         {
1518           if (pfile->state.in_directive
1519               || pfile->state.parsing_args
1520               || pfile->state.in_deferred_pragma)
1521             {
1522               cur--;
1523               type = CPP_OTHER;
1524               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1525                                    "unterminated raw string");
1526               break;
1527             }
1528
1529           BUF_APPEND (base, cur - base);
1530
1531           if (pfile->buffer->cur < pfile->buffer->rlimit)
1532             CPP_INCREMENT_LINE (pfile, 0);
1533           pfile->buffer->need_line = true;
1534
1535           pfile->buffer->cur = cur-1;
1536           _cpp_process_line_notes (pfile, false);
1537           if (!_cpp_get_fresh_line (pfile))
1538             {
1539               source_location src_loc = token->src_loc;
1540               token->type = CPP_EOF;
1541               /* Tell the compiler the line number of the EOF token.  */
1542               token->src_loc = pfile->line_table->highest_line;
1543               token->flags = BOL;
1544               if (first_buff != NULL)
1545                 _cpp_release_buff (pfile, first_buff);
1546               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1547                                    "unterminated raw string");
1548               return;
1549             }
1550
1551           cur = base = pfile->buffer->cur;
1552           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1553         }
1554     }
1555  break_outer_loop:
1556
1557   if (CPP_OPTION (pfile, user_literals))
1558     {
1559       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1560          underscore is ill-formed.  Since this breaks programs using macros
1561          from inttypes.h, we generate a warning and treat the ud-suffix as a
1562          separate preprocessing token.  This approach is under discussion by
1563          the standards committee, and has been adopted as a conforming
1564          extension by other front ends such as clang. */
1565       if (ISALPHA (*cur))
1566         {
1567           /* Raise a warning, but do not consume subsequent tokens.  */
1568           if (CPP_OPTION (pfile, warn_literal_suffix))
1569             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1570                                    token->src_loc, 0,
1571                                    "invalid suffix on literal; C++11 requires "
1572                                    "a space between literal and identifier");
1573         }
1574       /* Grab user defined literal suffix.  */
1575       else if (*cur == '_')
1576         {
1577           type = cpp_userdef_string_add_type (type);
1578           ++cur;
1579
1580           while (ISIDNUM (*cur))
1581             ++cur;
1582         }
1583     }
1584
1585   pfile->buffer->cur = cur;
1586   if (first_buff == NULL)
1587     create_literal (pfile, token, base, cur - base, type);
1588   else
1589     {
1590       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1591
1592       token->type = type;
1593       token->val.str.len = total_len + (cur - base);
1594       token->val.str.text = dest;
1595       last_buff = first_buff;
1596       while (last_buff != NULL)
1597         {
1598           memcpy (dest, last_buff->base,
1599                   BUFF_FRONT (last_buff) - last_buff->base);
1600           dest += BUFF_FRONT (last_buff) - last_buff->base;
1601           last_buff = last_buff->next;
1602         }
1603       _cpp_release_buff (pfile, first_buff);
1604       memcpy (dest, base, cur - base);
1605       dest[cur - base] = '\0';
1606     }
1607 }
1608
1609 /* Lexes a string, character constant, or angle-bracketed header file
1610    name.  The stored string contains the spelling, including opening
1611    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1612    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1613    if it was not properly terminated, or CPP_LESS for an unterminated
1614    header name which must be relexed as normal tokens.
1615
1616    The spelling is NUL-terminated, but it is not guaranteed that this
1617    is the first NUL since embedded NULs are preserved.  */
1618 static void
1619 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1620 {
1621   bool saw_NUL = false;
1622   const uchar *cur;
1623   cppchar_t terminator;
1624   enum cpp_ttype type;
1625
1626   cur = base;
1627   terminator = *cur++;
1628   if (terminator == 'L' || terminator == 'U')
1629     terminator = *cur++;
1630   else if (terminator == 'u')
1631     {
1632       terminator = *cur++;
1633       if (terminator == '8')
1634         terminator = *cur++;
1635     }
1636   if (terminator == 'R')
1637     {
1638       lex_raw_string (pfile, token, base, cur);
1639       return;
1640     }
1641   if (terminator == '"')
1642     type = (*base == 'L' ? CPP_WSTRING :
1643             *base == 'U' ? CPP_STRING32 :
1644             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1645                          : CPP_STRING);
1646   else if (terminator == '\'')
1647     type = (*base == 'L' ? CPP_WCHAR :
1648             *base == 'U' ? CPP_CHAR32 :
1649             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1650   else
1651     terminator = '>', type = CPP_HEADER_NAME;
1652
1653   for (;;)
1654     {
1655       cppchar_t c = *cur++;
1656
1657       /* In #include-style directives, terminators are not escapable.  */
1658       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1659         cur++;
1660       else if (c == terminator)
1661         break;
1662       else if (c == '\n')
1663         {
1664           cur--;
1665           /* Unmatched quotes always yield undefined behavior, but
1666              greedy lexing means that what appears to be an unterminated
1667              header name may actually be a legitimate sequence of tokens.  */
1668           if (terminator == '>')
1669             {
1670               token->type = CPP_LESS;
1671               return;
1672             }
1673           type = CPP_OTHER;
1674           break;
1675         }
1676       else if (c == '\0')
1677         saw_NUL = true;
1678     }
1679
1680   if (saw_NUL && !pfile->state.skipping)
1681     cpp_error (pfile, CPP_DL_WARNING,
1682                "null character(s) preserved in literal");
1683
1684   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1685     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1686                (int) terminator);
1687
1688   if (CPP_OPTION (pfile, user_literals))
1689     {
1690       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1691          underscore is ill-formed.  Since this breaks programs using macros
1692          from inttypes.h, we generate a warning and treat the ud-suffix as a
1693          separate preprocessing token.  This approach is under discussion by
1694          the standards committee, and has been adopted as a conforming
1695          extension by other front ends such as clang. */
1696       if (ISALPHA (*cur))
1697         {
1698           /* Raise a warning, but do not consume subsequent tokens.  */
1699           if (CPP_OPTION (pfile, warn_literal_suffix))
1700             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1701                                    token->src_loc, 0,
1702                                    "invalid suffix on literal; C++11 requires "
1703                                    "a space between literal and identifier");
1704         }
1705       /* Grab user defined literal suffix.  */
1706       else if (*cur == '_')
1707         {
1708           type = cpp_userdef_char_add_type (type);
1709           type = cpp_userdef_string_add_type (type);
1710           ++cur;
1711
1712           while (ISIDNUM (*cur))
1713             ++cur;
1714         }
1715     }
1716
1717   pfile->buffer->cur = cur;
1718   create_literal (pfile, token, base, cur - base, type);
1719 }
1720
1721 /* Return the comment table. The client may not make any assumption
1722    about the ordering of the table.  */
1723 cpp_comment_table *
1724 cpp_get_comments (cpp_reader *pfile)
1725 {
1726   return &pfile->comments;
1727 }
1728
1729 /* Append a comment to the end of the comment table. */
1730 static void
1731 store_comment (cpp_reader *pfile, cpp_token *token)
1732 {
1733   int len;
1734
1735   if (pfile->comments.allocated == 0)
1736     {
1737       pfile->comments.allocated = 256;
1738       pfile->comments.entries = (cpp_comment *) xmalloc
1739         (pfile->comments.allocated * sizeof (cpp_comment));
1740     }
1741
1742   if (pfile->comments.count == pfile->comments.allocated)
1743     {
1744       pfile->comments.allocated *= 2;
1745       pfile->comments.entries = (cpp_comment *) xrealloc
1746         (pfile->comments.entries,
1747          pfile->comments.allocated * sizeof (cpp_comment));
1748     }
1749
1750   len = token->val.str.len;
1751
1752   /* Copy comment. Note, token may not be NULL terminated. */
1753   pfile->comments.entries[pfile->comments.count].comment =
1754     (char *) xmalloc (sizeof (char) * (len + 1));
1755   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1756           token->val.str.text, len);
1757   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1758
1759   /* Set source location. */
1760   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1761
1762   /* Increment the count of entries in the comment table. */
1763   pfile->comments.count++;
1764 }
1765
1766 /* The stored comment includes the comment start and any terminator.  */
1767 static void
1768 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1769               cppchar_t type)
1770 {
1771   unsigned char *buffer;
1772   unsigned int len, clen, i;
1773
1774   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1775
1776   /* C++ comments probably (not definitely) have moved past a new
1777      line, which we don't want to save in the comment.  */
1778   if (is_vspace (pfile->buffer->cur[-1]))
1779     len--;
1780
1781   /* If we are currently in a directive or in argument parsing, then
1782      we need to store all C++ comments as C comments internally, and
1783      so we need to allocate a little extra space in that case.
1784
1785      Note that the only time we encounter a directive here is
1786      when we are saving comments in a "#define".  */
1787   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1788           && type == '/') ? len + 2 : len;
1789
1790   buffer = _cpp_unaligned_alloc (pfile, clen);
1791
1792   token->type = CPP_COMMENT;
1793   token->val.str.len = clen;
1794   token->val.str.text = buffer;
1795
1796   buffer[0] = '/';
1797   memcpy (buffer + 1, from, len - 1);
1798
1799   /* Finish conversion to a C comment, if necessary.  */
1800   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1801     {
1802       buffer[1] = '*';
1803       buffer[clen - 2] = '*';
1804       buffer[clen - 1] = '/';
1805       /* As there can be in a C++ comments illegal sequences for C comments
1806          we need to filter them out.  */
1807       for (i = 2; i < (clen - 2); i++)
1808         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1809           buffer[i] = '|';
1810     }
1811
1812   /* Finally store this comment for use by clients of libcpp. */
1813   store_comment (pfile, token);
1814 }
1815
1816 /* Allocate COUNT tokens for RUN.  */
1817 void
1818 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1819 {
1820   run->base = XNEWVEC (cpp_token, count);
1821   run->limit = run->base + count;
1822   run->next = NULL;
1823 }
1824
1825 /* Returns the next tokenrun, or creates one if there is none.  */
1826 static tokenrun *
1827 next_tokenrun (tokenrun *run)
1828 {
1829   if (run->next == NULL)
1830     {
1831       run->next = XNEW (tokenrun);
1832       run->next->prev = run;
1833       _cpp_init_tokenrun (run->next, 250);
1834     }
1835
1836   return run->next;
1837 }
1838
1839 /* Return the number of not yet processed token in a given
1840    context.  */
1841 int
1842 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1843 {
1844   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1845     return (LAST (context).token - FIRST (context).token);
1846   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1847            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1848     return (LAST (context).ptoken - FIRST (context).ptoken);
1849   else
1850       abort ();
1851 }
1852
1853 /* Returns the token present at index INDEX in a given context.  If
1854    INDEX is zero, the next token to be processed is returned.  */
1855 static const cpp_token*
1856 _cpp_token_from_context_at (cpp_context *context, int index)
1857 {
1858   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1859     return &(FIRST (context).token[index]);
1860   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1861            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1862     return FIRST (context).ptoken[index];
1863  else
1864    abort ();
1865 }
1866
1867 /* Look ahead in the input stream.  */
1868 const cpp_token *
1869 cpp_peek_token (cpp_reader *pfile, int index)
1870 {
1871   cpp_context *context = pfile->context;
1872   const cpp_token *peektok;
1873   int count;
1874
1875   /* First, scan through any pending cpp_context objects.  */
1876   while (context->prev)
1877     {
1878       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1879
1880       if (index < (int) sz)
1881         return _cpp_token_from_context_at (context, index);
1882       index -= (int) sz;
1883       context = context->prev;
1884     }
1885
1886   /* We will have to read some new tokens after all (and do so
1887      without invalidating preceding tokens).  */
1888   count = index;
1889   pfile->keep_tokens++;
1890
1891   do
1892     {
1893       peektok = _cpp_lex_token (pfile);
1894       if (peektok->type == CPP_EOF)
1895         return peektok;
1896     }
1897   while (index--);
1898
1899   _cpp_backup_tokens_direct (pfile, count + 1);
1900   pfile->keep_tokens--;
1901
1902   return peektok;
1903 }
1904
1905 /* Allocate a single token that is invalidated at the same time as the
1906    rest of the tokens on the line.  Has its line and col set to the
1907    same as the last lexed token, so that diagnostics appear in the
1908    right place.  */
1909 cpp_token *
1910 _cpp_temp_token (cpp_reader *pfile)
1911 {
1912   cpp_token *old, *result;
1913   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1914   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1915
1916   old = pfile->cur_token - 1;
1917   /* Any pre-existing lookaheads must not be clobbered.  */
1918   if (la)
1919     {
1920       if (sz <= la)
1921         {
1922           tokenrun *next = next_tokenrun (pfile->cur_run);
1923
1924           if (sz < la)
1925             memmove (next->base + 1, next->base,
1926                      (la - sz) * sizeof (cpp_token));
1927
1928           next->base[0] = pfile->cur_run->limit[-1];
1929         }
1930
1931       if (sz > 1)
1932         memmove (pfile->cur_token + 1, pfile->cur_token,
1933                  MIN (la, sz - 1) * sizeof (cpp_token));
1934     }
1935
1936   if (!sz && pfile->cur_token == pfile->cur_run->limit)
1937     {
1938       pfile->cur_run = next_tokenrun (pfile->cur_run);
1939       pfile->cur_token = pfile->cur_run->base;
1940     }
1941
1942   result = pfile->cur_token++;
1943   result->src_loc = old->src_loc;
1944   return result;
1945 }
1946
1947 /* Lex a token into RESULT (external interface).  Takes care of issues
1948    like directive handling, token lookahead, multiple include
1949    optimization and skipping.  */
1950 const cpp_token *
1951 _cpp_lex_token (cpp_reader *pfile)
1952 {
1953   cpp_token *result;
1954
1955   for (;;)
1956     {
1957       if (pfile->cur_token == pfile->cur_run->limit)
1958         {
1959           pfile->cur_run = next_tokenrun (pfile->cur_run);
1960           pfile->cur_token = pfile->cur_run->base;
1961         }
1962       /* We assume that the current token is somewhere in the current
1963          run.  */
1964       if (pfile->cur_token < pfile->cur_run->base
1965           || pfile->cur_token >= pfile->cur_run->limit)
1966         abort ();
1967
1968       if (pfile->lookaheads)
1969         {
1970           pfile->lookaheads--;
1971           result = pfile->cur_token++;
1972         }
1973       else
1974         result = _cpp_lex_direct (pfile);
1975
1976       if (result->flags & BOL)
1977         {
1978           /* Is this a directive.  If _cpp_handle_directive returns
1979              false, it is an assembler #.  */
1980           if (result->type == CPP_HASH
1981               /* 6.10.3 p 11: Directives in a list of macro arguments
1982                  gives undefined behavior.  This implementation
1983                  handles the directive as normal.  */
1984               && pfile->state.parsing_args != 1)
1985             {
1986               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1987                 {
1988                   if (pfile->directive_result.type == CPP_PADDING)
1989                     continue;
1990                   result = &pfile->directive_result;
1991                 }
1992             }
1993           else if (pfile->state.in_deferred_pragma)
1994             result = &pfile->directive_result;
1995
1996           if (pfile->cb.line_change && !pfile->state.skipping)
1997             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1998         }
1999
2000       /* We don't skip tokens in directives.  */
2001       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2002         break;
2003
2004       /* Outside a directive, invalidate controlling macros.  At file
2005          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2006          get here and MI optimization works.  */
2007       pfile->mi_valid = false;
2008
2009       if (!pfile->state.skipping || result->type == CPP_EOF)
2010         break;
2011     }
2012
2013   return result;
2014 }
2015
2016 /* Returns true if a fresh line has been loaded.  */
2017 bool
2018 _cpp_get_fresh_line (cpp_reader *pfile)
2019 {
2020   int return_at_eof;
2021
2022   /* We can't get a new line until we leave the current directive.  */
2023   if (pfile->state.in_directive)
2024     return false;
2025
2026   for (;;)
2027     {
2028       cpp_buffer *buffer = pfile->buffer;
2029
2030       if (!buffer->need_line)
2031         return true;
2032
2033       if (buffer->next_line < buffer->rlimit)
2034         {
2035           _cpp_clean_line (pfile);
2036           return true;
2037         }
2038
2039       /* First, get out of parsing arguments state.  */
2040       if (pfile->state.parsing_args)
2041         return false;
2042
2043       /* End of buffer.  Non-empty files should end in a newline.  */
2044       if (buffer->buf != buffer->rlimit
2045           && buffer->next_line > buffer->rlimit
2046           && !buffer->from_stage3)
2047         {
2048           /* Clip to buffer size.  */
2049           buffer->next_line = buffer->rlimit;
2050         }
2051
2052       return_at_eof = buffer->return_at_eof;
2053       _cpp_pop_buffer (pfile);
2054       if (pfile->buffer == NULL || return_at_eof)
2055         return false;
2056     }
2057 }
2058
2059 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2060   do                                                    \
2061     {                                                   \
2062       result->type = ELSE_TYPE;                         \
2063       if (*buffer->cur == CHAR)                         \
2064         buffer->cur++, result->type = THEN_TYPE;        \
2065     }                                                   \
2066   while (0)
2067
2068 /* Lex a token into pfile->cur_token, which is also incremented, to
2069    get diagnostics pointing to the correct location.
2070
2071    Does not handle issues such as token lookahead, multiple-include
2072    optimization, directives, skipping etc.  This function is only
2073    suitable for use by _cpp_lex_token, and in special cases like
2074    lex_expansion_token which doesn't care for any of these issues.
2075
2076    When meeting a newline, returns CPP_EOF if parsing a directive,
2077    otherwise returns to the start of the token buffer if permissible.
2078    Returns the location of the lexed token.  */
2079 cpp_token *
2080 _cpp_lex_direct (cpp_reader *pfile)
2081 {
2082   cppchar_t c;
2083   cpp_buffer *buffer;
2084   const unsigned char *comment_start;
2085   cpp_token *result = pfile->cur_token++;
2086
2087  fresh_line:
2088   result->flags = 0;
2089   buffer = pfile->buffer;
2090   if (buffer->need_line)
2091     {
2092       if (pfile->state.in_deferred_pragma)
2093         {
2094           result->type = CPP_PRAGMA_EOL;
2095           pfile->state.in_deferred_pragma = false;
2096           if (!pfile->state.pragma_allow_expansion)
2097             pfile->state.prevent_expansion--;
2098           return result;
2099         }
2100       if (!_cpp_get_fresh_line (pfile))
2101         {
2102           result->type = CPP_EOF;
2103           if (!pfile->state.in_directive)
2104             {
2105               /* Tell the compiler the line number of the EOF token.  */
2106               result->src_loc = pfile->line_table->highest_line;
2107               result->flags = BOL;
2108             }
2109           return result;
2110         }
2111       if (!pfile->keep_tokens)
2112         {
2113           pfile->cur_run = &pfile->base_run;
2114           result = pfile->base_run.base;
2115           pfile->cur_token = result + 1;
2116         }
2117       result->flags = BOL;
2118       if (pfile->state.parsing_args == 2)
2119         result->flags |= PREV_WHITE;
2120     }
2121   buffer = pfile->buffer;
2122  update_tokens_line:
2123   result->src_loc = pfile->line_table->highest_line;
2124
2125  skipped_white:
2126   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2127       && !pfile->overlaid_buffer)
2128     {
2129       _cpp_process_line_notes (pfile, false);
2130       result->src_loc = pfile->line_table->highest_line;
2131     }
2132   c = *buffer->cur++;
2133
2134   if (pfile->forced_token_location_p)
2135     result->src_loc = *pfile->forced_token_location_p;
2136   else
2137     result->src_loc = linemap_position_for_column (pfile->line_table,
2138                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2139
2140   switch (c)
2141     {
2142     case ' ': case '\t': case '\f': case '\v': case '\0':
2143       result->flags |= PREV_WHITE;
2144       skip_whitespace (pfile, c);
2145       goto skipped_white;
2146
2147     case '\n':
2148       if (buffer->cur < buffer->rlimit)
2149         CPP_INCREMENT_LINE (pfile, 0);
2150       buffer->need_line = true;
2151       goto fresh_line;
2152
2153     case '0': case '1': case '2': case '3': case '4':
2154     case '5': case '6': case '7': case '8': case '9':
2155       {
2156         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2157         result->type = CPP_NUMBER;
2158         lex_number (pfile, &result->val.str, &nst);
2159         warn_about_normalization (pfile, result, &nst);
2160         break;
2161       }
2162
2163     case 'L':
2164     case 'u':
2165     case 'U':
2166     case 'R':
2167       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2168          wide strings or raw strings.  */
2169       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2170           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2171         {
2172           if ((*buffer->cur == '\'' && c != 'R')
2173               || *buffer->cur == '"'
2174               || (*buffer->cur == 'R'
2175                   && c != 'R'
2176                   && buffer->cur[1] == '"'
2177                   && CPP_OPTION (pfile, rliterals))
2178               || (*buffer->cur == '8'
2179                   && c == 'u'
2180                   && (buffer->cur[1] == '"'
2181                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2182                           && CPP_OPTION (pfile, rliterals)))))
2183             {
2184               lex_string (pfile, result, buffer->cur - 1);
2185               break;
2186             }
2187         }
2188       /* Fall through.  */
2189
2190     case '_':
2191     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2192     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2193     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2194     case 's': case 't':           case 'v': case 'w': case 'x':
2195     case 'y': case 'z':
2196     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2197     case 'G': case 'H': case 'I': case 'J': case 'K':
2198     case 'M': case 'N': case 'O': case 'P': case 'Q':
2199     case 'S': case 'T':           case 'V': case 'W': case 'X':
2200     case 'Y': case 'Z':
2201       result->type = CPP_NAME;
2202       {
2203         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2204         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2205                                                 &nst);
2206         warn_about_normalization (pfile, result, &nst);
2207       }
2208
2209       /* Convert named operators to their proper types.  */
2210       if (result->val.node.node->flags & NODE_OPERATOR)
2211         {
2212           result->flags |= NAMED_OP;
2213           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2214         }
2215       break;
2216
2217     case '\'':
2218     case '"':
2219       lex_string (pfile, result, buffer->cur - 1);
2220       break;
2221
2222     case '/':
2223       /* A potential block or line comment.  */
2224       comment_start = buffer->cur;
2225       c = *buffer->cur;
2226
2227       if (c == '*')
2228         {
2229           if (_cpp_skip_block_comment (pfile))
2230             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2231         }
2232       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2233                             || cpp_in_system_header (pfile)))
2234         {
2235           /* Warn about comments only if pedantically GNUC89, and not
2236              in system headers.  */
2237           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2238               && ! buffer->warned_cplusplus_comments)
2239             {
2240               cpp_error (pfile, CPP_DL_PEDWARN,
2241                          "C++ style comments are not allowed in ISO C90");
2242               cpp_error (pfile, CPP_DL_PEDWARN,
2243                          "(this will be reported only once per input file)");
2244               buffer->warned_cplusplus_comments = 1;
2245             }
2246
2247           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2248             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2249         }
2250       else if (c == '=')
2251         {
2252           buffer->cur++;
2253           result->type = CPP_DIV_EQ;
2254           break;
2255         }
2256       else
2257         {
2258           result->type = CPP_DIV;
2259           break;
2260         }
2261
2262       if (!pfile->state.save_comments)
2263         {
2264           result->flags |= PREV_WHITE;
2265           goto update_tokens_line;
2266         }
2267
2268       /* Save the comment as a token in its own right.  */
2269       save_comment (pfile, result, comment_start, c);
2270       break;
2271
2272     case '<':
2273       if (pfile->state.angled_headers)
2274         {
2275           lex_string (pfile, result, buffer->cur - 1);
2276           if (result->type != CPP_LESS)
2277             break;
2278         }
2279
2280       result->type = CPP_LESS;
2281       if (*buffer->cur == '=')
2282         buffer->cur++, result->type = CPP_LESS_EQ;
2283       else if (*buffer->cur == '<')
2284         {
2285           buffer->cur++;
2286           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2287         }
2288       else if (CPP_OPTION (pfile, digraphs))
2289         {
2290           if (*buffer->cur == ':')
2291             {
2292               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2293                  three characters are <:: and the subsequent character
2294                  is neither : nor >, the < is treated as a preprocessor
2295                  token by itself".  */
2296               if (CPP_OPTION (pfile, cplusplus)
2297                   && (CPP_OPTION (pfile, lang) == CLK_CXX11
2298                       || CPP_OPTION (pfile, lang) == CLK_GNUCXX11)
2299                   && buffer->cur[1] == ':'
2300                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2301                 break;
2302
2303               buffer->cur++;
2304               result->flags |= DIGRAPH;
2305               result->type = CPP_OPEN_SQUARE;
2306             }
2307           else if (*buffer->cur == '%')
2308             {
2309               buffer->cur++;
2310               result->flags |= DIGRAPH;
2311               result->type = CPP_OPEN_BRACE;
2312             }
2313         }
2314       break;
2315
2316     case '>':
2317       result->type = CPP_GREATER;
2318       if (*buffer->cur == '=')
2319         buffer->cur++, result->type = CPP_GREATER_EQ;
2320       else if (*buffer->cur == '>')
2321         {
2322           buffer->cur++;
2323           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2324         }
2325       break;
2326
2327     case '%':
2328       result->type = CPP_MOD;
2329       if (*buffer->cur == '=')
2330         buffer->cur++, result->type = CPP_MOD_EQ;
2331       else if (CPP_OPTION (pfile, digraphs))
2332         {
2333           if (*buffer->cur == ':')
2334             {
2335               buffer->cur++;
2336               result->flags |= DIGRAPH;
2337               result->type = CPP_HASH;
2338               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2339                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2340             }
2341           else if (*buffer->cur == '>')
2342             {
2343               buffer->cur++;
2344               result->flags |= DIGRAPH;
2345               result->type = CPP_CLOSE_BRACE;
2346             }
2347         }
2348       break;
2349
2350     case '.':
2351       result->type = CPP_DOT;
2352       if (ISDIGIT (*buffer->cur))
2353         {
2354           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2355           result->type = CPP_NUMBER;
2356           lex_number (pfile, &result->val.str, &nst);
2357           warn_about_normalization (pfile, result, &nst);
2358         }
2359       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2360         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2361       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2362         buffer->cur++, result->type = CPP_DOT_STAR;
2363       break;
2364
2365     case '+':
2366       result->type = CPP_PLUS;
2367       if (*buffer->cur == '+')
2368         buffer->cur++, result->type = CPP_PLUS_PLUS;
2369       else if (*buffer->cur == '=')
2370         buffer->cur++, result->type = CPP_PLUS_EQ;
2371       break;
2372
2373     case '-':
2374       result->type = CPP_MINUS;
2375       if (*buffer->cur == '>')
2376         {
2377           buffer->cur++;
2378           result->type = CPP_DEREF;
2379           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2380             buffer->cur++, result->type = CPP_DEREF_STAR;
2381         }
2382       else if (*buffer->cur == '-')
2383         buffer->cur++, result->type = CPP_MINUS_MINUS;
2384       else if (*buffer->cur == '=')
2385         buffer->cur++, result->type = CPP_MINUS_EQ;
2386       break;
2387
2388     case '&':
2389       result->type = CPP_AND;
2390       if (*buffer->cur == '&')
2391         buffer->cur++, result->type = CPP_AND_AND;
2392       else if (*buffer->cur == '=')
2393         buffer->cur++, result->type = CPP_AND_EQ;
2394       break;
2395
2396     case '|':
2397       result->type = CPP_OR;
2398       if (*buffer->cur == '|')
2399         buffer->cur++, result->type = CPP_OR_OR;
2400       else if (*buffer->cur == '=')
2401         buffer->cur++, result->type = CPP_OR_EQ;
2402       break;
2403
2404     case ':':
2405       result->type = CPP_COLON;
2406       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2407         buffer->cur++, result->type = CPP_SCOPE;
2408       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2409         {
2410           buffer->cur++;
2411           result->flags |= DIGRAPH;
2412           result->type = CPP_CLOSE_SQUARE;
2413         }
2414       break;
2415
2416     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2417     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2418     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2419     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2420     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2421
2422     case '?': result->type = CPP_QUERY; break;
2423     case '~': result->type = CPP_COMPL; break;
2424     case ',': result->type = CPP_COMMA; break;
2425     case '(': result->type = CPP_OPEN_PAREN; break;
2426     case ')': result->type = CPP_CLOSE_PAREN; break;
2427     case '[': result->type = CPP_OPEN_SQUARE; break;
2428     case ']': result->type = CPP_CLOSE_SQUARE; break;
2429     case '{': result->type = CPP_OPEN_BRACE; break;
2430     case '}': result->type = CPP_CLOSE_BRACE; break;
2431     case ';': result->type = CPP_SEMICOLON; break;
2432
2433       /* @ is a punctuator in Objective-C.  */
2434     case '@': result->type = CPP_ATSIGN; break;
2435
2436     case '$':
2437     case '\\':
2438       {
2439         const uchar *base = --buffer->cur;
2440         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2441
2442         if (forms_identifier_p (pfile, true, &nst))
2443           {
2444             result->type = CPP_NAME;
2445             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2446             warn_about_normalization (pfile, result, &nst);
2447             break;
2448           }
2449         buffer->cur++;
2450       }
2451
2452     default:
2453       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2454       break;
2455     }
2456
2457   return result;
2458 }
2459
2460 /* An upper bound on the number of bytes needed to spell TOKEN.
2461    Does not include preceding whitespace.  */
2462 unsigned int
2463 cpp_token_len (const cpp_token *token)
2464 {
2465   unsigned int len;
2466
2467   switch (TOKEN_SPELL (token))
2468     {
2469     default:            len = 6;                                break;
2470     case SPELL_LITERAL: len = token->val.str.len;               break;
2471     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2472     }
2473
2474   return len;
2475 }
2476
2477 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2478    Return the number of bytes read out of NAME.  (There are always
2479    10 bytes written to BUFFER.)  */
2480
2481 static size_t
2482 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2483 {
2484   int j;
2485   int ucn_len = 0;
2486   int ucn_len_c;
2487   unsigned t;
2488   unsigned long utf32;
2489
2490   /* Compute the length of the UTF-8 sequence.  */
2491   for (t = *name; t & 0x80; t <<= 1)
2492     ucn_len++;
2493
2494   utf32 = *name & (0x7F >> ucn_len);
2495   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2496     {
2497       utf32 = (utf32 << 6) | (*++name & 0x3F);
2498
2499       /* Ill-formed UTF-8.  */
2500       if ((*name & ~0x3F) != 0x80)
2501         abort ();
2502     }
2503
2504   *buffer++ = '\\';
2505   *buffer++ = 'U';
2506   for (j = 7; j >= 0; j--)
2507     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2508   return ucn_len;
2509 }
2510
2511 /* Given a token TYPE corresponding to a digraph, return a pointer to
2512    the spelling of the digraph.  */
2513 static const unsigned char *
2514 cpp_digraph2name (enum cpp_ttype type)
2515 {
2516   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2517 }
2518
2519 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2520    already contain the enough space to hold the token's spelling.
2521    Returns a pointer to the character after the last character written.
2522    FORSTRING is true if this is to be the spelling after translation
2523    phase 1 (this is different for UCNs).
2524    FIXME: Would be nice if we didn't need the PFILE argument.  */
2525 unsigned char *
2526 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2527                  unsigned char *buffer, bool forstring)
2528 {
2529   switch (TOKEN_SPELL (token))
2530     {
2531     case SPELL_OPERATOR:
2532       {
2533         const unsigned char *spelling;
2534         unsigned char c;
2535
2536         if (token->flags & DIGRAPH)
2537           spelling = cpp_digraph2name (token->type);
2538         else if (token->flags & NAMED_OP)
2539           goto spell_ident;
2540         else
2541           spelling = TOKEN_NAME (token);
2542
2543         while ((c = *spelling++) != '\0')
2544           *buffer++ = c;
2545       }
2546       break;
2547
2548     spell_ident:
2549     case SPELL_IDENT:
2550       if (forstring)
2551         {
2552           memcpy (buffer, NODE_NAME (token->val.node.node),
2553                   NODE_LEN (token->val.node.node));
2554           buffer += NODE_LEN (token->val.node.node);
2555         }
2556       else
2557         {
2558           size_t i;
2559           const unsigned char * name = NODE_NAME (token->val.node.node);
2560
2561           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2562             if (name[i] & ~0x7F)
2563               {
2564                 i += utf8_to_ucn (buffer, name + i) - 1;
2565                 buffer += 10;
2566               }
2567             else
2568               *buffer++ = NODE_NAME (token->val.node.node)[i];
2569         }
2570       break;
2571
2572     case SPELL_LITERAL:
2573       memcpy (buffer, token->val.str.text, token->val.str.len);
2574       buffer += token->val.str.len;
2575       break;
2576
2577     case SPELL_NONE:
2578       cpp_error (pfile, CPP_DL_ICE,
2579                  "unspellable token %s", TOKEN_NAME (token));
2580       break;
2581     }
2582
2583   return buffer;
2584 }
2585
2586 /* Returns TOKEN spelt as a null-terminated string.  The string is
2587    freed when the reader is destroyed.  Useful for diagnostics.  */
2588 unsigned char *
2589 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2590 {
2591   unsigned int len = cpp_token_len (token) + 1;
2592   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2593
2594   end = cpp_spell_token (pfile, token, start, false);
2595   end[0] = '\0';
2596
2597   return start;
2598 }
2599
2600 /* Returns a pointer to a string which spells the token defined by
2601    TYPE and FLAGS.  Used by C front ends, which really should move to
2602    using cpp_token_as_text.  */
2603 const char *
2604 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2605 {
2606   if (flags & DIGRAPH)
2607     return (const char *) cpp_digraph2name (type);
2608   else if (flags & NAMED_OP)
2609     return cpp_named_operator2name (type);
2610
2611   return (const char *) token_spellings[type].name;
2612 }
2613
2614 /* Writes the spelling of token to FP, without any preceding space.
2615    Separated from cpp_spell_token for efficiency - to avoid stdio
2616    double-buffering.  */
2617 void
2618 cpp_output_token (const cpp_token *token, FILE *fp)
2619 {
2620   switch (TOKEN_SPELL (token))
2621     {
2622     case SPELL_OPERATOR:
2623       {
2624         const unsigned char *spelling;
2625         int c;
2626
2627         if (token->flags & DIGRAPH)
2628           spelling = cpp_digraph2name (token->type);
2629         else if (token->flags & NAMED_OP)
2630           goto spell_ident;
2631         else
2632           spelling = TOKEN_NAME (token);
2633
2634         c = *spelling;
2635         do
2636           putc (c, fp);
2637         while ((c = *++spelling) != '\0');
2638       }
2639       break;
2640
2641     spell_ident:
2642     case SPELL_IDENT:
2643       {
2644         size_t i;
2645         const unsigned char * name = NODE_NAME (token->val.node.node);
2646
2647         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2648           if (name[i] & ~0x7F)
2649             {
2650               unsigned char buffer[10];
2651               i += utf8_to_ucn (buffer, name + i) - 1;
2652               fwrite (buffer, 1, 10, fp);
2653             }
2654           else
2655             fputc (NODE_NAME (token->val.node.node)[i], fp);
2656       }
2657       break;
2658
2659     case SPELL_LITERAL:
2660       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2661       break;
2662
2663     case SPELL_NONE:
2664       /* An error, most probably.  */
2665       break;
2666     }
2667 }
2668
2669 /* Compare two tokens.  */
2670 int
2671 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2672 {
2673   if (a->type == b->type && a->flags == b->flags)
2674     switch (TOKEN_SPELL (a))
2675       {
2676       default:                  /* Keep compiler happy.  */
2677       case SPELL_OPERATOR:
2678         /* token_no is used to track where multiple consecutive ##
2679            tokens were originally located.  */
2680         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2681       case SPELL_NONE:
2682         return (a->type != CPP_MACRO_ARG
2683                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2684       case SPELL_IDENT:
2685         return a->val.node.node == b->val.node.node;
2686       case SPELL_LITERAL:
2687         return (a->val.str.len == b->val.str.len
2688                 && !memcmp (a->val.str.text, b->val.str.text,
2689                             a->val.str.len));
2690       }
2691
2692   return 0;
2693 }
2694
2695 /* Returns nonzero if a space should be inserted to avoid an
2696    accidental token paste for output.  For simplicity, it is
2697    conservative, and occasionally advises a space where one is not
2698    needed, e.g. "." and ".2".  */
2699 int
2700 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2701                  const cpp_token *token2)
2702 {
2703   enum cpp_ttype a = token1->type, b = token2->type;
2704   cppchar_t c;
2705
2706   if (token1->flags & NAMED_OP)
2707     a = CPP_NAME;
2708   if (token2->flags & NAMED_OP)
2709     b = CPP_NAME;
2710
2711   c = EOF;
2712   if (token2->flags & DIGRAPH)
2713     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2714   else if (token_spellings[b].category == SPELL_OPERATOR)
2715     c = token_spellings[b].name[0];
2716
2717   /* Quickly get everything that can paste with an '='.  */
2718   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2719     return 1;
2720
2721   switch (a)
2722     {
2723     case CPP_GREATER:   return c == '>';
2724     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2725     case CPP_PLUS:      return c == '+';
2726     case CPP_MINUS:     return c == '-' || c == '>';
2727     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2728     case CPP_MOD:       return c == ':' || c == '>';
2729     case CPP_AND:       return c == '&';
2730     case CPP_OR:        return c == '|';
2731     case CPP_COLON:     return c == ':' || c == '>';
2732     case CPP_DEREF:     return c == '*';
2733     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2734     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2735     case CPP_NAME:      return ((b == CPP_NUMBER
2736                                  && name_p (pfile, &token2->val.str))
2737                                 || b == CPP_NAME
2738                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2739     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2740                                 || c == '.' || c == '+' || c == '-');
2741                                       /* UCNs */
2742     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2743                                  && b == CPP_NAME)
2744                                 || (CPP_OPTION (pfile, objc)
2745                                     && token1->val.str.text[0] == '@'
2746                                     && (b == CPP_NAME || b == CPP_STRING)));
2747     default:            break;
2748     }
2749
2750   return 0;
2751 }
2752
2753 /* Output all the remaining tokens on the current line, and a newline
2754    character, to FP.  Leading whitespace is removed.  If there are
2755    macros, special token padding is not performed.  */
2756 void
2757 cpp_output_line (cpp_reader *pfile, FILE *fp)
2758 {
2759   const cpp_token *token;
2760
2761   token = cpp_get_token (pfile);
2762   while (token->type != CPP_EOF)
2763     {
2764       cpp_output_token (token, fp);
2765       token = cpp_get_token (pfile);
2766       if (token->flags & PREV_WHITE)
2767         putc (' ', fp);
2768     }
2769
2770   putc ('\n', fp);
2771 }
2772
2773 /* Return a string representation of all the remaining tokens on the
2774    current line.  The result is allocated using xmalloc and must be
2775    freed by the caller.  */
2776 unsigned char *
2777 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2778 {
2779   const cpp_token *token;
2780   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2781   unsigned int alloced = 120 + out;
2782   unsigned char *result = (unsigned char *) xmalloc (alloced);
2783
2784   /* If DIR_NAME is empty, there are no initial contents.  */
2785   if (dir_name)
2786     {
2787       sprintf ((char *) result, "#%s ", dir_name);
2788       out += 2;
2789     }
2790
2791   token = cpp_get_token (pfile);
2792   while (token->type != CPP_EOF)
2793     {
2794       unsigned char *last;
2795       /* Include room for a possible space and the terminating nul.  */
2796       unsigned int len = cpp_token_len (token) + 2;
2797
2798       if (out + len > alloced)
2799         {
2800           alloced *= 2;
2801           if (out + len > alloced)
2802             alloced = out + len;
2803           result = (unsigned char *) xrealloc (result, alloced);
2804         }
2805
2806       last = cpp_spell_token (pfile, token, &result[out], 0);
2807       out = last - result;
2808
2809       token = cpp_get_token (pfile);
2810       if (token->flags & PREV_WHITE)
2811         result[out++] = ' ';
2812     }
2813
2814   result[out] = '\0';
2815   return result;
2816 }
2817
2818 /* Memory buffers.  Changing these three constants can have a dramatic
2819    effect on performance.  The values here are reasonable defaults,
2820    but might be tuned.  If you adjust them, be sure to test across a
2821    range of uses of cpplib, including heavy nested function-like macro
2822    expansion.  Also check the change in peak memory usage (NJAMD is a
2823    good tool for this).  */
2824 #define MIN_BUFF_SIZE 8000
2825 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2826 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2827         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2828
2829 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2830   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2831 #endif
2832
2833 /* Create a new allocation buffer.  Place the control block at the end
2834    of the buffer, so that buffer overflows will cause immediate chaos.  */
2835 static _cpp_buff *
2836 new_buff (size_t len)
2837 {
2838   _cpp_buff *result;
2839   unsigned char *base;
2840
2841   if (len < MIN_BUFF_SIZE)
2842     len = MIN_BUFF_SIZE;
2843   len = CPP_ALIGN (len);
2844
2845   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2846   result = (_cpp_buff *) (base + len);
2847   result->base = base;
2848   result->cur = base;
2849   result->limit = base + len;
2850   result->next = NULL;
2851   return result;
2852 }
2853
2854 /* Place a chain of unwanted allocation buffers on the free list.  */
2855 void
2856 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2857 {
2858   _cpp_buff *end = buff;
2859
2860   while (end->next)
2861     end = end->next;
2862   end->next = pfile->free_buffs;
2863   pfile->free_buffs = buff;
2864 }
2865
2866 /* Return a free buffer of size at least MIN_SIZE.  */
2867 _cpp_buff *
2868 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2869 {
2870   _cpp_buff *result, **p;
2871
2872   for (p = &pfile->free_buffs;; p = &(*p)->next)
2873     {
2874       size_t size;
2875
2876       if (*p == NULL)
2877         return new_buff (min_size);
2878       result = *p;
2879       size = result->limit - result->base;
2880       /* Return a buffer that's big enough, but don't waste one that's
2881          way too big.  */
2882       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2883         break;
2884     }
2885
2886   *p = result->next;
2887   result->next = NULL;
2888   result->cur = result->base;
2889   return result;
2890 }
2891
2892 /* Creates a new buffer with enough space to hold the uncommitted
2893    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2894    the excess bytes to the new buffer.  Chains the new buffer after
2895    BUFF, and returns the new buffer.  */
2896 _cpp_buff *
2897 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2898 {
2899   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2900   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2901
2902   buff->next = new_buff;
2903   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2904   return new_buff;
2905 }
2906
2907 /* Creates a new buffer with enough space to hold the uncommitted
2908    remaining bytes of the buffer pointed to by BUFF, and at least
2909    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
2910    Chains the new buffer before the buffer pointed to by BUFF, and
2911    updates the pointer to point to the new buffer.  */
2912 void
2913 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
2914 {
2915   _cpp_buff *new_buff, *old_buff = *pbuff;
2916   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2917
2918   new_buff = _cpp_get_buff (pfile, size);
2919   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2920   new_buff->next = old_buff;
2921   *pbuff = new_buff;
2922 }
2923
2924 /* Free a chain of buffers starting at BUFF.  */
2925 void
2926 _cpp_free_buff (_cpp_buff *buff)
2927 {
2928   _cpp_buff *next;
2929
2930   for (; buff; buff = next)
2931     {
2932       next = buff->next;
2933       free (buff->base);
2934     }
2935 }
2936
2937 /* Allocate permanent, unaligned storage of length LEN.  */
2938 unsigned char *
2939 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2940 {
2941   _cpp_buff *buff = pfile->u_buff;
2942   unsigned char *result = buff->cur;
2943
2944   if (len > (size_t) (buff->limit - result))
2945     {
2946       buff = _cpp_get_buff (pfile, len);
2947       buff->next = pfile->u_buff;
2948       pfile->u_buff = buff;
2949       result = buff->cur;
2950     }
2951
2952   buff->cur = result + len;
2953   return result;
2954 }
2955
2956 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2957    That buffer is used for growing allocations when saving macro
2958    replacement lists in a #define, and when parsing an answer to an
2959    assertion in #assert, #unassert or #if (and therefore possibly
2960    whilst expanding macros).  It therefore must not be used by any
2961    code that they might call: specifically the lexer and the guts of
2962    the macro expander.
2963
2964    All existing other uses clearly fit this restriction: storing
2965    registered pragmas during initialization.  */
2966 unsigned char *
2967 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2968 {
2969   _cpp_buff *buff = pfile->a_buff;
2970   unsigned char *result = buff->cur;
2971
2972   if (len > (size_t) (buff->limit - result))
2973     {
2974       buff = _cpp_get_buff (pfile, len);
2975       buff->next = pfile->a_buff;
2976       pfile->a_buff = buff;
2977       result = buff->cur;
2978     }
2979
2980   buff->cur = result + len;
2981   return result;
2982 }
2983
2984 /* Say which field of TOK is in use.  */
2985
2986 enum cpp_token_fld_kind
2987 cpp_token_val_index (cpp_token *tok)
2988 {
2989   switch (TOKEN_SPELL (tok))
2990     {
2991     case SPELL_IDENT:
2992       return CPP_TOKEN_FLD_NODE;
2993     case SPELL_LITERAL:
2994       return CPP_TOKEN_FLD_STR;
2995     case SPELL_OPERATOR:
2996       if (tok->type == CPP_PASTE)
2997         return CPP_TOKEN_FLD_TOKEN_NO;
2998       else
2999         return CPP_TOKEN_FLD_NONE;
3000     case SPELL_NONE:
3001       if (tok->type == CPP_MACRO_ARG)
3002         return CPP_TOKEN_FLD_ARG_NO;
3003       else if (tok->type == CPP_PADDING)
3004         return CPP_TOKEN_FLD_SOURCE;
3005       else if (tok->type == CPP_PRAGMA)
3006         return CPP_TOKEN_FLD_PRAGMA;
3007       /* else fall through */
3008     default:
3009       return CPP_TOKEN_FLD_NONE;
3010     }
3011 }
3012
3013 /* All tokens lexed in R after calling this function will be forced to have
3014    their source_location the same as the location referenced by P, until
3015    cpp_stop_forcing_token_locations is called for R.  */
3016
3017 void
3018 cpp_force_token_locations (cpp_reader *r, source_location *p)
3019 {
3020   r->forced_token_location_p = p;
3021 }
3022
3023 /* Go back to assigning locations naturally for lexed tokens.  */
3024
3025 void
3026 cpp_stop_forcing_token_locations (cpp_reader *r)
3027 {
3028   r->forced_token_location_p = NULL;
3029 }