Mark ChangeLog
[official-gcc.git] / libcpp / lex.c
blob6d69b591fec8aba1853be1d644ed5821da12184a
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000-2014 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
27 enum spell_type
29 SPELL_OPERATOR = 0,
30 SPELL_IDENT,
31 SPELL_LITERAL,
32 SPELL_NONE
35 struct token_spelling
37 enum spell_type category;
38 const unsigned char *name;
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
44 #define OP(e, s) { SPELL_OPERATOR, UC s },
45 #define TK(e, s) { SPELL_ ## s, UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54 static int skip_line_comment (cpp_reader *);
55 static void skip_whitespace (cpp_reader *, cppchar_t);
56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58 static void store_comment (cpp_reader *, cpp_token *);
59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
60 unsigned int, enum cpp_ttype);
61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
62 static int name_p (cpp_reader *, const cpp_string *);
63 static tokenrun *next_tokenrun (tokenrun *);
65 static _cpp_buff *new_buff (size_t);
68 /* Utility routine:
70 Compares, the token TOKEN to the NUL-terminated string STRING.
71 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
72 int
73 cpp_ideq (const cpp_token *token, const char *string)
75 if (token->type != CPP_NAME)
76 return 0;
78 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
81 /* Record a note TYPE at byte POS into the current cleaned logical
82 line. */
83 static void
84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
86 if (buffer->notes_used == buffer->notes_cap)
88 buffer->notes_cap = buffer->notes_cap * 2 + 200;
89 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
90 buffer->notes_cap);
93 buffer->notes[buffer->notes_used].pos = pos;
94 buffer->notes[buffer->notes_used].type = type;
95 buffer->notes_used++;
99 /* Fast path to find line special characters using optimized character
100 scanning algorithms. Anything complicated falls back to the slow
101 path below. Since this loop is very hot it's worth doing these kinds
102 of optimizations.
104 One of the paths through the ifdefs should provide
106 const uchar *search_line_fast (const uchar *s, const uchar *end);
108 Between S and END, search for \n, \r, \\, ?. Return a pointer to
109 the found character.
111 Note that the last character of the buffer is *always* a newline,
112 as forced by _cpp_convert_input. This fact can be used to avoid
113 explicitly looking for the end of the buffer. */
115 /* Configure gives us an ifdef test. */
116 #ifndef WORDS_BIGENDIAN
117 #define WORDS_BIGENDIAN 0
118 #endif
120 /* We'd like the largest integer that fits into a register. There's nothing
121 in <stdint.h> that gives us that. For most hosts this is unsigned long,
122 but MS decided on an LLP64 model. Thankfully when building with GCC we
123 can get the "real" word size. */
124 #ifdef __GNUC__
125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
126 #else
127 typedef unsigned long word_type;
128 #endif
130 /* The code below is only expecting sizes 4 or 8.
131 Die at compile-time if this expectation is violated. */
132 typedef char check_word_type_size
133 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
135 /* Return X with the first N bytes forced to values that won't match one
136 of the interesting characters. Note that NUL is not interesting. */
138 static inline word_type
139 acc_char_mask_misalign (word_type val, unsigned int n)
141 word_type mask = -1;
142 if (WORDS_BIGENDIAN)
143 mask >>= n * 8;
144 else
145 mask <<= n * 8;
146 return val & mask;
149 /* Return X replicated to all byte positions within WORD_TYPE. */
151 static inline word_type
152 acc_char_replicate (uchar x)
154 word_type ret;
156 ret = (x << 24) | (x << 16) | (x << 8) | x;
157 if (sizeof(word_type) == 8)
158 ret = (ret << 16 << 16) | ret;
159 return ret;
162 /* Return non-zero if some byte of VAL is (probably) C. */
164 static inline word_type
165 acc_char_cmp (word_type val, word_type c)
167 #if defined(__GNUC__) && defined(__alpha__)
168 /* We can get exact results using a compare-bytes instruction.
169 Get (val == c) via (0 >= (val ^ c)). */
170 return __builtin_alpha_cmpbge (0, val ^ c);
171 #else
172 word_type magic = 0x7efefefeU;
173 if (sizeof(word_type) == 8)
174 magic = (magic << 16 << 16) | 0xfefefefeU;
175 magic |= 1;
177 val ^= c;
178 return ((val + magic) ^ ~val) & ~magic;
179 #endif
182 /* Given the result of acc_char_cmp is non-zero, return the index of
183 the found character. If this was a false positive, return -1. */
185 static inline int
186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
187 word_type val ATTRIBUTE_UNUSED)
189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
190 /* The cmpbge instruction sets *bits* of the result corresponding to
191 matches in the bytes with no false positives. */
192 return __builtin_ctzl (cmp);
193 #else
194 unsigned int i;
196 /* ??? It would be nice to force unrolling here,
197 and have all of these constants folded. */
198 for (i = 0; i < sizeof(word_type); ++i)
200 uchar c;
201 if (WORDS_BIGENDIAN)
202 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
203 else
204 c = (val >> i * 8) & 0xff;
206 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
207 return i;
210 return -1;
211 #endif
214 /* A version of the fast scanner using bit fiddling techniques.
216 For 32-bit words, one would normally perform 16 comparisons and
217 16 branches. With this algorithm one performs 24 arithmetic
218 operations and one branch. Whether this is faster with a 32-bit
219 word size is going to be somewhat system dependent.
221 For 64-bit words, we eliminate twice the number of comparisons
222 and branches without increasing the number of arithmetic operations.
223 It's almost certainly going to be a win with 64-bit word size. */
225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
226 ATTRIBUTE_UNUSED;
228 static const uchar *
229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
231 const word_type repl_nl = acc_char_replicate ('\n');
232 const word_type repl_cr = acc_char_replicate ('\r');
233 const word_type repl_bs = acc_char_replicate ('\\');
234 const word_type repl_qm = acc_char_replicate ('?');
236 unsigned int misalign;
237 const word_type *p;
238 word_type val, t;
240 /* Align the buffer. Mask out any bytes from before the beginning. */
241 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
242 val = *p;
243 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
244 if (misalign)
245 val = acc_char_mask_misalign (val, misalign);
247 /* Main loop. */
248 while (1)
250 t = acc_char_cmp (val, repl_nl);
251 t |= acc_char_cmp (val, repl_cr);
252 t |= acc_char_cmp (val, repl_bs);
253 t |= acc_char_cmp (val, repl_qm);
255 if (__builtin_expect (t != 0, 0))
257 int i = acc_char_index (t, val);
258 if (i >= 0)
259 return (const uchar *)p + i;
262 val = *++p;
266 /* Disable on Solaris 2/x86 until the following problems can be properly
267 autoconfed:
269 The Solaris 9 assembler cannot assemble SSE4.2 insns.
270 Before Solaris 9 Update 6, SSE insns cannot be executed.
271 The Solaris 10+ assembler tags objects with the instruction set
272 extensions used, so SSE4.2 executables cannot run on machines that
273 don't support that extension. */
275 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
277 /* Replicated character data to be shared between implementations.
278 Recall that outside of a context with vector support we can't
279 define compatible vector types, therefore these are all defined
280 in terms of raw characters. */
281 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
282 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
283 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
284 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
285 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
286 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
287 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
288 { '?', '?', '?', '?', '?', '?', '?', '?',
289 '?', '?', '?', '?', '?', '?', '?', '?' },
292 /* A version of the fast scanner using MMX vectorized byte compare insns.
294 This uses the PMOVMSKB instruction which was introduced with "MMX2",
295 which was packaged into SSE1; it is also present in the AMD MMX
296 extension. Mark the function as using "sse" so that we emit a real
297 "emms" instruction, rather than the 3dNOW "femms" instruction. */
299 static const uchar *
300 #ifndef __SSE__
301 __attribute__((__target__("sse")))
302 #endif
303 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
305 typedef char v8qi __attribute__ ((__vector_size__ (8)));
306 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
308 const v8qi repl_nl = *(const v8qi *)repl_chars[0];
309 const v8qi repl_cr = *(const v8qi *)repl_chars[1];
310 const v8qi repl_bs = *(const v8qi *)repl_chars[2];
311 const v8qi repl_qm = *(const v8qi *)repl_chars[3];
313 unsigned int misalign, found, mask;
314 const v8qi *p;
315 v8qi data, t, c;
317 /* Align the source pointer. While MMX doesn't generate unaligned data
318 faults, this allows us to safely scan to the end of the buffer without
319 reading beyond the end of the last page. */
320 misalign = (uintptr_t)s & 7;
321 p = (const v8qi *)((uintptr_t)s & -8);
322 data = *p;
324 /* Create a mask for the bytes that are valid within the first
325 16-byte block. The Idea here is that the AND with the mask
326 within the loop is "free", since we need some AND or TEST
327 insn in order to set the flags for the branch anyway. */
328 mask = -1u << misalign;
330 /* Main loop processing 8 bytes at a time. */
331 goto start;
334 data = *++p;
335 mask = -1;
337 start:
338 t = __builtin_ia32_pcmpeqb(data, repl_nl);
339 c = __builtin_ia32_pcmpeqb(data, repl_cr);
340 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
341 c = __builtin_ia32_pcmpeqb(data, repl_bs);
342 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
343 c = __builtin_ia32_pcmpeqb(data, repl_qm);
344 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
345 found = __builtin_ia32_pmovmskb (t);
346 found &= mask;
348 while (!found);
350 __builtin_ia32_emms ();
352 /* FOUND contains 1 in bits for which we matched a relevant
353 character. Conversion to the byte index is trivial. */
354 found = __builtin_ctz(found);
355 return (const uchar *)p + found;
358 /* A version of the fast scanner using SSE2 vectorized byte compare insns. */
360 static const uchar *
361 #ifndef __SSE2__
362 __attribute__((__target__("sse2")))
363 #endif
364 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
366 typedef char v16qi __attribute__ ((__vector_size__ (16)));
368 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
369 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
370 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
371 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
373 unsigned int misalign, found, mask;
374 const v16qi *p;
375 v16qi data, t;
377 /* Align the source pointer. */
378 misalign = (uintptr_t)s & 15;
379 p = (const v16qi *)((uintptr_t)s & -16);
380 data = *p;
382 /* Create a mask for the bytes that are valid within the first
383 16-byte block. The Idea here is that the AND with the mask
384 within the loop is "free", since we need some AND or TEST
385 insn in order to set the flags for the branch anyway. */
386 mask = -1u << misalign;
388 /* Main loop processing 16 bytes at a time. */
389 goto start;
392 data = *++p;
393 mask = -1;
395 start:
396 t = __builtin_ia32_pcmpeqb128(data, repl_nl);
397 t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
398 t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
399 t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
400 found = __builtin_ia32_pmovmskb128 (t);
401 found &= mask;
403 while (!found);
405 /* FOUND contains 1 in bits for which we matched a relevant
406 character. Conversion to the byte index is trivial. */
407 found = __builtin_ctz(found);
408 return (const uchar *)p + found;
411 #ifdef HAVE_SSE4
412 /* A version of the fast scanner using SSE 4.2 vectorized string insns. */
414 static const uchar *
415 #ifndef __SSE4_2__
416 __attribute__((__target__("sse4.2")))
417 #endif
418 search_line_sse42 (const uchar *s, const uchar *end)
420 typedef char v16qi __attribute__ ((__vector_size__ (16)));
421 static const v16qi search = { '\n', '\r', '?', '\\' };
423 uintptr_t si = (uintptr_t)s;
424 uintptr_t index;
426 /* Check for unaligned input. */
427 if (si & 15)
429 v16qi sv;
431 if (__builtin_expect (end - s < 16, 0)
432 && __builtin_expect ((si & 0xfff) > 0xff0, 0))
434 /* There are less than 16 bytes left in the buffer, and less
435 than 16 bytes left on the page. Reading 16 bytes at this
436 point might generate a spurious page fault. Defer to the
437 SSE2 implementation, which already handles alignment. */
438 return search_line_sse2 (s, end);
441 /* ??? The builtin doesn't understand that the PCMPESTRI read from
442 memory need not be aligned. */
443 sv = __builtin_ia32_loaddqu ((const char *) s);
444 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
446 if (__builtin_expect (index < 16, 0))
447 goto found;
449 /* Advance the pointer to an aligned address. We will re-scan a
450 few bytes, but we no longer need care for reading past the
451 end of a page, since we're guaranteed a match. */
452 s = (const uchar *)((si + 16) & -16);
455 /* Main loop, processing 16 bytes at a time. By doing the whole loop
456 in inline assembly, we can make proper use of the flags set. */
457 __asm ( "sub $16, %1\n"
458 " .balign 16\n"
459 "0: add $16, %1\n"
460 " %vpcmpestri $0, (%1), %2\n"
461 " jnc 0b"
462 : "=&c"(index), "+r"(s)
463 : "x"(search), "a"(4), "d"(16));
465 found:
466 return s + index;
469 #else
470 /* Work around out-dated assemblers without sse4 support. */
471 #define search_line_sse42 search_line_sse2
472 #endif
474 /* Check the CPU capabilities. */
476 #include "../gcc/config/i386/cpuid.h"
478 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
479 static search_line_fast_type search_line_fast;
481 #define HAVE_init_vectorized_lexer 1
482 static inline void
483 init_vectorized_lexer (void)
485 unsigned dummy, ecx = 0, edx = 0;
486 search_line_fast_type impl = search_line_acc_char;
487 int minimum = 0;
489 #if defined(__SSE4_2__)
490 minimum = 3;
491 #elif defined(__SSE2__)
492 minimum = 2;
493 #elif defined(__SSE__)
494 minimum = 1;
495 #endif
497 if (minimum == 3)
498 impl = search_line_sse42;
499 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
501 if (minimum == 3 || (ecx & bit_SSE4_2))
502 impl = search_line_sse42;
503 else if (minimum == 2 || (edx & bit_SSE2))
504 impl = search_line_sse2;
505 else if (minimum == 1 || (edx & bit_SSE))
506 impl = search_line_mmx;
508 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
510 if (minimum == 1
511 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
512 impl = search_line_mmx;
515 search_line_fast = impl;
518 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
520 /* A vection of the fast scanner using AltiVec vectorized byte compares. */
521 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
522 so we can't compile this function without -maltivec on the command line
523 (or implied by some other switch). */
525 static const uchar *
526 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
528 typedef __attribute__((altivec(vector))) unsigned char vc;
530 const vc repl_nl = {
531 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
532 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
534 const vc repl_cr = {
535 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
536 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
538 const vc repl_bs = {
539 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
540 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
542 const vc repl_qm = {
543 '?', '?', '?', '?', '?', '?', '?', '?',
544 '?', '?', '?', '?', '?', '?', '?', '?',
546 const vc ones = {
547 -1, -1, -1, -1, -1, -1, -1, -1,
548 -1, -1, -1, -1, -1, -1, -1, -1,
550 const vc zero = { 0 };
552 vc data, mask, t;
554 /* Altivec loads automatically mask addresses with -16. This lets us
555 issue the first load as early as possible. */
556 data = __builtin_vec_ld(0, (const vc *)s);
558 /* Discard bytes before the beginning of the buffer. Do this by
559 beginning with all ones and shifting in zeros according to the
560 mis-alignment. The LVSR instruction pulls the exact shift we
561 want from the address. */
562 #ifdef __BIG_ENDIAN__
563 mask = __builtin_vec_lvsr(0, s);
564 mask = __builtin_vec_perm(zero, ones, mask);
565 #else
566 mask = __builtin_vec_lvsl(0, s);
567 mask = __builtin_vec_perm(ones, zero, mask);
568 #endif
569 data &= mask;
571 /* While altivec loads mask addresses, we still need to align S so
572 that the offset we compute at the end is correct. */
573 s = (const uchar *)((uintptr_t)s & -16);
575 /* Main loop processing 16 bytes at a time. */
576 goto start;
579 vc m_nl, m_cr, m_bs, m_qm;
581 s += 16;
582 data = __builtin_vec_ld(0, (const vc *)s);
584 start:
585 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
586 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
587 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
588 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
589 t = (m_nl | m_cr) | (m_bs | m_qm);
591 /* T now contains 0xff in bytes for which we matched one of the relevant
592 characters. We want to exit the loop if any byte in T is non-zero.
593 Below is the expansion of vec_any_ne(t, zero). */
595 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
598 #define N (sizeof(vc) / sizeof(long))
600 union {
601 vc v;
602 /* Statically assert that N is 2 or 4. */
603 unsigned long l[(N == 2 || N == 4) ? N : -1];
604 } u;
605 unsigned long l, i = 0;
607 u.v = t;
609 /* Find the first word of T that is non-zero. */
610 switch (N)
612 case 4:
613 l = u.l[i++];
614 if (l != 0)
615 break;
616 s += sizeof(unsigned long);
617 l = u.l[i++];
618 if (l != 0)
619 break;
620 s += sizeof(unsigned long);
621 case 2:
622 l = u.l[i++];
623 if (l != 0)
624 break;
625 s += sizeof(unsigned long);
626 l = u.l[i];
629 /* L now contains 0xff in bytes for which we matched one of the
630 relevant characters. We can find the byte index by finding
631 its bit index and dividing by 8. */
632 #ifdef __BIG_ENDIAN__
633 l = __builtin_clzl(l) >> 3;
634 #else
635 l = __builtin_ctzl(l) >> 3;
636 #endif
637 return s + l;
639 #undef N
643 #elif defined (__ARM_NEON__)
644 #include "arm_neon.h"
646 static const uchar *
647 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
649 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
650 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
651 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
652 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
653 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
655 unsigned int misalign, found, mask;
656 const uint8_t *p;
657 uint8x16_t data;
659 /* Align the source pointer. */
660 misalign = (uintptr_t)s & 15;
661 p = (const uint8_t *)((uintptr_t)s & -16);
662 data = vld1q_u8 (p);
664 /* Create a mask for the bytes that are valid within the first
665 16-byte block. The Idea here is that the AND with the mask
666 within the loop is "free", since we need some AND or TEST
667 insn in order to set the flags for the branch anyway. */
668 mask = (-1u << misalign) & 0xffff;
670 /* Main loop, processing 16 bytes at a time. */
671 goto start;
675 uint8x8_t l;
676 uint16x4_t m;
677 uint32x2_t n;
678 uint8x16_t t, u, v, w;
680 p += 16;
681 data = vld1q_u8 (p);
682 mask = 0xffff;
684 start:
685 t = vceqq_u8 (data, repl_nl);
686 u = vceqq_u8 (data, repl_cr);
687 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
688 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
689 t = vandq_u8 (vorrq_u8 (v, w), xmask);
690 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
691 m = vpaddl_u8 (l);
692 n = vpaddl_u16 (m);
694 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
695 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
696 found &= mask;
698 while (!found);
700 /* FOUND contains 1 in bits for which we matched a relevant
701 character. Conversion to the byte index is trivial. */
702 found = __builtin_ctz (found);
703 return (const uchar *)p + found;
706 #else
708 /* We only have one accellerated alternative. Use a direct call so that
709 we encourage inlining. */
711 #define search_line_fast search_line_acc_char
713 #endif
715 /* Initialize the lexer if needed. */
717 void
718 _cpp_init_lexer (void)
720 #ifdef HAVE_init_vectorized_lexer
721 init_vectorized_lexer ();
722 #endif
725 /* Returns with a logical line that contains no escaped newlines or
726 trigraphs. This is a time-critical inner loop. */
727 void
728 _cpp_clean_line (cpp_reader *pfile)
730 cpp_buffer *buffer;
731 const uchar *s;
732 uchar c, *d, *p;
734 buffer = pfile->buffer;
735 buffer->cur_note = buffer->notes_used = 0;
736 buffer->cur = buffer->line_base = buffer->next_line;
737 buffer->need_line = false;
738 s = buffer->next_line;
740 if (!buffer->from_stage3)
742 const uchar *pbackslash = NULL;
744 /* Fast path. This is the common case of an un-escaped line with
745 no trigraphs. The primary win here is by not writing any
746 data back to memory until we have to. */
747 while (1)
749 /* Perform an optimized search for \n, \r, \\, ?. */
750 s = search_line_fast (s, buffer->rlimit);
752 c = *s;
753 if (c == '\\')
755 /* Record the location of the backslash and continue. */
756 pbackslash = s++;
758 else if (__builtin_expect (c == '?', 0))
760 if (__builtin_expect (s[1] == '?', false)
761 && _cpp_trigraph_map[s[2]])
763 /* Have a trigraph. We may or may not have to convert
764 it. Add a line note regardless, for -Wtrigraphs. */
765 add_line_note (buffer, s, s[2]);
766 if (CPP_OPTION (pfile, trigraphs))
768 /* We do, and that means we have to switch to the
769 slow path. */
770 d = (uchar *) s;
771 *d = _cpp_trigraph_map[s[2]];
772 s += 2;
773 goto slow_path;
776 /* Not a trigraph. Continue on fast-path. */
777 s++;
779 else
780 break;
783 /* This must be \r or \n. We're either done, or we'll be forced
784 to write back to the buffer and continue on the slow path. */
785 d = (uchar *) s;
787 if (__builtin_expect (s == buffer->rlimit, false))
788 goto done;
790 /* DOS line ending? */
791 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
793 s++;
794 if (s == buffer->rlimit)
795 goto done;
798 if (__builtin_expect (pbackslash == NULL, true))
799 goto done;
801 /* Check for escaped newline. */
802 p = d;
803 while (is_nvspace (p[-1]))
804 p--;
805 if (p - 1 != pbackslash)
806 goto done;
808 /* Have an escaped newline; process it and proceed to
809 the slow path. */
810 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
811 d = p - 2;
812 buffer->next_line = p - 1;
814 slow_path:
815 while (1)
817 c = *++s;
818 *++d = c;
820 if (c == '\n' || c == '\r')
822 /* Handle DOS line endings. */
823 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
824 s++;
825 if (s == buffer->rlimit)
826 break;
828 /* Escaped? */
829 p = d;
830 while (p != buffer->next_line && is_nvspace (p[-1]))
831 p--;
832 if (p == buffer->next_line || p[-1] != '\\')
833 break;
835 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
836 d = p - 2;
837 buffer->next_line = p - 1;
839 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
841 /* Add a note regardless, for the benefit of -Wtrigraphs. */
842 add_line_note (buffer, d, s[2]);
843 if (CPP_OPTION (pfile, trigraphs))
845 *d = _cpp_trigraph_map[s[2]];
846 s += 2;
851 else
853 while (*s != '\n' && *s != '\r')
854 s++;
855 d = (uchar *) s;
857 /* Handle DOS line endings. */
858 if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
859 s++;
862 done:
863 *d = '\n';
864 /* A sentinel note that should never be processed. */
865 add_line_note (buffer, d + 1, '\n');
866 buffer->next_line = s + 1;
869 /* Return true if the trigraph indicated by NOTE should be warned
870 about in a comment. */
871 static bool
872 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
874 const uchar *p;
876 /* Within comments we don't warn about trigraphs, unless the
877 trigraph forms an escaped newline, as that may change
878 behavior. */
879 if (note->type != '/')
880 return false;
882 /* If -trigraphs, then this was an escaped newline iff the next note
883 is coincident. */
884 if (CPP_OPTION (pfile, trigraphs))
885 return note[1].pos == note->pos;
887 /* Otherwise, see if this forms an escaped newline. */
888 p = note->pos + 3;
889 while (is_nvspace (*p))
890 p++;
892 /* There might have been escaped newlines between the trigraph and the
893 newline we found. Hence the position test. */
894 return (*p == '\n' && p < note[1].pos);
897 /* Process the notes created by add_line_note as far as the current
898 location. */
899 void
900 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
902 cpp_buffer *buffer = pfile->buffer;
904 for (;;)
906 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
907 unsigned int col;
909 if (note->pos > buffer->cur)
910 break;
912 buffer->cur_note++;
913 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
915 if (note->type == '\\' || note->type == ' ')
917 if (note->type == ' ' && !in_comment)
918 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
919 "backslash and newline separated by space");
921 if (buffer->next_line > buffer->rlimit)
923 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
924 "backslash-newline at end of file");
925 /* Prevent "no newline at end of file" warning. */
926 buffer->next_line = buffer->rlimit;
929 buffer->line_base = note->pos;
930 CPP_INCREMENT_LINE (pfile, 0);
932 else if (_cpp_trigraph_map[note->type])
934 if (CPP_OPTION (pfile, warn_trigraphs)
935 && (!in_comment || warn_in_comment (pfile, note)))
937 if (CPP_OPTION (pfile, trigraphs))
938 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
939 pfile->line_table->highest_line, col,
940 "trigraph ??%c converted to %c",
941 note->type,
942 (int) _cpp_trigraph_map[note->type]);
943 else
945 cpp_warning_with_line
946 (pfile, CPP_W_TRIGRAPHS,
947 pfile->line_table->highest_line, col,
948 "trigraph ??%c ignored, use -trigraphs to enable",
949 note->type);
953 else if (note->type == 0)
954 /* Already processed in lex_raw_string. */;
955 else
956 abort ();
960 /* Skip a C-style block comment. We find the end of the comment by
961 seeing if an asterisk is before every '/' we encounter. Returns
962 nonzero if comment terminated by EOF, zero otherwise.
964 Buffer->cur points to the initial asterisk of the comment. */
965 bool
966 _cpp_skip_block_comment (cpp_reader *pfile)
968 cpp_buffer *buffer = pfile->buffer;
969 const uchar *cur = buffer->cur;
970 uchar c;
972 cur++;
973 if (*cur == '/')
974 cur++;
976 for (;;)
978 /* People like decorating comments with '*', so check for '/'
979 instead for efficiency. */
980 c = *cur++;
982 if (c == '/')
984 if (cur[-2] == '*')
985 break;
987 /* Warn about potential nested comments, but not if the '/'
988 comes immediately before the true comment delimiter.
989 Don't bother to get it right across escaped newlines. */
990 if (CPP_OPTION (pfile, warn_comments)
991 && cur[0] == '*' && cur[1] != '/')
993 buffer->cur = cur;
994 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
995 pfile->line_table->highest_line,
996 CPP_BUF_COL (buffer),
997 "\"/*\" within comment");
1000 else if (c == '\n')
1002 unsigned int cols;
1003 buffer->cur = cur - 1;
1004 _cpp_process_line_notes (pfile, true);
1005 if (buffer->next_line >= buffer->rlimit)
1006 return true;
1007 _cpp_clean_line (pfile);
1009 cols = buffer->next_line - buffer->line_base;
1010 CPP_INCREMENT_LINE (pfile, cols);
1012 cur = buffer->cur;
1016 buffer->cur = cur;
1017 _cpp_process_line_notes (pfile, true);
1018 return false;
1021 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1022 terminating newline. Handles escaped newlines. Returns nonzero
1023 if a multiline comment. */
1024 static int
1025 skip_line_comment (cpp_reader *pfile)
1027 cpp_buffer *buffer = pfile->buffer;
1028 source_location orig_line = pfile->line_table->highest_line;
1030 while (*buffer->cur != '\n')
1031 buffer->cur++;
1033 _cpp_process_line_notes (pfile, true);
1034 return orig_line != pfile->line_table->highest_line;
1037 /* Skips whitespace, saving the next non-whitespace character. */
1038 static void
1039 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1041 cpp_buffer *buffer = pfile->buffer;
1042 bool saw_NUL = false;
1046 /* Horizontal space always OK. */
1047 if (c == ' ' || c == '\t')
1049 /* Just \f \v or \0 left. */
1050 else if (c == '\0')
1051 saw_NUL = true;
1052 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1053 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1054 CPP_BUF_COL (buffer),
1055 "%s in preprocessing directive",
1056 c == '\f' ? "form feed" : "vertical tab");
1058 c = *buffer->cur++;
1060 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1061 while (is_nvspace (c));
1063 if (saw_NUL)
1064 cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1066 buffer->cur--;
1069 /* See if the characters of a number token are valid in a name (no
1070 '.', '+' or '-'). */
1071 static int
1072 name_p (cpp_reader *pfile, const cpp_string *string)
1074 unsigned int i;
1076 for (i = 0; i < string->len; i++)
1077 if (!is_idchar (string->text[i]))
1078 return 0;
1080 return 1;
1083 /* After parsing an identifier or other sequence, produce a warning about
1084 sequences not in NFC/NFKC. */
1085 static void
1086 warn_about_normalization (cpp_reader *pfile,
1087 const cpp_token *token,
1088 const struct normalize_state *s)
1090 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1091 && !pfile->state.skipping)
1093 /* Make sure that the token is printed using UCNs, even
1094 if we'd otherwise happily print UTF-8. */
1095 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1096 size_t sz;
1098 sz = cpp_spell_token (pfile, token, buf, false) - buf;
1099 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1100 cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1101 "`%.*s' is not in NFKC", (int) sz, buf);
1102 else
1103 cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1104 "`%.*s' is not in NFC", (int) sz, buf);
1105 free (buf);
1109 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1110 an identifier. FIRST is TRUE if this starts an identifier. */
1111 static bool
1112 forms_identifier_p (cpp_reader *pfile, int first,
1113 struct normalize_state *state)
1115 cpp_buffer *buffer = pfile->buffer;
1117 if (*buffer->cur == '$')
1119 if (!CPP_OPTION (pfile, dollars_in_ident))
1120 return false;
1122 buffer->cur++;
1123 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1125 CPP_OPTION (pfile, warn_dollars) = 0;
1126 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1129 return true;
1132 /* Is this a syntactically valid UCN? */
1133 if (CPP_OPTION (pfile, extended_identifiers)
1134 && *buffer->cur == '\\'
1135 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1137 buffer->cur += 2;
1138 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1139 state))
1140 return true;
1141 buffer->cur -= 2;
1144 return false;
1147 /* Helper function to get the cpp_hashnode of the identifier BASE. */
1148 static cpp_hashnode *
1149 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1151 cpp_hashnode *result;
1152 const uchar *cur;
1153 unsigned int len;
1154 unsigned int hash = HT_HASHSTEP (0, *base);
1156 cur = base + 1;
1157 while (ISIDNUM (*cur))
1159 hash = HT_HASHSTEP (hash, *cur);
1160 cur++;
1162 len = cur - base;
1163 hash = HT_HASHFINISH (hash, len);
1164 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1165 base, len, hash, HT_ALLOC));
1167 /* Rarely, identifiers require diagnostics when lexed. */
1168 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1169 && !pfile->state.skipping, 0))
1171 /* It is allowed to poison the same identifier twice. */
1172 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1173 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1174 NODE_NAME (result));
1176 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1177 replacement list of a variadic macro. */
1178 if (result == pfile->spec_nodes.n__VA_ARGS__
1179 && !pfile->state.va_args_ok)
1180 cpp_error (pfile, CPP_DL_PEDWARN,
1181 "__VA_ARGS__ can only appear in the expansion"
1182 " of a C99 variadic macro");
1184 /* For -Wc++-compat, warn about use of C++ named operators. */
1185 if (result->flags & NODE_WARN_OPERATOR)
1186 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1187 "identifier \"%s\" is a special operator name in C++",
1188 NODE_NAME (result));
1191 return result;
1194 /* Get the cpp_hashnode of an identifier specified by NAME in
1195 the current cpp_reader object. If none is found, NULL is returned. */
1196 cpp_hashnode *
1197 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1199 cpp_hashnode *result;
1200 result = lex_identifier_intern (pfile, (uchar *) name);
1201 return result;
1204 /* Lex an identifier starting at BUFFER->CUR - 1. */
1205 static cpp_hashnode *
1206 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1207 struct normalize_state *nst)
1209 cpp_hashnode *result;
1210 const uchar *cur;
1211 unsigned int len;
1212 unsigned int hash = HT_HASHSTEP (0, *base);
1214 cur = pfile->buffer->cur;
1215 if (! starts_ucn)
1217 while (ISIDNUM (*cur))
1219 hash = HT_HASHSTEP (hash, *cur);
1220 cur++;
1222 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1224 pfile->buffer->cur = cur;
1225 if (starts_ucn || forms_identifier_p (pfile, false, nst))
1227 /* Slower version for identifiers containing UCNs (or $). */
1228 do {
1229 while (ISIDNUM (*pfile->buffer->cur))
1231 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1232 pfile->buffer->cur++;
1234 } while (forms_identifier_p (pfile, false, nst));
1235 result = _cpp_interpret_identifier (pfile, base,
1236 pfile->buffer->cur - base);
1238 else
1240 len = cur - base;
1241 hash = HT_HASHFINISH (hash, len);
1243 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1244 base, len, hash, HT_ALLOC));
1247 /* Rarely, identifiers require diagnostics when lexed. */
1248 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1249 && !pfile->state.skipping, 0))
1251 /* It is allowed to poison the same identifier twice. */
1252 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1253 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1254 NODE_NAME (result));
1256 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1257 replacement list of a variadic macro. */
1258 if (result == pfile->spec_nodes.n__VA_ARGS__
1259 && !pfile->state.va_args_ok)
1260 cpp_error (pfile, CPP_DL_PEDWARN,
1261 "__VA_ARGS__ can only appear in the expansion"
1262 " of a C99 variadic macro");
1264 /* For -Wc++-compat, warn about use of C++ named operators. */
1265 if (result->flags & NODE_WARN_OPERATOR)
1266 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1267 "identifier \"%s\" is a special operator name in C++",
1268 NODE_NAME (result));
1271 return result;
1274 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
1275 static void
1276 lex_number (cpp_reader *pfile, cpp_string *number,
1277 struct normalize_state *nst)
1279 const uchar *cur;
1280 const uchar *base;
1281 uchar *dest;
1283 base = pfile->buffer->cur - 1;
1286 cur = pfile->buffer->cur;
1288 /* N.B. ISIDNUM does not include $. */
1289 while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1290 || VALID_SIGN (*cur, cur[-1]))
1292 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1293 cur++;
1296 pfile->buffer->cur = cur;
1298 while (forms_identifier_p (pfile, false, nst));
1300 number->len = cur - base;
1301 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1302 memcpy (dest, base, number->len);
1303 dest[number->len] = '\0';
1304 number->text = dest;
1307 /* Create a token of type TYPE with a literal spelling. */
1308 static void
1309 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1310 unsigned int len, enum cpp_ttype type)
1312 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1314 memcpy (dest, base, len);
1315 dest[len] = '\0';
1316 token->type = type;
1317 token->val.str.len = len;
1318 token->val.str.text = dest;
1321 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1322 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
1324 static void
1325 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1326 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1328 _cpp_buff *first_buff = *first_buff_p;
1329 _cpp_buff *last_buff = *last_buff_p;
1331 if (first_buff == NULL)
1332 first_buff = last_buff = _cpp_get_buff (pfile, len);
1333 else if (len > BUFF_ROOM (last_buff))
1335 size_t room = BUFF_ROOM (last_buff);
1336 memcpy (BUFF_FRONT (last_buff), base, room);
1337 BUFF_FRONT (last_buff) += room;
1338 base += room;
1339 len -= room;
1340 last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1343 memcpy (BUFF_FRONT (last_buff), base, len);
1344 BUFF_FRONT (last_buff) += len;
1346 *first_buff_p = first_buff;
1347 *last_buff_p = last_buff;
1351 /* Returns true if a macro has been defined.
1352 This might not work if compile with -save-temps,
1353 or preprocess separately from compilation. */
1355 static bool
1356 is_macro(cpp_reader *pfile, const uchar *base)
1358 const uchar *cur = base;
1359 if (! ISIDST (*cur))
1360 return false;
1361 unsigned int hash = HT_HASHSTEP (0, *cur);
1362 ++cur;
1363 while (ISIDNUM (*cur))
1365 hash = HT_HASHSTEP (hash, *cur);
1366 ++cur;
1368 hash = HT_HASHFINISH (hash, cur - base);
1370 cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1371 base, cur - base, hash, HT_NO_INSERT));
1373 return !result ? false : (result->type == NT_MACRO);
1377 /* Lexes a raw string. The stored string contains the spelling, including
1378 double quotes, delimiter string, '(' and ')', any leading
1379 'L', 'u', 'U' or 'u8' and 'R' modifier. It returns the type of the
1380 literal, or CPP_OTHER if it was not properly terminated.
1382 The spelling is NUL-terminated, but it is not guaranteed that this
1383 is the first NUL since embedded NULs are preserved. */
1385 static void
1386 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1387 const uchar *cur)
1389 uchar raw_prefix[17];
1390 uchar temp_buffer[18];
1391 const uchar *orig_base;
1392 unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
1393 enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
1394 raw_str_phase phase = RAW_STR_PREFIX;
1395 enum cpp_ttype type;
1396 size_t total_len = 0;
1397 /* Index into temp_buffer during phases other than RAW_STR,
1398 during RAW_STR phase 17 to tell BUF_APPEND that nothing should
1399 be appended to temp_buffer. */
1400 size_t temp_buffer_len = 0;
1401 _cpp_buff *first_buff = NULL, *last_buff = NULL;
1402 size_t raw_prefix_start;
1403 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1405 type = (*base == 'L' ? CPP_WSTRING :
1406 *base == 'U' ? CPP_STRING32 :
1407 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1408 : CPP_STRING);
1410 #define BUF_APPEND(STR,LEN) \
1411 do { \
1412 bufring_append (pfile, (const uchar *)(STR), (LEN), \
1413 &first_buff, &last_buff); \
1414 total_len += (LEN); \
1415 if (__builtin_expect (temp_buffer_len < 17, 0) \
1416 && (const uchar *)(STR) != base \
1417 && (LEN) <= 2) \
1419 memcpy (temp_buffer + temp_buffer_len, \
1420 (const uchar *)(STR), (LEN)); \
1421 temp_buffer_len += (LEN); \
1423 } while (0);
1425 orig_base = base;
1426 ++cur;
1427 raw_prefix_start = cur - base;
1428 for (;;)
1430 cppchar_t c;
1432 /* If we previously performed any trigraph or line splicing
1433 transformations, undo them in between the opening and closing
1434 double quote. */
1435 while (note->pos < cur)
1436 ++note;
1437 for (; note->pos == cur; ++note)
1439 switch (note->type)
1441 case '\\':
1442 case ' ':
1443 /* Restore backslash followed by newline. */
1444 BUF_APPEND (base, cur - base);
1445 base = cur;
1446 BUF_APPEND ("\\", 1);
1447 after_backslash:
1448 if (note->type == ' ')
1450 /* GNU backslash whitespace newline extension. FIXME
1451 could be any sequence of non-vertical space. When we
1452 can properly restore any such sequence, we should mark
1453 this note as handled so _cpp_process_line_notes
1454 doesn't warn. */
1455 BUF_APPEND (" ", 1);
1458 BUF_APPEND ("\n", 1);
1459 break;
1461 case 0:
1462 /* Already handled. */
1463 break;
1465 default:
1466 if (_cpp_trigraph_map[note->type])
1468 /* Don't warn about this trigraph in
1469 _cpp_process_line_notes, since trigraphs show up as
1470 trigraphs in raw strings. */
1471 uchar type = note->type;
1472 note->type = 0;
1474 if (!CPP_OPTION (pfile, trigraphs))
1475 /* If we didn't convert the trigraph in the first
1476 place, don't do anything now either. */
1477 break;
1479 BUF_APPEND (base, cur - base);
1480 base = cur;
1481 BUF_APPEND ("??", 2);
1483 /* ??/ followed by newline gets two line notes, one for
1484 the trigraph and one for the backslash/newline. */
1485 if (type == '/' && note[1].pos == cur)
1487 if (note[1].type != '\\'
1488 && note[1].type != ' ')
1489 abort ();
1490 BUF_APPEND ("/", 1);
1491 ++note;
1492 goto after_backslash;
1494 else
1496 /* Skip the replacement character. */
1497 base = ++cur;
1498 BUF_APPEND (&type, 1);
1499 c = type;
1500 goto check_c;
1503 else
1504 abort ();
1505 break;
1508 c = *cur++;
1509 if (__builtin_expect (temp_buffer_len < 17, 0))
1510 temp_buffer[temp_buffer_len++] = c;
1512 check_c:
1513 if (phase == RAW_STR_PREFIX)
1515 while (raw_prefix_len < temp_buffer_len)
1517 raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len];
1518 switch (raw_prefix[raw_prefix_len])
1520 case ' ': case '(': case ')': case '\\': case '\t':
1521 case '\v': case '\f': case '\n': default:
1522 break;
1523 /* Basic source charset except the above chars. */
1524 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1525 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1526 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1527 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1528 case 'y': case 'z':
1529 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1530 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1531 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1532 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1533 case 'Y': case 'Z':
1534 case '0': case '1': case '2': case '3': case '4': case '5':
1535 case '6': case '7': case '8': case '9':
1536 case '_': case '{': case '}': case '#': case '[': case ']':
1537 case '<': case '>': case '%': case ':': case ';': case '.':
1538 case '?': case '*': case '+': case '-': case '/': case '^':
1539 case '&': case '|': case '~': case '!': case '=': case ',':
1540 case '"': case '\'':
1541 if (raw_prefix_len < 16)
1543 raw_prefix_len++;
1544 continue;
1546 break;
1549 if (raw_prefix[raw_prefix_len] != '(')
1551 int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1;
1552 if (raw_prefix_len == 16)
1553 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1554 col, "raw string delimiter longer "
1555 "than 16 characters");
1556 else if (raw_prefix[raw_prefix_len] == '\n')
1557 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1558 col, "invalid new-line in raw "
1559 "string delimiter");
1560 else
1561 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1562 col, "invalid character '%c' in "
1563 "raw string delimiter",
1564 (int) raw_prefix[raw_prefix_len]);
1565 pfile->buffer->cur = orig_base + raw_prefix_start - 1;
1566 create_literal (pfile, token, orig_base,
1567 raw_prefix_start - 1, CPP_OTHER);
1568 if (first_buff)
1569 _cpp_release_buff (pfile, first_buff);
1570 return;
1572 raw_prefix[raw_prefix_len] = '"';
1573 phase = RAW_STR;
1574 /* Nothing should be appended to temp_buffer during
1575 RAW_STR phase. */
1576 temp_buffer_len = 17;
1577 break;
1579 continue;
1581 else if (phase == RAW_STR_SUFFIX)
1583 while (raw_suffix_len <= raw_prefix_len
1584 && raw_suffix_len < temp_buffer_len
1585 && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len])
1586 raw_suffix_len++;
1587 if (raw_suffix_len > raw_prefix_len)
1588 break;
1589 if (raw_suffix_len == temp_buffer_len)
1590 continue;
1591 phase = RAW_STR;
1592 /* Nothing should be appended to temp_buffer during
1593 RAW_STR phase. */
1594 temp_buffer_len = 17;
1596 if (c == ')')
1598 phase = RAW_STR_SUFFIX;
1599 raw_suffix_len = 0;
1600 temp_buffer_len = 0;
1602 else if (c == '\n')
1604 if (pfile->state.in_directive
1605 || (pfile->state.parsing_args
1606 && pfile->buffer->next_line >= pfile->buffer->rlimit))
1608 cur--;
1609 type = CPP_OTHER;
1610 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1611 "unterminated raw string");
1612 break;
1615 BUF_APPEND (base, cur - base);
1617 if (pfile->buffer->cur < pfile->buffer->rlimit)
1618 CPP_INCREMENT_LINE (pfile, 0);
1619 pfile->buffer->need_line = true;
1621 pfile->buffer->cur = cur-1;
1622 _cpp_process_line_notes (pfile, false);
1623 if (!_cpp_get_fresh_line (pfile))
1625 source_location src_loc = token->src_loc;
1626 token->type = CPP_EOF;
1627 /* Tell the compiler the line number of the EOF token. */
1628 token->src_loc = pfile->line_table->highest_line;
1629 token->flags = BOL;
1630 if (first_buff != NULL)
1631 _cpp_release_buff (pfile, first_buff);
1632 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1633 "unterminated raw string");
1634 return;
1637 cur = base = pfile->buffer->cur;
1638 note = &pfile->buffer->notes[pfile->buffer->cur_note];
1642 if (CPP_OPTION (pfile, user_literals))
1644 /* If a string format macro, say from inttypes.h, is placed touching
1645 a string literal it could be parsed as a C++11 user-defined string
1646 literal thus breaking the program.
1647 Try to identify macros with is_macro. A warning is issued. */
1648 if (is_macro (pfile, cur))
1650 /* Raise a warning, but do not consume subsequent tokens. */
1651 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1652 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1653 token->src_loc, 0,
1654 "invalid suffix on literal; C++11 requires "
1655 "a space between literal and string macro");
1657 /* Grab user defined literal suffix. */
1658 else if (ISIDST (*cur))
1660 type = cpp_userdef_string_add_type (type);
1661 ++cur;
1663 while (ISIDNUM (*cur))
1664 ++cur;
1668 pfile->buffer->cur = cur;
1669 if (first_buff == NULL)
1670 create_literal (pfile, token, base, cur - base, type);
1671 else
1673 uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1675 token->type = type;
1676 token->val.str.len = total_len + (cur - base);
1677 token->val.str.text = dest;
1678 last_buff = first_buff;
1679 while (last_buff != NULL)
1681 memcpy (dest, last_buff->base,
1682 BUFF_FRONT (last_buff) - last_buff->base);
1683 dest += BUFF_FRONT (last_buff) - last_buff->base;
1684 last_buff = last_buff->next;
1686 _cpp_release_buff (pfile, first_buff);
1687 memcpy (dest, base, cur - base);
1688 dest[cur - base] = '\0';
1692 /* Lexes a string, character constant, or angle-bracketed header file
1693 name. The stored string contains the spelling, including opening
1694 quote and any leading 'L', 'u', 'U' or 'u8' and optional
1695 'R' modifier. It returns the type of the literal, or CPP_OTHER
1696 if it was not properly terminated, or CPP_LESS for an unterminated
1697 header name which must be relexed as normal tokens.
1699 The spelling is NUL-terminated, but it is not guaranteed that this
1700 is the first NUL since embedded NULs are preserved. */
1701 static void
1702 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1704 bool saw_NUL = false;
1705 const uchar *cur;
1706 cppchar_t terminator;
1707 enum cpp_ttype type;
1709 cur = base;
1710 terminator = *cur++;
1711 if (terminator == 'L' || terminator == 'U')
1712 terminator = *cur++;
1713 else if (terminator == 'u')
1715 terminator = *cur++;
1716 if (terminator == '8')
1717 terminator = *cur++;
1719 if (terminator == 'R')
1721 lex_raw_string (pfile, token, base, cur);
1722 return;
1724 if (terminator == '"')
1725 type = (*base == 'L' ? CPP_WSTRING :
1726 *base == 'U' ? CPP_STRING32 :
1727 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1728 : CPP_STRING);
1729 else if (terminator == '\'')
1730 type = (*base == 'L' ? CPP_WCHAR :
1731 *base == 'U' ? CPP_CHAR32 :
1732 *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1733 else
1734 terminator = '>', type = CPP_HEADER_NAME;
1736 for (;;)
1738 cppchar_t c = *cur++;
1740 /* In #include-style directives, terminators are not escapable. */
1741 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1742 cur++;
1743 else if (c == terminator)
1744 break;
1745 else if (c == '\n')
1747 cur--;
1748 /* Unmatched quotes always yield undefined behavior, but
1749 greedy lexing means that what appears to be an unterminated
1750 header name may actually be a legitimate sequence of tokens. */
1751 if (terminator == '>')
1753 token->type = CPP_LESS;
1754 return;
1756 type = CPP_OTHER;
1757 break;
1759 else if (c == '\0')
1760 saw_NUL = true;
1763 if (saw_NUL && !pfile->state.skipping)
1764 cpp_error (pfile, CPP_DL_WARNING,
1765 "null character(s) preserved in literal");
1767 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1768 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1769 (int) terminator);
1771 if (CPP_OPTION (pfile, user_literals))
1773 /* If a string format macro, say from inttypes.h, is placed touching
1774 a string literal it could be parsed as a C++11 user-defined string
1775 literal thus breaking the program.
1776 Try to identify macros with is_macro. A warning is issued. */
1777 if (is_macro (pfile, cur))
1779 /* Raise a warning, but do not consume subsequent tokens. */
1780 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1781 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1782 token->src_loc, 0,
1783 "invalid suffix on literal; C++11 requires "
1784 "a space between literal and string macro");
1786 /* Grab user defined literal suffix. */
1787 else if (ISIDST (*cur))
1789 type = cpp_userdef_char_add_type (type);
1790 type = cpp_userdef_string_add_type (type);
1791 ++cur;
1793 while (ISIDNUM (*cur))
1794 ++cur;
1798 pfile->buffer->cur = cur;
1799 create_literal (pfile, token, base, cur - base, type);
1802 /* Return the comment table. The client may not make any assumption
1803 about the ordering of the table. */
1804 cpp_comment_table *
1805 cpp_get_comments (cpp_reader *pfile)
1807 return &pfile->comments;
1810 /* Append a comment to the end of the comment table. */
1811 static void
1812 store_comment (cpp_reader *pfile, cpp_token *token)
1814 int len;
1816 if (pfile->comments.allocated == 0)
1818 pfile->comments.allocated = 256;
1819 pfile->comments.entries = (cpp_comment *) xmalloc
1820 (pfile->comments.allocated * sizeof (cpp_comment));
1823 if (pfile->comments.count == pfile->comments.allocated)
1825 pfile->comments.allocated *= 2;
1826 pfile->comments.entries = (cpp_comment *) xrealloc
1827 (pfile->comments.entries,
1828 pfile->comments.allocated * sizeof (cpp_comment));
1831 len = token->val.str.len;
1833 /* Copy comment. Note, token may not be NULL terminated. */
1834 pfile->comments.entries[pfile->comments.count].comment =
1835 (char *) xmalloc (sizeof (char) * (len + 1));
1836 memcpy (pfile->comments.entries[pfile->comments.count].comment,
1837 token->val.str.text, len);
1838 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1840 /* Set source location. */
1841 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1843 /* Increment the count of entries in the comment table. */
1844 pfile->comments.count++;
1847 /* The stored comment includes the comment start and any terminator. */
1848 static void
1849 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1850 cppchar_t type)
1852 unsigned char *buffer;
1853 unsigned int len, clen, i;
1855 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
1857 /* C++ comments probably (not definitely) have moved past a new
1858 line, which we don't want to save in the comment. */
1859 if (is_vspace (pfile->buffer->cur[-1]))
1860 len--;
1862 /* If we are currently in a directive or in argument parsing, then
1863 we need to store all C++ comments as C comments internally, and
1864 so we need to allocate a little extra space in that case.
1866 Note that the only time we encounter a directive here is
1867 when we are saving comments in a "#define". */
1868 clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1869 && type == '/') ? len + 2 : len;
1871 buffer = _cpp_unaligned_alloc (pfile, clen);
1873 token->type = CPP_COMMENT;
1874 token->val.str.len = clen;
1875 token->val.str.text = buffer;
1877 buffer[0] = '/';
1878 memcpy (buffer + 1, from, len - 1);
1880 /* Finish conversion to a C comment, if necessary. */
1881 if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1883 buffer[1] = '*';
1884 buffer[clen - 2] = '*';
1885 buffer[clen - 1] = '/';
1886 /* As there can be in a C++ comments illegal sequences for C comments
1887 we need to filter them out. */
1888 for (i = 2; i < (clen - 2); i++)
1889 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1890 buffer[i] = '|';
1893 /* Finally store this comment for use by clients of libcpp. */
1894 store_comment (pfile, token);
1897 /* Allocate COUNT tokens for RUN. */
1898 void
1899 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1901 run->base = XNEWVEC (cpp_token, count);
1902 run->limit = run->base + count;
1903 run->next = NULL;
1906 /* Returns the next tokenrun, or creates one if there is none. */
1907 static tokenrun *
1908 next_tokenrun (tokenrun *run)
1910 if (run->next == NULL)
1912 run->next = XNEW (tokenrun);
1913 run->next->prev = run;
1914 _cpp_init_tokenrun (run->next, 250);
1917 return run->next;
1920 /* Return the number of not yet processed token in a given
1921 context. */
1923 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1925 if (context->tokens_kind == TOKENS_KIND_DIRECT)
1926 return (LAST (context).token - FIRST (context).token);
1927 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1928 || context->tokens_kind == TOKENS_KIND_EXTENDED)
1929 return (LAST (context).ptoken - FIRST (context).ptoken);
1930 else
1931 abort ();
1934 /* Returns the token present at index INDEX in a given context. If
1935 INDEX is zero, the next token to be processed is returned. */
1936 static const cpp_token*
1937 _cpp_token_from_context_at (cpp_context *context, int index)
1939 if (context->tokens_kind == TOKENS_KIND_DIRECT)
1940 return &(FIRST (context).token[index]);
1941 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1942 || context->tokens_kind == TOKENS_KIND_EXTENDED)
1943 return FIRST (context).ptoken[index];
1944 else
1945 abort ();
1948 /* Look ahead in the input stream. */
1949 const cpp_token *
1950 cpp_peek_token (cpp_reader *pfile, int index)
1952 cpp_context *context = pfile->context;
1953 const cpp_token *peektok;
1954 int count;
1956 /* First, scan through any pending cpp_context objects. */
1957 while (context->prev)
1959 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1961 if (index < (int) sz)
1962 return _cpp_token_from_context_at (context, index);
1963 index -= (int) sz;
1964 context = context->prev;
1967 /* We will have to read some new tokens after all (and do so
1968 without invalidating preceding tokens). */
1969 count = index;
1970 pfile->keep_tokens++;
1974 peektok = _cpp_lex_token (pfile);
1975 if (peektok->type == CPP_EOF)
1976 return peektok;
1978 while (index--);
1980 _cpp_backup_tokens_direct (pfile, count + 1);
1981 pfile->keep_tokens--;
1983 return peektok;
1986 /* Allocate a single token that is invalidated at the same time as the
1987 rest of the tokens on the line. Has its line and col set to the
1988 same as the last lexed token, so that diagnostics appear in the
1989 right place. */
1990 cpp_token *
1991 _cpp_temp_token (cpp_reader *pfile)
1993 cpp_token *old, *result;
1994 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1995 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1997 old = pfile->cur_token - 1;
1998 /* Any pre-existing lookaheads must not be clobbered. */
1999 if (la)
2001 if (sz <= la)
2003 tokenrun *next = next_tokenrun (pfile->cur_run);
2005 if (sz < la)
2006 memmove (next->base + 1, next->base,
2007 (la - sz) * sizeof (cpp_token));
2009 next->base[0] = pfile->cur_run->limit[-1];
2012 if (sz > 1)
2013 memmove (pfile->cur_token + 1, pfile->cur_token,
2014 MIN (la, sz - 1) * sizeof (cpp_token));
2017 if (!sz && pfile->cur_token == pfile->cur_run->limit)
2019 pfile->cur_run = next_tokenrun (pfile->cur_run);
2020 pfile->cur_token = pfile->cur_run->base;
2023 result = pfile->cur_token++;
2024 result->src_loc = old->src_loc;
2025 return result;
2028 /* Lex a token into RESULT (external interface). Takes care of issues
2029 like directive handling, token lookahead, multiple include
2030 optimization and skipping. */
2031 const cpp_token *
2032 _cpp_lex_token (cpp_reader *pfile)
2034 cpp_token *result;
2036 for (;;)
2038 if (pfile->cur_token == pfile->cur_run->limit)
2040 pfile->cur_run = next_tokenrun (pfile->cur_run);
2041 pfile->cur_token = pfile->cur_run->base;
2043 /* We assume that the current token is somewhere in the current
2044 run. */
2045 if (pfile->cur_token < pfile->cur_run->base
2046 || pfile->cur_token >= pfile->cur_run->limit)
2047 abort ();
2049 if (pfile->lookaheads)
2051 pfile->lookaheads--;
2052 result = pfile->cur_token++;
2054 else
2055 result = _cpp_lex_direct (pfile);
2057 if (result->flags & BOL)
2059 /* Is this a directive. If _cpp_handle_directive returns
2060 false, it is an assembler #. */
2061 if (result->type == CPP_HASH
2062 /* 6.10.3 p 11: Directives in a list of macro arguments
2063 gives undefined behavior. This implementation
2064 handles the directive as normal. */
2065 && pfile->state.parsing_args != 1)
2067 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2069 if (pfile->directive_result.type == CPP_PADDING)
2070 continue;
2071 result = &pfile->directive_result;
2074 else if (pfile->state.in_deferred_pragma)
2075 result = &pfile->directive_result;
2077 if (pfile->cb.line_change && !pfile->state.skipping)
2078 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2081 /* We don't skip tokens in directives. */
2082 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2083 break;
2085 /* Outside a directive, invalidate controlling macros. At file
2086 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2087 get here and MI optimization works. */
2088 pfile->mi_valid = false;
2090 if (!pfile->state.skipping || result->type == CPP_EOF)
2091 break;
2094 return result;
2097 /* Returns true if a fresh line has been loaded. */
2098 bool
2099 _cpp_get_fresh_line (cpp_reader *pfile)
2101 int return_at_eof;
2103 /* We can't get a new line until we leave the current directive. */
2104 if (pfile->state.in_directive)
2105 return false;
2107 for (;;)
2109 cpp_buffer *buffer = pfile->buffer;
2111 if (!buffer->need_line)
2112 return true;
2114 if (buffer->next_line < buffer->rlimit)
2116 _cpp_clean_line (pfile);
2117 return true;
2120 /* First, get out of parsing arguments state. */
2121 if (pfile->state.parsing_args)
2122 return false;
2124 /* End of buffer. Non-empty files should end in a newline. */
2125 if (buffer->buf != buffer->rlimit
2126 && buffer->next_line > buffer->rlimit
2127 && !buffer->from_stage3)
2129 /* Clip to buffer size. */
2130 buffer->next_line = buffer->rlimit;
2133 return_at_eof = buffer->return_at_eof;
2134 _cpp_pop_buffer (pfile);
2135 if (pfile->buffer == NULL || return_at_eof)
2136 return false;
2140 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
2141 do \
2143 result->type = ELSE_TYPE; \
2144 if (*buffer->cur == CHAR) \
2145 buffer->cur++, result->type = THEN_TYPE; \
2147 while (0)
2149 /* Lex a token into pfile->cur_token, which is also incremented, to
2150 get diagnostics pointing to the correct location.
2152 Does not handle issues such as token lookahead, multiple-include
2153 optimization, directives, skipping etc. This function is only
2154 suitable for use by _cpp_lex_token, and in special cases like
2155 lex_expansion_token which doesn't care for any of these issues.
2157 When meeting a newline, returns CPP_EOF if parsing a directive,
2158 otherwise returns to the start of the token buffer if permissible.
2159 Returns the location of the lexed token. */
2160 cpp_token *
2161 _cpp_lex_direct (cpp_reader *pfile)
2163 cppchar_t c;
2164 cpp_buffer *buffer;
2165 const unsigned char *comment_start;
2166 cpp_token *result = pfile->cur_token++;
2168 fresh_line:
2169 result->flags = 0;
2170 buffer = pfile->buffer;
2171 if (buffer->need_line)
2173 if (pfile->state.in_deferred_pragma)
2175 result->type = CPP_PRAGMA_EOL;
2176 pfile->state.in_deferred_pragma = false;
2177 if (!pfile->state.pragma_allow_expansion)
2178 pfile->state.prevent_expansion--;
2179 return result;
2181 if (!_cpp_get_fresh_line (pfile))
2183 result->type = CPP_EOF;
2184 if (!pfile->state.in_directive)
2186 /* Tell the compiler the line number of the EOF token. */
2187 result->src_loc = pfile->line_table->highest_line;
2188 result->flags = BOL;
2190 return result;
2192 if (!pfile->keep_tokens)
2194 pfile->cur_run = &pfile->base_run;
2195 result = pfile->base_run.base;
2196 pfile->cur_token = result + 1;
2198 result->flags = BOL;
2199 if (pfile->state.parsing_args == 2)
2200 result->flags |= PREV_WHITE;
2202 buffer = pfile->buffer;
2203 update_tokens_line:
2204 result->src_loc = pfile->line_table->highest_line;
2206 skipped_white:
2207 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2208 && !pfile->overlaid_buffer)
2210 _cpp_process_line_notes (pfile, false);
2211 result->src_loc = pfile->line_table->highest_line;
2213 c = *buffer->cur++;
2215 if (pfile->forced_token_location_p)
2216 result->src_loc = *pfile->forced_token_location_p;
2217 else
2218 result->src_loc = linemap_position_for_column (pfile->line_table,
2219 CPP_BUF_COLUMN (buffer, buffer->cur));
2221 switch (c)
2223 case ' ': case '\t': case '\f': case '\v': case '\0':
2224 result->flags |= PREV_WHITE;
2225 skip_whitespace (pfile, c);
2226 goto skipped_white;
2228 case '\n':
2229 if (buffer->cur < buffer->rlimit)
2230 CPP_INCREMENT_LINE (pfile, 0);
2231 buffer->need_line = true;
2232 goto fresh_line;
2234 case '0': case '1': case '2': case '3': case '4':
2235 case '5': case '6': case '7': case '8': case '9':
2237 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2238 result->type = CPP_NUMBER;
2239 lex_number (pfile, &result->val.str, &nst);
2240 warn_about_normalization (pfile, result, &nst);
2241 break;
2244 case 'L':
2245 case 'u':
2246 case 'U':
2247 case 'R':
2248 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2249 wide strings or raw strings. */
2250 if (c == 'L' || CPP_OPTION (pfile, rliterals)
2251 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2253 if ((*buffer->cur == '\'' && c != 'R')
2254 || *buffer->cur == '"'
2255 || (*buffer->cur == 'R'
2256 && c != 'R'
2257 && buffer->cur[1] == '"'
2258 && CPP_OPTION (pfile, rliterals))
2259 || (*buffer->cur == '8'
2260 && c == 'u'
2261 && (buffer->cur[1] == '"'
2262 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2263 && CPP_OPTION (pfile, rliterals)))))
2265 lex_string (pfile, result, buffer->cur - 1);
2266 break;
2269 /* Fall through. */
2271 case '_':
2272 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2273 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2274 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2275 case 's': case 't': case 'v': case 'w': case 'x':
2276 case 'y': case 'z':
2277 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2278 case 'G': case 'H': case 'I': case 'J': case 'K':
2279 case 'M': case 'N': case 'O': case 'P': case 'Q':
2280 case 'S': case 'T': case 'V': case 'W': case 'X':
2281 case 'Y': case 'Z':
2282 result->type = CPP_NAME;
2284 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2285 result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2286 &nst);
2287 warn_about_normalization (pfile, result, &nst);
2290 /* Convert named operators to their proper types. */
2291 if (result->val.node.node->flags & NODE_OPERATOR)
2293 result->flags |= NAMED_OP;
2294 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2296 break;
2298 case '\'':
2299 case '"':
2300 lex_string (pfile, result, buffer->cur - 1);
2301 break;
2303 case '/':
2304 /* A potential block or line comment. */
2305 comment_start = buffer->cur;
2306 c = *buffer->cur;
2308 if (c == '*')
2310 if (_cpp_skip_block_comment (pfile))
2311 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2313 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2314 || cpp_in_system_header (pfile)))
2316 /* Warn about comments only if pedantically GNUC89, and not
2317 in system headers. */
2318 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2319 && ! buffer->warned_cplusplus_comments)
2321 cpp_error (pfile, CPP_DL_PEDWARN,
2322 "C++ style comments are not allowed in ISO C90");
2323 cpp_error (pfile, CPP_DL_PEDWARN,
2324 "(this will be reported only once per input file)");
2325 buffer->warned_cplusplus_comments = 1;
2328 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2329 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2331 else if (c == '=')
2333 buffer->cur++;
2334 result->type = CPP_DIV_EQ;
2335 break;
2337 else
2339 result->type = CPP_DIV;
2340 break;
2343 if (!pfile->state.save_comments)
2345 result->flags |= PREV_WHITE;
2346 goto update_tokens_line;
2349 /* Save the comment as a token in its own right. */
2350 save_comment (pfile, result, comment_start, c);
2351 break;
2353 case '<':
2354 if (pfile->state.angled_headers)
2356 lex_string (pfile, result, buffer->cur - 1);
2357 if (result->type != CPP_LESS)
2358 break;
2361 result->type = CPP_LESS;
2362 if (*buffer->cur == '=')
2363 buffer->cur++, result->type = CPP_LESS_EQ;
2364 else if (*buffer->cur == '<')
2366 buffer->cur++;
2367 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2369 else if (CPP_OPTION (pfile, digraphs))
2371 if (*buffer->cur == ':')
2373 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2374 three characters are <:: and the subsequent character
2375 is neither : nor >, the < is treated as a preprocessor
2376 token by itself". */
2377 if (CPP_OPTION (pfile, cplusplus)
2378 && CPP_OPTION (pfile, lang) != CLK_CXX98
2379 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
2380 && buffer->cur[1] == ':'
2381 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2382 break;
2384 buffer->cur++;
2385 result->flags |= DIGRAPH;
2386 result->type = CPP_OPEN_SQUARE;
2388 else if (*buffer->cur == '%')
2390 buffer->cur++;
2391 result->flags |= DIGRAPH;
2392 result->type = CPP_OPEN_BRACE;
2395 break;
2397 case '>':
2398 result->type = CPP_GREATER;
2399 if (*buffer->cur == '=')
2400 buffer->cur++, result->type = CPP_GREATER_EQ;
2401 else if (*buffer->cur == '>')
2403 buffer->cur++;
2404 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2406 break;
2408 case '%':
2409 result->type = CPP_MOD;
2410 if (*buffer->cur == '=')
2411 buffer->cur++, result->type = CPP_MOD_EQ;
2412 else if (CPP_OPTION (pfile, digraphs))
2414 if (*buffer->cur == ':')
2416 buffer->cur++;
2417 result->flags |= DIGRAPH;
2418 result->type = CPP_HASH;
2419 if (*buffer->cur == '%' && buffer->cur[1] == ':')
2420 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2422 else if (*buffer->cur == '>')
2424 buffer->cur++;
2425 result->flags |= DIGRAPH;
2426 result->type = CPP_CLOSE_BRACE;
2429 break;
2431 case '.':
2432 result->type = CPP_DOT;
2433 if (ISDIGIT (*buffer->cur))
2435 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2436 result->type = CPP_NUMBER;
2437 lex_number (pfile, &result->val.str, &nst);
2438 warn_about_normalization (pfile, result, &nst);
2440 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2441 buffer->cur += 2, result->type = CPP_ELLIPSIS;
2442 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2443 buffer->cur++, result->type = CPP_DOT_STAR;
2444 break;
2446 case '+':
2447 result->type = CPP_PLUS;
2448 if (*buffer->cur == '+')
2449 buffer->cur++, result->type = CPP_PLUS_PLUS;
2450 else if (*buffer->cur == '=')
2451 buffer->cur++, result->type = CPP_PLUS_EQ;
2452 break;
2454 case '-':
2455 result->type = CPP_MINUS;
2456 if (*buffer->cur == '>')
2458 buffer->cur++;
2459 result->type = CPP_DEREF;
2460 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2461 buffer->cur++, result->type = CPP_DEREF_STAR;
2463 else if (*buffer->cur == '-')
2464 buffer->cur++, result->type = CPP_MINUS_MINUS;
2465 else if (*buffer->cur == '=')
2466 buffer->cur++, result->type = CPP_MINUS_EQ;
2467 break;
2469 case '&':
2470 result->type = CPP_AND;
2471 if (*buffer->cur == '&')
2472 buffer->cur++, result->type = CPP_AND_AND;
2473 else if (*buffer->cur == '=')
2474 buffer->cur++, result->type = CPP_AND_EQ;
2475 break;
2477 case '|':
2478 result->type = CPP_OR;
2479 if (*buffer->cur == '|')
2480 buffer->cur++, result->type = CPP_OR_OR;
2481 else if (*buffer->cur == '=')
2482 buffer->cur++, result->type = CPP_OR_EQ;
2483 break;
2485 case ':':
2486 result->type = CPP_COLON;
2487 if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2488 buffer->cur++, result->type = CPP_SCOPE;
2489 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2491 buffer->cur++;
2492 result->flags |= DIGRAPH;
2493 result->type = CPP_CLOSE_SQUARE;
2495 break;
2497 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2498 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2499 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2500 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2501 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2503 case '?': result->type = CPP_QUERY; break;
2504 case '~': result->type = CPP_COMPL; break;
2505 case ',': result->type = CPP_COMMA; break;
2506 case '(': result->type = CPP_OPEN_PAREN; break;
2507 case ')': result->type = CPP_CLOSE_PAREN; break;
2508 case '[': result->type = CPP_OPEN_SQUARE; break;
2509 case ']': result->type = CPP_CLOSE_SQUARE; break;
2510 case '{': result->type = CPP_OPEN_BRACE; break;
2511 case '}': result->type = CPP_CLOSE_BRACE; break;
2512 case ';': result->type = CPP_SEMICOLON; break;
2514 /* @ is a punctuator in Objective-C. */
2515 case '@': result->type = CPP_ATSIGN; break;
2517 case '$':
2518 case '\\':
2520 const uchar *base = --buffer->cur;
2521 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2523 if (forms_identifier_p (pfile, true, &nst))
2525 result->type = CPP_NAME;
2526 result->val.node.node = lex_identifier (pfile, base, true, &nst);
2527 warn_about_normalization (pfile, result, &nst);
2528 break;
2530 buffer->cur++;
2533 default:
2534 create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2535 break;
2538 return result;
2541 /* An upper bound on the number of bytes needed to spell TOKEN.
2542 Does not include preceding whitespace. */
2543 unsigned int
2544 cpp_token_len (const cpp_token *token)
2546 unsigned int len;
2548 switch (TOKEN_SPELL (token))
2550 default: len = 6; break;
2551 case SPELL_LITERAL: len = token->val.str.len; break;
2552 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
2555 return len;
2558 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2559 Return the number of bytes read out of NAME. (There are always
2560 10 bytes written to BUFFER.) */
2562 static size_t
2563 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2565 int j;
2566 int ucn_len = 0;
2567 int ucn_len_c;
2568 unsigned t;
2569 unsigned long utf32;
2571 /* Compute the length of the UTF-8 sequence. */
2572 for (t = *name; t & 0x80; t <<= 1)
2573 ucn_len++;
2575 utf32 = *name & (0x7F >> ucn_len);
2576 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2578 utf32 = (utf32 << 6) | (*++name & 0x3F);
2580 /* Ill-formed UTF-8. */
2581 if ((*name & ~0x3F) != 0x80)
2582 abort ();
2585 *buffer++ = '\\';
2586 *buffer++ = 'U';
2587 for (j = 7; j >= 0; j--)
2588 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2589 return ucn_len;
2592 /* Given a token TYPE corresponding to a digraph, return a pointer to
2593 the spelling of the digraph. */
2594 static const unsigned char *
2595 cpp_digraph2name (enum cpp_ttype type)
2597 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2600 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
2601 already contain the enough space to hold the token's spelling.
2602 Returns a pointer to the character after the last character written.
2603 FORSTRING is true if this is to be the spelling after translation
2604 phase 1 (this is different for UCNs).
2605 FIXME: Would be nice if we didn't need the PFILE argument. */
2606 unsigned char *
2607 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2608 unsigned char *buffer, bool forstring)
2610 switch (TOKEN_SPELL (token))
2612 case SPELL_OPERATOR:
2614 const unsigned char *spelling;
2615 unsigned char c;
2617 if (token->flags & DIGRAPH)
2618 spelling = cpp_digraph2name (token->type);
2619 else if (token->flags & NAMED_OP)
2620 goto spell_ident;
2621 else
2622 spelling = TOKEN_NAME (token);
2624 while ((c = *spelling++) != '\0')
2625 *buffer++ = c;
2627 break;
2629 spell_ident:
2630 case SPELL_IDENT:
2631 if (forstring)
2633 memcpy (buffer, NODE_NAME (token->val.node.node),
2634 NODE_LEN (token->val.node.node));
2635 buffer += NODE_LEN (token->val.node.node);
2637 else
2639 size_t i;
2640 const unsigned char * name = NODE_NAME (token->val.node.node);
2642 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2643 if (name[i] & ~0x7F)
2645 i += utf8_to_ucn (buffer, name + i) - 1;
2646 buffer += 10;
2648 else
2649 *buffer++ = NODE_NAME (token->val.node.node)[i];
2651 break;
2653 case SPELL_LITERAL:
2654 memcpy (buffer, token->val.str.text, token->val.str.len);
2655 buffer += token->val.str.len;
2656 break;
2658 case SPELL_NONE:
2659 cpp_error (pfile, CPP_DL_ICE,
2660 "unspellable token %s", TOKEN_NAME (token));
2661 break;
2664 return buffer;
2667 /* Returns TOKEN spelt as a null-terminated string. The string is
2668 freed when the reader is destroyed. Useful for diagnostics. */
2669 unsigned char *
2670 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2672 unsigned int len = cpp_token_len (token) + 1;
2673 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2675 end = cpp_spell_token (pfile, token, start, false);
2676 end[0] = '\0';
2678 return start;
2681 /* Returns a pointer to a string which spells the token defined by
2682 TYPE and FLAGS. Used by C front ends, which really should move to
2683 using cpp_token_as_text. */
2684 const char *
2685 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2687 if (flags & DIGRAPH)
2688 return (const char *) cpp_digraph2name (type);
2689 else if (flags & NAMED_OP)
2690 return cpp_named_operator2name (type);
2692 return (const char *) token_spellings[type].name;
2695 /* Writes the spelling of token to FP, without any preceding space.
2696 Separated from cpp_spell_token for efficiency - to avoid stdio
2697 double-buffering. */
2698 void
2699 cpp_output_token (const cpp_token *token, FILE *fp)
2701 switch (TOKEN_SPELL (token))
2703 case SPELL_OPERATOR:
2705 const unsigned char *spelling;
2706 int c;
2708 if (token->flags & DIGRAPH)
2709 spelling = cpp_digraph2name (token->type);
2710 else if (token->flags & NAMED_OP)
2711 goto spell_ident;
2712 else
2713 spelling = TOKEN_NAME (token);
2715 c = *spelling;
2717 putc (c, fp);
2718 while ((c = *++spelling) != '\0');
2720 break;
2722 spell_ident:
2723 case SPELL_IDENT:
2725 size_t i;
2726 const unsigned char * name = NODE_NAME (token->val.node.node);
2728 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2729 if (name[i] & ~0x7F)
2731 unsigned char buffer[10];
2732 i += utf8_to_ucn (buffer, name + i) - 1;
2733 fwrite (buffer, 1, 10, fp);
2735 else
2736 fputc (NODE_NAME (token->val.node.node)[i], fp);
2738 break;
2740 case SPELL_LITERAL:
2741 fwrite (token->val.str.text, 1, token->val.str.len, fp);
2742 break;
2744 case SPELL_NONE:
2745 /* An error, most probably. */
2746 break;
2750 /* Compare two tokens. */
2752 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2754 if (a->type == b->type && a->flags == b->flags)
2755 switch (TOKEN_SPELL (a))
2757 default: /* Keep compiler happy. */
2758 case SPELL_OPERATOR:
2759 /* token_no is used to track where multiple consecutive ##
2760 tokens were originally located. */
2761 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2762 case SPELL_NONE:
2763 return (a->type != CPP_MACRO_ARG
2764 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2765 case SPELL_IDENT:
2766 return a->val.node.node == b->val.node.node;
2767 case SPELL_LITERAL:
2768 return (a->val.str.len == b->val.str.len
2769 && !memcmp (a->val.str.text, b->val.str.text,
2770 a->val.str.len));
2773 return 0;
2776 /* Returns nonzero if a space should be inserted to avoid an
2777 accidental token paste for output. For simplicity, it is
2778 conservative, and occasionally advises a space where one is not
2779 needed, e.g. "." and ".2". */
2781 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2782 const cpp_token *token2)
2784 enum cpp_ttype a = token1->type, b = token2->type;
2785 cppchar_t c;
2787 if (token1->flags & NAMED_OP)
2788 a = CPP_NAME;
2789 if (token2->flags & NAMED_OP)
2790 b = CPP_NAME;
2792 c = EOF;
2793 if (token2->flags & DIGRAPH)
2794 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2795 else if (token_spellings[b].category == SPELL_OPERATOR)
2796 c = token_spellings[b].name[0];
2798 /* Quickly get everything that can paste with an '='. */
2799 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2800 return 1;
2802 switch (a)
2804 case CPP_GREATER: return c == '>';
2805 case CPP_LESS: return c == '<' || c == '%' || c == ':';
2806 case CPP_PLUS: return c == '+';
2807 case CPP_MINUS: return c == '-' || c == '>';
2808 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
2809 case CPP_MOD: return c == ':' || c == '>';
2810 case CPP_AND: return c == '&';
2811 case CPP_OR: return c == '|';
2812 case CPP_COLON: return c == ':' || c == '>';
2813 case CPP_DEREF: return c == '*';
2814 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
2815 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
2816 case CPP_NAME: return ((b == CPP_NUMBER
2817 && name_p (pfile, &token2->val.str))
2818 || b == CPP_NAME
2819 || b == CPP_CHAR || b == CPP_STRING); /* L */
2820 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
2821 || c == '.' || c == '+' || c == '-');
2822 /* UCNs */
2823 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
2824 && b == CPP_NAME)
2825 || (CPP_OPTION (pfile, objc)
2826 && token1->val.str.text[0] == '@'
2827 && (b == CPP_NAME || b == CPP_STRING)));
2828 case CPP_STRING:
2829 case CPP_WSTRING:
2830 case CPP_UTF8STRING:
2831 case CPP_STRING16:
2832 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
2833 && (b == CPP_NAME
2834 || (TOKEN_SPELL (token2) == SPELL_LITERAL
2835 && ISIDST (token2->val.str.text[0]))));
2837 default: break;
2840 return 0;
2843 /* Output all the remaining tokens on the current line, and a newline
2844 character, to FP. Leading whitespace is removed. If there are
2845 macros, special token padding is not performed. */
2846 void
2847 cpp_output_line (cpp_reader *pfile, FILE *fp)
2849 const cpp_token *token;
2851 token = cpp_get_token (pfile);
2852 while (token->type != CPP_EOF)
2854 cpp_output_token (token, fp);
2855 token = cpp_get_token (pfile);
2856 if (token->flags & PREV_WHITE)
2857 putc (' ', fp);
2860 putc ('\n', fp);
2863 /* Return a string representation of all the remaining tokens on the
2864 current line. The result is allocated using xmalloc and must be
2865 freed by the caller. */
2866 unsigned char *
2867 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2869 const cpp_token *token;
2870 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2871 unsigned int alloced = 120 + out;
2872 unsigned char *result = (unsigned char *) xmalloc (alloced);
2874 /* If DIR_NAME is empty, there are no initial contents. */
2875 if (dir_name)
2877 sprintf ((char *) result, "#%s ", dir_name);
2878 out += 2;
2881 token = cpp_get_token (pfile);
2882 while (token->type != CPP_EOF)
2884 unsigned char *last;
2885 /* Include room for a possible space and the terminating nul. */
2886 unsigned int len = cpp_token_len (token) + 2;
2888 if (out + len > alloced)
2890 alloced *= 2;
2891 if (out + len > alloced)
2892 alloced = out + len;
2893 result = (unsigned char *) xrealloc (result, alloced);
2896 last = cpp_spell_token (pfile, token, &result[out], 0);
2897 out = last - result;
2899 token = cpp_get_token (pfile);
2900 if (token->flags & PREV_WHITE)
2901 result[out++] = ' ';
2904 result[out] = '\0';
2905 return result;
2908 /* Memory buffers. Changing these three constants can have a dramatic
2909 effect on performance. The values here are reasonable defaults,
2910 but might be tuned. If you adjust them, be sure to test across a
2911 range of uses of cpplib, including heavy nested function-like macro
2912 expansion. Also check the change in peak memory usage (NJAMD is a
2913 good tool for this). */
2914 #define MIN_BUFF_SIZE 8000
2915 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2916 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2917 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2919 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2920 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2921 #endif
2923 /* Create a new allocation buffer. Place the control block at the end
2924 of the buffer, so that buffer overflows will cause immediate chaos. */
2925 static _cpp_buff *
2926 new_buff (size_t len)
2928 _cpp_buff *result;
2929 unsigned char *base;
2931 if (len < MIN_BUFF_SIZE)
2932 len = MIN_BUFF_SIZE;
2933 len = CPP_ALIGN (len);
2935 #ifdef ENABLE_VALGRIND_CHECKING
2936 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
2937 struct first. */
2938 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
2939 base = XNEWVEC (unsigned char, len + slen);
2940 result = (_cpp_buff *) base;
2941 base += slen;
2942 #else
2943 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2944 result = (_cpp_buff *) (base + len);
2945 #endif
2946 result->base = base;
2947 result->cur = base;
2948 result->limit = base + len;
2949 result->next = NULL;
2950 return result;
2953 /* Place a chain of unwanted allocation buffers on the free list. */
2954 void
2955 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2957 _cpp_buff *end = buff;
2959 while (end->next)
2960 end = end->next;
2961 end->next = pfile->free_buffs;
2962 pfile->free_buffs = buff;
2965 /* Return a free buffer of size at least MIN_SIZE. */
2966 _cpp_buff *
2967 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2969 _cpp_buff *result, **p;
2971 for (p = &pfile->free_buffs;; p = &(*p)->next)
2973 size_t size;
2975 if (*p == NULL)
2976 return new_buff (min_size);
2977 result = *p;
2978 size = result->limit - result->base;
2979 /* Return a buffer that's big enough, but don't waste one that's
2980 way too big. */
2981 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2982 break;
2985 *p = result->next;
2986 result->next = NULL;
2987 result->cur = result->base;
2988 return result;
2991 /* Creates a new buffer with enough space to hold the uncommitted
2992 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
2993 the excess bytes to the new buffer. Chains the new buffer after
2994 BUFF, and returns the new buffer. */
2995 _cpp_buff *
2996 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2998 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2999 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3001 buff->next = new_buff;
3002 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3003 return new_buff;
3006 /* Creates a new buffer with enough space to hold the uncommitted
3007 remaining bytes of the buffer pointed to by BUFF, and at least
3008 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
3009 Chains the new buffer before the buffer pointed to by BUFF, and
3010 updates the pointer to point to the new buffer. */
3011 void
3012 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3014 _cpp_buff *new_buff, *old_buff = *pbuff;
3015 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3017 new_buff = _cpp_get_buff (pfile, size);
3018 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3019 new_buff->next = old_buff;
3020 *pbuff = new_buff;
3023 /* Free a chain of buffers starting at BUFF. */
3024 void
3025 _cpp_free_buff (_cpp_buff *buff)
3027 _cpp_buff *next;
3029 for (; buff; buff = next)
3031 next = buff->next;
3032 #ifdef ENABLE_VALGRIND_CHECKING
3033 free (buff);
3034 #else
3035 free (buff->base);
3036 #endif
3040 /* Allocate permanent, unaligned storage of length LEN. */
3041 unsigned char *
3042 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3044 _cpp_buff *buff = pfile->u_buff;
3045 unsigned char *result = buff->cur;
3047 if (len > (size_t) (buff->limit - result))
3049 buff = _cpp_get_buff (pfile, len);
3050 buff->next = pfile->u_buff;
3051 pfile->u_buff = buff;
3052 result = buff->cur;
3055 buff->cur = result + len;
3056 return result;
3059 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3060 That buffer is used for growing allocations when saving macro
3061 replacement lists in a #define, and when parsing an answer to an
3062 assertion in #assert, #unassert or #if (and therefore possibly
3063 whilst expanding macros). It therefore must not be used by any
3064 code that they might call: specifically the lexer and the guts of
3065 the macro expander.
3067 All existing other uses clearly fit this restriction: storing
3068 registered pragmas during initialization. */
3069 unsigned char *
3070 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3072 _cpp_buff *buff = pfile->a_buff;
3073 unsigned char *result = buff->cur;
3075 if (len > (size_t) (buff->limit - result))
3077 buff = _cpp_get_buff (pfile, len);
3078 buff->next = pfile->a_buff;
3079 pfile->a_buff = buff;
3080 result = buff->cur;
3083 buff->cur = result + len;
3084 return result;
3087 /* Say which field of TOK is in use. */
3089 enum cpp_token_fld_kind
3090 cpp_token_val_index (const cpp_token *tok)
3092 switch (TOKEN_SPELL (tok))
3094 case SPELL_IDENT:
3095 return CPP_TOKEN_FLD_NODE;
3096 case SPELL_LITERAL:
3097 return CPP_TOKEN_FLD_STR;
3098 case SPELL_OPERATOR:
3099 if (tok->type == CPP_PASTE)
3100 return CPP_TOKEN_FLD_TOKEN_NO;
3101 else
3102 return CPP_TOKEN_FLD_NONE;
3103 case SPELL_NONE:
3104 if (tok->type == CPP_MACRO_ARG)
3105 return CPP_TOKEN_FLD_ARG_NO;
3106 else if (tok->type == CPP_PADDING)
3107 return CPP_TOKEN_FLD_SOURCE;
3108 else if (tok->type == CPP_PRAGMA)
3109 return CPP_TOKEN_FLD_PRAGMA;
3110 /* else fall through */
3111 default:
3112 return CPP_TOKEN_FLD_NONE;
3116 /* All tokens lexed in R after calling this function will be forced to have
3117 their source_location the same as the location referenced by P, until
3118 cpp_stop_forcing_token_locations is called for R. */
3120 void
3121 cpp_force_token_locations (cpp_reader *r, source_location *p)
3123 r->forced_token_location_p = p;
3126 /* Go back to assigning locations naturally for lexed tokens. */
3128 void
3129 cpp_stop_forcing_token_locations (cpp_reader *r)
3131 r->forced_token_location_p = NULL;