add dbgcnt support for devirt
[official-gcc.git] / libcpp / lex.c
blobb7836225332bdba04f4ab123cfb84624fbeb7d65
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000-2014 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
27 enum spell_type
29 SPELL_OPERATOR = 0,
30 SPELL_IDENT,
31 SPELL_LITERAL,
32 SPELL_NONE
35 struct token_spelling
37 enum spell_type category;
38 const unsigned char *name;
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
44 #define OP(e, s) { SPELL_OPERATOR, UC s },
45 #define TK(e, s) { SPELL_ ## s, UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54 static int skip_line_comment (cpp_reader *);
55 static void skip_whitespace (cpp_reader *, cppchar_t);
56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58 static void store_comment (cpp_reader *, cpp_token *);
59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
60 unsigned int, enum cpp_ttype);
61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
62 static int name_p (cpp_reader *, const cpp_string *);
63 static tokenrun *next_tokenrun (tokenrun *);
65 static _cpp_buff *new_buff (size_t);
68 /* Utility routine:
70 Compares, the token TOKEN to the NUL-terminated string STRING.
71 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
72 int
73 cpp_ideq (const cpp_token *token, const char *string)
75 if (token->type != CPP_NAME)
76 return 0;
78 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
81 /* Record a note TYPE at byte POS into the current cleaned logical
82 line. */
83 static void
84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
86 if (buffer->notes_used == buffer->notes_cap)
88 buffer->notes_cap = buffer->notes_cap * 2 + 200;
89 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
90 buffer->notes_cap);
93 buffer->notes[buffer->notes_used].pos = pos;
94 buffer->notes[buffer->notes_used].type = type;
95 buffer->notes_used++;
99 /* Fast path to find line special characters using optimized character
100 scanning algorithms. Anything complicated falls back to the slow
101 path below. Since this loop is very hot it's worth doing these kinds
102 of optimizations.
104 One of the paths through the ifdefs should provide
106 const uchar *search_line_fast (const uchar *s, const uchar *end);
108 Between S and END, search for \n, \r, \\, ?. Return a pointer to
109 the found character.
111 Note that the last character of the buffer is *always* a newline,
112 as forced by _cpp_convert_input. This fact can be used to avoid
113 explicitly looking for the end of the buffer. */
115 /* Configure gives us an ifdef test. */
116 #ifndef WORDS_BIGENDIAN
117 #define WORDS_BIGENDIAN 0
118 #endif
120 /* We'd like the largest integer that fits into a register. There's nothing
121 in <stdint.h> that gives us that. For most hosts this is unsigned long,
122 but MS decided on an LLP64 model. Thankfully when building with GCC we
123 can get the "real" word size. */
124 #ifdef __GNUC__
125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
126 #else
127 typedef unsigned long word_type;
128 #endif
130 /* The code below is only expecting sizes 4 or 8.
131 Die at compile-time if this expectation is violated. */
132 typedef char check_word_type_size
133 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
135 /* Return X with the first N bytes forced to values that won't match one
136 of the interesting characters. Note that NUL is not interesting. */
138 static inline word_type
139 acc_char_mask_misalign (word_type val, unsigned int n)
141 word_type mask = -1;
142 if (WORDS_BIGENDIAN)
143 mask >>= n * 8;
144 else
145 mask <<= n * 8;
146 return val & mask;
149 /* Return X replicated to all byte positions within WORD_TYPE. */
151 static inline word_type
152 acc_char_replicate (uchar x)
154 word_type ret;
156 ret = (x << 24) | (x << 16) | (x << 8) | x;
157 if (sizeof(word_type) == 8)
158 ret = (ret << 16 << 16) | ret;
159 return ret;
162 /* Return non-zero if some byte of VAL is (probably) C. */
164 static inline word_type
165 acc_char_cmp (word_type val, word_type c)
167 #if defined(__GNUC__) && defined(__alpha__)
168 /* We can get exact results using a compare-bytes instruction.
169 Get (val == c) via (0 >= (val ^ c)). */
170 return __builtin_alpha_cmpbge (0, val ^ c);
171 #else
172 word_type magic = 0x7efefefeU;
173 if (sizeof(word_type) == 8)
174 magic = (magic << 16 << 16) | 0xfefefefeU;
175 magic |= 1;
177 val ^= c;
178 return ((val + magic) ^ ~val) & ~magic;
179 #endif
182 /* Given the result of acc_char_cmp is non-zero, return the index of
183 the found character. If this was a false positive, return -1. */
185 static inline int
186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
187 word_type val ATTRIBUTE_UNUSED)
189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
190 /* The cmpbge instruction sets *bits* of the result corresponding to
191 matches in the bytes with no false positives. */
192 return __builtin_ctzl (cmp);
193 #else
194 unsigned int i;
196 /* ??? It would be nice to force unrolling here,
197 and have all of these constants folded. */
198 for (i = 0; i < sizeof(word_type); ++i)
200 uchar c;
201 if (WORDS_BIGENDIAN)
202 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
203 else
204 c = (val >> i * 8) & 0xff;
206 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
207 return i;
210 return -1;
211 #endif
214 /* A version of the fast scanner using bit fiddling techniques.
216 For 32-bit words, one would normally perform 16 comparisons and
217 16 branches. With this algorithm one performs 24 arithmetic
218 operations and one branch. Whether this is faster with a 32-bit
219 word size is going to be somewhat system dependent.
221 For 64-bit words, we eliminate twice the number of comparisons
222 and branches without increasing the number of arithmetic operations.
223 It's almost certainly going to be a win with 64-bit word size. */
225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
226 ATTRIBUTE_UNUSED;
228 static const uchar *
229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
231 const word_type repl_nl = acc_char_replicate ('\n');
232 const word_type repl_cr = acc_char_replicate ('\r');
233 const word_type repl_bs = acc_char_replicate ('\\');
234 const word_type repl_qm = acc_char_replicate ('?');
236 unsigned int misalign;
237 const word_type *p;
238 word_type val, t;
240 /* Align the buffer. Mask out any bytes from before the beginning. */
241 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
242 val = *p;
243 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
244 if (misalign)
245 val = acc_char_mask_misalign (val, misalign);
247 /* Main loop. */
248 while (1)
250 t = acc_char_cmp (val, repl_nl);
251 t |= acc_char_cmp (val, repl_cr);
252 t |= acc_char_cmp (val, repl_bs);
253 t |= acc_char_cmp (val, repl_qm);
255 if (__builtin_expect (t != 0, 0))
257 int i = acc_char_index (t, val);
258 if (i >= 0)
259 return (const uchar *)p + i;
262 val = *++p;
266 /* Disable on Solaris 2/x86 until the following problem can be properly
267 autoconfed:
269 The Solaris 10+ assembler tags objects with the instruction set
270 extensions used, so SSE4.2 executables cannot run on machines that
271 don't support that extension. */
273 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
275 /* Replicated character data to be shared between implementations.
276 Recall that outside of a context with vector support we can't
277 define compatible vector types, therefore these are all defined
278 in terms of raw characters. */
279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
280 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
281 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
282 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
283 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
284 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
285 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
286 { '?', '?', '?', '?', '?', '?', '?', '?',
287 '?', '?', '?', '?', '?', '?', '?', '?' },
290 /* A version of the fast scanner using MMX vectorized byte compare insns.
292 This uses the PMOVMSKB instruction which was introduced with "MMX2",
293 which was packaged into SSE1; it is also present in the AMD MMX
294 extension. Mark the function as using "sse" so that we emit a real
295 "emms" instruction, rather than the 3dNOW "femms" instruction. */
297 static const uchar *
298 #ifndef __SSE__
299 __attribute__((__target__("sse")))
300 #endif
301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
303 typedef char v8qi __attribute__ ((__vector_size__ (8)));
304 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
306 const v8qi repl_nl = *(const v8qi *)repl_chars[0];
307 const v8qi repl_cr = *(const v8qi *)repl_chars[1];
308 const v8qi repl_bs = *(const v8qi *)repl_chars[2];
309 const v8qi repl_qm = *(const v8qi *)repl_chars[3];
311 unsigned int misalign, found, mask;
312 const v8qi *p;
313 v8qi data, t, c;
315 /* Align the source pointer. While MMX doesn't generate unaligned data
316 faults, this allows us to safely scan to the end of the buffer without
317 reading beyond the end of the last page. */
318 misalign = (uintptr_t)s & 7;
319 p = (const v8qi *)((uintptr_t)s & -8);
320 data = *p;
322 /* Create a mask for the bytes that are valid within the first
323 16-byte block. The Idea here is that the AND with the mask
324 within the loop is "free", since we need some AND or TEST
325 insn in order to set the flags for the branch anyway. */
326 mask = -1u << misalign;
328 /* Main loop processing 8 bytes at a time. */
329 goto start;
332 data = *++p;
333 mask = -1;
335 start:
336 t = __builtin_ia32_pcmpeqb(data, repl_nl);
337 c = __builtin_ia32_pcmpeqb(data, repl_cr);
338 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
339 c = __builtin_ia32_pcmpeqb(data, repl_bs);
340 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
341 c = __builtin_ia32_pcmpeqb(data, repl_qm);
342 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
343 found = __builtin_ia32_pmovmskb (t);
344 found &= mask;
346 while (!found);
348 __builtin_ia32_emms ();
350 /* FOUND contains 1 in bits for which we matched a relevant
351 character. Conversion to the byte index is trivial. */
352 found = __builtin_ctz(found);
353 return (const uchar *)p + found;
356 /* A version of the fast scanner using SSE2 vectorized byte compare insns. */
358 static const uchar *
359 #ifndef __SSE2__
360 __attribute__((__target__("sse2")))
361 #endif
362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
364 typedef char v16qi __attribute__ ((__vector_size__ (16)));
366 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
367 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
368 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
369 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
371 unsigned int misalign, found, mask;
372 const v16qi *p;
373 v16qi data, t;
375 /* Align the source pointer. */
376 misalign = (uintptr_t)s & 15;
377 p = (const v16qi *)((uintptr_t)s & -16);
378 data = *p;
380 /* Create a mask for the bytes that are valid within the first
381 16-byte block. The Idea here is that the AND with the mask
382 within the loop is "free", since we need some AND or TEST
383 insn in order to set the flags for the branch anyway. */
384 mask = -1u << misalign;
386 /* Main loop processing 16 bytes at a time. */
387 goto start;
390 data = *++p;
391 mask = -1;
393 start:
394 t = __builtin_ia32_pcmpeqb128(data, repl_nl);
395 t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
396 t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
397 t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
398 found = __builtin_ia32_pmovmskb128 (t);
399 found &= mask;
401 while (!found);
403 /* FOUND contains 1 in bits for which we matched a relevant
404 character. Conversion to the byte index is trivial. */
405 found = __builtin_ctz(found);
406 return (const uchar *)p + found;
409 #ifdef HAVE_SSE4
410 /* A version of the fast scanner using SSE 4.2 vectorized string insns. */
412 static const uchar *
413 #ifndef __SSE4_2__
414 __attribute__((__target__("sse4.2")))
415 #endif
416 search_line_sse42 (const uchar *s, const uchar *end)
418 typedef char v16qi __attribute__ ((__vector_size__ (16)));
419 static const v16qi search = { '\n', '\r', '?', '\\' };
421 uintptr_t si = (uintptr_t)s;
422 uintptr_t index;
424 /* Check for unaligned input. */
425 if (si & 15)
427 v16qi sv;
429 if (__builtin_expect (end - s < 16, 0)
430 && __builtin_expect ((si & 0xfff) > 0xff0, 0))
432 /* There are less than 16 bytes left in the buffer, and less
433 than 16 bytes left on the page. Reading 16 bytes at this
434 point might generate a spurious page fault. Defer to the
435 SSE2 implementation, which already handles alignment. */
436 return search_line_sse2 (s, end);
439 /* ??? The builtin doesn't understand that the PCMPESTRI read from
440 memory need not be aligned. */
441 sv = __builtin_ia32_loaddqu ((const char *) s);
442 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
444 if (__builtin_expect (index < 16, 0))
445 goto found;
447 /* Advance the pointer to an aligned address. We will re-scan a
448 few bytes, but we no longer need care for reading past the
449 end of a page, since we're guaranteed a match. */
450 s = (const uchar *)((si + 16) & -16);
453 /* Main loop, processing 16 bytes at a time. By doing the whole loop
454 in inline assembly, we can make proper use of the flags set. */
455 __asm ( "sub $16, %1\n"
456 " .balign 16\n"
457 "0: add $16, %1\n"
458 " %vpcmpestri $0, (%1), %2\n"
459 " jnc 0b"
460 : "=&c"(index), "+r"(s)
461 : "x"(search), "a"(4), "d"(16));
463 found:
464 return s + index;
467 #else
468 /* Work around out-dated assemblers without sse4 support. */
469 #define search_line_sse42 search_line_sse2
470 #endif
472 /* Check the CPU capabilities. */
474 #include "../gcc/config/i386/cpuid.h"
476 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
477 static search_line_fast_type search_line_fast;
479 #define HAVE_init_vectorized_lexer 1
480 static inline void
481 init_vectorized_lexer (void)
483 unsigned dummy, ecx = 0, edx = 0;
484 search_line_fast_type impl = search_line_acc_char;
485 int minimum = 0;
487 #if defined(__SSE4_2__)
488 minimum = 3;
489 #elif defined(__SSE2__)
490 minimum = 2;
491 #elif defined(__SSE__)
492 minimum = 1;
493 #endif
495 if (minimum == 3)
496 impl = search_line_sse42;
497 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
499 if (minimum == 3 || (ecx & bit_SSE4_2))
500 impl = search_line_sse42;
501 else if (minimum == 2 || (edx & bit_SSE2))
502 impl = search_line_sse2;
503 else if (minimum == 1 || (edx & bit_SSE))
504 impl = search_line_mmx;
506 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
508 if (minimum == 1
509 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
510 impl = search_line_mmx;
513 search_line_fast = impl;
516 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
518 /* A vection of the fast scanner using AltiVec vectorized byte compares. */
519 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
520 so we can't compile this function without -maltivec on the command line
521 (or implied by some other switch). */
523 static const uchar *
524 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
526 typedef __attribute__((altivec(vector))) unsigned char vc;
528 const vc repl_nl = {
529 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
530 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
532 const vc repl_cr = {
533 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
534 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
536 const vc repl_bs = {
537 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
538 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
540 const vc repl_qm = {
541 '?', '?', '?', '?', '?', '?', '?', '?',
542 '?', '?', '?', '?', '?', '?', '?', '?',
544 const vc ones = {
545 -1, -1, -1, -1, -1, -1, -1, -1,
546 -1, -1, -1, -1, -1, -1, -1, -1,
548 const vc zero = { 0 };
550 vc data, mask, t;
552 /* Altivec loads automatically mask addresses with -16. This lets us
553 issue the first load as early as possible. */
554 data = __builtin_vec_ld(0, (const vc *)s);
556 /* Discard bytes before the beginning of the buffer. Do this by
557 beginning with all ones and shifting in zeros according to the
558 mis-alignment. The LVSR instruction pulls the exact shift we
559 want from the address. */
560 #ifdef __BIG_ENDIAN__
561 mask = __builtin_vec_lvsr(0, s);
562 mask = __builtin_vec_perm(zero, ones, mask);
563 #else
564 mask = __builtin_vec_lvsl(0, s);
565 mask = __builtin_vec_perm(ones, zero, mask);
566 #endif
567 data &= mask;
569 /* While altivec loads mask addresses, we still need to align S so
570 that the offset we compute at the end is correct. */
571 s = (const uchar *)((uintptr_t)s & -16);
573 /* Main loop processing 16 bytes at a time. */
574 goto start;
577 vc m_nl, m_cr, m_bs, m_qm;
579 s += 16;
580 data = __builtin_vec_ld(0, (const vc *)s);
582 start:
583 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
584 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
585 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
586 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
587 t = (m_nl | m_cr) | (m_bs | m_qm);
589 /* T now contains 0xff in bytes for which we matched one of the relevant
590 characters. We want to exit the loop if any byte in T is non-zero.
591 Below is the expansion of vec_any_ne(t, zero). */
593 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
596 #define N (sizeof(vc) / sizeof(long))
598 union {
599 vc v;
600 /* Statically assert that N is 2 or 4. */
601 unsigned long l[(N == 2 || N == 4) ? N : -1];
602 } u;
603 unsigned long l, i = 0;
605 u.v = t;
607 /* Find the first word of T that is non-zero. */
608 switch (N)
610 case 4:
611 l = u.l[i++];
612 if (l != 0)
613 break;
614 s += sizeof(unsigned long);
615 l = u.l[i++];
616 if (l != 0)
617 break;
618 s += sizeof(unsigned long);
619 case 2:
620 l = u.l[i++];
621 if (l != 0)
622 break;
623 s += sizeof(unsigned long);
624 l = u.l[i];
627 /* L now contains 0xff in bytes for which we matched one of the
628 relevant characters. We can find the byte index by finding
629 its bit index and dividing by 8. */
630 #ifdef __BIG_ENDIAN__
631 l = __builtin_clzl(l) >> 3;
632 #else
633 l = __builtin_ctzl(l) >> 3;
634 #endif
635 return s + l;
637 #undef N
641 #elif defined (__ARM_NEON__)
642 #include "arm_neon.h"
644 static const uchar *
645 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
647 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
648 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
649 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
650 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
651 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
653 unsigned int misalign, found, mask;
654 const uint8_t *p;
655 uint8x16_t data;
657 /* Align the source pointer. */
658 misalign = (uintptr_t)s & 15;
659 p = (const uint8_t *)((uintptr_t)s & -16);
660 data = vld1q_u8 (p);
662 /* Create a mask for the bytes that are valid within the first
663 16-byte block. The Idea here is that the AND with the mask
664 within the loop is "free", since we need some AND or TEST
665 insn in order to set the flags for the branch anyway. */
666 mask = (-1u << misalign) & 0xffff;
668 /* Main loop, processing 16 bytes at a time. */
669 goto start;
673 uint8x8_t l;
674 uint16x4_t m;
675 uint32x2_t n;
676 uint8x16_t t, u, v, w;
678 p += 16;
679 data = vld1q_u8 (p);
680 mask = 0xffff;
682 start:
683 t = vceqq_u8 (data, repl_nl);
684 u = vceqq_u8 (data, repl_cr);
685 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
686 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
687 t = vandq_u8 (vorrq_u8 (v, w), xmask);
688 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
689 m = vpaddl_u8 (l);
690 n = vpaddl_u16 (m);
692 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
693 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
694 found &= mask;
696 while (!found);
698 /* FOUND contains 1 in bits for which we matched a relevant
699 character. Conversion to the byte index is trivial. */
700 found = __builtin_ctz (found);
701 return (const uchar *)p + found;
704 #else
706 /* We only have one accellerated alternative. Use a direct call so that
707 we encourage inlining. */
709 #define search_line_fast search_line_acc_char
711 #endif
713 /* Initialize the lexer if needed. */
715 void
716 _cpp_init_lexer (void)
718 #ifdef HAVE_init_vectorized_lexer
719 init_vectorized_lexer ();
720 #endif
723 /* Returns with a logical line that contains no escaped newlines or
724 trigraphs. This is a time-critical inner loop. */
725 void
726 _cpp_clean_line (cpp_reader *pfile)
728 cpp_buffer *buffer;
729 const uchar *s;
730 uchar c, *d, *p;
732 buffer = pfile->buffer;
733 buffer->cur_note = buffer->notes_used = 0;
734 buffer->cur = buffer->line_base = buffer->next_line;
735 buffer->need_line = false;
736 s = buffer->next_line;
738 if (!buffer->from_stage3)
740 const uchar *pbackslash = NULL;
742 /* Fast path. This is the common case of an un-escaped line with
743 no trigraphs. The primary win here is by not writing any
744 data back to memory until we have to. */
745 while (1)
747 /* Perform an optimized search for \n, \r, \\, ?. */
748 s = search_line_fast (s, buffer->rlimit);
750 c = *s;
751 if (c == '\\')
753 /* Record the location of the backslash and continue. */
754 pbackslash = s++;
756 else if (__builtin_expect (c == '?', 0))
758 if (__builtin_expect (s[1] == '?', false)
759 && _cpp_trigraph_map[s[2]])
761 /* Have a trigraph. We may or may not have to convert
762 it. Add a line note regardless, for -Wtrigraphs. */
763 add_line_note (buffer, s, s[2]);
764 if (CPP_OPTION (pfile, trigraphs))
766 /* We do, and that means we have to switch to the
767 slow path. */
768 d = (uchar *) s;
769 *d = _cpp_trigraph_map[s[2]];
770 s += 2;
771 goto slow_path;
774 /* Not a trigraph. Continue on fast-path. */
775 s++;
777 else
778 break;
781 /* This must be \r or \n. We're either done, or we'll be forced
782 to write back to the buffer and continue on the slow path. */
783 d = (uchar *) s;
785 if (__builtin_expect (s == buffer->rlimit, false))
786 goto done;
788 /* DOS line ending? */
789 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
791 s++;
792 if (s == buffer->rlimit)
793 goto done;
796 if (__builtin_expect (pbackslash == NULL, true))
797 goto done;
799 /* Check for escaped newline. */
800 p = d;
801 while (is_nvspace (p[-1]))
802 p--;
803 if (p - 1 != pbackslash)
804 goto done;
806 /* Have an escaped newline; process it and proceed to
807 the slow path. */
808 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
809 d = p - 2;
810 buffer->next_line = p - 1;
812 slow_path:
813 while (1)
815 c = *++s;
816 *++d = c;
818 if (c == '\n' || c == '\r')
820 /* Handle DOS line endings. */
821 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
822 s++;
823 if (s == buffer->rlimit)
824 break;
826 /* Escaped? */
827 p = d;
828 while (p != buffer->next_line && is_nvspace (p[-1]))
829 p--;
830 if (p == buffer->next_line || p[-1] != '\\')
831 break;
833 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
834 d = p - 2;
835 buffer->next_line = p - 1;
837 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
839 /* Add a note regardless, for the benefit of -Wtrigraphs. */
840 add_line_note (buffer, d, s[2]);
841 if (CPP_OPTION (pfile, trigraphs))
843 *d = _cpp_trigraph_map[s[2]];
844 s += 2;
849 else
851 while (*s != '\n' && *s != '\r')
852 s++;
853 d = (uchar *) s;
855 /* Handle DOS line endings. */
856 if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
857 s++;
860 done:
861 *d = '\n';
862 /* A sentinel note that should never be processed. */
863 add_line_note (buffer, d + 1, '\n');
864 buffer->next_line = s + 1;
867 /* Return true if the trigraph indicated by NOTE should be warned
868 about in a comment. */
869 static bool
870 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
872 const uchar *p;
874 /* Within comments we don't warn about trigraphs, unless the
875 trigraph forms an escaped newline, as that may change
876 behavior. */
877 if (note->type != '/')
878 return false;
880 /* If -trigraphs, then this was an escaped newline iff the next note
881 is coincident. */
882 if (CPP_OPTION (pfile, trigraphs))
883 return note[1].pos == note->pos;
885 /* Otherwise, see if this forms an escaped newline. */
886 p = note->pos + 3;
887 while (is_nvspace (*p))
888 p++;
890 /* There might have been escaped newlines between the trigraph and the
891 newline we found. Hence the position test. */
892 return (*p == '\n' && p < note[1].pos);
895 /* Process the notes created by add_line_note as far as the current
896 location. */
897 void
898 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
900 cpp_buffer *buffer = pfile->buffer;
902 for (;;)
904 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
905 unsigned int col;
907 if (note->pos > buffer->cur)
908 break;
910 buffer->cur_note++;
911 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
913 if (note->type == '\\' || note->type == ' ')
915 if (note->type == ' ' && !in_comment)
916 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
917 "backslash and newline separated by space");
919 if (buffer->next_line > buffer->rlimit)
921 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
922 "backslash-newline at end of file");
923 /* Prevent "no newline at end of file" warning. */
924 buffer->next_line = buffer->rlimit;
927 buffer->line_base = note->pos;
928 CPP_INCREMENT_LINE (pfile, 0);
930 else if (_cpp_trigraph_map[note->type])
932 if (CPP_OPTION (pfile, warn_trigraphs)
933 && (!in_comment || warn_in_comment (pfile, note)))
935 if (CPP_OPTION (pfile, trigraphs))
936 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
937 pfile->line_table->highest_line, col,
938 "trigraph ??%c converted to %c",
939 note->type,
940 (int) _cpp_trigraph_map[note->type]);
941 else
943 cpp_warning_with_line
944 (pfile, CPP_W_TRIGRAPHS,
945 pfile->line_table->highest_line, col,
946 "trigraph ??%c ignored, use -trigraphs to enable",
947 note->type);
951 else if (note->type == 0)
952 /* Already processed in lex_raw_string. */;
953 else
954 abort ();
958 /* Skip a C-style block comment. We find the end of the comment by
959 seeing if an asterisk is before every '/' we encounter. Returns
960 nonzero if comment terminated by EOF, zero otherwise.
962 Buffer->cur points to the initial asterisk of the comment. */
963 bool
964 _cpp_skip_block_comment (cpp_reader *pfile)
966 cpp_buffer *buffer = pfile->buffer;
967 const uchar *cur = buffer->cur;
968 uchar c;
970 cur++;
971 if (*cur == '/')
972 cur++;
974 for (;;)
976 /* People like decorating comments with '*', so check for '/'
977 instead for efficiency. */
978 c = *cur++;
980 if (c == '/')
982 if (cur[-2] == '*')
983 break;
985 /* Warn about potential nested comments, but not if the '/'
986 comes immediately before the true comment delimiter.
987 Don't bother to get it right across escaped newlines. */
988 if (CPP_OPTION (pfile, warn_comments)
989 && cur[0] == '*' && cur[1] != '/')
991 buffer->cur = cur;
992 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
993 pfile->line_table->highest_line,
994 CPP_BUF_COL (buffer),
995 "\"/*\" within comment");
998 else if (c == '\n')
1000 unsigned int cols;
1001 buffer->cur = cur - 1;
1002 _cpp_process_line_notes (pfile, true);
1003 if (buffer->next_line >= buffer->rlimit)
1004 return true;
1005 _cpp_clean_line (pfile);
1007 cols = buffer->next_line - buffer->line_base;
1008 CPP_INCREMENT_LINE (pfile, cols);
1010 cur = buffer->cur;
1014 buffer->cur = cur;
1015 _cpp_process_line_notes (pfile, true);
1016 return false;
1019 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1020 terminating newline. Handles escaped newlines. Returns nonzero
1021 if a multiline comment. */
1022 static int
1023 skip_line_comment (cpp_reader *pfile)
1025 cpp_buffer *buffer = pfile->buffer;
1026 source_location orig_line = pfile->line_table->highest_line;
1028 while (*buffer->cur != '\n')
1029 buffer->cur++;
1031 _cpp_process_line_notes (pfile, true);
1032 return orig_line != pfile->line_table->highest_line;
1035 /* Skips whitespace, saving the next non-whitespace character. */
1036 static void
1037 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1039 cpp_buffer *buffer = pfile->buffer;
1040 bool saw_NUL = false;
1044 /* Horizontal space always OK. */
1045 if (c == ' ' || c == '\t')
1047 /* Just \f \v or \0 left. */
1048 else if (c == '\0')
1049 saw_NUL = true;
1050 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1051 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1052 CPP_BUF_COL (buffer),
1053 "%s in preprocessing directive",
1054 c == '\f' ? "form feed" : "vertical tab");
1056 c = *buffer->cur++;
1058 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1059 while (is_nvspace (c));
1061 if (saw_NUL)
1062 cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1064 buffer->cur--;
1067 /* See if the characters of a number token are valid in a name (no
1068 '.', '+' or '-'). */
1069 static int
1070 name_p (cpp_reader *pfile, const cpp_string *string)
1072 unsigned int i;
1074 for (i = 0; i < string->len; i++)
1075 if (!is_idchar (string->text[i]))
1076 return 0;
1078 return 1;
1081 /* After parsing an identifier or other sequence, produce a warning about
1082 sequences not in NFC/NFKC. */
1083 static void
1084 warn_about_normalization (cpp_reader *pfile,
1085 const cpp_token *token,
1086 const struct normalize_state *s)
1088 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1089 && !pfile->state.skipping)
1091 /* Make sure that the token is printed using UCNs, even
1092 if we'd otherwise happily print UTF-8. */
1093 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1094 size_t sz;
1096 sz = cpp_spell_token (pfile, token, buf, false) - buf;
1097 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1098 cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1099 "`%.*s' is not in NFKC", (int) sz, buf);
1100 else
1101 cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1102 "`%.*s' is not in NFC", (int) sz, buf);
1103 free (buf);
1107 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1108 an identifier. FIRST is TRUE if this starts an identifier. */
1109 static bool
1110 forms_identifier_p (cpp_reader *pfile, int first,
1111 struct normalize_state *state)
1113 cpp_buffer *buffer = pfile->buffer;
1115 if (*buffer->cur == '$')
1117 if (!CPP_OPTION (pfile, dollars_in_ident))
1118 return false;
1120 buffer->cur++;
1121 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1123 CPP_OPTION (pfile, warn_dollars) = 0;
1124 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1127 return true;
1130 /* Is this a syntactically valid UCN? */
1131 if (CPP_OPTION (pfile, extended_identifiers)
1132 && *buffer->cur == '\\'
1133 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1135 buffer->cur += 2;
1136 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1137 state))
1138 return true;
1139 buffer->cur -= 2;
1142 return false;
1145 /* Helper function to get the cpp_hashnode of the identifier BASE. */
1146 static cpp_hashnode *
1147 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1149 cpp_hashnode *result;
1150 const uchar *cur;
1151 unsigned int len;
1152 unsigned int hash = HT_HASHSTEP (0, *base);
1154 cur = base + 1;
1155 while (ISIDNUM (*cur))
1157 hash = HT_HASHSTEP (hash, *cur);
1158 cur++;
1160 len = cur - base;
1161 hash = HT_HASHFINISH (hash, len);
1162 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1163 base, len, hash, HT_ALLOC));
1165 /* Rarely, identifiers require diagnostics when lexed. */
1166 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1167 && !pfile->state.skipping, 0))
1169 /* It is allowed to poison the same identifier twice. */
1170 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1171 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1172 NODE_NAME (result));
1174 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1175 replacement list of a variadic macro. */
1176 if (result == pfile->spec_nodes.n__VA_ARGS__
1177 && !pfile->state.va_args_ok)
1178 cpp_error (pfile, CPP_DL_PEDWARN,
1179 "__VA_ARGS__ can only appear in the expansion"
1180 " of a C99 variadic macro");
1182 /* For -Wc++-compat, warn about use of C++ named operators. */
1183 if (result->flags & NODE_WARN_OPERATOR)
1184 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1185 "identifier \"%s\" is a special operator name in C++",
1186 NODE_NAME (result));
1189 return result;
1192 /* Get the cpp_hashnode of an identifier specified by NAME in
1193 the current cpp_reader object. If none is found, NULL is returned. */
1194 cpp_hashnode *
1195 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1197 cpp_hashnode *result;
1198 result = lex_identifier_intern (pfile, (uchar *) name);
1199 return result;
1202 /* Lex an identifier starting at BUFFER->CUR - 1. */
1203 static cpp_hashnode *
1204 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1205 struct normalize_state *nst)
1207 cpp_hashnode *result;
1208 const uchar *cur;
1209 unsigned int len;
1210 unsigned int hash = HT_HASHSTEP (0, *base);
1212 cur = pfile->buffer->cur;
1213 if (! starts_ucn)
1215 while (ISIDNUM (*cur))
1217 hash = HT_HASHSTEP (hash, *cur);
1218 cur++;
1220 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1222 pfile->buffer->cur = cur;
1223 if (starts_ucn || forms_identifier_p (pfile, false, nst))
1225 /* Slower version for identifiers containing UCNs (or $). */
1226 do {
1227 while (ISIDNUM (*pfile->buffer->cur))
1229 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1230 pfile->buffer->cur++;
1232 } while (forms_identifier_p (pfile, false, nst));
1233 result = _cpp_interpret_identifier (pfile, base,
1234 pfile->buffer->cur - base);
1236 else
1238 len = cur - base;
1239 hash = HT_HASHFINISH (hash, len);
1241 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1242 base, len, hash, HT_ALLOC));
1245 /* Rarely, identifiers require diagnostics when lexed. */
1246 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1247 && !pfile->state.skipping, 0))
1249 /* It is allowed to poison the same identifier twice. */
1250 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1251 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1252 NODE_NAME (result));
1254 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1255 replacement list of a variadic macro. */
1256 if (result == pfile->spec_nodes.n__VA_ARGS__
1257 && !pfile->state.va_args_ok)
1258 cpp_error (pfile, CPP_DL_PEDWARN,
1259 "__VA_ARGS__ can only appear in the expansion"
1260 " of a C99 variadic macro");
1262 /* For -Wc++-compat, warn about use of C++ named operators. */
1263 if (result->flags & NODE_WARN_OPERATOR)
1264 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1265 "identifier \"%s\" is a special operator name in C++",
1266 NODE_NAME (result));
1269 return result;
1272 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
1273 static void
1274 lex_number (cpp_reader *pfile, cpp_string *number,
1275 struct normalize_state *nst)
1277 const uchar *cur;
1278 const uchar *base;
1279 uchar *dest;
1281 base = pfile->buffer->cur - 1;
1284 cur = pfile->buffer->cur;
1286 /* N.B. ISIDNUM does not include $. */
1287 while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1288 || VALID_SIGN (*cur, cur[-1]))
1290 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1291 cur++;
1294 pfile->buffer->cur = cur;
1296 while (forms_identifier_p (pfile, false, nst));
1298 number->len = cur - base;
1299 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1300 memcpy (dest, base, number->len);
1301 dest[number->len] = '\0';
1302 number->text = dest;
1305 /* Create a token of type TYPE with a literal spelling. */
1306 static void
1307 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1308 unsigned int len, enum cpp_ttype type)
1310 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1312 memcpy (dest, base, len);
1313 dest[len] = '\0';
1314 token->type = type;
1315 token->val.str.len = len;
1316 token->val.str.text = dest;
1319 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1320 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
1322 static void
1323 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1324 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1326 _cpp_buff *first_buff = *first_buff_p;
1327 _cpp_buff *last_buff = *last_buff_p;
1329 if (first_buff == NULL)
1330 first_buff = last_buff = _cpp_get_buff (pfile, len);
1331 else if (len > BUFF_ROOM (last_buff))
1333 size_t room = BUFF_ROOM (last_buff);
1334 memcpy (BUFF_FRONT (last_buff), base, room);
1335 BUFF_FRONT (last_buff) += room;
1336 base += room;
1337 len -= room;
1338 last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1341 memcpy (BUFF_FRONT (last_buff), base, len);
1342 BUFF_FRONT (last_buff) += len;
1344 *first_buff_p = first_buff;
1345 *last_buff_p = last_buff;
1349 /* Returns true if a macro has been defined.
1350 This might not work if compile with -save-temps,
1351 or preprocess separately from compilation. */
1353 static bool
1354 is_macro(cpp_reader *pfile, const uchar *base)
1356 const uchar *cur = base;
1357 if (! ISIDST (*cur))
1358 return false;
1359 unsigned int hash = HT_HASHSTEP (0, *cur);
1360 ++cur;
1361 while (ISIDNUM (*cur))
1363 hash = HT_HASHSTEP (hash, *cur);
1364 ++cur;
1366 hash = HT_HASHFINISH (hash, cur - base);
1368 cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1369 base, cur - base, hash, HT_NO_INSERT));
1371 return !result ? false : (result->type == NT_MACRO);
1375 /* Lexes a raw string. The stored string contains the spelling, including
1376 double quotes, delimiter string, '(' and ')', any leading
1377 'L', 'u', 'U' or 'u8' and 'R' modifier. It returns the type of the
1378 literal, or CPP_OTHER if it was not properly terminated.
1380 The spelling is NUL-terminated, but it is not guaranteed that this
1381 is the first NUL since embedded NULs are preserved. */
1383 static void
1384 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1385 const uchar *cur)
1387 uchar raw_prefix[17];
1388 uchar temp_buffer[18];
1389 const uchar *orig_base;
1390 unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
1391 enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
1392 raw_str_phase phase = RAW_STR_PREFIX;
1393 enum cpp_ttype type;
1394 size_t total_len = 0;
1395 /* Index into temp_buffer during phases other than RAW_STR,
1396 during RAW_STR phase 17 to tell BUF_APPEND that nothing should
1397 be appended to temp_buffer. */
1398 size_t temp_buffer_len = 0;
1399 _cpp_buff *first_buff = NULL, *last_buff = NULL;
1400 size_t raw_prefix_start;
1401 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1403 type = (*base == 'L' ? CPP_WSTRING :
1404 *base == 'U' ? CPP_STRING32 :
1405 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1406 : CPP_STRING);
1408 #define BUF_APPEND(STR,LEN) \
1409 do { \
1410 bufring_append (pfile, (const uchar *)(STR), (LEN), \
1411 &first_buff, &last_buff); \
1412 total_len += (LEN); \
1413 if (__builtin_expect (temp_buffer_len < 17, 0) \
1414 && (const uchar *)(STR) != base \
1415 && (LEN) <= 2) \
1417 memcpy (temp_buffer + temp_buffer_len, \
1418 (const uchar *)(STR), (LEN)); \
1419 temp_buffer_len += (LEN); \
1421 } while (0);
1423 orig_base = base;
1424 ++cur;
1425 raw_prefix_start = cur - base;
1426 for (;;)
1428 cppchar_t c;
1430 /* If we previously performed any trigraph or line splicing
1431 transformations, undo them in between the opening and closing
1432 double quote. */
1433 while (note->pos < cur)
1434 ++note;
1435 for (; note->pos == cur; ++note)
1437 switch (note->type)
1439 case '\\':
1440 case ' ':
1441 /* Restore backslash followed by newline. */
1442 BUF_APPEND (base, cur - base);
1443 base = cur;
1444 BUF_APPEND ("\\", 1);
1445 after_backslash:
1446 if (note->type == ' ')
1448 /* GNU backslash whitespace newline extension. FIXME
1449 could be any sequence of non-vertical space. When we
1450 can properly restore any such sequence, we should mark
1451 this note as handled so _cpp_process_line_notes
1452 doesn't warn. */
1453 BUF_APPEND (" ", 1);
1456 BUF_APPEND ("\n", 1);
1457 break;
1459 case 0:
1460 /* Already handled. */
1461 break;
1463 default:
1464 if (_cpp_trigraph_map[note->type])
1466 /* Don't warn about this trigraph in
1467 _cpp_process_line_notes, since trigraphs show up as
1468 trigraphs in raw strings. */
1469 uchar type = note->type;
1470 note->type = 0;
1472 if (!CPP_OPTION (pfile, trigraphs))
1473 /* If we didn't convert the trigraph in the first
1474 place, don't do anything now either. */
1475 break;
1477 BUF_APPEND (base, cur - base);
1478 base = cur;
1479 BUF_APPEND ("??", 2);
1481 /* ??/ followed by newline gets two line notes, one for
1482 the trigraph and one for the backslash/newline. */
1483 if (type == '/' && note[1].pos == cur)
1485 if (note[1].type != '\\'
1486 && note[1].type != ' ')
1487 abort ();
1488 BUF_APPEND ("/", 1);
1489 ++note;
1490 goto after_backslash;
1492 else
1494 /* Skip the replacement character. */
1495 base = ++cur;
1496 BUF_APPEND (&type, 1);
1497 c = type;
1498 goto check_c;
1501 else
1502 abort ();
1503 break;
1506 c = *cur++;
1507 if (__builtin_expect (temp_buffer_len < 17, 0))
1508 temp_buffer[temp_buffer_len++] = c;
1510 check_c:
1511 if (phase == RAW_STR_PREFIX)
1513 while (raw_prefix_len < temp_buffer_len)
1515 raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len];
1516 switch (raw_prefix[raw_prefix_len])
1518 case ' ': case '(': case ')': case '\\': case '\t':
1519 case '\v': case '\f': case '\n': default:
1520 break;
1521 /* Basic source charset except the above chars. */
1522 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1523 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1524 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1525 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1526 case 'y': case 'z':
1527 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1528 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1529 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1530 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1531 case 'Y': case 'Z':
1532 case '0': case '1': case '2': case '3': case '4': case '5':
1533 case '6': case '7': case '8': case '9':
1534 case '_': case '{': case '}': case '#': case '[': case ']':
1535 case '<': case '>': case '%': case ':': case ';': case '.':
1536 case '?': case '*': case '+': case '-': case '/': case '^':
1537 case '&': case '|': case '~': case '!': case '=': case ',':
1538 case '"': case '\'':
1539 if (raw_prefix_len < 16)
1541 raw_prefix_len++;
1542 continue;
1544 break;
1547 if (raw_prefix[raw_prefix_len] != '(')
1549 int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1;
1550 if (raw_prefix_len == 16)
1551 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1552 col, "raw string delimiter longer "
1553 "than 16 characters");
1554 else if (raw_prefix[raw_prefix_len] == '\n')
1555 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1556 col, "invalid new-line in raw "
1557 "string delimiter");
1558 else
1559 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1560 col, "invalid character '%c' in "
1561 "raw string delimiter",
1562 (int) raw_prefix[raw_prefix_len]);
1563 pfile->buffer->cur = orig_base + raw_prefix_start - 1;
1564 create_literal (pfile, token, orig_base,
1565 raw_prefix_start - 1, CPP_OTHER);
1566 if (first_buff)
1567 _cpp_release_buff (pfile, first_buff);
1568 return;
1570 raw_prefix[raw_prefix_len] = '"';
1571 phase = RAW_STR;
1572 /* Nothing should be appended to temp_buffer during
1573 RAW_STR phase. */
1574 temp_buffer_len = 17;
1575 break;
1577 continue;
1579 else if (phase == RAW_STR_SUFFIX)
1581 while (raw_suffix_len <= raw_prefix_len
1582 && raw_suffix_len < temp_buffer_len
1583 && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len])
1584 raw_suffix_len++;
1585 if (raw_suffix_len > raw_prefix_len)
1586 break;
1587 if (raw_suffix_len == temp_buffer_len)
1588 continue;
1589 phase = RAW_STR;
1590 /* Nothing should be appended to temp_buffer during
1591 RAW_STR phase. */
1592 temp_buffer_len = 17;
1594 if (c == ')')
1596 phase = RAW_STR_SUFFIX;
1597 raw_suffix_len = 0;
1598 temp_buffer_len = 0;
1600 else if (c == '\n')
1602 if (pfile->state.in_directive
1603 || (pfile->state.parsing_args
1604 && pfile->buffer->next_line >= pfile->buffer->rlimit))
1606 cur--;
1607 type = CPP_OTHER;
1608 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1609 "unterminated raw string");
1610 break;
1613 BUF_APPEND (base, cur - base);
1615 if (pfile->buffer->cur < pfile->buffer->rlimit)
1616 CPP_INCREMENT_LINE (pfile, 0);
1617 pfile->buffer->need_line = true;
1619 pfile->buffer->cur = cur-1;
1620 _cpp_process_line_notes (pfile, false);
1621 if (!_cpp_get_fresh_line (pfile))
1623 source_location src_loc = token->src_loc;
1624 token->type = CPP_EOF;
1625 /* Tell the compiler the line number of the EOF token. */
1626 token->src_loc = pfile->line_table->highest_line;
1627 token->flags = BOL;
1628 if (first_buff != NULL)
1629 _cpp_release_buff (pfile, first_buff);
1630 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1631 "unterminated raw string");
1632 return;
1635 cur = base = pfile->buffer->cur;
1636 note = &pfile->buffer->notes[pfile->buffer->cur_note];
1640 if (CPP_OPTION (pfile, user_literals))
1642 /* If a string format macro, say from inttypes.h, is placed touching
1643 a string literal it could be parsed as a C++11 user-defined string
1644 literal thus breaking the program.
1645 Try to identify macros with is_macro. A warning is issued. */
1646 if (is_macro (pfile, cur))
1648 /* Raise a warning, but do not consume subsequent tokens. */
1649 if (CPP_OPTION (pfile, warn_literal_suffix))
1650 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1651 token->src_loc, 0,
1652 "invalid suffix on literal; C++11 requires "
1653 "a space between literal and string macro");
1655 /* Grab user defined literal suffix. */
1656 else if (ISIDST (*cur))
1658 type = cpp_userdef_string_add_type (type);
1659 ++cur;
1661 while (ISIDNUM (*cur))
1662 ++cur;
1666 pfile->buffer->cur = cur;
1667 if (first_buff == NULL)
1668 create_literal (pfile, token, base, cur - base, type);
1669 else
1671 uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1673 token->type = type;
1674 token->val.str.len = total_len + (cur - base);
1675 token->val.str.text = dest;
1676 last_buff = first_buff;
1677 while (last_buff != NULL)
1679 memcpy (dest, last_buff->base,
1680 BUFF_FRONT (last_buff) - last_buff->base);
1681 dest += BUFF_FRONT (last_buff) - last_buff->base;
1682 last_buff = last_buff->next;
1684 _cpp_release_buff (pfile, first_buff);
1685 memcpy (dest, base, cur - base);
1686 dest[cur - base] = '\0';
1690 /* Lexes a string, character constant, or angle-bracketed header file
1691 name. The stored string contains the spelling, including opening
1692 quote and any leading 'L', 'u', 'U' or 'u8' and optional
1693 'R' modifier. It returns the type of the literal, or CPP_OTHER
1694 if it was not properly terminated, or CPP_LESS for an unterminated
1695 header name which must be relexed as normal tokens.
1697 The spelling is NUL-terminated, but it is not guaranteed that this
1698 is the first NUL since embedded NULs are preserved. */
1699 static void
1700 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1702 bool saw_NUL = false;
1703 const uchar *cur;
1704 cppchar_t terminator;
1705 enum cpp_ttype type;
1707 cur = base;
1708 terminator = *cur++;
1709 if (terminator == 'L' || terminator == 'U')
1710 terminator = *cur++;
1711 else if (terminator == 'u')
1713 terminator = *cur++;
1714 if (terminator == '8')
1715 terminator = *cur++;
1717 if (terminator == 'R')
1719 lex_raw_string (pfile, token, base, cur);
1720 return;
1722 if (terminator == '"')
1723 type = (*base == 'L' ? CPP_WSTRING :
1724 *base == 'U' ? CPP_STRING32 :
1725 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1726 : CPP_STRING);
1727 else if (terminator == '\'')
1728 type = (*base == 'L' ? CPP_WCHAR :
1729 *base == 'U' ? CPP_CHAR32 :
1730 *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1731 else
1732 terminator = '>', type = CPP_HEADER_NAME;
1734 for (;;)
1736 cppchar_t c = *cur++;
1738 /* In #include-style directives, terminators are not escapable. */
1739 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1740 cur++;
1741 else if (c == terminator)
1742 break;
1743 else if (c == '\n')
1745 cur--;
1746 /* Unmatched quotes always yield undefined behavior, but
1747 greedy lexing means that what appears to be an unterminated
1748 header name may actually be a legitimate sequence of tokens. */
1749 if (terminator == '>')
1751 token->type = CPP_LESS;
1752 return;
1754 type = CPP_OTHER;
1755 break;
1757 else if (c == '\0')
1758 saw_NUL = true;
1761 if (saw_NUL && !pfile->state.skipping)
1762 cpp_error (pfile, CPP_DL_WARNING,
1763 "null character(s) preserved in literal");
1765 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1766 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1767 (int) terminator);
1769 if (CPP_OPTION (pfile, user_literals))
1771 /* If a string format macro, say from inttypes.h, is placed touching
1772 a string literal it could be parsed as a C++11 user-defined string
1773 literal thus breaking the program.
1774 Try to identify macros with is_macro. A warning is issued. */
1775 if (is_macro (pfile, cur))
1777 /* Raise a warning, but do not consume subsequent tokens. */
1778 if (CPP_OPTION (pfile, warn_literal_suffix))
1779 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1780 token->src_loc, 0,
1781 "invalid suffix on literal; C++11 requires "
1782 "a space between literal and string macro");
1784 /* Grab user defined literal suffix. */
1785 else if (ISIDST (*cur))
1787 type = cpp_userdef_char_add_type (type);
1788 type = cpp_userdef_string_add_type (type);
1789 ++cur;
1791 while (ISIDNUM (*cur))
1792 ++cur;
1796 pfile->buffer->cur = cur;
1797 create_literal (pfile, token, base, cur - base, type);
1800 /* Return the comment table. The client may not make any assumption
1801 about the ordering of the table. */
1802 cpp_comment_table *
1803 cpp_get_comments (cpp_reader *pfile)
1805 return &pfile->comments;
1808 /* Append a comment to the end of the comment table. */
1809 static void
1810 store_comment (cpp_reader *pfile, cpp_token *token)
1812 int len;
1814 if (pfile->comments.allocated == 0)
1816 pfile->comments.allocated = 256;
1817 pfile->comments.entries = (cpp_comment *) xmalloc
1818 (pfile->comments.allocated * sizeof (cpp_comment));
1821 if (pfile->comments.count == pfile->comments.allocated)
1823 pfile->comments.allocated *= 2;
1824 pfile->comments.entries = (cpp_comment *) xrealloc
1825 (pfile->comments.entries,
1826 pfile->comments.allocated * sizeof (cpp_comment));
1829 len = token->val.str.len;
1831 /* Copy comment. Note, token may not be NULL terminated. */
1832 pfile->comments.entries[pfile->comments.count].comment =
1833 (char *) xmalloc (sizeof (char) * (len + 1));
1834 memcpy (pfile->comments.entries[pfile->comments.count].comment,
1835 token->val.str.text, len);
1836 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1838 /* Set source location. */
1839 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1841 /* Increment the count of entries in the comment table. */
1842 pfile->comments.count++;
1845 /* The stored comment includes the comment start and any terminator. */
1846 static void
1847 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1848 cppchar_t type)
1850 unsigned char *buffer;
1851 unsigned int len, clen, i;
1853 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
1855 /* C++ comments probably (not definitely) have moved past a new
1856 line, which we don't want to save in the comment. */
1857 if (is_vspace (pfile->buffer->cur[-1]))
1858 len--;
1860 /* If we are currently in a directive or in argument parsing, then
1861 we need to store all C++ comments as C comments internally, and
1862 so we need to allocate a little extra space in that case.
1864 Note that the only time we encounter a directive here is
1865 when we are saving comments in a "#define". */
1866 clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1867 && type == '/') ? len + 2 : len;
1869 buffer = _cpp_unaligned_alloc (pfile, clen);
1871 token->type = CPP_COMMENT;
1872 token->val.str.len = clen;
1873 token->val.str.text = buffer;
1875 buffer[0] = '/';
1876 memcpy (buffer + 1, from, len - 1);
1878 /* Finish conversion to a C comment, if necessary. */
1879 if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1881 buffer[1] = '*';
1882 buffer[clen - 2] = '*';
1883 buffer[clen - 1] = '/';
1884 /* As there can be in a C++ comments illegal sequences for C comments
1885 we need to filter them out. */
1886 for (i = 2; i < (clen - 2); i++)
1887 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1888 buffer[i] = '|';
1891 /* Finally store this comment for use by clients of libcpp. */
1892 store_comment (pfile, token);
1895 /* Allocate COUNT tokens for RUN. */
1896 void
1897 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1899 run->base = XNEWVEC (cpp_token, count);
1900 run->limit = run->base + count;
1901 run->next = NULL;
1904 /* Returns the next tokenrun, or creates one if there is none. */
1905 static tokenrun *
1906 next_tokenrun (tokenrun *run)
1908 if (run->next == NULL)
1910 run->next = XNEW (tokenrun);
1911 run->next->prev = run;
1912 _cpp_init_tokenrun (run->next, 250);
1915 return run->next;
1918 /* Return the number of not yet processed token in a given
1919 context. */
1921 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1923 if (context->tokens_kind == TOKENS_KIND_DIRECT)
1924 return (LAST (context).token - FIRST (context).token);
1925 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1926 || context->tokens_kind == TOKENS_KIND_EXTENDED)
1927 return (LAST (context).ptoken - FIRST (context).ptoken);
1928 else
1929 abort ();
1932 /* Returns the token present at index INDEX in a given context. If
1933 INDEX is zero, the next token to be processed is returned. */
1934 static const cpp_token*
1935 _cpp_token_from_context_at (cpp_context *context, int index)
1937 if (context->tokens_kind == TOKENS_KIND_DIRECT)
1938 return &(FIRST (context).token[index]);
1939 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1940 || context->tokens_kind == TOKENS_KIND_EXTENDED)
1941 return FIRST (context).ptoken[index];
1942 else
1943 abort ();
1946 /* Look ahead in the input stream. */
1947 const cpp_token *
1948 cpp_peek_token (cpp_reader *pfile, int index)
1950 cpp_context *context = pfile->context;
1951 const cpp_token *peektok;
1952 int count;
1954 /* First, scan through any pending cpp_context objects. */
1955 while (context->prev)
1957 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1959 if (index < (int) sz)
1960 return _cpp_token_from_context_at (context, index);
1961 index -= (int) sz;
1962 context = context->prev;
1965 /* We will have to read some new tokens after all (and do so
1966 without invalidating preceding tokens). */
1967 count = index;
1968 pfile->keep_tokens++;
1972 peektok = _cpp_lex_token (pfile);
1973 if (peektok->type == CPP_EOF)
1974 return peektok;
1976 while (index--);
1978 _cpp_backup_tokens_direct (pfile, count + 1);
1979 pfile->keep_tokens--;
1981 return peektok;
1984 /* Allocate a single token that is invalidated at the same time as the
1985 rest of the tokens on the line. Has its line and col set to the
1986 same as the last lexed token, so that diagnostics appear in the
1987 right place. */
1988 cpp_token *
1989 _cpp_temp_token (cpp_reader *pfile)
1991 cpp_token *old, *result;
1992 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1993 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1995 old = pfile->cur_token - 1;
1996 /* Any pre-existing lookaheads must not be clobbered. */
1997 if (la)
1999 if (sz <= la)
2001 tokenrun *next = next_tokenrun (pfile->cur_run);
2003 if (sz < la)
2004 memmove (next->base + 1, next->base,
2005 (la - sz) * sizeof (cpp_token));
2007 next->base[0] = pfile->cur_run->limit[-1];
2010 if (sz > 1)
2011 memmove (pfile->cur_token + 1, pfile->cur_token,
2012 MIN (la, sz - 1) * sizeof (cpp_token));
2015 if (!sz && pfile->cur_token == pfile->cur_run->limit)
2017 pfile->cur_run = next_tokenrun (pfile->cur_run);
2018 pfile->cur_token = pfile->cur_run->base;
2021 result = pfile->cur_token++;
2022 result->src_loc = old->src_loc;
2023 return result;
2026 /* Lex a token into RESULT (external interface). Takes care of issues
2027 like directive handling, token lookahead, multiple include
2028 optimization and skipping. */
2029 const cpp_token *
2030 _cpp_lex_token (cpp_reader *pfile)
2032 cpp_token *result;
2034 for (;;)
2036 if (pfile->cur_token == pfile->cur_run->limit)
2038 pfile->cur_run = next_tokenrun (pfile->cur_run);
2039 pfile->cur_token = pfile->cur_run->base;
2041 /* We assume that the current token is somewhere in the current
2042 run. */
2043 if (pfile->cur_token < pfile->cur_run->base
2044 || pfile->cur_token >= pfile->cur_run->limit)
2045 abort ();
2047 if (pfile->lookaheads)
2049 pfile->lookaheads--;
2050 result = pfile->cur_token++;
2052 else
2053 result = _cpp_lex_direct (pfile);
2055 if (result->flags & BOL)
2057 /* Is this a directive. If _cpp_handle_directive returns
2058 false, it is an assembler #. */
2059 if (result->type == CPP_HASH
2060 /* 6.10.3 p 11: Directives in a list of macro arguments
2061 gives undefined behavior. This implementation
2062 handles the directive as normal. */
2063 && pfile->state.parsing_args != 1)
2065 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2067 if (pfile->directive_result.type == CPP_PADDING)
2068 continue;
2069 result = &pfile->directive_result;
2072 else if (pfile->state.in_deferred_pragma)
2073 result = &pfile->directive_result;
2075 if (pfile->cb.line_change && !pfile->state.skipping)
2076 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2079 /* We don't skip tokens in directives. */
2080 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2081 break;
2083 /* Outside a directive, invalidate controlling macros. At file
2084 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2085 get here and MI optimization works. */
2086 pfile->mi_valid = false;
2088 if (!pfile->state.skipping || result->type == CPP_EOF)
2089 break;
2092 return result;
2095 /* Returns true if a fresh line has been loaded. */
2096 bool
2097 _cpp_get_fresh_line (cpp_reader *pfile)
2099 int return_at_eof;
2101 /* We can't get a new line until we leave the current directive. */
2102 if (pfile->state.in_directive)
2103 return false;
2105 for (;;)
2107 cpp_buffer *buffer = pfile->buffer;
2109 if (!buffer->need_line)
2110 return true;
2112 if (buffer->next_line < buffer->rlimit)
2114 _cpp_clean_line (pfile);
2115 return true;
2118 /* First, get out of parsing arguments state. */
2119 if (pfile->state.parsing_args)
2120 return false;
2122 /* End of buffer. Non-empty files should end in a newline. */
2123 if (buffer->buf != buffer->rlimit
2124 && buffer->next_line > buffer->rlimit
2125 && !buffer->from_stage3)
2127 /* Clip to buffer size. */
2128 buffer->next_line = buffer->rlimit;
2131 return_at_eof = buffer->return_at_eof;
2132 _cpp_pop_buffer (pfile);
2133 if (pfile->buffer == NULL || return_at_eof)
2134 return false;
2138 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
2139 do \
2141 result->type = ELSE_TYPE; \
2142 if (*buffer->cur == CHAR) \
2143 buffer->cur++, result->type = THEN_TYPE; \
2145 while (0)
2147 /* Lex a token into pfile->cur_token, which is also incremented, to
2148 get diagnostics pointing to the correct location.
2150 Does not handle issues such as token lookahead, multiple-include
2151 optimization, directives, skipping etc. This function is only
2152 suitable for use by _cpp_lex_token, and in special cases like
2153 lex_expansion_token which doesn't care for any of these issues.
2155 When meeting a newline, returns CPP_EOF if parsing a directive,
2156 otherwise returns to the start of the token buffer if permissible.
2157 Returns the location of the lexed token. */
2158 cpp_token *
2159 _cpp_lex_direct (cpp_reader *pfile)
2161 cppchar_t c;
2162 cpp_buffer *buffer;
2163 const unsigned char *comment_start;
2164 cpp_token *result = pfile->cur_token++;
2166 fresh_line:
2167 result->flags = 0;
2168 buffer = pfile->buffer;
2169 if (buffer->need_line)
2171 if (pfile->state.in_deferred_pragma)
2173 result->type = CPP_PRAGMA_EOL;
2174 pfile->state.in_deferred_pragma = false;
2175 if (!pfile->state.pragma_allow_expansion)
2176 pfile->state.prevent_expansion--;
2177 return result;
2179 if (!_cpp_get_fresh_line (pfile))
2181 result->type = CPP_EOF;
2182 if (!pfile->state.in_directive)
2184 /* Tell the compiler the line number of the EOF token. */
2185 result->src_loc = pfile->line_table->highest_line;
2186 result->flags = BOL;
2188 return result;
2190 if (!pfile->keep_tokens)
2192 pfile->cur_run = &pfile->base_run;
2193 result = pfile->base_run.base;
2194 pfile->cur_token = result + 1;
2196 result->flags = BOL;
2197 if (pfile->state.parsing_args == 2)
2198 result->flags |= PREV_WHITE;
2200 buffer = pfile->buffer;
2201 update_tokens_line:
2202 result->src_loc = pfile->line_table->highest_line;
2204 skipped_white:
2205 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2206 && !pfile->overlaid_buffer)
2208 _cpp_process_line_notes (pfile, false);
2209 result->src_loc = pfile->line_table->highest_line;
2211 c = *buffer->cur++;
2213 if (pfile->forced_token_location_p)
2214 result->src_loc = *pfile->forced_token_location_p;
2215 else
2216 result->src_loc = linemap_position_for_column (pfile->line_table,
2217 CPP_BUF_COLUMN (buffer, buffer->cur));
2219 switch (c)
2221 case ' ': case '\t': case '\f': case '\v': case '\0':
2222 result->flags |= PREV_WHITE;
2223 skip_whitespace (pfile, c);
2224 goto skipped_white;
2226 case '\n':
2227 if (buffer->cur < buffer->rlimit)
2228 CPP_INCREMENT_LINE (pfile, 0);
2229 buffer->need_line = true;
2230 goto fresh_line;
2232 case '0': case '1': case '2': case '3': case '4':
2233 case '5': case '6': case '7': case '8': case '9':
2235 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2236 result->type = CPP_NUMBER;
2237 lex_number (pfile, &result->val.str, &nst);
2238 warn_about_normalization (pfile, result, &nst);
2239 break;
2242 case 'L':
2243 case 'u':
2244 case 'U':
2245 case 'R':
2246 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2247 wide strings or raw strings. */
2248 if (c == 'L' || CPP_OPTION (pfile, rliterals)
2249 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2251 if ((*buffer->cur == '\'' && c != 'R')
2252 || *buffer->cur == '"'
2253 || (*buffer->cur == 'R'
2254 && c != 'R'
2255 && buffer->cur[1] == '"'
2256 && CPP_OPTION (pfile, rliterals))
2257 || (*buffer->cur == '8'
2258 && c == 'u'
2259 && (buffer->cur[1] == '"'
2260 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2261 && CPP_OPTION (pfile, rliterals)))))
2263 lex_string (pfile, result, buffer->cur - 1);
2264 break;
2267 /* Fall through. */
2269 case '_':
2270 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2271 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2272 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2273 case 's': case 't': case 'v': case 'w': case 'x':
2274 case 'y': case 'z':
2275 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2276 case 'G': case 'H': case 'I': case 'J': case 'K':
2277 case 'M': case 'N': case 'O': case 'P': case 'Q':
2278 case 'S': case 'T': case 'V': case 'W': case 'X':
2279 case 'Y': case 'Z':
2280 result->type = CPP_NAME;
2282 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2283 result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2284 &nst);
2285 warn_about_normalization (pfile, result, &nst);
2288 /* Convert named operators to their proper types. */
2289 if (result->val.node.node->flags & NODE_OPERATOR)
2291 result->flags |= NAMED_OP;
2292 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2294 break;
2296 case '\'':
2297 case '"':
2298 lex_string (pfile, result, buffer->cur - 1);
2299 break;
2301 case '/':
2302 /* A potential block or line comment. */
2303 comment_start = buffer->cur;
2304 c = *buffer->cur;
2306 if (c == '*')
2308 if (_cpp_skip_block_comment (pfile))
2309 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2311 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2312 || cpp_in_system_header (pfile)))
2314 /* Warn about comments only if pedantically GNUC89, and not
2315 in system headers. */
2316 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2317 && ! buffer->warned_cplusplus_comments)
2319 cpp_error (pfile, CPP_DL_PEDWARN,
2320 "C++ style comments are not allowed in ISO C90");
2321 cpp_error (pfile, CPP_DL_PEDWARN,
2322 "(this will be reported only once per input file)");
2323 buffer->warned_cplusplus_comments = 1;
2326 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2327 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2329 else if (c == '=')
2331 buffer->cur++;
2332 result->type = CPP_DIV_EQ;
2333 break;
2335 else
2337 result->type = CPP_DIV;
2338 break;
2341 if (!pfile->state.save_comments)
2343 result->flags |= PREV_WHITE;
2344 goto update_tokens_line;
2347 /* Save the comment as a token in its own right. */
2348 save_comment (pfile, result, comment_start, c);
2349 break;
2351 case '<':
2352 if (pfile->state.angled_headers)
2354 lex_string (pfile, result, buffer->cur - 1);
2355 if (result->type != CPP_LESS)
2356 break;
2359 result->type = CPP_LESS;
2360 if (*buffer->cur == '=')
2361 buffer->cur++, result->type = CPP_LESS_EQ;
2362 else if (*buffer->cur == '<')
2364 buffer->cur++;
2365 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2367 else if (CPP_OPTION (pfile, digraphs))
2369 if (*buffer->cur == ':')
2371 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2372 three characters are <:: and the subsequent character
2373 is neither : nor >, the < is treated as a preprocessor
2374 token by itself". */
2375 if (CPP_OPTION (pfile, cplusplus)
2376 && CPP_OPTION (pfile, lang) != CLK_CXX98
2377 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
2378 && buffer->cur[1] == ':'
2379 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2380 break;
2382 buffer->cur++;
2383 result->flags |= DIGRAPH;
2384 result->type = CPP_OPEN_SQUARE;
2386 else if (*buffer->cur == '%')
2388 buffer->cur++;
2389 result->flags |= DIGRAPH;
2390 result->type = CPP_OPEN_BRACE;
2393 break;
2395 case '>':
2396 result->type = CPP_GREATER;
2397 if (*buffer->cur == '=')
2398 buffer->cur++, result->type = CPP_GREATER_EQ;
2399 else if (*buffer->cur == '>')
2401 buffer->cur++;
2402 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2404 break;
2406 case '%':
2407 result->type = CPP_MOD;
2408 if (*buffer->cur == '=')
2409 buffer->cur++, result->type = CPP_MOD_EQ;
2410 else if (CPP_OPTION (pfile, digraphs))
2412 if (*buffer->cur == ':')
2414 buffer->cur++;
2415 result->flags |= DIGRAPH;
2416 result->type = CPP_HASH;
2417 if (*buffer->cur == '%' && buffer->cur[1] == ':')
2418 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2420 else if (*buffer->cur == '>')
2422 buffer->cur++;
2423 result->flags |= DIGRAPH;
2424 result->type = CPP_CLOSE_BRACE;
2427 break;
2429 case '.':
2430 result->type = CPP_DOT;
2431 if (ISDIGIT (*buffer->cur))
2433 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2434 result->type = CPP_NUMBER;
2435 lex_number (pfile, &result->val.str, &nst);
2436 warn_about_normalization (pfile, result, &nst);
2438 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2439 buffer->cur += 2, result->type = CPP_ELLIPSIS;
2440 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2441 buffer->cur++, result->type = CPP_DOT_STAR;
2442 break;
2444 case '+':
2445 result->type = CPP_PLUS;
2446 if (*buffer->cur == '+')
2447 buffer->cur++, result->type = CPP_PLUS_PLUS;
2448 else if (*buffer->cur == '=')
2449 buffer->cur++, result->type = CPP_PLUS_EQ;
2450 break;
2452 case '-':
2453 result->type = CPP_MINUS;
2454 if (*buffer->cur == '>')
2456 buffer->cur++;
2457 result->type = CPP_DEREF;
2458 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2459 buffer->cur++, result->type = CPP_DEREF_STAR;
2461 else if (*buffer->cur == '-')
2462 buffer->cur++, result->type = CPP_MINUS_MINUS;
2463 else if (*buffer->cur == '=')
2464 buffer->cur++, result->type = CPP_MINUS_EQ;
2465 break;
2467 case '&':
2468 result->type = CPP_AND;
2469 if (*buffer->cur == '&')
2470 buffer->cur++, result->type = CPP_AND_AND;
2471 else if (*buffer->cur == '=')
2472 buffer->cur++, result->type = CPP_AND_EQ;
2473 break;
2475 case '|':
2476 result->type = CPP_OR;
2477 if (*buffer->cur == '|')
2478 buffer->cur++, result->type = CPP_OR_OR;
2479 else if (*buffer->cur == '=')
2480 buffer->cur++, result->type = CPP_OR_EQ;
2481 break;
2483 case ':':
2484 result->type = CPP_COLON;
2485 if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2486 buffer->cur++, result->type = CPP_SCOPE;
2487 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2489 buffer->cur++;
2490 result->flags |= DIGRAPH;
2491 result->type = CPP_CLOSE_SQUARE;
2493 break;
2495 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2496 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2497 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2498 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2499 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2501 case '?': result->type = CPP_QUERY; break;
2502 case '~': result->type = CPP_COMPL; break;
2503 case ',': result->type = CPP_COMMA; break;
2504 case '(': result->type = CPP_OPEN_PAREN; break;
2505 case ')': result->type = CPP_CLOSE_PAREN; break;
2506 case '[': result->type = CPP_OPEN_SQUARE; break;
2507 case ']': result->type = CPP_CLOSE_SQUARE; break;
2508 case '{': result->type = CPP_OPEN_BRACE; break;
2509 case '}': result->type = CPP_CLOSE_BRACE; break;
2510 case ';': result->type = CPP_SEMICOLON; break;
2512 /* @ is a punctuator in Objective-C. */
2513 case '@': result->type = CPP_ATSIGN; break;
2515 case '$':
2516 case '\\':
2518 const uchar *base = --buffer->cur;
2519 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2521 if (forms_identifier_p (pfile, true, &nst))
2523 result->type = CPP_NAME;
2524 result->val.node.node = lex_identifier (pfile, base, true, &nst);
2525 warn_about_normalization (pfile, result, &nst);
2526 break;
2528 buffer->cur++;
2531 default:
2532 create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2533 break;
2536 return result;
2539 /* An upper bound on the number of bytes needed to spell TOKEN.
2540 Does not include preceding whitespace. */
2541 unsigned int
2542 cpp_token_len (const cpp_token *token)
2544 unsigned int len;
2546 switch (TOKEN_SPELL (token))
2548 default: len = 6; break;
2549 case SPELL_LITERAL: len = token->val.str.len; break;
2550 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
2553 return len;
2556 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2557 Return the number of bytes read out of NAME. (There are always
2558 10 bytes written to BUFFER.) */
2560 static size_t
2561 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2563 int j;
2564 int ucn_len = 0;
2565 int ucn_len_c;
2566 unsigned t;
2567 unsigned long utf32;
2569 /* Compute the length of the UTF-8 sequence. */
2570 for (t = *name; t & 0x80; t <<= 1)
2571 ucn_len++;
2573 utf32 = *name & (0x7F >> ucn_len);
2574 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2576 utf32 = (utf32 << 6) | (*++name & 0x3F);
2578 /* Ill-formed UTF-8. */
2579 if ((*name & ~0x3F) != 0x80)
2580 abort ();
2583 *buffer++ = '\\';
2584 *buffer++ = 'U';
2585 for (j = 7; j >= 0; j--)
2586 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2587 return ucn_len;
2590 /* Given a token TYPE corresponding to a digraph, return a pointer to
2591 the spelling of the digraph. */
2592 static const unsigned char *
2593 cpp_digraph2name (enum cpp_ttype type)
2595 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2598 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
2599 already contain the enough space to hold the token's spelling.
2600 Returns a pointer to the character after the last character written.
2601 FORSTRING is true if this is to be the spelling after translation
2602 phase 1 (this is different for UCNs).
2603 FIXME: Would be nice if we didn't need the PFILE argument. */
2604 unsigned char *
2605 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2606 unsigned char *buffer, bool forstring)
2608 switch (TOKEN_SPELL (token))
2610 case SPELL_OPERATOR:
2612 const unsigned char *spelling;
2613 unsigned char c;
2615 if (token->flags & DIGRAPH)
2616 spelling = cpp_digraph2name (token->type);
2617 else if (token->flags & NAMED_OP)
2618 goto spell_ident;
2619 else
2620 spelling = TOKEN_NAME (token);
2622 while ((c = *spelling++) != '\0')
2623 *buffer++ = c;
2625 break;
2627 spell_ident:
2628 case SPELL_IDENT:
2629 if (forstring)
2631 memcpy (buffer, NODE_NAME (token->val.node.node),
2632 NODE_LEN (token->val.node.node));
2633 buffer += NODE_LEN (token->val.node.node);
2635 else
2637 size_t i;
2638 const unsigned char * name = NODE_NAME (token->val.node.node);
2640 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2641 if (name[i] & ~0x7F)
2643 i += utf8_to_ucn (buffer, name + i) - 1;
2644 buffer += 10;
2646 else
2647 *buffer++ = NODE_NAME (token->val.node.node)[i];
2649 break;
2651 case SPELL_LITERAL:
2652 memcpy (buffer, token->val.str.text, token->val.str.len);
2653 buffer += token->val.str.len;
2654 break;
2656 case SPELL_NONE:
2657 cpp_error (pfile, CPP_DL_ICE,
2658 "unspellable token %s", TOKEN_NAME (token));
2659 break;
2662 return buffer;
2665 /* Returns TOKEN spelt as a null-terminated string. The string is
2666 freed when the reader is destroyed. Useful for diagnostics. */
2667 unsigned char *
2668 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2670 unsigned int len = cpp_token_len (token) + 1;
2671 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2673 end = cpp_spell_token (pfile, token, start, false);
2674 end[0] = '\0';
2676 return start;
2679 /* Returns a pointer to a string which spells the token defined by
2680 TYPE and FLAGS. Used by C front ends, which really should move to
2681 using cpp_token_as_text. */
2682 const char *
2683 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2685 if (flags & DIGRAPH)
2686 return (const char *) cpp_digraph2name (type);
2687 else if (flags & NAMED_OP)
2688 return cpp_named_operator2name (type);
2690 return (const char *) token_spellings[type].name;
2693 /* Writes the spelling of token to FP, without any preceding space.
2694 Separated from cpp_spell_token for efficiency - to avoid stdio
2695 double-buffering. */
2696 void
2697 cpp_output_token (const cpp_token *token, FILE *fp)
2699 switch (TOKEN_SPELL (token))
2701 case SPELL_OPERATOR:
2703 const unsigned char *spelling;
2704 int c;
2706 if (token->flags & DIGRAPH)
2707 spelling = cpp_digraph2name (token->type);
2708 else if (token->flags & NAMED_OP)
2709 goto spell_ident;
2710 else
2711 spelling = TOKEN_NAME (token);
2713 c = *spelling;
2715 putc (c, fp);
2716 while ((c = *++spelling) != '\0');
2718 break;
2720 spell_ident:
2721 case SPELL_IDENT:
2723 size_t i;
2724 const unsigned char * name = NODE_NAME (token->val.node.node);
2726 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2727 if (name[i] & ~0x7F)
2729 unsigned char buffer[10];
2730 i += utf8_to_ucn (buffer, name + i) - 1;
2731 fwrite (buffer, 1, 10, fp);
2733 else
2734 fputc (NODE_NAME (token->val.node.node)[i], fp);
2736 break;
2738 case SPELL_LITERAL:
2739 fwrite (token->val.str.text, 1, token->val.str.len, fp);
2740 break;
2742 case SPELL_NONE:
2743 /* An error, most probably. */
2744 break;
2748 /* Compare two tokens. */
2750 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2752 if (a->type == b->type && a->flags == b->flags)
2753 switch (TOKEN_SPELL (a))
2755 default: /* Keep compiler happy. */
2756 case SPELL_OPERATOR:
2757 /* token_no is used to track where multiple consecutive ##
2758 tokens were originally located. */
2759 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2760 case SPELL_NONE:
2761 return (a->type != CPP_MACRO_ARG
2762 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2763 case SPELL_IDENT:
2764 return a->val.node.node == b->val.node.node;
2765 case SPELL_LITERAL:
2766 return (a->val.str.len == b->val.str.len
2767 && !memcmp (a->val.str.text, b->val.str.text,
2768 a->val.str.len));
2771 return 0;
2774 /* Returns nonzero if a space should be inserted to avoid an
2775 accidental token paste for output. For simplicity, it is
2776 conservative, and occasionally advises a space where one is not
2777 needed, e.g. "." and ".2". */
2779 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2780 const cpp_token *token2)
2782 enum cpp_ttype a = token1->type, b = token2->type;
2783 cppchar_t c;
2785 if (token1->flags & NAMED_OP)
2786 a = CPP_NAME;
2787 if (token2->flags & NAMED_OP)
2788 b = CPP_NAME;
2790 c = EOF;
2791 if (token2->flags & DIGRAPH)
2792 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2793 else if (token_spellings[b].category == SPELL_OPERATOR)
2794 c = token_spellings[b].name[0];
2796 /* Quickly get everything that can paste with an '='. */
2797 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2798 return 1;
2800 switch (a)
2802 case CPP_GREATER: return c == '>';
2803 case CPP_LESS: return c == '<' || c == '%' || c == ':';
2804 case CPP_PLUS: return c == '+';
2805 case CPP_MINUS: return c == '-' || c == '>';
2806 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
2807 case CPP_MOD: return c == ':' || c == '>';
2808 case CPP_AND: return c == '&';
2809 case CPP_OR: return c == '|';
2810 case CPP_COLON: return c == ':' || c == '>';
2811 case CPP_DEREF: return c == '*';
2812 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
2813 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
2814 case CPP_NAME: return ((b == CPP_NUMBER
2815 && name_p (pfile, &token2->val.str))
2816 || b == CPP_NAME
2817 || b == CPP_CHAR || b == CPP_STRING); /* L */
2818 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
2819 || c == '.' || c == '+' || c == '-');
2820 /* UCNs */
2821 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
2822 && b == CPP_NAME)
2823 || (CPP_OPTION (pfile, objc)
2824 && token1->val.str.text[0] == '@'
2825 && (b == CPP_NAME || b == CPP_STRING)));
2826 case CPP_STRING:
2827 case CPP_WSTRING:
2828 case CPP_UTF8STRING:
2829 case CPP_STRING16:
2830 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
2831 && (b == CPP_NAME
2832 || (TOKEN_SPELL (token2) == SPELL_LITERAL
2833 && ISIDST (token2->val.str.text[0]))));
2835 default: break;
2838 return 0;
2841 /* Output all the remaining tokens on the current line, and a newline
2842 character, to FP. Leading whitespace is removed. If there are
2843 macros, special token padding is not performed. */
2844 void
2845 cpp_output_line (cpp_reader *pfile, FILE *fp)
2847 const cpp_token *token;
2849 token = cpp_get_token (pfile);
2850 while (token->type != CPP_EOF)
2852 cpp_output_token (token, fp);
2853 token = cpp_get_token (pfile);
2854 if (token->flags & PREV_WHITE)
2855 putc (' ', fp);
2858 putc ('\n', fp);
2861 /* Return a string representation of all the remaining tokens on the
2862 current line. The result is allocated using xmalloc and must be
2863 freed by the caller. */
2864 unsigned char *
2865 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2867 const cpp_token *token;
2868 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2869 unsigned int alloced = 120 + out;
2870 unsigned char *result = (unsigned char *) xmalloc (alloced);
2872 /* If DIR_NAME is empty, there are no initial contents. */
2873 if (dir_name)
2875 sprintf ((char *) result, "#%s ", dir_name);
2876 out += 2;
2879 token = cpp_get_token (pfile);
2880 while (token->type != CPP_EOF)
2882 unsigned char *last;
2883 /* Include room for a possible space and the terminating nul. */
2884 unsigned int len = cpp_token_len (token) + 2;
2886 if (out + len > alloced)
2888 alloced *= 2;
2889 if (out + len > alloced)
2890 alloced = out + len;
2891 result = (unsigned char *) xrealloc (result, alloced);
2894 last = cpp_spell_token (pfile, token, &result[out], 0);
2895 out = last - result;
2897 token = cpp_get_token (pfile);
2898 if (token->flags & PREV_WHITE)
2899 result[out++] = ' ';
2902 result[out] = '\0';
2903 return result;
2906 /* Memory buffers. Changing these three constants can have a dramatic
2907 effect on performance. The values here are reasonable defaults,
2908 but might be tuned. If you adjust them, be sure to test across a
2909 range of uses of cpplib, including heavy nested function-like macro
2910 expansion. Also check the change in peak memory usage (NJAMD is a
2911 good tool for this). */
2912 #define MIN_BUFF_SIZE 8000
2913 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2914 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2915 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2917 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2918 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2919 #endif
2921 /* Create a new allocation buffer. Place the control block at the end
2922 of the buffer, so that buffer overflows will cause immediate chaos. */
2923 static _cpp_buff *
2924 new_buff (size_t len)
2926 _cpp_buff *result;
2927 unsigned char *base;
2929 if (len < MIN_BUFF_SIZE)
2930 len = MIN_BUFF_SIZE;
2931 len = CPP_ALIGN (len);
2933 #ifdef ENABLE_VALGRIND_CHECKING
2934 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
2935 struct first. */
2936 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
2937 base = XNEWVEC (unsigned char, len + slen);
2938 result = (_cpp_buff *) base;
2939 base += slen;
2940 #else
2941 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2942 result = (_cpp_buff *) (base + len);
2943 #endif
2944 result->base = base;
2945 result->cur = base;
2946 result->limit = base + len;
2947 result->next = NULL;
2948 return result;
2951 /* Place a chain of unwanted allocation buffers on the free list. */
2952 void
2953 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2955 _cpp_buff *end = buff;
2957 while (end->next)
2958 end = end->next;
2959 end->next = pfile->free_buffs;
2960 pfile->free_buffs = buff;
2963 /* Return a free buffer of size at least MIN_SIZE. */
2964 _cpp_buff *
2965 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2967 _cpp_buff *result, **p;
2969 for (p = &pfile->free_buffs;; p = &(*p)->next)
2971 size_t size;
2973 if (*p == NULL)
2974 return new_buff (min_size);
2975 result = *p;
2976 size = result->limit - result->base;
2977 /* Return a buffer that's big enough, but don't waste one that's
2978 way too big. */
2979 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2980 break;
2983 *p = result->next;
2984 result->next = NULL;
2985 result->cur = result->base;
2986 return result;
2989 /* Creates a new buffer with enough space to hold the uncommitted
2990 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
2991 the excess bytes to the new buffer. Chains the new buffer after
2992 BUFF, and returns the new buffer. */
2993 _cpp_buff *
2994 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2996 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2997 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2999 buff->next = new_buff;
3000 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3001 return new_buff;
3004 /* Creates a new buffer with enough space to hold the uncommitted
3005 remaining bytes of the buffer pointed to by BUFF, and at least
3006 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
3007 Chains the new buffer before the buffer pointed to by BUFF, and
3008 updates the pointer to point to the new buffer. */
3009 void
3010 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3012 _cpp_buff *new_buff, *old_buff = *pbuff;
3013 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3015 new_buff = _cpp_get_buff (pfile, size);
3016 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3017 new_buff->next = old_buff;
3018 *pbuff = new_buff;
3021 /* Free a chain of buffers starting at BUFF. */
3022 void
3023 _cpp_free_buff (_cpp_buff *buff)
3025 _cpp_buff *next;
3027 for (; buff; buff = next)
3029 next = buff->next;
3030 #ifdef ENABLE_VALGRIND_CHECKING
3031 free (buff);
3032 #else
3033 free (buff->base);
3034 #endif
3038 /* Allocate permanent, unaligned storage of length LEN. */
3039 unsigned char *
3040 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3042 _cpp_buff *buff = pfile->u_buff;
3043 unsigned char *result = buff->cur;
3045 if (len > (size_t) (buff->limit - result))
3047 buff = _cpp_get_buff (pfile, len);
3048 buff->next = pfile->u_buff;
3049 pfile->u_buff = buff;
3050 result = buff->cur;
3053 buff->cur = result + len;
3054 return result;
3057 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3058 That buffer is used for growing allocations when saving macro
3059 replacement lists in a #define, and when parsing an answer to an
3060 assertion in #assert, #unassert or #if (and therefore possibly
3061 whilst expanding macros). It therefore must not be used by any
3062 code that they might call: specifically the lexer and the guts of
3063 the macro expander.
3065 All existing other uses clearly fit this restriction: storing
3066 registered pragmas during initialization. */
3067 unsigned char *
3068 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3070 _cpp_buff *buff = pfile->a_buff;
3071 unsigned char *result = buff->cur;
3073 if (len > (size_t) (buff->limit - result))
3075 buff = _cpp_get_buff (pfile, len);
3076 buff->next = pfile->a_buff;
3077 pfile->a_buff = buff;
3078 result = buff->cur;
3081 buff->cur = result + len;
3082 return result;
3085 /* Say which field of TOK is in use. */
3087 enum cpp_token_fld_kind
3088 cpp_token_val_index (const cpp_token *tok)
3090 switch (TOKEN_SPELL (tok))
3092 case SPELL_IDENT:
3093 return CPP_TOKEN_FLD_NODE;
3094 case SPELL_LITERAL:
3095 return CPP_TOKEN_FLD_STR;
3096 case SPELL_OPERATOR:
3097 if (tok->type == CPP_PASTE)
3098 return CPP_TOKEN_FLD_TOKEN_NO;
3099 else
3100 return CPP_TOKEN_FLD_NONE;
3101 case SPELL_NONE:
3102 if (tok->type == CPP_MACRO_ARG)
3103 return CPP_TOKEN_FLD_ARG_NO;
3104 else if (tok->type == CPP_PADDING)
3105 return CPP_TOKEN_FLD_SOURCE;
3106 else if (tok->type == CPP_PRAGMA)
3107 return CPP_TOKEN_FLD_PRAGMA;
3108 /* else fall through */
3109 default:
3110 return CPP_TOKEN_FLD_NONE;
3114 /* All tokens lexed in R after calling this function will be forced to have
3115 their source_location the same as the location referenced by P, until
3116 cpp_stop_forcing_token_locations is called for R. */
3118 void
3119 cpp_force_token_locations (cpp_reader *r, source_location *p)
3121 r->forced_token_location_p = p;
3124 /* Go back to assigning locations naturally for lexed tokens. */
3126 void
3127 cpp_stop_forcing_token_locations (cpp_reader *r)
3129 r->forced_token_location_p = NULL;