Default to dwarf version 4 on hppa64-hpux
[official-gcc.git] / libcpp / lex.c
blob8e3ef096bbe390ba4d89764810ac1a56c71492e6
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000-2021 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
27 enum spell_type
29 SPELL_OPERATOR = 0,
30 SPELL_IDENT,
31 SPELL_LITERAL,
32 SPELL_NONE
35 struct token_spelling
37 enum spell_type category;
38 const unsigned char *name;
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
44 #define OP(e, s) { SPELL_OPERATOR, UC s },
45 #define TK(e, s) { SPELL_ ## s, UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54 static int skip_line_comment (cpp_reader *);
55 static void skip_whitespace (cpp_reader *, cppchar_t);
56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58 static void store_comment (cpp_reader *, cpp_token *);
59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
60 unsigned int, enum cpp_ttype);
61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
62 static int name_p (cpp_reader *, const cpp_string *);
63 static tokenrun *next_tokenrun (tokenrun *);
65 static _cpp_buff *new_buff (size_t);
68 /* Utility routine:
70 Compares, the token TOKEN to the NUL-terminated string STRING.
71 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
72 int
73 cpp_ideq (const cpp_token *token, const char *string)
75 if (token->type != CPP_NAME)
76 return 0;
78 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
81 /* Record a note TYPE at byte POS into the current cleaned logical
82 line. */
83 static void
84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
86 if (buffer->notes_used == buffer->notes_cap)
88 buffer->notes_cap = buffer->notes_cap * 2 + 200;
89 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
90 buffer->notes_cap);
93 buffer->notes[buffer->notes_used].pos = pos;
94 buffer->notes[buffer->notes_used].type = type;
95 buffer->notes_used++;
99 /* Fast path to find line special characters using optimized character
100 scanning algorithms. Anything complicated falls back to the slow
101 path below. Since this loop is very hot it's worth doing these kinds
102 of optimizations.
104 One of the paths through the ifdefs should provide
106 const uchar *search_line_fast (const uchar *s, const uchar *end);
108 Between S and END, search for \n, \r, \\, ?. Return a pointer to
109 the found character.
111 Note that the last character of the buffer is *always* a newline,
112 as forced by _cpp_convert_input. This fact can be used to avoid
113 explicitly looking for the end of the buffer. */
115 /* Configure gives us an ifdef test. */
116 #ifndef WORDS_BIGENDIAN
117 #define WORDS_BIGENDIAN 0
118 #endif
120 /* We'd like the largest integer that fits into a register. There's nothing
121 in <stdint.h> that gives us that. For most hosts this is unsigned long,
122 but MS decided on an LLP64 model. Thankfully when building with GCC we
123 can get the "real" word size. */
124 #ifdef __GNUC__
125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
126 #else
127 typedef unsigned long word_type;
128 #endif
130 /* The code below is only expecting sizes 4 or 8.
131 Die at compile-time if this expectation is violated. */
132 typedef char check_word_type_size
133 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
135 /* Return X with the first N bytes forced to values that won't match one
136 of the interesting characters. Note that NUL is not interesting. */
138 static inline word_type
139 acc_char_mask_misalign (word_type val, unsigned int n)
141 word_type mask = -1;
142 if (WORDS_BIGENDIAN)
143 mask >>= n * 8;
144 else
145 mask <<= n * 8;
146 return val & mask;
149 /* Return X replicated to all byte positions within WORD_TYPE. */
151 static inline word_type
152 acc_char_replicate (uchar x)
154 word_type ret;
156 ret = (x << 24) | (x << 16) | (x << 8) | x;
157 if (sizeof(word_type) == 8)
158 ret = (ret << 16 << 16) | ret;
159 return ret;
162 /* Return non-zero if some byte of VAL is (probably) C. */
164 static inline word_type
165 acc_char_cmp (word_type val, word_type c)
167 #if defined(__GNUC__) && defined(__alpha__)
168 /* We can get exact results using a compare-bytes instruction.
169 Get (val == c) via (0 >= (val ^ c)). */
170 return __builtin_alpha_cmpbge (0, val ^ c);
171 #else
172 word_type magic = 0x7efefefeU;
173 if (sizeof(word_type) == 8)
174 magic = (magic << 16 << 16) | 0xfefefefeU;
175 magic |= 1;
177 val ^= c;
178 return ((val + magic) ^ ~val) & ~magic;
179 #endif
182 /* Given the result of acc_char_cmp is non-zero, return the index of
183 the found character. If this was a false positive, return -1. */
185 static inline int
186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
187 word_type val ATTRIBUTE_UNUSED)
189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
190 /* The cmpbge instruction sets *bits* of the result corresponding to
191 matches in the bytes with no false positives. */
192 return __builtin_ctzl (cmp);
193 #else
194 unsigned int i;
196 /* ??? It would be nice to force unrolling here,
197 and have all of these constants folded. */
198 for (i = 0; i < sizeof(word_type); ++i)
200 uchar c;
201 if (WORDS_BIGENDIAN)
202 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
203 else
204 c = (val >> i * 8) & 0xff;
206 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
207 return i;
210 return -1;
211 #endif
214 /* A version of the fast scanner using bit fiddling techniques.
216 For 32-bit words, one would normally perform 16 comparisons and
217 16 branches. With this algorithm one performs 24 arithmetic
218 operations and one branch. Whether this is faster with a 32-bit
219 word size is going to be somewhat system dependent.
221 For 64-bit words, we eliminate twice the number of comparisons
222 and branches without increasing the number of arithmetic operations.
223 It's almost certainly going to be a win with 64-bit word size. */
225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
226 ATTRIBUTE_UNUSED;
228 static const uchar *
229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
231 const word_type repl_nl = acc_char_replicate ('\n');
232 const word_type repl_cr = acc_char_replicate ('\r');
233 const word_type repl_bs = acc_char_replicate ('\\');
234 const word_type repl_qm = acc_char_replicate ('?');
236 unsigned int misalign;
237 const word_type *p;
238 word_type val, t;
240 /* Align the buffer. Mask out any bytes from before the beginning. */
241 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
242 val = *p;
243 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
244 if (misalign)
245 val = acc_char_mask_misalign (val, misalign);
247 /* Main loop. */
248 while (1)
250 t = acc_char_cmp (val, repl_nl);
251 t |= acc_char_cmp (val, repl_cr);
252 t |= acc_char_cmp (val, repl_bs);
253 t |= acc_char_cmp (val, repl_qm);
255 if (__builtin_expect (t != 0, 0))
257 int i = acc_char_index (t, val);
258 if (i >= 0)
259 return (const uchar *)p + i;
262 val = *++p;
266 /* Disable on Solaris 2/x86 until the following problem can be properly
267 autoconfed:
269 The Solaris 10+ assembler tags objects with the instruction set
270 extensions used, so SSE4.2 executables cannot run on machines that
271 don't support that extension. */
273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
275 /* Replicated character data to be shared between implementations.
276 Recall that outside of a context with vector support we can't
277 define compatible vector types, therefore these are all defined
278 in terms of raw characters. */
279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
280 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
281 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
282 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
283 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
284 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
285 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
286 { '?', '?', '?', '?', '?', '?', '?', '?',
287 '?', '?', '?', '?', '?', '?', '?', '?' },
290 /* A version of the fast scanner using MMX vectorized byte compare insns.
292 This uses the PMOVMSKB instruction which was introduced with "MMX2",
293 which was packaged into SSE1; it is also present in the AMD MMX
294 extension. Mark the function as using "sse" so that we emit a real
295 "emms" instruction, rather than the 3dNOW "femms" instruction. */
297 static const uchar *
298 #ifndef __SSE__
299 __attribute__((__target__("sse")))
300 #endif
301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
303 typedef char v8qi __attribute__ ((__vector_size__ (8)));
304 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
306 const v8qi repl_nl = *(const v8qi *)repl_chars[0];
307 const v8qi repl_cr = *(const v8qi *)repl_chars[1];
308 const v8qi repl_bs = *(const v8qi *)repl_chars[2];
309 const v8qi repl_qm = *(const v8qi *)repl_chars[3];
311 unsigned int misalign, found, mask;
312 const v8qi *p;
313 v8qi data, t, c;
315 /* Align the source pointer. While MMX doesn't generate unaligned data
316 faults, this allows us to safely scan to the end of the buffer without
317 reading beyond the end of the last page. */
318 misalign = (uintptr_t)s & 7;
319 p = (const v8qi *)((uintptr_t)s & -8);
320 data = *p;
322 /* Create a mask for the bytes that are valid within the first
323 16-byte block. The Idea here is that the AND with the mask
324 within the loop is "free", since we need some AND or TEST
325 insn in order to set the flags for the branch anyway. */
326 mask = -1u << misalign;
328 /* Main loop processing 8 bytes at a time. */
329 goto start;
332 data = *++p;
333 mask = -1;
335 start:
336 t = __builtin_ia32_pcmpeqb(data, repl_nl);
337 c = __builtin_ia32_pcmpeqb(data, repl_cr);
338 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
339 c = __builtin_ia32_pcmpeqb(data, repl_bs);
340 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
341 c = __builtin_ia32_pcmpeqb(data, repl_qm);
342 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
343 found = __builtin_ia32_pmovmskb (t);
344 found &= mask;
346 while (!found);
348 __builtin_ia32_emms ();
350 /* FOUND contains 1 in bits for which we matched a relevant
351 character. Conversion to the byte index is trivial. */
352 found = __builtin_ctz(found);
353 return (const uchar *)p + found;
356 /* A version of the fast scanner using SSE2 vectorized byte compare insns. */
358 static const uchar *
359 #ifndef __SSE2__
360 __attribute__((__target__("sse2")))
361 #endif
362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
364 typedef char v16qi __attribute__ ((__vector_size__ (16)));
366 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
367 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
368 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
369 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
371 unsigned int misalign, found, mask;
372 const v16qi *p;
373 v16qi data, t;
375 /* Align the source pointer. */
376 misalign = (uintptr_t)s & 15;
377 p = (const v16qi *)((uintptr_t)s & -16);
378 data = *p;
380 /* Create a mask for the bytes that are valid within the first
381 16-byte block. The Idea here is that the AND with the mask
382 within the loop is "free", since we need some AND or TEST
383 insn in order to set the flags for the branch anyway. */
384 mask = -1u << misalign;
386 /* Main loop processing 16 bytes at a time. */
387 goto start;
390 data = *++p;
391 mask = -1;
393 start:
394 t = data == repl_nl;
395 t |= data == repl_cr;
396 t |= data == repl_bs;
397 t |= data == repl_qm;
398 found = __builtin_ia32_pmovmskb128 (t);
399 found &= mask;
401 while (!found);
403 /* FOUND contains 1 in bits for which we matched a relevant
404 character. Conversion to the byte index is trivial. */
405 found = __builtin_ctz(found);
406 return (const uchar *)p + found;
409 #ifdef HAVE_SSE4
410 /* A version of the fast scanner using SSE 4.2 vectorized string insns. */
412 static const uchar *
413 #ifndef __SSE4_2__
414 __attribute__((__target__("sse4.2")))
415 #endif
416 search_line_sse42 (const uchar *s, const uchar *end)
418 typedef char v16qi __attribute__ ((__vector_size__ (16)));
419 static const v16qi search = { '\n', '\r', '?', '\\' };
421 uintptr_t si = (uintptr_t)s;
422 uintptr_t index;
424 /* Check for unaligned input. */
425 if (si & 15)
427 v16qi sv;
429 if (__builtin_expect (end - s < 16, 0)
430 && __builtin_expect ((si & 0xfff) > 0xff0, 0))
432 /* There are less than 16 bytes left in the buffer, and less
433 than 16 bytes left on the page. Reading 16 bytes at this
434 point might generate a spurious page fault. Defer to the
435 SSE2 implementation, which already handles alignment. */
436 return search_line_sse2 (s, end);
439 /* ??? The builtin doesn't understand that the PCMPESTRI read from
440 memory need not be aligned. */
441 sv = __builtin_ia32_loaddqu ((const char *) s);
442 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
444 if (__builtin_expect (index < 16, 0))
445 goto found;
447 /* Advance the pointer to an aligned address. We will re-scan a
448 few bytes, but we no longer need care for reading past the
449 end of a page, since we're guaranteed a match. */
450 s = (const uchar *)((si + 15) & -16);
453 /* Main loop, processing 16 bytes at a time. */
454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
455 while (1)
457 char f;
459 /* By using inline assembly instead of the builtin,
460 we can use the result, as well as the flags set. */
461 __asm ("%vpcmpestri\t$0, %2, %3"
462 : "=c"(index), "=@ccc"(f)
463 : "m"(*s), "x"(search), "a"(4), "d"(16));
464 if (f)
465 break;
467 s += 16;
469 #else
470 s -= 16;
471 /* By doing the whole loop in inline assembly,
472 we can make proper use of the flags set. */
473 __asm ( ".balign 16\n"
474 "0: add $16, %1\n"
475 " %vpcmpestri\t$0, (%1), %2\n"
476 " jnc 0b"
477 : "=&c"(index), "+r"(s)
478 : "x"(search), "a"(4), "d"(16));
479 #endif
481 found:
482 return s + index;
485 #else
486 /* Work around out-dated assemblers without sse4 support. */
487 #define search_line_sse42 search_line_sse2
488 #endif
490 /* Check the CPU capabilities. */
492 #include "../gcc/config/i386/cpuid.h"
494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
495 static search_line_fast_type search_line_fast;
497 #define HAVE_init_vectorized_lexer 1
498 static inline void
499 init_vectorized_lexer (void)
501 unsigned dummy, ecx = 0, edx = 0;
502 search_line_fast_type impl = search_line_acc_char;
503 int minimum = 0;
505 #if defined(__SSE4_2__)
506 minimum = 3;
507 #elif defined(__SSE2__)
508 minimum = 2;
509 #elif defined(__SSE__)
510 minimum = 1;
511 #endif
513 if (minimum == 3)
514 impl = search_line_sse42;
515 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
517 if (minimum == 3 || (ecx & bit_SSE4_2))
518 impl = search_line_sse42;
519 else if (minimum == 2 || (edx & bit_SSE2))
520 impl = search_line_sse2;
521 else if (minimum == 1 || (edx & bit_SSE))
522 impl = search_line_mmx;
524 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
526 if (minimum == 1
527 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
528 impl = search_line_mmx;
531 search_line_fast = impl;
534 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
536 /* A vection of the fast scanner using AltiVec vectorized byte compares
537 and VSX unaligned loads (when VSX is available). This is otherwise
538 the same as the AltiVec version. */
540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
541 static const uchar *
542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
544 typedef __attribute__((altivec(vector))) unsigned char vc;
546 const vc repl_nl = {
547 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
548 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
550 const vc repl_cr = {
551 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
552 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
554 const vc repl_bs = {
555 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
556 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
558 const vc repl_qm = {
559 '?', '?', '?', '?', '?', '?', '?', '?',
560 '?', '?', '?', '?', '?', '?', '?', '?',
562 const vc zero = { 0 };
564 vc data, t;
566 /* Main loop processing 16 bytes at a time. */
569 vc m_nl, m_cr, m_bs, m_qm;
571 data = __builtin_vec_vsx_ld (0, s);
572 s += 16;
574 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
575 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
576 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
577 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
578 t = (m_nl | m_cr) | (m_bs | m_qm);
580 /* T now contains 0xff in bytes for which we matched one of the relevant
581 characters. We want to exit the loop if any byte in T is non-zero.
582 Below is the expansion of vec_any_ne(t, zero). */
584 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
586 /* Restore s to to point to the 16 bytes we just processed. */
587 s -= 16;
590 #define N (sizeof(vc) / sizeof(long))
592 union {
593 vc v;
594 /* Statically assert that N is 2 or 4. */
595 unsigned long l[(N == 2 || N == 4) ? N : -1];
596 } u;
597 unsigned long l, i = 0;
599 u.v = t;
601 /* Find the first word of T that is non-zero. */
602 switch (N)
604 case 4:
605 l = u.l[i++];
606 if (l != 0)
607 break;
608 s += sizeof(unsigned long);
609 l = u.l[i++];
610 if (l != 0)
611 break;
612 s += sizeof(unsigned long);
613 /* FALLTHRU */
614 case 2:
615 l = u.l[i++];
616 if (l != 0)
617 break;
618 s += sizeof(unsigned long);
619 l = u.l[i];
622 /* L now contains 0xff in bytes for which we matched one of the
623 relevant characters. We can find the byte index by finding
624 its bit index and dividing by 8. */
625 #ifdef __BIG_ENDIAN__
626 l = __builtin_clzl(l) >> 3;
627 #else
628 l = __builtin_ctzl(l) >> 3;
629 #endif
630 return s + l;
632 #undef N
636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
638 /* A vection of the fast scanner using AltiVec vectorized byte compares.
639 This cannot be used for little endian because vec_lvsl/lvsr are
640 deprecated for little endian and the code won't work properly. */
641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
642 so we can't compile this function without -maltivec on the command line
643 (or implied by some other switch). */
645 static const uchar *
646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
648 typedef __attribute__((altivec(vector))) unsigned char vc;
650 const vc repl_nl = {
651 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
652 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
654 const vc repl_cr = {
655 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
656 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
658 const vc repl_bs = {
659 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
660 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
662 const vc repl_qm = {
663 '?', '?', '?', '?', '?', '?', '?', '?',
664 '?', '?', '?', '?', '?', '?', '?', '?',
666 const vc ones = {
667 -1, -1, -1, -1, -1, -1, -1, -1,
668 -1, -1, -1, -1, -1, -1, -1, -1,
670 const vc zero = { 0 };
672 vc data, mask, t;
674 /* Altivec loads automatically mask addresses with -16. This lets us
675 issue the first load as early as possible. */
676 data = __builtin_vec_ld(0, (const vc *)s);
678 /* Discard bytes before the beginning of the buffer. Do this by
679 beginning with all ones and shifting in zeros according to the
680 mis-alignment. The LVSR instruction pulls the exact shift we
681 want from the address. */
682 mask = __builtin_vec_lvsr(0, s);
683 mask = __builtin_vec_perm(zero, ones, mask);
684 data &= mask;
686 /* While altivec loads mask addresses, we still need to align S so
687 that the offset we compute at the end is correct. */
688 s = (const uchar *)((uintptr_t)s & -16);
690 /* Main loop processing 16 bytes at a time. */
691 goto start;
694 vc m_nl, m_cr, m_bs, m_qm;
696 s += 16;
697 data = __builtin_vec_ld(0, (const vc *)s);
699 start:
700 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
701 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
702 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
703 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
704 t = (m_nl | m_cr) | (m_bs | m_qm);
706 /* T now contains 0xff in bytes for which we matched one of the relevant
707 characters. We want to exit the loop if any byte in T is non-zero.
708 Below is the expansion of vec_any_ne(t, zero). */
710 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
713 #define N (sizeof(vc) / sizeof(long))
715 union {
716 vc v;
717 /* Statically assert that N is 2 or 4. */
718 unsigned long l[(N == 2 || N == 4) ? N : -1];
719 } u;
720 unsigned long l, i = 0;
722 u.v = t;
724 /* Find the first word of T that is non-zero. */
725 switch (N)
727 case 4:
728 l = u.l[i++];
729 if (l != 0)
730 break;
731 s += sizeof(unsigned long);
732 l = u.l[i++];
733 if (l != 0)
734 break;
735 s += sizeof(unsigned long);
736 /* FALLTHROUGH */
737 case 2:
738 l = u.l[i++];
739 if (l != 0)
740 break;
741 s += sizeof(unsigned long);
742 l = u.l[i];
745 /* L now contains 0xff in bytes for which we matched one of the
746 relevant characters. We can find the byte index by finding
747 its bit index and dividing by 8. */
748 l = __builtin_clzl(l) >> 3;
749 return s + l;
751 #undef N
755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
756 #include "arm_neon.h"
758 /* This doesn't have to be the exact page size, but no system may use
759 a size smaller than this. ARMv8 requires a minimum page size of
760 4k. The impact of being conservative here is a small number of
761 cases will take the slightly slower entry path into the main
762 loop. */
764 #define AARCH64_MIN_PAGE_SIZE 4096
766 static const uchar *
767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
769 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
770 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
771 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
772 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
773 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
775 #ifdef __ARM_BIG_ENDIAN
776 const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
777 #else
778 const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
779 #endif
781 unsigned int found;
782 const uint8_t *p;
783 uint8x16_t data;
784 uint8x16_t t;
785 uint16x8_t m;
786 uint8x16_t u, v, w;
788 /* Align the source pointer. */
789 p = (const uint8_t *)((uintptr_t)s & -16);
791 /* Assuming random string start positions, with a 4k page size we'll take
792 the slow path about 0.37% of the time. */
793 if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
794 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
795 < 16, 0))
797 /* Slow path: the string starts near a possible page boundary. */
798 uint32_t misalign, mask;
800 misalign = (uintptr_t)s & 15;
801 mask = (-1u << misalign) & 0xffff;
802 data = vld1q_u8 (p);
803 t = vceqq_u8 (data, repl_nl);
804 u = vceqq_u8 (data, repl_cr);
805 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
806 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
807 t = vorrq_u8 (v, w);
808 t = vandq_u8 (t, xmask);
809 m = vpaddlq_u8 (t);
810 m = vshlq_u16 (m, shift);
811 found = vaddvq_u16 (m);
812 found &= mask;
813 if (found)
814 return (const uchar*)p + __builtin_ctz (found);
816 else
818 data = vld1q_u8 ((const uint8_t *) s);
819 t = vceqq_u8 (data, repl_nl);
820 u = vceqq_u8 (data, repl_cr);
821 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
822 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
823 t = vorrq_u8 (v, w);
824 if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
825 goto done;
830 p += 16;
831 data = vld1q_u8 (p);
832 t = vceqq_u8 (data, repl_nl);
833 u = vceqq_u8 (data, repl_cr);
834 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
835 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
836 t = vorrq_u8 (v, w);
837 } while (!vpaddd_u64 ((uint64x2_t)t));
839 done:
840 /* Now that we've found the terminating substring, work out precisely where
841 we need to stop. */
842 t = vandq_u8 (t, xmask);
843 m = vpaddlq_u8 (t);
844 m = vshlq_u16 (m, shift);
845 found = vaddvq_u16 (m);
846 return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
847 + __builtin_ctz (found));
850 #elif defined (__ARM_NEON)
851 #include "arm_neon.h"
853 static const uchar *
854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
856 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
857 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
858 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
859 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
860 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
862 unsigned int misalign, found, mask;
863 const uint8_t *p;
864 uint8x16_t data;
866 /* Align the source pointer. */
867 misalign = (uintptr_t)s & 15;
868 p = (const uint8_t *)((uintptr_t)s & -16);
869 data = vld1q_u8 (p);
871 /* Create a mask for the bytes that are valid within the first
872 16-byte block. The Idea here is that the AND with the mask
873 within the loop is "free", since we need some AND or TEST
874 insn in order to set the flags for the branch anyway. */
875 mask = (-1u << misalign) & 0xffff;
877 /* Main loop, processing 16 bytes at a time. */
878 goto start;
882 uint8x8_t l;
883 uint16x4_t m;
884 uint32x2_t n;
885 uint8x16_t t, u, v, w;
887 p += 16;
888 data = vld1q_u8 (p);
889 mask = 0xffff;
891 start:
892 t = vceqq_u8 (data, repl_nl);
893 u = vceqq_u8 (data, repl_cr);
894 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
895 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
896 t = vandq_u8 (vorrq_u8 (v, w), xmask);
897 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
898 m = vpaddl_u8 (l);
899 n = vpaddl_u16 (m);
901 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
902 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
903 found &= mask;
905 while (!found);
907 /* FOUND contains 1 in bits for which we matched a relevant
908 character. Conversion to the byte index is trivial. */
909 found = __builtin_ctz (found);
910 return (const uchar *)p + found;
913 #else
915 /* We only have one accelerated alternative. Use a direct call so that
916 we encourage inlining. */
918 #define search_line_fast search_line_acc_char
920 #endif
922 /* Initialize the lexer if needed. */
924 void
925 _cpp_init_lexer (void)
927 #ifdef HAVE_init_vectorized_lexer
928 init_vectorized_lexer ();
929 #endif
932 /* Returns with a logical line that contains no escaped newlines or
933 trigraphs. This is a time-critical inner loop. */
934 void
935 _cpp_clean_line (cpp_reader *pfile)
937 cpp_buffer *buffer;
938 const uchar *s;
939 uchar c, *d, *p;
941 buffer = pfile->buffer;
942 buffer->cur_note = buffer->notes_used = 0;
943 buffer->cur = buffer->line_base = buffer->next_line;
944 buffer->need_line = false;
945 s = buffer->next_line;
947 if (!buffer->from_stage3)
949 const uchar *pbackslash = NULL;
951 /* Fast path. This is the common case of an un-escaped line with
952 no trigraphs. The primary win here is by not writing any
953 data back to memory until we have to. */
954 while (1)
956 /* Perform an optimized search for \n, \r, \\, ?. */
957 s = search_line_fast (s, buffer->rlimit);
959 c = *s;
960 if (c == '\\')
962 /* Record the location of the backslash and continue. */
963 pbackslash = s++;
965 else if (__builtin_expect (c == '?', 0))
967 if (__builtin_expect (s[1] == '?', false)
968 && _cpp_trigraph_map[s[2]])
970 /* Have a trigraph. We may or may not have to convert
971 it. Add a line note regardless, for -Wtrigraphs. */
972 add_line_note (buffer, s, s[2]);
973 if (CPP_OPTION (pfile, trigraphs))
975 /* We do, and that means we have to switch to the
976 slow path. */
977 d = (uchar *) s;
978 *d = _cpp_trigraph_map[s[2]];
979 s += 2;
980 goto slow_path;
983 /* Not a trigraph. Continue on fast-path. */
984 s++;
986 else
987 break;
990 /* This must be \r or \n. We're either done, or we'll be forced
991 to write back to the buffer and continue on the slow path. */
992 d = (uchar *) s;
994 if (__builtin_expect (s == buffer->rlimit, false))
995 goto done;
997 /* DOS line ending? */
998 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1000 s++;
1001 if (s == buffer->rlimit)
1002 goto done;
1005 if (__builtin_expect (pbackslash == NULL, true))
1006 goto done;
1008 /* Check for escaped newline. */
1009 p = d;
1010 while (is_nvspace (p[-1]))
1011 p--;
1012 if (p - 1 != pbackslash)
1013 goto done;
1015 /* Have an escaped newline; process it and proceed to
1016 the slow path. */
1017 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018 d = p - 2;
1019 buffer->next_line = p - 1;
1021 slow_path:
1022 while (1)
1024 c = *++s;
1025 *++d = c;
1027 if (c == '\n' || c == '\r')
1029 /* Handle DOS line endings. */
1030 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031 s++;
1032 if (s == buffer->rlimit)
1033 break;
1035 /* Escaped? */
1036 p = d;
1037 while (p != buffer->next_line && is_nvspace (p[-1]))
1038 p--;
1039 if (p == buffer->next_line || p[-1] != '\\')
1040 break;
1042 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043 d = p - 2;
1044 buffer->next_line = p - 1;
1046 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1048 /* Add a note regardless, for the benefit of -Wtrigraphs. */
1049 add_line_note (buffer, d, s[2]);
1050 if (CPP_OPTION (pfile, trigraphs))
1052 *d = _cpp_trigraph_map[s[2]];
1053 s += 2;
1058 else
1060 while (*s != '\n' && *s != '\r')
1061 s++;
1062 d = (uchar *) s;
1064 /* Handle DOS line endings. */
1065 if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1066 s++;
1069 done:
1070 *d = '\n';
1071 /* A sentinel note that should never be processed. */
1072 add_line_note (buffer, d + 1, '\n');
1073 buffer->next_line = s + 1;
1076 /* Return true if the trigraph indicated by NOTE should be warned
1077 about in a comment. */
1078 static bool
1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1081 const uchar *p;
1083 /* Within comments we don't warn about trigraphs, unless the
1084 trigraph forms an escaped newline, as that may change
1085 behavior. */
1086 if (note->type != '/')
1087 return false;
1089 /* If -trigraphs, then this was an escaped newline iff the next note
1090 is coincident. */
1091 if (CPP_OPTION (pfile, trigraphs))
1092 return note[1].pos == note->pos;
1094 /* Otherwise, see if this forms an escaped newline. */
1095 p = note->pos + 3;
1096 while (is_nvspace (*p))
1097 p++;
1099 /* There might have been escaped newlines between the trigraph and the
1100 newline we found. Hence the position test. */
1101 return (*p == '\n' && p < note[1].pos);
1104 /* Process the notes created by add_line_note as far as the current
1105 location. */
1106 void
1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1109 cpp_buffer *buffer = pfile->buffer;
1111 for (;;)
1113 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114 unsigned int col;
1116 if (note->pos > buffer->cur)
1117 break;
1119 buffer->cur_note++;
1120 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1122 if (note->type == '\\' || note->type == ' ')
1124 if (note->type == ' ' && !in_comment)
1125 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126 "backslash and newline separated by space");
1128 if (buffer->next_line > buffer->rlimit)
1130 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131 "backslash-newline at end of file");
1132 /* Prevent "no newline at end of file" warning. */
1133 buffer->next_line = buffer->rlimit;
1136 buffer->line_base = note->pos;
1137 CPP_INCREMENT_LINE (pfile, 0);
1139 else if (_cpp_trigraph_map[note->type])
1141 if (CPP_OPTION (pfile, warn_trigraphs)
1142 && (!in_comment || warn_in_comment (pfile, note)))
1144 if (CPP_OPTION (pfile, trigraphs))
1145 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146 pfile->line_table->highest_line, col,
1147 "trigraph ??%c converted to %c",
1148 note->type,
1149 (int) _cpp_trigraph_map[note->type]);
1150 else
1152 cpp_warning_with_line
1153 (pfile, CPP_W_TRIGRAPHS,
1154 pfile->line_table->highest_line, col,
1155 "trigraph ??%c ignored, use -trigraphs to enable",
1156 note->type);
1160 else if (note->type == 0)
1161 /* Already processed in lex_raw_string. */;
1162 else
1163 abort ();
1167 /* Skip a C-style block comment. We find the end of the comment by
1168 seeing if an asterisk is before every '/' we encounter. Returns
1169 nonzero if comment terminated by EOF, zero otherwise.
1171 Buffer->cur points to the initial asterisk of the comment. */
1172 bool
1173 _cpp_skip_block_comment (cpp_reader *pfile)
1175 cpp_buffer *buffer = pfile->buffer;
1176 const uchar *cur = buffer->cur;
1177 uchar c;
1179 cur++;
1180 if (*cur == '/')
1181 cur++;
1183 for (;;)
1185 /* People like decorating comments with '*', so check for '/'
1186 instead for efficiency. */
1187 c = *cur++;
1189 if (c == '/')
1191 if (cur[-2] == '*')
1192 break;
1194 /* Warn about potential nested comments, but not if the '/'
1195 comes immediately before the true comment delimiter.
1196 Don't bother to get it right across escaped newlines. */
1197 if (CPP_OPTION (pfile, warn_comments)
1198 && cur[0] == '*' && cur[1] != '/')
1200 buffer->cur = cur;
1201 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1202 pfile->line_table->highest_line,
1203 CPP_BUF_COL (buffer),
1204 "\"/*\" within comment");
1207 else if (c == '\n')
1209 unsigned int cols;
1210 buffer->cur = cur - 1;
1211 _cpp_process_line_notes (pfile, true);
1212 if (buffer->next_line >= buffer->rlimit)
1213 return true;
1214 _cpp_clean_line (pfile);
1216 cols = buffer->next_line - buffer->line_base;
1217 CPP_INCREMENT_LINE (pfile, cols);
1219 cur = buffer->cur;
1223 buffer->cur = cur;
1224 _cpp_process_line_notes (pfile, true);
1225 return false;
1228 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1229 terminating newline. Handles escaped newlines. Returns nonzero
1230 if a multiline comment. */
1231 static int
1232 skip_line_comment (cpp_reader *pfile)
1234 cpp_buffer *buffer = pfile->buffer;
1235 location_t orig_line = pfile->line_table->highest_line;
1237 while (*buffer->cur != '\n')
1238 buffer->cur++;
1240 _cpp_process_line_notes (pfile, true);
1241 return orig_line != pfile->line_table->highest_line;
1244 /* Skips whitespace, saving the next non-whitespace character. */
1245 static void
1246 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1248 cpp_buffer *buffer = pfile->buffer;
1249 bool saw_NUL = false;
1253 /* Horizontal space always OK. */
1254 if (c == ' ' || c == '\t')
1256 /* Just \f \v or \0 left. */
1257 else if (c == '\0')
1258 saw_NUL = true;
1259 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1260 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1261 CPP_BUF_COL (buffer),
1262 "%s in preprocessing directive",
1263 c == '\f' ? "form feed" : "vertical tab");
1265 c = *buffer->cur++;
1267 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1268 while (is_nvspace (c));
1270 if (saw_NUL)
1271 cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1273 buffer->cur--;
1276 /* See if the characters of a number token are valid in a name (no
1277 '.', '+' or '-'). */
1278 static int
1279 name_p (cpp_reader *pfile, const cpp_string *string)
1281 unsigned int i;
1283 for (i = 0; i < string->len; i++)
1284 if (!is_idchar (string->text[i]))
1285 return 0;
1287 return 1;
1290 /* After parsing an identifier or other sequence, produce a warning about
1291 sequences not in NFC/NFKC. */
1292 static void
1293 warn_about_normalization (cpp_reader *pfile,
1294 const cpp_token *token,
1295 const struct normalize_state *s)
1297 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1298 && !pfile->state.skipping)
1300 /* Make sure that the token is printed using UCNs, even
1301 if we'd otherwise happily print UTF-8. */
1302 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1303 size_t sz;
1305 sz = cpp_spell_token (pfile, token, buf, false) - buf;
1306 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1307 cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1308 "`%.*s' is not in NFKC", (int) sz, buf);
1309 else if (CPP_OPTION (pfile, cxx23_identifiers))
1310 cpp_pedwarning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1311 "`%.*s' is not in NFC", (int) sz, buf);
1312 else
1313 cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1314 "`%.*s' is not in NFC", (int) sz, buf);
1315 free (buf);
1319 static const cppchar_t utf8_signifier = 0xC0;
1321 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1322 an identifier. FIRST is TRUE if this starts an identifier. */
1323 static bool
1324 forms_identifier_p (cpp_reader *pfile, int first,
1325 struct normalize_state *state)
1327 cpp_buffer *buffer = pfile->buffer;
1329 if (*buffer->cur == '$')
1331 if (!CPP_OPTION (pfile, dollars_in_ident))
1332 return false;
1334 buffer->cur++;
1335 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1337 CPP_OPTION (pfile, warn_dollars) = 0;
1338 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1341 return true;
1344 /* Is this a syntactically valid UCN or a valid UTF-8 char? */
1345 if (CPP_OPTION (pfile, extended_identifiers))
1347 cppchar_t s;
1348 if (*buffer->cur >= utf8_signifier)
1350 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1351 state, &s))
1352 return true;
1354 else if (*buffer->cur == '\\'
1355 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1357 buffer->cur += 2;
1358 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1359 state, &s, NULL, NULL))
1360 return true;
1361 buffer->cur -= 2;
1365 return false;
1368 /* Helper function to issue error about improper __VA_OPT__ use. */
1369 static void
1370 maybe_va_opt_error (cpp_reader *pfile)
1372 if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1374 /* __VA_OPT__ should not be accepted at all, but allow it in
1375 system headers. */
1376 if (!_cpp_in_system_header (pfile))
1377 cpp_error (pfile, CPP_DL_PEDWARN,
1378 "__VA_OPT__ is not available until C++20");
1380 else if (!pfile->state.va_args_ok)
1382 /* __VA_OPT__ should only appear in the replacement list of a
1383 variadic macro. */
1384 cpp_error (pfile, CPP_DL_PEDWARN,
1385 "__VA_OPT__ can only appear in the expansion"
1386 " of a C++20 variadic macro");
1390 /* Helper function to get the cpp_hashnode of the identifier BASE. */
1391 static cpp_hashnode *
1392 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1394 cpp_hashnode *result;
1395 const uchar *cur;
1396 unsigned int len;
1397 unsigned int hash = HT_HASHSTEP (0, *base);
1399 cur = base + 1;
1400 while (ISIDNUM (*cur))
1402 hash = HT_HASHSTEP (hash, *cur);
1403 cur++;
1405 len = cur - base;
1406 hash = HT_HASHFINISH (hash, len);
1407 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1408 base, len, hash, HT_ALLOC));
1410 /* Rarely, identifiers require diagnostics when lexed. */
1411 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1412 && !pfile->state.skipping, 0))
1414 /* It is allowed to poison the same identifier twice. */
1415 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1416 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1417 NODE_NAME (result));
1419 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1420 replacement list of a variadic macro. */
1421 if (result == pfile->spec_nodes.n__VA_ARGS__
1422 && !pfile->state.va_args_ok)
1424 if (CPP_OPTION (pfile, cplusplus))
1425 cpp_error (pfile, CPP_DL_PEDWARN,
1426 "__VA_ARGS__ can only appear in the expansion"
1427 " of a C++11 variadic macro");
1428 else
1429 cpp_error (pfile, CPP_DL_PEDWARN,
1430 "__VA_ARGS__ can only appear in the expansion"
1431 " of a C99 variadic macro");
1434 if (result == pfile->spec_nodes.n__VA_OPT__)
1435 maybe_va_opt_error (pfile);
1437 /* For -Wc++-compat, warn about use of C++ named operators. */
1438 if (result->flags & NODE_WARN_OPERATOR)
1439 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1440 "identifier \"%s\" is a special operator name in C++",
1441 NODE_NAME (result));
1444 return result;
1447 /* Get the cpp_hashnode of an identifier specified by NAME in
1448 the current cpp_reader object. If none is found, NULL is returned. */
1449 cpp_hashnode *
1450 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1452 cpp_hashnode *result;
1453 result = lex_identifier_intern (pfile, (uchar *) name);
1454 return result;
1457 /* Lex an identifier starting at BUFFER->CUR - 1. */
1458 static cpp_hashnode *
1459 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1460 struct normalize_state *nst, cpp_hashnode **spelling)
1462 cpp_hashnode *result;
1463 const uchar *cur;
1464 unsigned int len;
1465 unsigned int hash = HT_HASHSTEP (0, *base);
1467 cur = pfile->buffer->cur;
1468 if (! starts_ucn)
1470 while (ISIDNUM (*cur))
1472 hash = HT_HASHSTEP (hash, *cur);
1473 cur++;
1475 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1477 pfile->buffer->cur = cur;
1478 if (starts_ucn || forms_identifier_p (pfile, false, nst))
1480 /* Slower version for identifiers containing UCNs
1481 or extended chars (including $). */
1482 do {
1483 while (ISIDNUM (*pfile->buffer->cur))
1485 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1486 pfile->buffer->cur++;
1488 } while (forms_identifier_p (pfile, false, nst));
1489 result = _cpp_interpret_identifier (pfile, base,
1490 pfile->buffer->cur - base);
1491 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1493 else
1495 len = cur - base;
1496 hash = HT_HASHFINISH (hash, len);
1498 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1499 base, len, hash, HT_ALLOC));
1500 *spelling = result;
1503 /* Rarely, identifiers require diagnostics when lexed. */
1504 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1505 && !pfile->state.skipping, 0))
1507 /* It is allowed to poison the same identifier twice. */
1508 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1509 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1510 NODE_NAME (result));
1512 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1513 replacement list of a variadic macro. */
1514 if (result == pfile->spec_nodes.n__VA_ARGS__
1515 && !pfile->state.va_args_ok)
1517 if (CPP_OPTION (pfile, cplusplus))
1518 cpp_error (pfile, CPP_DL_PEDWARN,
1519 "__VA_ARGS__ can only appear in the expansion"
1520 " of a C++11 variadic macro");
1521 else
1522 cpp_error (pfile, CPP_DL_PEDWARN,
1523 "__VA_ARGS__ can only appear in the expansion"
1524 " of a C99 variadic macro");
1527 /* __VA_OPT__ should only appear in the replacement list of a
1528 variadic macro. */
1529 if (result == pfile->spec_nodes.n__VA_OPT__)
1530 maybe_va_opt_error (pfile);
1532 /* For -Wc++-compat, warn about use of C++ named operators. */
1533 if (result->flags & NODE_WARN_OPERATOR)
1534 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1535 "identifier \"%s\" is a special operator name in C++",
1536 NODE_NAME (result));
1539 return result;
1542 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
1543 static void
1544 lex_number (cpp_reader *pfile, cpp_string *number,
1545 struct normalize_state *nst)
1547 const uchar *cur;
1548 const uchar *base;
1549 uchar *dest;
1551 base = pfile->buffer->cur - 1;
1554 const uchar *adj_digit_sep = NULL;
1555 cur = pfile->buffer->cur;
1557 /* N.B. ISIDNUM does not include $. */
1558 while (ISIDNUM (*cur)
1559 || (*cur == '.' && !DIGIT_SEP (cur[-1]))
1560 || DIGIT_SEP (*cur)
1561 || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
1563 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1564 /* Adjacent digit separators do not form part of the pp-number syntax.
1565 However, they can safely be diagnosed here as an error, since '' is
1566 not a valid preprocessing token. */
1567 if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
1568 adj_digit_sep = cur;
1569 cur++;
1571 /* A number can't end with a digit separator. */
1572 while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
1573 --cur;
1574 if (adj_digit_sep && adj_digit_sep < cur)
1575 cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
1577 pfile->buffer->cur = cur;
1579 while (forms_identifier_p (pfile, false, nst));
1581 number->len = cur - base;
1582 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1583 memcpy (dest, base, number->len);
1584 dest[number->len] = '\0';
1585 number->text = dest;
1588 /* Create a token of type TYPE with a literal spelling. */
1589 static void
1590 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1591 unsigned int len, enum cpp_ttype type)
1593 token->type = type;
1594 token->val.str.len = len;
1595 token->val.str.text = cpp_alloc_token_string (pfile, base, len);
1598 const uchar *
1599 cpp_alloc_token_string (cpp_reader *pfile,
1600 const unsigned char *ptr, unsigned len)
1602 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1604 dest[len] = 0;
1605 memcpy (dest, ptr, len);
1606 return dest;
1609 /* A pair of raw buffer pointers. The currently open one is [1], the
1610 first one is [0]. Used for string literal lexing. */
1611 struct lit_accum {
1612 _cpp_buff *first;
1613 _cpp_buff *last;
1614 const uchar *rpos;
1615 size_t accum;
1617 lit_accum ()
1618 : first (NULL), last (NULL), rpos (0), accum (0)
1622 void append (cpp_reader *, const uchar *, size_t);
1624 void read_begin (cpp_reader *);
1625 bool reading_p () const
1627 return rpos != NULL;
1629 char read_char ()
1631 char c = *rpos++;
1632 if (rpos == BUFF_FRONT (last))
1633 rpos = NULL;
1634 return c;
1638 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1639 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
1641 void
1642 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
1644 if (!last)
1645 /* Starting. */
1646 first = last = _cpp_get_buff (pfile, len);
1647 else if (len > BUFF_ROOM (last))
1649 /* There is insufficient room in the buffer. Copy what we can,
1650 and then either extend or create a new one. */
1651 size_t room = BUFF_ROOM (last);
1652 memcpy (BUFF_FRONT (last), base, room);
1653 BUFF_FRONT (last) += room;
1654 base += room;
1655 len -= room;
1656 accum += room;
1658 gcc_checking_assert (!rpos);
1660 last = _cpp_append_extend_buff (pfile, last, len);
1663 memcpy (BUFF_FRONT (last), base, len);
1664 BUFF_FRONT (last) += len;
1665 accum += len;
1668 void
1669 lit_accum::read_begin (cpp_reader *pfile)
1671 /* We never accumulate more than 4 chars to read. */
1672 if (BUFF_ROOM (last) < 4)
1674 last = _cpp_append_extend_buff (pfile, last, 4);
1675 rpos = BUFF_FRONT (last);
1678 /* Returns true if a macro has been defined.
1679 This might not work if compile with -save-temps,
1680 or preprocess separately from compilation. */
1682 static bool
1683 is_macro(cpp_reader *pfile, const uchar *base)
1685 const uchar *cur = base;
1686 if (! ISIDST (*cur))
1687 return false;
1688 unsigned int hash = HT_HASHSTEP (0, *cur);
1689 ++cur;
1690 while (ISIDNUM (*cur))
1692 hash = HT_HASHSTEP (hash, *cur);
1693 ++cur;
1695 hash = HT_HASHFINISH (hash, cur - base);
1697 cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1698 base, cur - base, hash, HT_NO_INSERT));
1700 return result && cpp_macro_p (result);
1703 /* Returns true if a literal suffix does not have the expected form
1704 and is defined as a macro. */
1706 static bool
1707 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
1709 /* User-defined literals outside of namespace std must start with a single
1710 underscore, so assume anything of that form really is a UDL suffix.
1711 We don't need to worry about UDLs defined inside namespace std because
1712 their names are reserved, so cannot be used as macro names in valid
1713 programs. */
1714 if (base[0] == '_' && base[1] != '_')
1715 return false;
1716 return is_macro (pfile, base);
1719 /* Lexes a raw string. The stored string contains the spelling,
1720 including double quotes, delimiter string, '(' and ')', any leading
1721 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
1722 the type of the literal, or CPP_OTHER if it was not properly
1723 terminated.
1725 BASE is the start of the token. Updates pfile->buffer->cur to just
1726 after the lexed string.
1728 The spelling is NUL-terminated, but it is not guaranteed that this
1729 is the first NUL since embedded NULs are preserved. */
1731 static void
1732 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1734 const uchar *pos = base;
1736 /* 'tis a pity this information isn't passed down from the lexer's
1737 initial categorization of the token. */
1738 enum cpp_ttype type = CPP_STRING;
1740 if (*pos == 'L')
1742 type = CPP_WSTRING;
1743 pos++;
1745 else if (*pos == 'U')
1747 type = CPP_STRING32;
1748 pos++;
1750 else if (*pos == 'u')
1752 if (pos[1] == '8')
1754 type = CPP_UTF8STRING;
1755 pos++;
1757 else
1758 type = CPP_STRING16;
1759 pos++;
1762 gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
1763 pos += 2;
1765 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1767 /* Skip notes before the ". */
1768 while (note->pos < pos)
1769 ++note;
1771 lit_accum accum;
1773 uchar prefix[17];
1774 unsigned prefix_len = 0;
1775 enum Phase
1777 PHASE_PREFIX = -2,
1778 PHASE_NONE = -1,
1779 PHASE_SUFFIX = 0
1780 } phase = PHASE_PREFIX;
1782 for (;;)
1784 gcc_checking_assert (note->pos >= pos);
1786 /* Undo any escaped newlines and trigraphs. */
1787 if (!accum.reading_p () && note->pos == pos)
1788 switch (note->type)
1790 case '\\':
1791 case ' ':
1792 /* Restore backslash followed by newline. */
1793 accum.append (pfile, base, pos - base);
1794 base = pos;
1795 accum.read_begin (pfile);
1796 accum.append (pfile, UC"\\", 1);
1798 after_backslash:
1799 if (note->type == ' ')
1800 /* GNU backslash whitespace newline extension. FIXME
1801 could be any sequence of non-vertical space. When we
1802 can properly restore any such sequence, we should
1803 mark this note as handled so _cpp_process_line_notes
1804 doesn't warn. */
1805 accum.append (pfile, UC" ", 1);
1807 accum.append (pfile, UC"\n", 1);
1808 note++;
1809 break;
1811 case '\n':
1812 /* This can happen for ??/<NEWLINE> when trigraphs are not
1813 being interpretted. */
1814 gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
1815 note->type = 0;
1816 note++;
1817 break;
1819 default:
1820 gcc_checking_assert (_cpp_trigraph_map[note->type]);
1822 /* Don't warn about this trigraph in
1823 _cpp_process_line_notes, since trigraphs show up as
1824 trigraphs in raw strings. */
1825 uchar type = note->type;
1826 note->type = 0;
1828 if (CPP_OPTION (pfile, trigraphs))
1830 accum.append (pfile, base, pos - base);
1831 base = pos;
1832 accum.read_begin (pfile);
1833 accum.append (pfile, UC"??", 2);
1834 accum.append (pfile, &type, 1);
1836 /* ??/ followed by newline gets two line notes, one for
1837 the trigraph and one for the backslash/newline. */
1838 if (type == '/' && note[1].pos == pos)
1840 note++;
1841 gcc_assert (note->type == '\\' || note->type == ' ');
1842 goto after_backslash;
1844 /* Skip the replacement character. */
1845 base = ++pos;
1848 note++;
1849 break;
1852 /* Now get a char to process. Either from an expanded note, or
1853 from the line buffer. */
1854 bool read_note = accum.reading_p ();
1855 char c = read_note ? accum.read_char () : *pos++;
1857 if (phase == PHASE_PREFIX)
1859 if (c == '(')
1861 /* Done. */
1862 phase = PHASE_NONE;
1863 prefix[prefix_len++] = '"';
1865 else if (prefix_len < 16
1866 /* Prefix chars are any of the basic character set,
1867 [lex.charset] except for '
1868 ()\\\t\v\f\n'. Optimized for a contiguous
1869 alphabet. */
1870 /* Unlike a switch, this collapses down to one or
1871 two shift and bitmask operations on an ASCII
1872 system, with an outlier or two. */
1873 && (('Z' - 'A' == 25
1874 ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
1875 : ISIDST (c))
1876 || (c >= '0' && c <= '9')
1877 || c == '_' || c == '{' || c == '}'
1878 || c == '[' || c == ']' || c == '#'
1879 || c == '<' || c == '>' || c == '%'
1880 || c == ':' || c == ';' || c == '.' || c == '?'
1881 || c == '*' || c == '+' || c == '-' || c == '/'
1882 || c == '^' || c == '&' || c == '|' || c == '~'
1883 || c == '!' || c == '=' || c == ','
1884 || c == '"' || c == '\''))
1885 prefix[prefix_len++] = c;
1886 else
1888 /* Something is wrong. */
1889 int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
1890 if (prefix_len == 16)
1891 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1892 col, "raw string delimiter longer "
1893 "than 16 characters");
1894 else if (c == '\n')
1895 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1896 col, "invalid new-line in raw "
1897 "string delimiter");
1898 else
1899 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1900 col, "invalid character '%c' in "
1901 "raw string delimiter", c);
1902 type = CPP_OTHER;
1903 phase = PHASE_NONE;
1904 /* Continue until we get a close quote, that's probably
1905 the best failure mode. */
1906 prefix_len = 0;
1908 if (c != '\n')
1909 continue;
1912 if (phase != PHASE_NONE)
1914 if (prefix[phase] != c)
1915 phase = PHASE_NONE;
1916 else if (unsigned (phase + 1) == prefix_len)
1917 break;
1918 else
1920 phase = Phase (phase + 1);
1921 continue;
1925 if (!prefix_len && c == '"')
1926 /* Failure mode lexing. */
1927 goto out;
1928 else if (prefix_len && c == ')')
1929 phase = PHASE_SUFFIX;
1930 else if (!read_note && c == '\n')
1932 pos--;
1933 pfile->buffer->cur = pos;
1934 if (pfile->state.in_directive
1935 || (pfile->state.parsing_args
1936 && pfile->buffer->next_line >= pfile->buffer->rlimit))
1938 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1939 "unterminated raw string");
1940 type = CPP_OTHER;
1941 goto out;
1944 accum.append (pfile, base, pos - base + 1);
1945 _cpp_process_line_notes (pfile, false);
1947 if (pfile->buffer->next_line < pfile->buffer->rlimit)
1948 CPP_INCREMENT_LINE (pfile, 0);
1949 pfile->buffer->need_line = true;
1951 if (!_cpp_get_fresh_line (pfile))
1953 /* We ran out of file and failed to get a line. */
1954 location_t src_loc = token->src_loc;
1955 token->type = CPP_EOF;
1956 /* Tell the compiler the line number of the EOF token. */
1957 token->src_loc = pfile->line_table->highest_line;
1958 token->flags = BOL;
1959 if (accum.first)
1960 _cpp_release_buff (pfile, accum.first);
1961 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1962 "unterminated raw string");
1963 /* Now pop the buffer that _cpp_get_fresh_line did not. */
1964 _cpp_pop_buffer (pfile);
1965 return;
1968 pos = base = pfile->buffer->cur;
1969 note = &pfile->buffer->notes[pfile->buffer->cur_note];
1973 if (CPP_OPTION (pfile, user_literals))
1975 /* If a string format macro, say from inttypes.h, is placed touching
1976 a string literal it could be parsed as a C++11 user-defined string
1977 literal thus breaking the program. */
1978 if (is_macro_not_literal_suffix (pfile, pos))
1980 /* Raise a warning, but do not consume subsequent tokens. */
1981 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1982 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1983 token->src_loc, 0,
1984 "invalid suffix on literal; C++11 requires "
1985 "a space between literal and string macro");
1987 /* Grab user defined literal suffix. */
1988 else if (ISIDST (*pos))
1990 type = cpp_userdef_string_add_type (type);
1991 ++pos;
1993 while (ISIDNUM (*pos))
1994 ++pos;
1998 out:
1999 pfile->buffer->cur = pos;
2000 if (!accum.accum)
2001 create_literal (pfile, token, base, pos - base, type);
2002 else
2004 size_t extra_len = pos - base;
2005 uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2007 token->type = type;
2008 token->val.str.len = accum.accum + extra_len;
2009 token->val.str.text = dest;
2010 for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2012 size_t len = BUFF_FRONT (buf) - buf->base;
2013 memcpy (dest, buf->base, len);
2014 dest += len;
2016 _cpp_release_buff (pfile, accum.first);
2017 memcpy (dest, base, extra_len);
2018 dest[extra_len] = '\0';
2022 /* Lexes a string, character constant, or angle-bracketed header file
2023 name. The stored string contains the spelling, including opening
2024 quote and any leading 'L', 'u', 'U' or 'u8' and optional
2025 'R' modifier. It returns the type of the literal, or CPP_OTHER
2026 if it was not properly terminated, or CPP_LESS for an unterminated
2027 header name which must be relexed as normal tokens.
2029 The spelling is NUL-terminated, but it is not guaranteed that this
2030 is the first NUL since embedded NULs are preserved. */
2031 static void
2032 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2034 bool saw_NUL = false;
2035 const uchar *cur;
2036 cppchar_t terminator;
2037 enum cpp_ttype type;
2039 cur = base;
2040 terminator = *cur++;
2041 if (terminator == 'L' || terminator == 'U')
2042 terminator = *cur++;
2043 else if (terminator == 'u')
2045 terminator = *cur++;
2046 if (terminator == '8')
2047 terminator = *cur++;
2049 if (terminator == 'R')
2051 lex_raw_string (pfile, token, base);
2052 return;
2054 if (terminator == '"')
2055 type = (*base == 'L' ? CPP_WSTRING :
2056 *base == 'U' ? CPP_STRING32 :
2057 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2058 : CPP_STRING);
2059 else if (terminator == '\'')
2060 type = (*base == 'L' ? CPP_WCHAR :
2061 *base == 'U' ? CPP_CHAR32 :
2062 *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2063 : CPP_CHAR);
2064 else
2065 terminator = '>', type = CPP_HEADER_NAME;
2067 for (;;)
2069 cppchar_t c = *cur++;
2071 /* In #include-style directives, terminators are not escapable. */
2072 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2073 cur++;
2074 else if (c == terminator)
2075 break;
2076 else if (c == '\n')
2078 cur--;
2079 /* Unmatched quotes always yield undefined behavior, but
2080 greedy lexing means that what appears to be an unterminated
2081 header name may actually be a legitimate sequence of tokens. */
2082 if (terminator == '>')
2084 token->type = CPP_LESS;
2085 return;
2087 type = CPP_OTHER;
2088 break;
2090 else if (c == '\0')
2091 saw_NUL = true;
2094 if (saw_NUL && !pfile->state.skipping)
2095 cpp_error (pfile, CPP_DL_WARNING,
2096 "null character(s) preserved in literal");
2098 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2099 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2100 (int) terminator);
2102 if (CPP_OPTION (pfile, user_literals))
2104 /* If a string format macro, say from inttypes.h, is placed touching
2105 a string literal it could be parsed as a C++11 user-defined string
2106 literal thus breaking the program. */
2107 if (is_macro_not_literal_suffix (pfile, cur))
2109 /* Raise a warning, but do not consume subsequent tokens. */
2110 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2111 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2112 token->src_loc, 0,
2113 "invalid suffix on literal; C++11 requires "
2114 "a space between literal and string macro");
2116 /* Grab user defined literal suffix. */
2117 else if (ISIDST (*cur))
2119 type = cpp_userdef_char_add_type (type);
2120 type = cpp_userdef_string_add_type (type);
2121 ++cur;
2123 while (ISIDNUM (*cur))
2124 ++cur;
2127 else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2128 && is_macro (pfile, cur)
2129 && !pfile->state.skipping)
2130 cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2131 token->src_loc, 0, "C++11 requires a space "
2132 "between string literal and macro");
2134 pfile->buffer->cur = cur;
2135 create_literal (pfile, token, base, cur - base, type);
2138 /* Return the comment table. The client may not make any assumption
2139 about the ordering of the table. */
2140 cpp_comment_table *
2141 cpp_get_comments (cpp_reader *pfile)
2143 return &pfile->comments;
2146 /* Append a comment to the end of the comment table. */
2147 static void
2148 store_comment (cpp_reader *pfile, cpp_token *token)
2150 int len;
2152 if (pfile->comments.allocated == 0)
2154 pfile->comments.allocated = 256;
2155 pfile->comments.entries = (cpp_comment *) xmalloc
2156 (pfile->comments.allocated * sizeof (cpp_comment));
2159 if (pfile->comments.count == pfile->comments.allocated)
2161 pfile->comments.allocated *= 2;
2162 pfile->comments.entries = (cpp_comment *) xrealloc
2163 (pfile->comments.entries,
2164 pfile->comments.allocated * sizeof (cpp_comment));
2167 len = token->val.str.len;
2169 /* Copy comment. Note, token may not be NULL terminated. */
2170 pfile->comments.entries[pfile->comments.count].comment =
2171 (char *) xmalloc (sizeof (char) * (len + 1));
2172 memcpy (pfile->comments.entries[pfile->comments.count].comment,
2173 token->val.str.text, len);
2174 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2176 /* Set source location. */
2177 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2179 /* Increment the count of entries in the comment table. */
2180 pfile->comments.count++;
2183 /* The stored comment includes the comment start and any terminator. */
2184 static void
2185 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2186 cppchar_t type)
2188 unsigned char *buffer;
2189 unsigned int len, clen, i;
2191 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
2193 /* C++ comments probably (not definitely) have moved past a new
2194 line, which we don't want to save in the comment. */
2195 if (is_vspace (pfile->buffer->cur[-1]))
2196 len--;
2198 /* If we are currently in a directive or in argument parsing, then
2199 we need to store all C++ comments as C comments internally, and
2200 so we need to allocate a little extra space in that case.
2202 Note that the only time we encounter a directive here is
2203 when we are saving comments in a "#define". */
2204 clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2205 && type == '/') ? len + 2 : len;
2207 buffer = _cpp_unaligned_alloc (pfile, clen);
2209 token->type = CPP_COMMENT;
2210 token->val.str.len = clen;
2211 token->val.str.text = buffer;
2213 buffer[0] = '/';
2214 memcpy (buffer + 1, from, len - 1);
2216 /* Finish conversion to a C comment, if necessary. */
2217 if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2219 buffer[1] = '*';
2220 buffer[clen - 2] = '*';
2221 buffer[clen - 1] = '/';
2222 /* As there can be in a C++ comments illegal sequences for C comments
2223 we need to filter them out. */
2224 for (i = 2; i < (clen - 2); i++)
2225 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2226 buffer[i] = '|';
2229 /* Finally store this comment for use by clients of libcpp. */
2230 store_comment (pfile, token);
2233 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2234 comment. */
2236 static bool
2237 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2239 const unsigned char *from = comment_start + 1;
2241 switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2243 /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2244 don't recognize any comments. The latter only checks attributes,
2245 the former doesn't warn. */
2246 case 0:
2247 default:
2248 return false;
2249 /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2250 content it has. */
2251 case 1:
2252 return true;
2253 case 2:
2254 /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2255 .*falls?[ \t-]*thr(u|ough).* regex. */
2256 for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2257 from++)
2259 /* Is there anything like strpbrk with upper boundary, or
2260 memchr looking for 2 characters rather than just one? */
2261 if (from[0] != 'f' && from[0] != 'F')
2262 continue;
2263 if (from[1] != 'a' && from[1] != 'A')
2264 continue;
2265 if (from[2] != 'l' && from[2] != 'L')
2266 continue;
2267 if (from[3] != 'l' && from[3] != 'L')
2268 continue;
2269 from += sizeof "fall" - 1;
2270 if (from[0] == 's' || from[0] == 'S')
2271 from++;
2272 while (*from == ' ' || *from == '\t' || *from == '-')
2273 from++;
2274 if (from[0] != 't' && from[0] != 'T')
2275 continue;
2276 if (from[1] != 'h' && from[1] != 'H')
2277 continue;
2278 if (from[2] != 'r' && from[2] != 'R')
2279 continue;
2280 if (from[3] == 'u' || from[3] == 'U')
2281 return true;
2282 if (from[3] != 'o' && from[3] != 'O')
2283 continue;
2284 if (from[4] != 'u' && from[4] != 'U')
2285 continue;
2286 if (from[5] != 'g' && from[5] != 'G')
2287 continue;
2288 if (from[6] != 'h' && from[6] != 'H')
2289 continue;
2290 return true;
2292 return false;
2293 case 3:
2294 case 4:
2295 break;
2298 /* Whole comment contents:
2299 -fallthrough
2300 @fallthrough@
2302 if (*from == '-' || *from == '@')
2304 size_t len = sizeof "fallthrough" - 1;
2305 if ((size_t) (pfile->buffer->cur - from - 1) < len)
2306 return false;
2307 if (memcmp (from + 1, "fallthrough", len))
2308 return false;
2309 if (*from == '@')
2311 if (from[len + 1] != '@')
2312 return false;
2313 len++;
2315 from += 1 + len;
2317 /* Whole comment contents (regex):
2318 lint -fallthrough[ \t]*
2320 else if (*from == 'l')
2322 size_t len = sizeof "int -fallthrough" - 1;
2323 if ((size_t) (pfile->buffer->cur - from - 1) < len)
2324 return false;
2325 if (memcmp (from + 1, "int -fallthrough", len))
2326 return false;
2327 from += 1 + len;
2328 while (*from == ' ' || *from == '\t')
2329 from++;
2331 /* Whole comment contents (regex):
2332 [ \t]*FALLTHR(U|OUGH)[ \t]*
2334 else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2336 while (*from == ' ' || *from == '\t')
2337 from++;
2338 if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
2339 return false;
2340 if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2341 return false;
2342 from += sizeof "FALLTHR" - 1;
2343 if (*from == 'U')
2344 from++;
2345 else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
2346 return false;
2347 else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2348 return false;
2349 else
2350 from += sizeof "OUGH" - 1;
2351 while (*from == ' ' || *from == '\t')
2352 from++;
2354 /* Whole comment contents (regex):
2355 [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2356 [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2357 [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2359 else
2361 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2362 from++;
2363 unsigned char f = *from;
2364 bool all_upper = false;
2365 if (f == 'E' || f == 'e')
2367 if ((size_t) (pfile->buffer->cur - from)
2368 < sizeof "else fallthru" - 1)
2369 return false;
2370 if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2371 all_upper = true;
2372 else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2373 return false;
2374 from += sizeof "else" - 1;
2375 if (*from == ',')
2376 from++;
2377 if (*from != ' ')
2378 return false;
2379 from++;
2380 if (all_upper && *from == 'f')
2381 return false;
2382 if (f == 'e' && *from == 'F')
2383 return false;
2384 f = *from;
2386 else if (f == 'I' || f == 'i')
2388 if ((size_t) (pfile->buffer->cur - from)
2389 < sizeof "intentional fallthru" - 1)
2390 return false;
2391 if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2392 sizeof "NTENTIONAL" - 1) == 0)
2393 all_upper = true;
2394 else if (memcmp (from + 1, "ntentional",
2395 sizeof "ntentional" - 1))
2396 return false;
2397 from += sizeof "intentional" - 1;
2398 if (*from == ' ')
2400 from++;
2401 if (all_upper && *from == 'f')
2402 return false;
2404 else if (all_upper)
2406 if (memcmp (from, "LY F", sizeof "LY F" - 1))
2407 return false;
2408 from += sizeof "LY " - 1;
2410 else
2412 if (memcmp (from, "ly ", sizeof "ly " - 1))
2413 return false;
2414 from += sizeof "ly " - 1;
2416 if (f == 'i' && *from == 'F')
2417 return false;
2418 f = *from;
2420 if (f != 'F' && f != 'f')
2421 return false;
2422 if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
2423 return false;
2424 if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
2425 all_upper = true;
2426 else if (all_upper)
2427 return false;
2428 else if (memcmp (from + 1, "all", sizeof "all" - 1))
2429 return false;
2430 from += sizeof "fall" - 1;
2431 if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
2432 from += 2;
2433 else if (*from == ' ' || *from == '-')
2434 from++;
2435 else if (*from != (all_upper ? 'T' : 't'))
2436 return false;
2437 if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
2438 return false;
2439 if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
2440 return false;
2441 if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
2443 if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
2444 return false;
2445 if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
2446 sizeof "hrough" - 1))
2447 return false;
2448 from += sizeof "through" - 1;
2450 else
2451 from += sizeof "thru" - 1;
2452 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2453 from++;
2454 if (*from == '-')
2456 from++;
2457 if (*comment_start == '*')
2461 while (*from && *from != '*'
2462 && *from != '\n' && *from != '\r')
2463 from++;
2464 if (*from != '*' || from[1] == '/')
2465 break;
2466 from++;
2468 while (1);
2470 else
2471 while (*from && *from != '\n' && *from != '\r')
2472 from++;
2475 /* C block comment. */
2476 if (*comment_start == '*')
2478 if (*from != '*' || from[1] != '/')
2479 return false;
2481 /* C++ line comment. */
2482 else if (*from != '\n')
2483 return false;
2485 return true;
2488 /* Allocate COUNT tokens for RUN. */
2489 void
2490 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
2492 run->base = XNEWVEC (cpp_token, count);
2493 run->limit = run->base + count;
2494 run->next = NULL;
2497 /* Returns the next tokenrun, or creates one if there is none. */
2498 static tokenrun *
2499 next_tokenrun (tokenrun *run)
2501 if (run->next == NULL)
2503 run->next = XNEW (tokenrun);
2504 run->next->prev = run;
2505 _cpp_init_tokenrun (run->next, 250);
2508 return run->next;
2511 /* Return the number of not yet processed token in a given
2512 context. */
2514 _cpp_remaining_tokens_num_in_context (cpp_context *context)
2516 if (context->tokens_kind == TOKENS_KIND_DIRECT)
2517 return (LAST (context).token - FIRST (context).token);
2518 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2519 || context->tokens_kind == TOKENS_KIND_EXTENDED)
2520 return (LAST (context).ptoken - FIRST (context).ptoken);
2521 else
2522 abort ();
2525 /* Returns the token present at index INDEX in a given context. If
2526 INDEX is zero, the next token to be processed is returned. */
2527 static const cpp_token*
2528 _cpp_token_from_context_at (cpp_context *context, int index)
2530 if (context->tokens_kind == TOKENS_KIND_DIRECT)
2531 return &(FIRST (context).token[index]);
2532 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2533 || context->tokens_kind == TOKENS_KIND_EXTENDED)
2534 return FIRST (context).ptoken[index];
2535 else
2536 abort ();
2539 /* Look ahead in the input stream. */
2540 const cpp_token *
2541 cpp_peek_token (cpp_reader *pfile, int index)
2543 cpp_context *context = pfile->context;
2544 const cpp_token *peektok;
2545 int count;
2547 /* First, scan through any pending cpp_context objects. */
2548 while (context->prev)
2550 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2552 if (index < (int) sz)
2553 return _cpp_token_from_context_at (context, index);
2554 index -= (int) sz;
2555 context = context->prev;
2558 /* We will have to read some new tokens after all (and do so
2559 without invalidating preceding tokens). */
2560 count = index;
2561 pfile->keep_tokens++;
2563 /* For peeked tokens temporarily disable line_change reporting,
2564 until the tokens are parsed for real. */
2565 void (*line_change) (cpp_reader *, const cpp_token *, int)
2566 = pfile->cb.line_change;
2567 pfile->cb.line_change = NULL;
2571 peektok = _cpp_lex_token (pfile);
2572 if (peektok->type == CPP_EOF)
2574 index--;
2575 break;
2577 else if (peektok->type == CPP_PRAGMA)
2579 /* Don't peek past a pragma. */
2580 if (peektok == &pfile->directive_result)
2581 /* Save the pragma in the buffer. */
2582 *pfile->cur_token++ = *peektok;
2583 index--;
2584 break;
2587 while (index--);
2589 _cpp_backup_tokens_direct (pfile, count - index);
2590 pfile->keep_tokens--;
2591 pfile->cb.line_change = line_change;
2593 return peektok;
2596 /* Allocate a single token that is invalidated at the same time as the
2597 rest of the tokens on the line. Has its line and col set to the
2598 same as the last lexed token, so that diagnostics appear in the
2599 right place. */
2600 cpp_token *
2601 _cpp_temp_token (cpp_reader *pfile)
2603 cpp_token *old, *result;
2604 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2605 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2607 old = pfile->cur_token - 1;
2608 /* Any pre-existing lookaheads must not be clobbered. */
2609 if (la)
2611 if (sz <= la)
2613 tokenrun *next = next_tokenrun (pfile->cur_run);
2615 if (sz < la)
2616 memmove (next->base + 1, next->base,
2617 (la - sz) * sizeof (cpp_token));
2619 next->base[0] = pfile->cur_run->limit[-1];
2622 if (sz > 1)
2623 memmove (pfile->cur_token + 1, pfile->cur_token,
2624 MIN (la, sz - 1) * sizeof (cpp_token));
2627 if (!sz && pfile->cur_token == pfile->cur_run->limit)
2629 pfile->cur_run = next_tokenrun (pfile->cur_run);
2630 pfile->cur_token = pfile->cur_run->base;
2633 result = pfile->cur_token++;
2634 result->src_loc = old->src_loc;
2635 return result;
2638 /* We're at the beginning of a logical line (so not in
2639 directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
2640 if we should enter deferred_pragma mode to tokenize the rest of the
2641 line as a module control-line. */
2643 static void
2644 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
2646 unsigned backup = 0; /* Tokens we peeked. */
2647 cpp_hashnode *node = result->val.node.node;
2648 cpp_token *peek = result;
2649 cpp_token *keyword = peek;
2650 cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
2651 int header_count = 0;
2653 /* Make sure the incoming state is as we expect it. This way we
2654 can restore it using constants. */
2655 gcc_checking_assert (!pfile->state.in_deferred_pragma
2656 && !pfile->state.skipping
2657 && !pfile->state.parsing_args
2658 && !pfile->state.angled_headers
2659 && (pfile->state.save_comments
2660 == !CPP_OPTION (pfile, discard_comments)));
2662 /* Enter directives mode sufficiently for peeking. We don't have
2663 to actually set in_directive. */
2664 pfile->state.in_deferred_pragma = true;
2666 /* These two fields are needed to process tokenization in deferred
2667 pragma mode. They are not used outside deferred pragma mode or
2668 directives mode. */
2669 pfile->state.pragma_allow_expansion = true;
2670 pfile->directive_line = result->src_loc;
2672 /* Saving comments is incompatible with directives mode. */
2673 pfile->state.save_comments = 0;
2675 if (node == n_modules[spec_nodes::M_EXPORT][0])
2677 peek = _cpp_lex_direct (pfile);
2678 keyword = peek;
2679 backup++;
2680 if (keyword->type != CPP_NAME)
2681 goto not_module;
2682 node = keyword->val.node.node;
2683 if (!(node->flags & NODE_MODULE))
2684 goto not_module;
2687 if (node == n_modules[spec_nodes::M__IMPORT][0])
2688 /* __import */
2689 header_count = backup + 2 + 16;
2690 else if (node == n_modules[spec_nodes::M_IMPORT][0])
2691 /* import */
2692 header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
2693 else if (node == n_modules[spec_nodes::M_MODULE][0])
2694 ; /* module */
2695 else
2696 goto not_module;
2698 /* We've seen [export] {module|import|__import}. Check the next token. */
2699 if (header_count)
2700 /* After '{,__}import' a header name may appear. */
2701 pfile->state.angled_headers = true;
2702 peek = _cpp_lex_direct (pfile);
2703 backup++;
2705 /* ... import followed by identifier, ':', '<' or
2706 header-name preprocessing tokens, or module
2707 followed by cpp-identifier, ':' or ';' preprocessing
2708 tokens. C++ keywords are not yet relevant. */
2709 if (peek->type == CPP_NAME
2710 || peek->type == CPP_COLON
2711 || (header_count
2712 ? (peek->type == CPP_LESS
2713 || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
2714 || peek->type == CPP_HEADER_NAME)
2715 : peek->type == CPP_SEMICOLON))
2717 pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
2718 if (!pfile->state.pragma_allow_expansion)
2719 pfile->state.prevent_expansion++;
2721 if (!header_count && linemap_included_from
2722 (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
2723 cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
2724 "module control-line cannot be in included file");
2726 /* The first one or two tokens cannot be macro names. */
2727 for (int ix = backup; ix--;)
2729 cpp_token *tok = ix ? keyword : result;
2730 cpp_hashnode *node = tok->val.node.node;
2732 /* Don't attempt to expand the token. */
2733 tok->flags |= NO_EXPAND;
2734 if (_cpp_defined_macro_p (node)
2735 && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
2736 && !cpp_fun_like_macro_p (node))
2737 cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
2738 "module control-line \"%s\" cannot be"
2739 " an object-like macro",
2740 NODE_NAME (node));
2743 /* Map to underbar variants. */
2744 keyword->val.node.node = n_modules[header_count
2745 ? spec_nodes::M_IMPORT
2746 : spec_nodes::M_MODULE][1];
2747 if (backup != 1)
2748 result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
2750 /* Maybe tell the tokenizer we expect a header-name down the
2751 road. */
2752 pfile->state.directive_file_token = header_count;
2754 else
2756 not_module:
2757 /* Drop out of directive mode. */
2758 /* We aaserted save_comments had this value upon entry. */
2759 pfile->state.save_comments
2760 = !CPP_OPTION (pfile, discard_comments);
2761 pfile->state.in_deferred_pragma = false;
2762 /* Do not let this remain on. */
2763 pfile->state.angled_headers = false;
2766 /* In either case we want to backup the peeked tokens. */
2767 if (backup)
2769 /* If we saw EOL, we should drop it, because this isn't a module
2770 control-line after all. */
2771 bool eol = peek->type == CPP_PRAGMA_EOL;
2772 if (!eol || backup > 1)
2774 /* Put put the peeked tokens back */
2775 _cpp_backup_tokens_direct (pfile, backup);
2776 /* But if the last one was an EOL, forget it. */
2777 if (eol)
2778 pfile->lookaheads--;
2783 /* Lex a token into RESULT (external interface). Takes care of issues
2784 like directive handling, token lookahead, multiple include
2785 optimization and skipping. */
2786 const cpp_token *
2787 _cpp_lex_token (cpp_reader *pfile)
2789 cpp_token *result;
2791 for (;;)
2793 if (pfile->cur_token == pfile->cur_run->limit)
2795 pfile->cur_run = next_tokenrun (pfile->cur_run);
2796 pfile->cur_token = pfile->cur_run->base;
2798 /* We assume that the current token is somewhere in the current
2799 run. */
2800 if (pfile->cur_token < pfile->cur_run->base
2801 || pfile->cur_token >= pfile->cur_run->limit)
2802 abort ();
2804 if (pfile->lookaheads)
2806 pfile->lookaheads--;
2807 result = pfile->cur_token++;
2809 else
2810 result = _cpp_lex_direct (pfile);
2812 if (result->flags & BOL)
2814 /* Is this a directive. If _cpp_handle_directive returns
2815 false, it is an assembler #. */
2816 if (result->type == CPP_HASH
2817 /* 6.10.3 p 11: Directives in a list of macro arguments
2818 gives undefined behavior. This implementation
2819 handles the directive as normal. */
2820 && pfile->state.parsing_args != 1)
2822 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2824 if (pfile->directive_result.type == CPP_PADDING)
2825 continue;
2826 result = &pfile->directive_result;
2829 else if (pfile->state.in_deferred_pragma)
2830 result = &pfile->directive_result;
2831 else if (result->type == CPP_NAME
2832 && (result->val.node.node->flags & NODE_MODULE)
2833 && !pfile->state.skipping
2834 /* Unlike regular directives, we do not deal with
2835 tokenizing module directives as macro arguments.
2836 That's not permitted. */
2837 && !pfile->state.parsing_args)
2839 /* P1857. Before macro expansion, At start of logical
2840 line ... */
2841 /* We don't have to consider lookaheads at this point. */
2842 gcc_checking_assert (!pfile->lookaheads);
2844 cpp_maybe_module_directive (pfile, result);
2847 if (pfile->cb.line_change && !pfile->state.skipping)
2848 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2851 /* We don't skip tokens in directives. */
2852 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2853 break;
2855 /* Outside a directive, invalidate controlling macros. At file
2856 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2857 get here and MI optimization works. */
2858 pfile->mi_valid = false;
2860 if (!pfile->state.skipping || result->type == CPP_EOF)
2861 break;
2864 return result;
2867 /* Returns true if a fresh line has been loaded. */
2868 bool
2869 _cpp_get_fresh_line (cpp_reader *pfile)
2871 /* We can't get a new line until we leave the current directive. */
2872 if (pfile->state.in_directive)
2873 return false;
2875 for (;;)
2877 cpp_buffer *buffer = pfile->buffer;
2879 if (!buffer->need_line)
2880 return true;
2882 if (buffer->next_line < buffer->rlimit)
2884 _cpp_clean_line (pfile);
2885 return true;
2888 /* First, get out of parsing arguments state. */
2889 if (pfile->state.parsing_args)
2890 return false;
2892 /* End of buffer. Non-empty files should end in a newline. */
2893 if (buffer->buf != buffer->rlimit
2894 && buffer->next_line > buffer->rlimit
2895 && !buffer->from_stage3)
2897 /* Clip to buffer size. */
2898 buffer->next_line = buffer->rlimit;
2901 if (buffer->prev && !buffer->return_at_eof)
2902 _cpp_pop_buffer (pfile);
2903 else
2905 /* End of translation. Do not pop the buffer yet. Increment
2906 line number so that the EOF token is on a line of its own
2907 (_cpp_lex_direct doesn't increment in that case, because
2908 it's hard for it to distinguish this special case). */
2909 CPP_INCREMENT_LINE (pfile, 0);
2910 return false;
2915 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
2916 do \
2918 result->type = ELSE_TYPE; \
2919 if (*buffer->cur == CHAR) \
2920 buffer->cur++, result->type = THEN_TYPE; \
2922 while (0)
2924 /* Lex a token into pfile->cur_token, which is also incremented, to
2925 get diagnostics pointing to the correct location.
2927 Does not handle issues such as token lookahead, multiple-include
2928 optimization, directives, skipping etc. This function is only
2929 suitable for use by _cpp_lex_token, and in special cases like
2930 lex_expansion_token which doesn't care for any of these issues.
2932 When meeting a newline, returns CPP_EOF if parsing a directive,
2933 otherwise returns to the start of the token buffer if permissible.
2934 Returns the location of the lexed token. */
2935 cpp_token *
2936 _cpp_lex_direct (cpp_reader *pfile)
2938 cppchar_t c;
2939 cpp_buffer *buffer;
2940 const unsigned char *comment_start;
2941 bool fallthrough_comment = false;
2942 cpp_token *result = pfile->cur_token++;
2944 fresh_line:
2945 result->flags = 0;
2946 buffer = pfile->buffer;
2947 if (buffer->need_line)
2949 gcc_assert (!pfile->state.in_deferred_pragma);
2950 if (!_cpp_get_fresh_line (pfile))
2952 result->type = CPP_EOF;
2953 /* Not a real EOF in a directive or arg parsing -- we refuse
2954 to advance to the next file now, and will once we're out
2955 of those modes. */
2956 if (!pfile->state.in_directive && !pfile->state.parsing_args)
2958 /* Tell the compiler the line number of the EOF token. */
2959 result->src_loc = pfile->line_table->highest_line;
2960 result->flags = BOL;
2961 /* Now pop the buffer that _cpp_get_fresh_line did not. */
2962 _cpp_pop_buffer (pfile);
2964 return result;
2966 if (buffer != pfile->buffer)
2967 fallthrough_comment = false;
2968 if (!pfile->keep_tokens)
2970 pfile->cur_run = &pfile->base_run;
2971 result = pfile->base_run.base;
2972 pfile->cur_token = result + 1;
2974 result->flags = BOL;
2975 if (pfile->state.parsing_args == 2)
2976 result->flags |= PREV_WHITE;
2978 buffer = pfile->buffer;
2979 update_tokens_line:
2980 result->src_loc = pfile->line_table->highest_line;
2982 skipped_white:
2983 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2984 && !pfile->overlaid_buffer)
2986 _cpp_process_line_notes (pfile, false);
2987 result->src_loc = pfile->line_table->highest_line;
2989 c = *buffer->cur++;
2991 if (pfile->forced_token_location)
2992 result->src_loc = pfile->forced_token_location;
2993 else
2994 result->src_loc = linemap_position_for_column (pfile->line_table,
2995 CPP_BUF_COLUMN (buffer, buffer->cur));
2997 switch (c)
2999 case ' ': case '\t': case '\f': case '\v': case '\0':
3000 result->flags |= PREV_WHITE;
3001 skip_whitespace (pfile, c);
3002 goto skipped_white;
3004 case '\n':
3005 /* Increment the line, unless this is the last line ... */
3006 if (buffer->cur < buffer->rlimit
3007 /* ... or this is a #include, (where _cpp_stack_file needs to
3008 unwind by one line) ... */
3009 || (pfile->state.in_directive > 1
3010 /* ... except traditional-cpp increments this elsewhere. */
3011 && !CPP_OPTION (pfile, traditional)))
3012 CPP_INCREMENT_LINE (pfile, 0);
3013 buffer->need_line = true;
3014 if (pfile->state.in_deferred_pragma)
3016 /* Produce the PRAGMA_EOL on this line. File reading
3017 ensures there is always a \n at end of the buffer, thus
3018 in a deferred pragma we always see CPP_PRAGMA_EOL before
3019 any CPP_EOF. */
3020 result->type = CPP_PRAGMA_EOL;
3021 result->flags &= ~PREV_WHITE;
3022 pfile->state.in_deferred_pragma = false;
3023 if (!pfile->state.pragma_allow_expansion)
3024 pfile->state.prevent_expansion--;
3025 return result;
3027 goto fresh_line;
3029 case '0': case '1': case '2': case '3': case '4':
3030 case '5': case '6': case '7': case '8': case '9':
3032 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3033 result->type = CPP_NUMBER;
3034 lex_number (pfile, &result->val.str, &nst);
3035 warn_about_normalization (pfile, result, &nst);
3036 break;
3039 case 'L':
3040 case 'u':
3041 case 'U':
3042 case 'R':
3043 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3044 wide strings or raw strings. */
3045 if (c == 'L' || CPP_OPTION (pfile, rliterals)
3046 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3048 if ((*buffer->cur == '\'' && c != 'R')
3049 || *buffer->cur == '"'
3050 || (*buffer->cur == 'R'
3051 && c != 'R'
3052 && buffer->cur[1] == '"'
3053 && CPP_OPTION (pfile, rliterals))
3054 || (*buffer->cur == '8'
3055 && c == 'u'
3056 && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3057 && CPP_OPTION (pfile, utf8_char_literals)))
3058 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3059 && CPP_OPTION (pfile, rliterals)))))
3061 lex_string (pfile, result, buffer->cur - 1);
3062 break;
3065 /* Fall through. */
3067 case '_':
3068 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3069 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3070 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3071 case 's': case 't': case 'v': case 'w': case 'x':
3072 case 'y': case 'z':
3073 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3074 case 'G': case 'H': case 'I': case 'J': case 'K':
3075 case 'M': case 'N': case 'O': case 'P': case 'Q':
3076 case 'S': case 'T': case 'V': case 'W': case 'X':
3077 case 'Y': case 'Z':
3078 result->type = CPP_NAME;
3080 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3081 result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3082 &nst,
3083 &result->val.node.spelling);
3084 warn_about_normalization (pfile, result, &nst);
3087 /* Convert named operators to their proper types. */
3088 if (result->val.node.node->flags & NODE_OPERATOR)
3090 result->flags |= NAMED_OP;
3091 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3094 /* Signal FALLTHROUGH comment followed by another token. */
3095 if (fallthrough_comment)
3096 result->flags |= PREV_FALLTHROUGH;
3097 break;
3099 case '\'':
3100 case '"':
3101 lex_string (pfile, result, buffer->cur - 1);
3102 break;
3104 case '/':
3105 /* A potential block or line comment. */
3106 comment_start = buffer->cur;
3107 c = *buffer->cur;
3109 if (c == '*')
3111 if (_cpp_skip_block_comment (pfile))
3112 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3114 else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3116 /* Don't warn for system headers. */
3117 if (_cpp_in_system_header (pfile))
3119 /* Warn about comments if pedantically GNUC89, and not
3120 in system headers. */
3121 else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3122 && CPP_PEDANTIC (pfile)
3123 && ! buffer->warned_cplusplus_comments)
3125 if (cpp_error (pfile, CPP_DL_PEDWARN,
3126 "C++ style comments are not allowed in ISO C90"))
3127 cpp_error (pfile, CPP_DL_NOTE,
3128 "(this will be reported only once per input file)");
3129 buffer->warned_cplusplus_comments = 1;
3131 /* Or if specifically desired via -Wc90-c99-compat. */
3132 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3133 && ! CPP_OPTION (pfile, cplusplus)
3134 && ! buffer->warned_cplusplus_comments)
3136 if (cpp_error (pfile, CPP_DL_WARNING,
3137 "C++ style comments are incompatible with C90"))
3138 cpp_error (pfile, CPP_DL_NOTE,
3139 "(this will be reported only once per input file)");
3140 buffer->warned_cplusplus_comments = 1;
3142 /* In C89/C94, C++ style comments are forbidden. */
3143 else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3144 || CPP_OPTION (pfile, lang) == CLK_STDC94))
3146 /* But don't be confused about valid code such as
3147 - // immediately followed by *,
3148 - // in a preprocessing directive,
3149 - // in an #if 0 block. */
3150 if (buffer->cur[1] == '*'
3151 || pfile->state.in_directive
3152 || pfile->state.skipping)
3154 result->type = CPP_DIV;
3155 break;
3157 else if (! buffer->warned_cplusplus_comments)
3159 if (cpp_error (pfile, CPP_DL_ERROR,
3160 "C++ style comments are not allowed in "
3161 "ISO C90"))
3162 cpp_error (pfile, CPP_DL_NOTE,
3163 "(this will be reported only once per input "
3164 "file)");
3165 buffer->warned_cplusplus_comments = 1;
3168 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3169 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3171 else if (c == '=')
3173 buffer->cur++;
3174 result->type = CPP_DIV_EQ;
3175 break;
3177 else
3179 result->type = CPP_DIV;
3180 break;
3183 if (fallthrough_comment_p (pfile, comment_start))
3184 fallthrough_comment = true;
3186 if (pfile->cb.comment)
3188 size_t len = pfile->buffer->cur - comment_start;
3189 pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3190 len + 1);
3193 if (!pfile->state.save_comments)
3195 result->flags |= PREV_WHITE;
3196 goto update_tokens_line;
3199 if (fallthrough_comment)
3200 result->flags |= PREV_FALLTHROUGH;
3202 /* Save the comment as a token in its own right. */
3203 save_comment (pfile, result, comment_start, c);
3204 break;
3206 case '<':
3207 if (pfile->state.angled_headers)
3209 lex_string (pfile, result, buffer->cur - 1);
3210 if (result->type != CPP_LESS)
3211 break;
3214 result->type = CPP_LESS;
3215 if (*buffer->cur == '=')
3217 buffer->cur++, result->type = CPP_LESS_EQ;
3218 if (*buffer->cur == '>'
3219 && CPP_OPTION (pfile, cplusplus)
3220 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
3221 buffer->cur++, result->type = CPP_SPACESHIP;
3223 else if (*buffer->cur == '<')
3225 buffer->cur++;
3226 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
3228 else if (CPP_OPTION (pfile, digraphs))
3230 if (*buffer->cur == ':')
3232 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
3233 three characters are <:: and the subsequent character
3234 is neither : nor >, the < is treated as a preprocessor
3235 token by itself". */
3236 if (CPP_OPTION (pfile, cplusplus)
3237 && CPP_OPTION (pfile, lang) != CLK_CXX98
3238 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
3239 && buffer->cur[1] == ':'
3240 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3241 break;
3243 buffer->cur++;
3244 result->flags |= DIGRAPH;
3245 result->type = CPP_OPEN_SQUARE;
3247 else if (*buffer->cur == '%')
3249 buffer->cur++;
3250 result->flags |= DIGRAPH;
3251 result->type = CPP_OPEN_BRACE;
3254 break;
3256 case '>':
3257 result->type = CPP_GREATER;
3258 if (*buffer->cur == '=')
3259 buffer->cur++, result->type = CPP_GREATER_EQ;
3260 else if (*buffer->cur == '>')
3262 buffer->cur++;
3263 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3265 break;
3267 case '%':
3268 result->type = CPP_MOD;
3269 if (*buffer->cur == '=')
3270 buffer->cur++, result->type = CPP_MOD_EQ;
3271 else if (CPP_OPTION (pfile, digraphs))
3273 if (*buffer->cur == ':')
3275 buffer->cur++;
3276 result->flags |= DIGRAPH;
3277 result->type = CPP_HASH;
3278 if (*buffer->cur == '%' && buffer->cur[1] == ':')
3279 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3281 else if (*buffer->cur == '>')
3283 buffer->cur++;
3284 result->flags |= DIGRAPH;
3285 result->type = CPP_CLOSE_BRACE;
3288 break;
3290 case '.':
3291 result->type = CPP_DOT;
3292 if (ISDIGIT (*buffer->cur))
3294 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3295 result->type = CPP_NUMBER;
3296 lex_number (pfile, &result->val.str, &nst);
3297 warn_about_normalization (pfile, result, &nst);
3299 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3300 buffer->cur += 2, result->type = CPP_ELLIPSIS;
3301 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3302 buffer->cur++, result->type = CPP_DOT_STAR;
3303 break;
3305 case '+':
3306 result->type = CPP_PLUS;
3307 if (*buffer->cur == '+')
3308 buffer->cur++, result->type = CPP_PLUS_PLUS;
3309 else if (*buffer->cur == '=')
3310 buffer->cur++, result->type = CPP_PLUS_EQ;
3311 break;
3313 case '-':
3314 result->type = CPP_MINUS;
3315 if (*buffer->cur == '>')
3317 buffer->cur++;
3318 result->type = CPP_DEREF;
3319 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3320 buffer->cur++, result->type = CPP_DEREF_STAR;
3322 else if (*buffer->cur == '-')
3323 buffer->cur++, result->type = CPP_MINUS_MINUS;
3324 else if (*buffer->cur == '=')
3325 buffer->cur++, result->type = CPP_MINUS_EQ;
3326 break;
3328 case '&':
3329 result->type = CPP_AND;
3330 if (*buffer->cur == '&')
3331 buffer->cur++, result->type = CPP_AND_AND;
3332 else if (*buffer->cur == '=')
3333 buffer->cur++, result->type = CPP_AND_EQ;
3334 break;
3336 case '|':
3337 result->type = CPP_OR;
3338 if (*buffer->cur == '|')
3339 buffer->cur++, result->type = CPP_OR_OR;
3340 else if (*buffer->cur == '=')
3341 buffer->cur++, result->type = CPP_OR_EQ;
3342 break;
3344 case ':':
3345 result->type = CPP_COLON;
3346 if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
3347 buffer->cur++, result->type = CPP_SCOPE;
3348 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3350 buffer->cur++;
3351 result->flags |= DIGRAPH;
3352 result->type = CPP_CLOSE_SQUARE;
3354 break;
3356 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3357 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3358 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3359 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3360 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3362 case '?': result->type = CPP_QUERY; break;
3363 case '~': result->type = CPP_COMPL; break;
3364 case ',': result->type = CPP_COMMA; break;
3365 case '(': result->type = CPP_OPEN_PAREN; break;
3366 case ')': result->type = CPP_CLOSE_PAREN; break;
3367 case '[': result->type = CPP_OPEN_SQUARE; break;
3368 case ']': result->type = CPP_CLOSE_SQUARE; break;
3369 case '{': result->type = CPP_OPEN_BRACE; break;
3370 case '}': result->type = CPP_CLOSE_BRACE; break;
3371 case ';': result->type = CPP_SEMICOLON; break;
3373 /* @ is a punctuator in Objective-C. */
3374 case '@': result->type = CPP_ATSIGN; break;
3376 default:
3378 const uchar *base = --buffer->cur;
3380 /* Check for an extended identifier ($ or UCN or UTF-8). */
3381 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3382 if (forms_identifier_p (pfile, true, &nst))
3384 result->type = CPP_NAME;
3385 result->val.node.node = lex_identifier (pfile, base, true, &nst,
3386 &result->val.node.spelling);
3387 warn_about_normalization (pfile, result, &nst);
3388 break;
3391 /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
3392 single token. */
3393 buffer->cur++;
3394 if (c >= utf8_signifier)
3396 const uchar *pstr = base;
3397 cppchar_t s;
3398 if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
3399 buffer->cur = pstr;
3401 create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
3402 break;
3407 /* Potentially convert the location of the token to a range. */
3408 if (result->src_loc >= RESERVED_LOCATION_COUNT
3409 && result->type != CPP_EOF)
3411 /* Ensure that any line notes are processed, so that we have the
3412 correct physical line/column for the end-point of the token even
3413 when a logical line is split via one or more backslashes. */
3414 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3415 && !pfile->overlaid_buffer)
3416 _cpp_process_line_notes (pfile, false);
3418 source_range tok_range;
3419 tok_range.m_start = result->src_loc;
3420 tok_range.m_finish
3421 = linemap_position_for_column (pfile->line_table,
3422 CPP_BUF_COLUMN (buffer, buffer->cur));
3424 result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
3425 result->src_loc,
3426 tok_range, NULL);
3429 return result;
3432 /* An upper bound on the number of bytes needed to spell TOKEN.
3433 Does not include preceding whitespace. */
3434 unsigned int
3435 cpp_token_len (const cpp_token *token)
3437 unsigned int len;
3439 switch (TOKEN_SPELL (token))
3441 default: len = 6; break;
3442 case SPELL_LITERAL: len = token->val.str.len; break;
3443 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
3446 return len;
3449 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
3450 Return the number of bytes read out of NAME. (There are always
3451 10 bytes written to BUFFER.) */
3453 static size_t
3454 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
3456 int j;
3457 int ucn_len = 0;
3458 int ucn_len_c;
3459 unsigned t;
3460 unsigned long utf32;
3462 /* Compute the length of the UTF-8 sequence. */
3463 for (t = *name; t & 0x80; t <<= 1)
3464 ucn_len++;
3466 utf32 = *name & (0x7F >> ucn_len);
3467 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
3469 utf32 = (utf32 << 6) | (*++name & 0x3F);
3471 /* Ill-formed UTF-8. */
3472 if ((*name & ~0x3F) != 0x80)
3473 abort ();
3476 *buffer++ = '\\';
3477 *buffer++ = 'U';
3478 for (j = 7; j >= 0; j--)
3479 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
3480 return ucn_len;
3483 /* Given a token TYPE corresponding to a digraph, return a pointer to
3484 the spelling of the digraph. */
3485 static const unsigned char *
3486 cpp_digraph2name (enum cpp_ttype type)
3488 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
3491 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
3492 The buffer must already contain the enough space to hold the
3493 token's spelling. Returns a pointer to the character after the
3494 last character written. */
3495 unsigned char *
3496 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
3498 size_t i;
3499 const unsigned char *name = NODE_NAME (ident);
3501 for (i = 0; i < NODE_LEN (ident); i++)
3502 if (name[i] & ~0x7F)
3504 i += utf8_to_ucn (buffer, name + i) - 1;
3505 buffer += 10;
3507 else
3508 *buffer++ = name[i];
3510 return buffer;
3513 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
3514 already contain the enough space to hold the token's spelling.
3515 Returns a pointer to the character after the last character written.
3516 FORSTRING is true if this is to be the spelling after translation
3517 phase 1 (with the original spelling of extended identifiers), false
3518 if extended identifiers should always be written using UCNs (there is
3519 no option for always writing them in the internal UTF-8 form).
3520 FIXME: Would be nice if we didn't need the PFILE argument. */
3521 unsigned char *
3522 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
3523 unsigned char *buffer, bool forstring)
3525 switch (TOKEN_SPELL (token))
3527 case SPELL_OPERATOR:
3529 const unsigned char *spelling;
3530 unsigned char c;
3532 if (token->flags & DIGRAPH)
3533 spelling = cpp_digraph2name (token->type);
3534 else if (token->flags & NAMED_OP)
3535 goto spell_ident;
3536 else
3537 spelling = TOKEN_NAME (token);
3539 while ((c = *spelling++) != '\0')
3540 *buffer++ = c;
3542 break;
3544 spell_ident:
3545 case SPELL_IDENT:
3546 if (forstring)
3548 memcpy (buffer, NODE_NAME (token->val.node.spelling),
3549 NODE_LEN (token->val.node.spelling));
3550 buffer += NODE_LEN (token->val.node.spelling);
3552 else
3553 buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
3554 break;
3556 case SPELL_LITERAL:
3557 memcpy (buffer, token->val.str.text, token->val.str.len);
3558 buffer += token->val.str.len;
3559 break;
3561 case SPELL_NONE:
3562 cpp_error (pfile, CPP_DL_ICE,
3563 "unspellable token %s", TOKEN_NAME (token));
3564 break;
3567 return buffer;
3570 /* Returns TOKEN spelt as a null-terminated string. The string is
3571 freed when the reader is destroyed. Useful for diagnostics. */
3572 unsigned char *
3573 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
3575 unsigned int len = cpp_token_len (token) + 1;
3576 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
3578 end = cpp_spell_token (pfile, token, start, false);
3579 end[0] = '\0';
3581 return start;
3584 /* Returns a pointer to a string which spells the token defined by
3585 TYPE and FLAGS. Used by C front ends, which really should move to
3586 using cpp_token_as_text. */
3587 const char *
3588 cpp_type2name (enum cpp_ttype type, unsigned char flags)
3590 if (flags & DIGRAPH)
3591 return (const char *) cpp_digraph2name (type);
3592 else if (flags & NAMED_OP)
3593 return cpp_named_operator2name (type);
3595 return (const char *) token_spellings[type].name;
3598 /* Writes the spelling of token to FP, without any preceding space.
3599 Separated from cpp_spell_token for efficiency - to avoid stdio
3600 double-buffering. */
3601 void
3602 cpp_output_token (const cpp_token *token, FILE *fp)
3604 switch (TOKEN_SPELL (token))
3606 case SPELL_OPERATOR:
3608 const unsigned char *spelling;
3609 int c;
3611 if (token->flags & DIGRAPH)
3612 spelling = cpp_digraph2name (token->type);
3613 else if (token->flags & NAMED_OP)
3614 goto spell_ident;
3615 else
3616 spelling = TOKEN_NAME (token);
3618 c = *spelling;
3620 putc (c, fp);
3621 while ((c = *++spelling) != '\0');
3623 break;
3625 spell_ident:
3626 case SPELL_IDENT:
3628 size_t i;
3629 const unsigned char * name = NODE_NAME (token->val.node.node);
3631 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
3632 if (name[i] & ~0x7F)
3634 unsigned char buffer[10];
3635 i += utf8_to_ucn (buffer, name + i) - 1;
3636 fwrite (buffer, 1, 10, fp);
3638 else
3639 fputc (NODE_NAME (token->val.node.node)[i], fp);
3641 break;
3643 case SPELL_LITERAL:
3644 if (token->type == CPP_HEADER_NAME)
3645 fputc ('"', fp);
3646 fwrite (token->val.str.text, 1, token->val.str.len, fp);
3647 if (token->type == CPP_HEADER_NAME)
3648 fputc ('"', fp);
3649 break;
3651 case SPELL_NONE:
3652 /* An error, most probably. */
3653 break;
3657 /* Compare two tokens. */
3659 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
3661 if (a->type == b->type && a->flags == b->flags)
3662 switch (TOKEN_SPELL (a))
3664 default: /* Keep compiler happy. */
3665 case SPELL_OPERATOR:
3666 /* token_no is used to track where multiple consecutive ##
3667 tokens were originally located. */
3668 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
3669 case SPELL_NONE:
3670 return (a->type != CPP_MACRO_ARG
3671 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
3672 && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
3673 case SPELL_IDENT:
3674 return (a->val.node.node == b->val.node.node
3675 && a->val.node.spelling == b->val.node.spelling);
3676 case SPELL_LITERAL:
3677 return (a->val.str.len == b->val.str.len
3678 && !memcmp (a->val.str.text, b->val.str.text,
3679 a->val.str.len));
3682 return 0;
3685 /* Returns nonzero if a space should be inserted to avoid an
3686 accidental token paste for output. For simplicity, it is
3687 conservative, and occasionally advises a space where one is not
3688 needed, e.g. "." and ".2". */
3690 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
3691 const cpp_token *token2)
3693 enum cpp_ttype a = token1->type, b = token2->type;
3694 cppchar_t c;
3696 if (token1->flags & NAMED_OP)
3697 a = CPP_NAME;
3698 if (token2->flags & NAMED_OP)
3699 b = CPP_NAME;
3701 c = EOF;
3702 if (token2->flags & DIGRAPH)
3703 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
3704 else if (token_spellings[b].category == SPELL_OPERATOR)
3705 c = token_spellings[b].name[0];
3707 /* Quickly get everything that can paste with an '='. */
3708 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
3709 return 1;
3711 switch (a)
3713 case CPP_GREATER: return c == '>';
3714 case CPP_LESS: return c == '<' || c == '%' || c == ':';
3715 case CPP_PLUS: return c == '+';
3716 case CPP_MINUS: return c == '-' || c == '>';
3717 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
3718 case CPP_MOD: return c == ':' || c == '>';
3719 case CPP_AND: return c == '&';
3720 case CPP_OR: return c == '|';
3721 case CPP_COLON: return c == ':' || c == '>';
3722 case CPP_DEREF: return c == '*';
3723 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
3724 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
3725 case CPP_PRAGMA:
3726 case CPP_NAME: return ((b == CPP_NUMBER
3727 && name_p (pfile, &token2->val.str))
3728 || b == CPP_NAME
3729 || b == CPP_CHAR || b == CPP_STRING); /* L */
3730 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
3731 || b == CPP_CHAR
3732 || c == '.' || c == '+' || c == '-');
3733 /* UCNs */
3734 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
3735 && b == CPP_NAME)
3736 || (CPP_OPTION (pfile, objc)
3737 && token1->val.str.text[0] == '@'
3738 && (b == CPP_NAME || b == CPP_STRING)));
3739 case CPP_LESS_EQ: return c == '>';
3740 case CPP_STRING:
3741 case CPP_WSTRING:
3742 case CPP_UTF8STRING:
3743 case CPP_STRING16:
3744 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
3745 && (b == CPP_NAME
3746 || (TOKEN_SPELL (token2) == SPELL_LITERAL
3747 && ISIDST (token2->val.str.text[0]))));
3749 default: break;
3752 return 0;
3755 /* Output all the remaining tokens on the current line, and a newline
3756 character, to FP. Leading whitespace is removed. If there are
3757 macros, special token padding is not performed. */
3758 void
3759 cpp_output_line (cpp_reader *pfile, FILE *fp)
3761 const cpp_token *token;
3763 token = cpp_get_token (pfile);
3764 while (token->type != CPP_EOF)
3766 cpp_output_token (token, fp);
3767 token = cpp_get_token (pfile);
3768 if (token->flags & PREV_WHITE)
3769 putc (' ', fp);
3772 putc ('\n', fp);
3775 /* Return a string representation of all the remaining tokens on the
3776 current line. The result is allocated using xmalloc and must be
3777 freed by the caller. */
3778 unsigned char *
3779 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
3781 const cpp_token *token;
3782 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
3783 unsigned int alloced = 120 + out;
3784 unsigned char *result = (unsigned char *) xmalloc (alloced);
3786 /* If DIR_NAME is empty, there are no initial contents. */
3787 if (dir_name)
3789 sprintf ((char *) result, "#%s ", dir_name);
3790 out += 2;
3793 token = cpp_get_token (pfile);
3794 while (token->type != CPP_EOF)
3796 unsigned char *last;
3797 /* Include room for a possible space and the terminating nul. */
3798 unsigned int len = cpp_token_len (token) + 2;
3800 if (out + len > alloced)
3802 alloced *= 2;
3803 if (out + len > alloced)
3804 alloced = out + len;
3805 result = (unsigned char *) xrealloc (result, alloced);
3808 last = cpp_spell_token (pfile, token, &result[out], 0);
3809 out = last - result;
3811 token = cpp_get_token (pfile);
3812 if (token->flags & PREV_WHITE)
3813 result[out++] = ' ';
3816 result[out] = '\0';
3817 return result;
3820 /* Memory buffers. Changing these three constants can have a dramatic
3821 effect on performance. The values here are reasonable defaults,
3822 but might be tuned. If you adjust them, be sure to test across a
3823 range of uses of cpplib, including heavy nested function-like macro
3824 expansion. Also check the change in peak memory usage (NJAMD is a
3825 good tool for this). */
3826 #define MIN_BUFF_SIZE 8000
3827 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3828 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
3829 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
3831 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
3832 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
3833 #endif
3835 /* Create a new allocation buffer. Place the control block at the end
3836 of the buffer, so that buffer overflows will cause immediate chaos. */
3837 static _cpp_buff *
3838 new_buff (size_t len)
3840 _cpp_buff *result;
3841 unsigned char *base;
3843 if (len < MIN_BUFF_SIZE)
3844 len = MIN_BUFF_SIZE;
3845 len = CPP_ALIGN (len);
3847 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3848 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
3849 struct first. */
3850 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
3851 base = XNEWVEC (unsigned char, len + slen);
3852 result = (_cpp_buff *) base;
3853 base += slen;
3854 #else
3855 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3856 result = (_cpp_buff *) (base + len);
3857 #endif
3858 result->base = base;
3859 result->cur = base;
3860 result->limit = base + len;
3861 result->next = NULL;
3862 return result;
3865 /* Place a chain of unwanted allocation buffers on the free list. */
3866 void
3867 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3869 _cpp_buff *end = buff;
3871 while (end->next)
3872 end = end->next;
3873 end->next = pfile->free_buffs;
3874 pfile->free_buffs = buff;
3877 /* Return a free buffer of size at least MIN_SIZE. */
3878 _cpp_buff *
3879 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3881 _cpp_buff *result, **p;
3883 for (p = &pfile->free_buffs;; p = &(*p)->next)
3885 size_t size;
3887 if (*p == NULL)
3888 return new_buff (min_size);
3889 result = *p;
3890 size = result->limit - result->base;
3891 /* Return a buffer that's big enough, but don't waste one that's
3892 way too big. */
3893 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3894 break;
3897 *p = result->next;
3898 result->next = NULL;
3899 result->cur = result->base;
3900 return result;
3903 /* Creates a new buffer with enough space to hold the uncommitted
3904 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
3905 the excess bytes to the new buffer. Chains the new buffer after
3906 BUFF, and returns the new buffer. */
3907 _cpp_buff *
3908 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3910 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3911 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3913 buff->next = new_buff;
3914 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3915 return new_buff;
3918 /* Creates a new buffer with enough space to hold the uncommitted
3919 remaining bytes of the buffer pointed to by BUFF, and at least
3920 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
3921 Chains the new buffer before the buffer pointed to by BUFF, and
3922 updates the pointer to point to the new buffer. */
3923 void
3924 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3926 _cpp_buff *new_buff, *old_buff = *pbuff;
3927 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3929 new_buff = _cpp_get_buff (pfile, size);
3930 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3931 new_buff->next = old_buff;
3932 *pbuff = new_buff;
3935 /* Free a chain of buffers starting at BUFF. */
3936 void
3937 _cpp_free_buff (_cpp_buff *buff)
3939 _cpp_buff *next;
3941 for (; buff; buff = next)
3943 next = buff->next;
3944 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3945 free (buff);
3946 #else
3947 free (buff->base);
3948 #endif
3952 /* Allocate permanent, unaligned storage of length LEN. */
3953 unsigned char *
3954 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3956 _cpp_buff *buff = pfile->u_buff;
3957 unsigned char *result = buff->cur;
3959 if (len > (size_t) (buff->limit - result))
3961 buff = _cpp_get_buff (pfile, len);
3962 buff->next = pfile->u_buff;
3963 pfile->u_buff = buff;
3964 result = buff->cur;
3967 buff->cur = result + len;
3968 return result;
3971 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3972 That buffer is used for growing allocations when saving macro
3973 replacement lists in a #define, and when parsing an answer to an
3974 assertion in #assert, #unassert or #if (and therefore possibly
3975 whilst expanding macros). It therefore must not be used by any
3976 code that they might call: specifically the lexer and the guts of
3977 the macro expander.
3979 All existing other uses clearly fit this restriction: storing
3980 registered pragmas during initialization. */
3981 unsigned char *
3982 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3984 _cpp_buff *buff = pfile->a_buff;
3985 unsigned char *result = buff->cur;
3987 if (len > (size_t) (buff->limit - result))
3989 buff = _cpp_get_buff (pfile, len);
3990 buff->next = pfile->a_buff;
3991 pfile->a_buff = buff;
3992 result = buff->cur;
3995 buff->cur = result + len;
3996 return result;
3999 /* Commit or allocate storage from a buffer. */
4001 void *
4002 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4004 void *ptr = BUFF_FRONT (pfile->a_buff);
4006 if (pfile->hash_table->alloc_subobject)
4008 void *copy = pfile->hash_table->alloc_subobject (size);
4009 memcpy (copy, ptr, size);
4010 ptr = copy;
4012 else
4013 BUFF_FRONT (pfile->a_buff) += size;
4015 return ptr;
4018 /* Say which field of TOK is in use. */
4020 enum cpp_token_fld_kind
4021 cpp_token_val_index (const cpp_token *tok)
4023 switch (TOKEN_SPELL (tok))
4025 case SPELL_IDENT:
4026 return CPP_TOKEN_FLD_NODE;
4027 case SPELL_LITERAL:
4028 return CPP_TOKEN_FLD_STR;
4029 case SPELL_OPERATOR:
4030 /* Operands which were originally spelled as ident keep around
4031 the node for the exact spelling. */
4032 if (tok->flags & NAMED_OP)
4033 return CPP_TOKEN_FLD_NODE;
4034 else if (tok->type == CPP_PASTE)
4035 return CPP_TOKEN_FLD_TOKEN_NO;
4036 else
4037 return CPP_TOKEN_FLD_NONE;
4038 case SPELL_NONE:
4039 if (tok->type == CPP_MACRO_ARG)
4040 return CPP_TOKEN_FLD_ARG_NO;
4041 else if (tok->type == CPP_PADDING)
4042 return CPP_TOKEN_FLD_SOURCE;
4043 else if (tok->type == CPP_PRAGMA)
4044 return CPP_TOKEN_FLD_PRAGMA;
4045 /* fall through */
4046 default:
4047 return CPP_TOKEN_FLD_NONE;
4051 /* All tokens lexed in R after calling this function will be forced to
4052 have their location_t to be P, until
4053 cpp_stop_forcing_token_locations is called for R. */
4055 void
4056 cpp_force_token_locations (cpp_reader *r, location_t loc)
4058 r->forced_token_location = loc;
4061 /* Go back to assigning locations naturally for lexed tokens. */
4063 void
4064 cpp_stop_forcing_token_locations (cpp_reader *r)
4066 r->forced_token_location = 0;
4069 /* We're looking at \, if it's escaping EOL, look past it. If at
4070 LIMIT, don't advance. */
4072 static const unsigned char *
4073 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4075 const unsigned char *probe = peek;
4077 if (__builtin_expect (peek[1] == '\n', true))
4079 eol:
4080 probe += 2;
4081 if (__builtin_expect (probe < limit, true))
4083 peek = probe;
4084 if (*peek == '\\')
4085 /* The user might be perverse. */
4086 return do_peek_backslash (peek, limit);
4089 else if (__builtin_expect (peek[1] == '\r', false))
4091 if (probe[2] == '\n')
4092 probe++;
4093 goto eol;
4096 return peek;
4099 static const unsigned char *
4100 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4102 if (__builtin_expect (*peek == '\\', false))
4103 peek = do_peek_backslash (peek, limit);
4104 return peek;
4107 static const unsigned char *
4108 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4110 if (peek == bound)
4111 return NULL;
4113 unsigned char c = *--peek;
4114 if (__builtin_expect (c == '\n', false)
4115 || __builtin_expect (c == 'r', false))
4117 if (peek == bound)
4118 return peek;
4119 int ix = -1;
4120 if (c == '\n' && peek[ix] == '\r')
4122 if (peek + ix == bound)
4123 return peek;
4124 ix--;
4127 if (peek[ix] == '\\')
4128 return do_peek_prev (peek + ix, bound);
4130 return peek;
4132 else
4133 return peek;
4136 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4137 space. Otherwise return NULL. */
4139 static const unsigned char *
4140 do_peek_ident (const char *match, const unsigned char *peek,
4141 const unsigned char *limit)
4143 for (; *++match; peek++)
4144 if (*peek != *match)
4146 peek = do_peek_next (peek, limit);
4147 if (*peek != *match)
4148 return NULL;
4151 /* Must now not be looking at an identifier char. */
4152 peek = do_peek_next (peek, limit);
4153 if (ISIDNUM (*peek))
4154 return NULL;
4156 /* Skip control-line whitespace. */
4158 while (*peek == ' ' || *peek == '\t')
4159 peek++;
4160 if (__builtin_expect (*peek == '\\', false))
4162 peek = do_peek_backslash (peek, limit);
4163 if (*peek != '\\')
4164 goto ws;
4167 return peek;
4170 /* Are we looking at a module control line starting as PEEK - 1? */
4172 static bool
4173 do_peek_module (cpp_reader *pfile, unsigned char c,
4174 const unsigned char *peek, const unsigned char *limit)
4176 bool import = false;
4178 if (__builtin_expect (c == 'e', false))
4180 if (!((peek[0] == 'x' || peek[0] == '\\')
4181 && (peek = do_peek_ident ("export", peek, limit))))
4182 return false;
4184 /* export, peek for import or module. No need to peek __import
4185 here. */
4186 if (peek[0] == 'i')
4188 if (!((peek[1] == 'm' || peek[1] == '\\')
4189 && (peek = do_peek_ident ("import", peek + 1, limit))))
4190 return false;
4191 import = true;
4193 else if (peek[0] == 'm')
4195 if (!((peek[1] == 'o' || peek[1] == '\\')
4196 && (peek = do_peek_ident ("module", peek + 1, limit))))
4197 return false;
4199 else
4200 return false;
4202 else if (__builtin_expect (c == 'i', false))
4204 if (!((peek[0] == 'm' || peek[0] == '\\')
4205 && (peek = do_peek_ident ("import", peek, limit))))
4206 return false;
4207 import = true;
4209 else if (__builtin_expect (c == '_', false))
4211 /* Needed for translated includes. */
4212 if (!((peek[0] == '_' || peek[0] == '\\')
4213 && (peek = do_peek_ident ("__import", peek, limit))))
4214 return false;
4215 import = true;
4217 else if (__builtin_expect (c == 'm', false))
4219 if (!((peek[0] == 'o' || peek[0] == '\\')
4220 && (peek = do_peek_ident ("module", peek, limit))))
4221 return false;
4223 else
4224 return false;
4226 /* Peek the next character to see if it's good enough. We'll be at
4227 the first non-whitespace char, including skipping an escaped
4228 newline. */
4229 /* ... import followed by identifier, ':', '<' or header-name
4230 preprocessing tokens, or module followed by identifier, ':' or
4231 ';' preprocessing tokens. */
4232 unsigned char p = *peek++;
4234 /* A character literal is ... single quotes, ... optionally preceded
4235 by u8, u, U, or L */
4236 /* A string-literal is a ... double quotes, optionally prefixed by
4237 R, u8, u8R, u, uR, U, UR, L, or LR */
4238 if (p == 'u')
4240 peek = do_peek_next (peek, limit);
4241 if (*peek == '8')
4243 peek++;
4244 goto peek_u8;
4246 goto peek_u;
4248 else if (p == 'U' || p == 'L')
4250 peek_u8:
4251 peek = do_peek_next (peek, limit);
4252 peek_u:
4253 if (*peek == '\"' || *peek == '\'')
4254 return false;
4256 if (*peek == 'R')
4257 goto peek_R;
4258 /* Identifier. Ok. */
4260 else if (p == 'R')
4262 peek_R:
4263 if (CPP_OPTION (pfile, rliterals))
4265 peek = do_peek_next (peek, limit);
4266 if (*peek == '\"')
4267 return false;
4269 /* Identifier. Ok. */
4271 else if ('Z' - 'A' == 25
4272 ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
4273 : ISIDST (p))
4275 /* Identifier. Ok. */
4277 else if (p == '<')
4279 /* Maybe angle header, ok for import. Reject
4280 '<=', '<<' digraph:'<:'. */
4281 if (!import)
4282 return false;
4283 peek = do_peek_next (peek, limit);
4284 if (*peek == '=' || *peek == '<'
4285 || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
4286 return false;
4288 else if (p == ';')
4290 /* SEMICOLON, ok for module. */
4291 if (import)
4292 return false;
4294 else if (p == '"')
4296 /* STRING, ok for import. */
4297 if (!import)
4298 return false;
4300 else if (p == ':')
4302 /* Maybe COLON, ok. Reject '::', digraph:':>'. */
4303 peek = do_peek_next (peek, limit);
4304 if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
4305 return false;
4307 else
4308 /* FIXME: Detect a unicode character, excluding those not
4309 permitted as the initial character. [lex.name]/1. I presume
4310 we need to check the \[uU] spellings, and directly using
4311 Unicode in say UTF8 form? Or perhaps we do the phase-1
4312 conversion of UTF8 to universal-character-names? */
4313 return false;
4315 return true;
4318 /* Directives-only scanning. Somewhat more relaxed than correct
4319 parsing -- some ill-formed programs will not be rejected. */
4321 void
4322 cpp_directive_only_process (cpp_reader *pfile,
4323 void *data,
4324 void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
4326 bool module_p = CPP_OPTION (pfile, module_directives);
4330 restart:
4331 /* Buffer initialization, but no line cleaning. */
4332 cpp_buffer *buffer = pfile->buffer;
4333 buffer->cur_note = buffer->notes_used = 0;
4334 buffer->cur = buffer->line_base = buffer->next_line;
4335 buffer->need_line = false;
4336 /* Files always end in a newline or carriage return. We rely on this for
4337 character peeking safety. */
4338 gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
4340 const unsigned char *base = buffer->cur;
4341 unsigned line_count = 0;
4342 const unsigned char *line_start = base;
4344 bool bol = true;
4345 bool raw = false;
4347 const unsigned char *lwm = base;
4348 for (const unsigned char *pos = base, *limit = buffer->rlimit;
4349 pos < limit;)
4351 unsigned char c = *pos++;
4352 /* This matches the switch in _cpp_lex_direct. */
4353 switch (c)
4355 case ' ': case '\t': case '\f': case '\v':
4356 /* Whitespace, do nothing. */
4357 break;
4359 case '\r': /* MAC line ending, or Windows \r\n */
4360 if (*pos == '\n')
4361 pos++;
4362 /* FALLTHROUGH */
4364 case '\n':
4365 bol = true;
4367 next_line:
4368 CPP_INCREMENT_LINE (pfile, 0);
4369 line_count++;
4370 line_start = pos;
4371 break;
4373 case '\\':
4374 /* <backslash><newline> is removed, and doesn't undo any
4375 preceeding escape or whatnot. */
4376 if (*pos == '\n')
4378 pos++;
4379 goto next_line;
4381 else if (*pos == '\r')
4383 if (pos[1] == '\n')
4384 pos++;
4385 pos++;
4386 goto next_line;
4388 goto dflt;
4390 case '#':
4391 if (bol)
4393 /* Line directive. */
4394 if (pos - 1 > base && !pfile->state.skipping)
4395 cb (pfile, CPP_DO_print, data,
4396 line_count, base, pos - 1 - base);
4398 /* Prep things for directive handling. */
4399 buffer->next_line = pos;
4400 buffer->need_line = true;
4401 bool ok = _cpp_get_fresh_line (pfile);
4402 gcc_checking_assert (ok);
4404 /* Ensure proper column numbering for generated
4405 error messages. */
4406 buffer->line_base -= pos - line_start;
4408 _cpp_handle_directive (pfile, line_start + 1 != pos);
4410 /* Sanitize the line settings. Duplicate #include's can
4411 mess things up. */
4412 // FIXME: Necessary?
4413 pfile->line_table->highest_location
4414 = pfile->line_table->highest_line;
4416 if (!pfile->state.skipping
4417 && pfile->buffer->next_line < pfile->buffer->rlimit)
4418 cb (pfile, CPP_DO_location, data,
4419 pfile->line_table->highest_line);
4421 goto restart;
4423 goto dflt;
4425 case '/':
4427 const unsigned char *peek = do_peek_next (pos, limit);
4428 if (!(*peek == '/' || *peek == '*'))
4429 goto dflt;
4431 /* Line or block comment */
4432 bool is_block = *peek == '*';
4433 bool star = false;
4434 bool esc = false;
4435 location_t sloc
4436 = linemap_position_for_column (pfile->line_table,
4437 pos - line_start);
4439 while (pos < limit)
4441 char c = *pos++;
4442 switch (c)
4444 case '\\':
4445 esc = true;
4446 break;
4448 case '\r':
4449 if (*pos == '\n')
4450 pos++;
4451 /* FALLTHROUGH */
4453 case '\n':
4455 CPP_INCREMENT_LINE (pfile, 0);
4456 line_count++;
4457 line_start = pos;
4458 if (!esc && !is_block)
4460 bol = true;
4461 goto done_comment;
4464 if (!esc)
4465 star = false;
4466 esc = false;
4467 break;
4469 case '*':
4470 if (pos > peek && !esc)
4471 star = is_block;
4472 esc = false;
4473 break;
4475 case '/':
4476 if (star)
4477 goto done_comment;
4478 /* FALLTHROUGH */
4480 default:
4481 star = false;
4482 esc = false;
4483 break;
4486 if (pos < limit || is_block)
4487 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4488 "unterminated comment");
4489 done_comment:
4490 lwm = pos;
4491 break;
4494 case '\'':
4495 if (!CPP_OPTION (pfile, digit_separators))
4496 goto delimited_string;
4498 /* Possibly a number punctuator. */
4499 if (!ISIDNUM (*do_peek_next (pos, limit)))
4500 goto delimited_string;
4502 goto quote_peek;
4504 case '\"':
4505 if (!CPP_OPTION (pfile, rliterals))
4506 goto delimited_string;
4508 quote_peek:
4510 /* For ' see if it's a number punctuator
4511 \.?<digit>(<digit>|<identifier-nondigit>
4512 |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
4513 /* For " see if it's a raw string
4514 {U,L,u,u8}R. This includes CPP_NUMBER detection,
4515 because that could be 0e+R. */
4516 const unsigned char *peek = pos - 1;
4517 bool quote_first = c == '"';
4518 bool quote_eight = false;
4519 bool maybe_number_start = false;
4520 bool want_number = false;
4522 while ((peek = do_peek_prev (peek, lwm)))
4524 unsigned char p = *peek;
4525 if (quote_first)
4527 if (!raw)
4529 if (p != 'R')
4530 break;
4531 raw = true;
4532 continue;
4535 quote_first = false;
4536 if (p == 'L' || p == 'U' || p == 'u')
4538 else if (p == '8')
4539 quote_eight = true;
4540 else
4541 goto second_raw;
4543 else if (quote_eight)
4545 if (p != 'u')
4547 raw = false;
4548 break;
4550 quote_eight = false;
4552 else if (c == '"')
4554 second_raw:;
4555 if (!want_number && ISIDNUM (p))
4557 raw = false;
4558 break;
4562 if (ISDIGIT (p))
4563 maybe_number_start = true;
4564 else if (p == '.')
4565 want_number = true;
4566 else if (ISIDNUM (p))
4567 maybe_number_start = false;
4568 else if (p == '+' || p == '-')
4570 if (const unsigned char *peek_prev
4571 = do_peek_prev (peek, lwm))
4573 p = *peek_prev;
4574 if (p == 'e' || p == 'E'
4575 || p == 'p' || p == 'P')
4577 want_number = true;
4578 maybe_number_start = false;
4580 else
4581 break;
4583 else
4584 break;
4586 else if (p == '\'' || p == '\"')
4588 /* If this is lwm, this must be the end of a
4589 previous string. So this is a trailing
4590 literal type, (a) if those are allowed,
4591 and (b) maybe_start is false. Otherwise
4592 this must be a CPP_NUMBER because we've
4593 met another ', and we'd have checked that
4594 in its own right. */
4595 if (peek == lwm && CPP_OPTION (pfile, uliterals))
4597 if (!maybe_number_start && !want_number)
4598 /* Must be a literal type. */
4599 raw = false;
4601 else if (p == '\''
4602 && CPP_OPTION (pfile, digit_separators))
4603 maybe_number_start = true;
4604 break;
4606 else if (c == '\'')
4607 break;
4608 else if (!quote_first && !quote_eight)
4609 break;
4612 if (maybe_number_start)
4614 if (c == '\'')
4615 /* A CPP NUMBER. */
4616 goto dflt;
4617 raw = false;
4620 goto delimited_string;
4623 delimited_string:
4625 /* (Possibly raw) string or char literal. */
4626 unsigned char end = c;
4627 int delim_len = -1;
4628 const unsigned char *delim = NULL;
4629 location_t sloc = linemap_position_for_column (pfile->line_table,
4630 pos - line_start);
4631 int esc = 0;
4633 if (raw)
4635 /* There can be no line breaks in the delimiter. */
4636 delim = pos;
4637 for (delim_len = 0; (c = *pos++) != '('; delim_len++)
4639 if (delim_len == 16)
4641 cpp_error_with_line (pfile, CPP_DL_ERROR,
4642 sloc, 0,
4643 "raw string delimiter"
4644 " longer than %d"
4645 " characters",
4646 delim_len);
4647 raw = false;
4648 pos = delim;
4649 break;
4651 if (strchr (") \\\t\v\f\n", c))
4653 cpp_error_with_line (pfile, CPP_DL_ERROR,
4654 sloc, 0,
4655 "invalid character '%c'"
4656 " in raw string"
4657 " delimiter", c);
4658 raw = false;
4659 pos = delim;
4660 break;
4662 if (pos >= limit)
4663 goto bad_string;
4667 while (pos < limit)
4669 char c = *pos++;
4670 switch (c)
4672 case '\\':
4673 if (!raw)
4674 esc++;
4675 break;
4677 case '\r':
4678 if (*pos == '\n')
4679 pos++;
4680 /* FALLTHROUGH */
4682 case '\n':
4684 CPP_INCREMENT_LINE (pfile, 0);
4685 line_count++;
4686 line_start = pos;
4688 if (esc)
4689 esc--;
4690 break;
4692 case ')':
4693 if (raw
4694 && pos + delim_len + 1 < limit
4695 && pos[delim_len] == end
4696 && !memcmp (delim, pos, delim_len))
4698 pos += delim_len + 1;
4699 raw = false;
4700 goto done_string;
4702 break;
4704 default:
4705 if (!raw && !(esc & 1) && c == end)
4706 goto done_string;
4707 esc = 0;
4708 break;
4711 bad_string:
4712 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4713 "unterminated literal");
4715 done_string:
4716 raw = false;
4717 lwm = pos - 1;
4719 goto dflt;
4721 case '_':
4722 case 'e':
4723 case 'i':
4724 case 'm':
4725 if (bol && module_p && !pfile->state.skipping
4726 && do_peek_module (pfile, c, pos, limit))
4728 /* We've seen the start of a module control line.
4729 Start up the tokenizer. */
4730 pos--; /* Backup over the first character. */
4732 /* Backup over whitespace to start of line. */
4733 while (pos > line_start
4734 && (pos[-1] == ' ' || pos[-1] == '\t'))
4735 pos--;
4737 if (pos > base)
4738 cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
4740 /* Prep things for directive handling. */
4741 buffer->next_line = pos;
4742 buffer->need_line = true;
4744 /* Now get tokens until the PRAGMA_EOL. */
4747 location_t spelling;
4748 const cpp_token *tok
4749 = cpp_get_token_with_location (pfile, &spelling);
4751 gcc_assert (pfile->state.in_deferred_pragma
4752 || tok->type == CPP_PRAGMA_EOL);
4753 cb (pfile, CPP_DO_token, data, tok, spelling);
4755 while (pfile->state.in_deferred_pragma);
4757 if (pfile->buffer->next_line < pfile->buffer->rlimit)
4758 cb (pfile, CPP_DO_location, data,
4759 pfile->line_table->highest_line);
4761 pfile->mi_valid = false;
4762 goto restart;
4764 goto dflt;
4766 default:
4767 dflt:
4768 bol = false;
4769 pfile->mi_valid = false;
4770 break;
4774 if (buffer->rlimit > base && !pfile->state.skipping)
4776 const unsigned char *limit = buffer->rlimit;
4777 /* If the file was not newline terminated, add rlimit, which is
4778 guaranteed to point to a newline, to the end of our range. */
4779 if (limit[-1] != '\n')
4781 limit++;
4782 CPP_INCREMENT_LINE (pfile, 0);
4783 line_count++;
4785 cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
4788 _cpp_pop_buffer (pfile);
4790 while (pfile->buffer);