Support TI mode and soft float on PA64
[official-gcc.git] / libcpp / lex.c
blobfa2253d41c38176458b30c15febd8d41fa484bd2
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000-2021 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
27 enum spell_type
29 SPELL_OPERATOR = 0,
30 SPELL_IDENT,
31 SPELL_LITERAL,
32 SPELL_NONE
35 struct token_spelling
37 enum spell_type category;
38 const unsigned char *name;
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
44 #define OP(e, s) { SPELL_OPERATOR, UC s },
45 #define TK(e, s) { SPELL_ ## s, UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54 static int skip_line_comment (cpp_reader *);
55 static void skip_whitespace (cpp_reader *, cppchar_t);
56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58 static void store_comment (cpp_reader *, cpp_token *);
59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
60 unsigned int, enum cpp_ttype);
61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
62 static int name_p (cpp_reader *, const cpp_string *);
63 static tokenrun *next_tokenrun (tokenrun *);
65 static _cpp_buff *new_buff (size_t);
68 /* Utility routine:
70 Compares, the token TOKEN to the NUL-terminated string STRING.
71 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
72 int
73 cpp_ideq (const cpp_token *token, const char *string)
75 if (token->type != CPP_NAME)
76 return 0;
78 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
81 /* Record a note TYPE at byte POS into the current cleaned logical
82 line. */
83 static void
84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
86 if (buffer->notes_used == buffer->notes_cap)
88 buffer->notes_cap = buffer->notes_cap * 2 + 200;
89 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
90 buffer->notes_cap);
93 buffer->notes[buffer->notes_used].pos = pos;
94 buffer->notes[buffer->notes_used].type = type;
95 buffer->notes_used++;
99 /* Fast path to find line special characters using optimized character
100 scanning algorithms. Anything complicated falls back to the slow
101 path below. Since this loop is very hot it's worth doing these kinds
102 of optimizations.
104 One of the paths through the ifdefs should provide
106 const uchar *search_line_fast (const uchar *s, const uchar *end);
108 Between S and END, search for \n, \r, \\, ?. Return a pointer to
109 the found character.
111 Note that the last character of the buffer is *always* a newline,
112 as forced by _cpp_convert_input. This fact can be used to avoid
113 explicitly looking for the end of the buffer. */
115 /* Configure gives us an ifdef test. */
116 #ifndef WORDS_BIGENDIAN
117 #define WORDS_BIGENDIAN 0
118 #endif
120 /* We'd like the largest integer that fits into a register. There's nothing
121 in <stdint.h> that gives us that. For most hosts this is unsigned long,
122 but MS decided on an LLP64 model. Thankfully when building with GCC we
123 can get the "real" word size. */
124 #ifdef __GNUC__
125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
126 #else
127 typedef unsigned long word_type;
128 #endif
130 /* The code below is only expecting sizes 4 or 8.
131 Die at compile-time if this expectation is violated. */
132 typedef char check_word_type_size
133 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
135 /* Return X with the first N bytes forced to values that won't match one
136 of the interesting characters. Note that NUL is not interesting. */
138 static inline word_type
139 acc_char_mask_misalign (word_type val, unsigned int n)
141 word_type mask = -1;
142 if (WORDS_BIGENDIAN)
143 mask >>= n * 8;
144 else
145 mask <<= n * 8;
146 return val & mask;
149 /* Return X replicated to all byte positions within WORD_TYPE. */
151 static inline word_type
152 acc_char_replicate (uchar x)
154 word_type ret;
156 ret = (x << 24) | (x << 16) | (x << 8) | x;
157 if (sizeof(word_type) == 8)
158 ret = (ret << 16 << 16) | ret;
159 return ret;
162 /* Return non-zero if some byte of VAL is (probably) C. */
164 static inline word_type
165 acc_char_cmp (word_type val, word_type c)
167 #if defined(__GNUC__) && defined(__alpha__)
168 /* We can get exact results using a compare-bytes instruction.
169 Get (val == c) via (0 >= (val ^ c)). */
170 return __builtin_alpha_cmpbge (0, val ^ c);
171 #else
172 word_type magic = 0x7efefefeU;
173 if (sizeof(word_type) == 8)
174 magic = (magic << 16 << 16) | 0xfefefefeU;
175 magic |= 1;
177 val ^= c;
178 return ((val + magic) ^ ~val) & ~magic;
179 #endif
182 /* Given the result of acc_char_cmp is non-zero, return the index of
183 the found character. If this was a false positive, return -1. */
185 static inline int
186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
187 word_type val ATTRIBUTE_UNUSED)
189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
190 /* The cmpbge instruction sets *bits* of the result corresponding to
191 matches in the bytes with no false positives. */
192 return __builtin_ctzl (cmp);
193 #else
194 unsigned int i;
196 /* ??? It would be nice to force unrolling here,
197 and have all of these constants folded. */
198 for (i = 0; i < sizeof(word_type); ++i)
200 uchar c;
201 if (WORDS_BIGENDIAN)
202 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
203 else
204 c = (val >> i * 8) & 0xff;
206 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
207 return i;
210 return -1;
211 #endif
214 /* A version of the fast scanner using bit fiddling techniques.
216 For 32-bit words, one would normally perform 16 comparisons and
217 16 branches. With this algorithm one performs 24 arithmetic
218 operations and one branch. Whether this is faster with a 32-bit
219 word size is going to be somewhat system dependent.
221 For 64-bit words, we eliminate twice the number of comparisons
222 and branches without increasing the number of arithmetic operations.
223 It's almost certainly going to be a win with 64-bit word size. */
225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
226 ATTRIBUTE_UNUSED;
228 static const uchar *
229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
231 const word_type repl_nl = acc_char_replicate ('\n');
232 const word_type repl_cr = acc_char_replicate ('\r');
233 const word_type repl_bs = acc_char_replicate ('\\');
234 const word_type repl_qm = acc_char_replicate ('?');
236 unsigned int misalign;
237 const word_type *p;
238 word_type val, t;
240 /* Align the buffer. Mask out any bytes from before the beginning. */
241 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
242 val = *p;
243 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
244 if (misalign)
245 val = acc_char_mask_misalign (val, misalign);
247 /* Main loop. */
248 while (1)
250 t = acc_char_cmp (val, repl_nl);
251 t |= acc_char_cmp (val, repl_cr);
252 t |= acc_char_cmp (val, repl_bs);
253 t |= acc_char_cmp (val, repl_qm);
255 if (__builtin_expect (t != 0, 0))
257 int i = acc_char_index (t, val);
258 if (i >= 0)
259 return (const uchar *)p + i;
262 val = *++p;
266 /* Disable on Solaris 2/x86 until the following problem can be properly
267 autoconfed:
269 The Solaris 10+ assembler tags objects with the instruction set
270 extensions used, so SSE4.2 executables cannot run on machines that
271 don't support that extension. */
273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
275 /* Replicated character data to be shared between implementations.
276 Recall that outside of a context with vector support we can't
277 define compatible vector types, therefore these are all defined
278 in terms of raw characters. */
279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
280 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
281 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
282 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
283 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
284 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
285 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
286 { '?', '?', '?', '?', '?', '?', '?', '?',
287 '?', '?', '?', '?', '?', '?', '?', '?' },
290 /* A version of the fast scanner using MMX vectorized byte compare insns.
292 This uses the PMOVMSKB instruction which was introduced with "MMX2",
293 which was packaged into SSE1; it is also present in the AMD MMX
294 extension. Mark the function as using "sse" so that we emit a real
295 "emms" instruction, rather than the 3dNOW "femms" instruction. */
297 static const uchar *
298 #ifndef __SSE__
299 __attribute__((__target__("sse")))
300 #endif
301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
303 typedef char v8qi __attribute__ ((__vector_size__ (8)));
304 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
306 const v8qi repl_nl = *(const v8qi *)repl_chars[0];
307 const v8qi repl_cr = *(const v8qi *)repl_chars[1];
308 const v8qi repl_bs = *(const v8qi *)repl_chars[2];
309 const v8qi repl_qm = *(const v8qi *)repl_chars[3];
311 unsigned int misalign, found, mask;
312 const v8qi *p;
313 v8qi data, t, c;
315 /* Align the source pointer. While MMX doesn't generate unaligned data
316 faults, this allows us to safely scan to the end of the buffer without
317 reading beyond the end of the last page. */
318 misalign = (uintptr_t)s & 7;
319 p = (const v8qi *)((uintptr_t)s & -8);
320 data = *p;
322 /* Create a mask for the bytes that are valid within the first
323 16-byte block. The Idea here is that the AND with the mask
324 within the loop is "free", since we need some AND or TEST
325 insn in order to set the flags for the branch anyway. */
326 mask = -1u << misalign;
328 /* Main loop processing 8 bytes at a time. */
329 goto start;
332 data = *++p;
333 mask = -1;
335 start:
336 t = __builtin_ia32_pcmpeqb(data, repl_nl);
337 c = __builtin_ia32_pcmpeqb(data, repl_cr);
338 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
339 c = __builtin_ia32_pcmpeqb(data, repl_bs);
340 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
341 c = __builtin_ia32_pcmpeqb(data, repl_qm);
342 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
343 found = __builtin_ia32_pmovmskb (t);
344 found &= mask;
346 while (!found);
348 __builtin_ia32_emms ();
350 /* FOUND contains 1 in bits for which we matched a relevant
351 character. Conversion to the byte index is trivial. */
352 found = __builtin_ctz(found);
353 return (const uchar *)p + found;
356 /* A version of the fast scanner using SSE2 vectorized byte compare insns. */
358 static const uchar *
359 #ifndef __SSE2__
360 __attribute__((__target__("sse2")))
361 #endif
362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
364 typedef char v16qi __attribute__ ((__vector_size__ (16)));
366 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
367 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
368 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
369 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
371 unsigned int misalign, found, mask;
372 const v16qi *p;
373 v16qi data, t;
375 /* Align the source pointer. */
376 misalign = (uintptr_t)s & 15;
377 p = (const v16qi *)((uintptr_t)s & -16);
378 data = *p;
380 /* Create a mask for the bytes that are valid within the first
381 16-byte block. The Idea here is that the AND with the mask
382 within the loop is "free", since we need some AND or TEST
383 insn in order to set the flags for the branch anyway. */
384 mask = -1u << misalign;
386 /* Main loop processing 16 bytes at a time. */
387 goto start;
390 data = *++p;
391 mask = -1;
393 start:
394 t = data == repl_nl;
395 t |= data == repl_cr;
396 t |= data == repl_bs;
397 t |= data == repl_qm;
398 found = __builtin_ia32_pmovmskb128 (t);
399 found &= mask;
401 while (!found);
403 /* FOUND contains 1 in bits for which we matched a relevant
404 character. Conversion to the byte index is trivial. */
405 found = __builtin_ctz(found);
406 return (const uchar *)p + found;
409 #ifdef HAVE_SSE4
410 /* A version of the fast scanner using SSE 4.2 vectorized string insns. */
412 static const uchar *
413 #ifndef __SSE4_2__
414 __attribute__((__target__("sse4.2")))
415 #endif
416 search_line_sse42 (const uchar *s, const uchar *end)
418 typedef char v16qi __attribute__ ((__vector_size__ (16)));
419 static const v16qi search = { '\n', '\r', '?', '\\' };
421 uintptr_t si = (uintptr_t)s;
422 uintptr_t index;
424 /* Check for unaligned input. */
425 if (si & 15)
427 v16qi sv;
429 if (__builtin_expect (end - s < 16, 0)
430 && __builtin_expect ((si & 0xfff) > 0xff0, 0))
432 /* There are less than 16 bytes left in the buffer, and less
433 than 16 bytes left on the page. Reading 16 bytes at this
434 point might generate a spurious page fault. Defer to the
435 SSE2 implementation, which already handles alignment. */
436 return search_line_sse2 (s, end);
439 /* ??? The builtin doesn't understand that the PCMPESTRI read from
440 memory need not be aligned. */
441 sv = __builtin_ia32_loaddqu ((const char *) s);
442 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
444 if (__builtin_expect (index < 16, 0))
445 goto found;
447 /* Advance the pointer to an aligned address. We will re-scan a
448 few bytes, but we no longer need care for reading past the
449 end of a page, since we're guaranteed a match. */
450 s = (const uchar *)((si + 15) & -16);
453 /* Main loop, processing 16 bytes at a time. */
454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
455 while (1)
457 char f;
459 /* By using inline assembly instead of the builtin,
460 we can use the result, as well as the flags set. */
461 __asm ("%vpcmpestri\t$0, %2, %3"
462 : "=c"(index), "=@ccc"(f)
463 : "m"(*s), "x"(search), "a"(4), "d"(16));
464 if (f)
465 break;
467 s += 16;
469 #else
470 s -= 16;
471 /* By doing the whole loop in inline assembly,
472 we can make proper use of the flags set. */
473 __asm ( ".balign 16\n"
474 "0: add $16, %1\n"
475 " %vpcmpestri\t$0, (%1), %2\n"
476 " jnc 0b"
477 : "=&c"(index), "+r"(s)
478 : "x"(search), "a"(4), "d"(16));
479 #endif
481 found:
482 return s + index;
485 #else
486 /* Work around out-dated assemblers without sse4 support. */
487 #define search_line_sse42 search_line_sse2
488 #endif
490 /* Check the CPU capabilities. */
492 #include "../gcc/config/i386/cpuid.h"
494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
495 static search_line_fast_type search_line_fast;
497 #define HAVE_init_vectorized_lexer 1
498 static inline void
499 init_vectorized_lexer (void)
501 unsigned dummy, ecx = 0, edx = 0;
502 search_line_fast_type impl = search_line_acc_char;
503 int minimum = 0;
505 #if defined(__SSE4_2__)
506 minimum = 3;
507 #elif defined(__SSE2__)
508 minimum = 2;
509 #elif defined(__SSE__)
510 minimum = 1;
511 #endif
513 if (minimum == 3)
514 impl = search_line_sse42;
515 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
517 if (minimum == 3 || (ecx & bit_SSE4_2))
518 impl = search_line_sse42;
519 else if (minimum == 2 || (edx & bit_SSE2))
520 impl = search_line_sse2;
521 else if (minimum == 1 || (edx & bit_SSE))
522 impl = search_line_mmx;
524 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
526 if (minimum == 1
527 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
528 impl = search_line_mmx;
531 search_line_fast = impl;
534 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
536 /* A vection of the fast scanner using AltiVec vectorized byte compares
537 and VSX unaligned loads (when VSX is available). This is otherwise
538 the same as the AltiVec version. */
540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
541 static const uchar *
542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
544 typedef __attribute__((altivec(vector))) unsigned char vc;
546 const vc repl_nl = {
547 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
548 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
550 const vc repl_cr = {
551 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
552 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
554 const vc repl_bs = {
555 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
556 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
558 const vc repl_qm = {
559 '?', '?', '?', '?', '?', '?', '?', '?',
560 '?', '?', '?', '?', '?', '?', '?', '?',
562 const vc zero = { 0 };
564 vc data, t;
566 /* Main loop processing 16 bytes at a time. */
569 vc m_nl, m_cr, m_bs, m_qm;
571 data = __builtin_vec_vsx_ld (0, s);
572 s += 16;
574 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
575 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
576 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
577 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
578 t = (m_nl | m_cr) | (m_bs | m_qm);
580 /* T now contains 0xff in bytes for which we matched one of the relevant
581 characters. We want to exit the loop if any byte in T is non-zero.
582 Below is the expansion of vec_any_ne(t, zero). */
584 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
586 /* Restore s to to point to the 16 bytes we just processed. */
587 s -= 16;
590 #define N (sizeof(vc) / sizeof(long))
592 union {
593 vc v;
594 /* Statically assert that N is 2 or 4. */
595 unsigned long l[(N == 2 || N == 4) ? N : -1];
596 } u;
597 unsigned long l, i = 0;
599 u.v = t;
601 /* Find the first word of T that is non-zero. */
602 switch (N)
604 case 4:
605 l = u.l[i++];
606 if (l != 0)
607 break;
608 s += sizeof(unsigned long);
609 l = u.l[i++];
610 if (l != 0)
611 break;
612 s += sizeof(unsigned long);
613 /* FALLTHRU */
614 case 2:
615 l = u.l[i++];
616 if (l != 0)
617 break;
618 s += sizeof(unsigned long);
619 l = u.l[i];
622 /* L now contains 0xff in bytes for which we matched one of the
623 relevant characters. We can find the byte index by finding
624 its bit index and dividing by 8. */
625 #ifdef __BIG_ENDIAN__
626 l = __builtin_clzl(l) >> 3;
627 #else
628 l = __builtin_ctzl(l) >> 3;
629 #endif
630 return s + l;
632 #undef N
636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
638 /* A vection of the fast scanner using AltiVec vectorized byte compares.
639 This cannot be used for little endian because vec_lvsl/lvsr are
640 deprecated for little endian and the code won't work properly. */
641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
642 so we can't compile this function without -maltivec on the command line
643 (or implied by some other switch). */
645 static const uchar *
646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
648 typedef __attribute__((altivec(vector))) unsigned char vc;
650 const vc repl_nl = {
651 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
652 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
654 const vc repl_cr = {
655 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
656 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
658 const vc repl_bs = {
659 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
660 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
662 const vc repl_qm = {
663 '?', '?', '?', '?', '?', '?', '?', '?',
664 '?', '?', '?', '?', '?', '?', '?', '?',
666 const vc ones = {
667 -1, -1, -1, -1, -1, -1, -1, -1,
668 -1, -1, -1, -1, -1, -1, -1, -1,
670 const vc zero = { 0 };
672 vc data, mask, t;
674 /* Altivec loads automatically mask addresses with -16. This lets us
675 issue the first load as early as possible. */
676 data = __builtin_vec_ld(0, (const vc *)s);
678 /* Discard bytes before the beginning of the buffer. Do this by
679 beginning with all ones and shifting in zeros according to the
680 mis-alignment. The LVSR instruction pulls the exact shift we
681 want from the address. */
682 mask = __builtin_vec_lvsr(0, s);
683 mask = __builtin_vec_perm(zero, ones, mask);
684 data &= mask;
686 /* While altivec loads mask addresses, we still need to align S so
687 that the offset we compute at the end is correct. */
688 s = (const uchar *)((uintptr_t)s & -16);
690 /* Main loop processing 16 bytes at a time. */
691 goto start;
694 vc m_nl, m_cr, m_bs, m_qm;
696 s += 16;
697 data = __builtin_vec_ld(0, (const vc *)s);
699 start:
700 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
701 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
702 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
703 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
704 t = (m_nl | m_cr) | (m_bs | m_qm);
706 /* T now contains 0xff in bytes for which we matched one of the relevant
707 characters. We want to exit the loop if any byte in T is non-zero.
708 Below is the expansion of vec_any_ne(t, zero). */
710 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
713 #define N (sizeof(vc) / sizeof(long))
715 union {
716 vc v;
717 /* Statically assert that N is 2 or 4. */
718 unsigned long l[(N == 2 || N == 4) ? N : -1];
719 } u;
720 unsigned long l, i = 0;
722 u.v = t;
724 /* Find the first word of T that is non-zero. */
725 switch (N)
727 case 4:
728 l = u.l[i++];
729 if (l != 0)
730 break;
731 s += sizeof(unsigned long);
732 l = u.l[i++];
733 if (l != 0)
734 break;
735 s += sizeof(unsigned long);
736 /* FALLTHROUGH */
737 case 2:
738 l = u.l[i++];
739 if (l != 0)
740 break;
741 s += sizeof(unsigned long);
742 l = u.l[i];
745 /* L now contains 0xff in bytes for which we matched one of the
746 relevant characters. We can find the byte index by finding
747 its bit index and dividing by 8. */
748 l = __builtin_clzl(l) >> 3;
749 return s + l;
751 #undef N
755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
756 #include "arm_neon.h"
758 /* This doesn't have to be the exact page size, but no system may use
759 a size smaller than this. ARMv8 requires a minimum page size of
760 4k. The impact of being conservative here is a small number of
761 cases will take the slightly slower entry path into the main
762 loop. */
764 #define AARCH64_MIN_PAGE_SIZE 4096
766 static const uchar *
767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
769 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
770 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
771 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
772 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
773 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
775 #ifdef __ARM_BIG_ENDIAN
776 const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
777 #else
778 const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
779 #endif
781 unsigned int found;
782 const uint8_t *p;
783 uint8x16_t data;
784 uint8x16_t t;
785 uint16x8_t m;
786 uint8x16_t u, v, w;
788 /* Align the source pointer. */
789 p = (const uint8_t *)((uintptr_t)s & -16);
791 /* Assuming random string start positions, with a 4k page size we'll take
792 the slow path about 0.37% of the time. */
793 if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
794 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
795 < 16, 0))
797 /* Slow path: the string starts near a possible page boundary. */
798 uint32_t misalign, mask;
800 misalign = (uintptr_t)s & 15;
801 mask = (-1u << misalign) & 0xffff;
802 data = vld1q_u8 (p);
803 t = vceqq_u8 (data, repl_nl);
804 u = vceqq_u8 (data, repl_cr);
805 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
806 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
807 t = vorrq_u8 (v, w);
808 t = vandq_u8 (t, xmask);
809 m = vpaddlq_u8 (t);
810 m = vshlq_u16 (m, shift);
811 found = vaddvq_u16 (m);
812 found &= mask;
813 if (found)
814 return (const uchar*)p + __builtin_ctz (found);
816 else
818 data = vld1q_u8 ((const uint8_t *) s);
819 t = vceqq_u8 (data, repl_nl);
820 u = vceqq_u8 (data, repl_cr);
821 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
822 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
823 t = vorrq_u8 (v, w);
824 if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
825 goto done;
830 p += 16;
831 data = vld1q_u8 (p);
832 t = vceqq_u8 (data, repl_nl);
833 u = vceqq_u8 (data, repl_cr);
834 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
835 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
836 t = vorrq_u8 (v, w);
837 } while (!vpaddd_u64 ((uint64x2_t)t));
839 done:
840 /* Now that we've found the terminating substring, work out precisely where
841 we need to stop. */
842 t = vandq_u8 (t, xmask);
843 m = vpaddlq_u8 (t);
844 m = vshlq_u16 (m, shift);
845 found = vaddvq_u16 (m);
846 return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
847 + __builtin_ctz (found));
850 #elif defined (__ARM_NEON)
851 #include "arm_neon.h"
853 static const uchar *
854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
856 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
857 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
858 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
859 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
860 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
862 unsigned int misalign, found, mask;
863 const uint8_t *p;
864 uint8x16_t data;
866 /* Align the source pointer. */
867 misalign = (uintptr_t)s & 15;
868 p = (const uint8_t *)((uintptr_t)s & -16);
869 data = vld1q_u8 (p);
871 /* Create a mask for the bytes that are valid within the first
872 16-byte block. The Idea here is that the AND with the mask
873 within the loop is "free", since we need some AND or TEST
874 insn in order to set the flags for the branch anyway. */
875 mask = (-1u << misalign) & 0xffff;
877 /* Main loop, processing 16 bytes at a time. */
878 goto start;
882 uint8x8_t l;
883 uint16x4_t m;
884 uint32x2_t n;
885 uint8x16_t t, u, v, w;
887 p += 16;
888 data = vld1q_u8 (p);
889 mask = 0xffff;
891 start:
892 t = vceqq_u8 (data, repl_nl);
893 u = vceqq_u8 (data, repl_cr);
894 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
895 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
896 t = vandq_u8 (vorrq_u8 (v, w), xmask);
897 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
898 m = vpaddl_u8 (l);
899 n = vpaddl_u16 (m);
901 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
902 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
903 found &= mask;
905 while (!found);
907 /* FOUND contains 1 in bits for which we matched a relevant
908 character. Conversion to the byte index is trivial. */
909 found = __builtin_ctz (found);
910 return (const uchar *)p + found;
913 #else
915 /* We only have one accelerated alternative. Use a direct call so that
916 we encourage inlining. */
918 #define search_line_fast search_line_acc_char
920 #endif
922 /* Initialize the lexer if needed. */
924 void
925 _cpp_init_lexer (void)
927 #ifdef HAVE_init_vectorized_lexer
928 init_vectorized_lexer ();
929 #endif
932 /* Returns with a logical line that contains no escaped newlines or
933 trigraphs. This is a time-critical inner loop. */
934 void
935 _cpp_clean_line (cpp_reader *pfile)
937 cpp_buffer *buffer;
938 const uchar *s;
939 uchar c, *d, *p;
941 buffer = pfile->buffer;
942 buffer->cur_note = buffer->notes_used = 0;
943 buffer->cur = buffer->line_base = buffer->next_line;
944 buffer->need_line = false;
945 s = buffer->next_line;
947 if (!buffer->from_stage3)
949 const uchar *pbackslash = NULL;
951 /* Fast path. This is the common case of an un-escaped line with
952 no trigraphs. The primary win here is by not writing any
953 data back to memory until we have to. */
954 while (1)
956 /* Perform an optimized search for \n, \r, \\, ?. */
957 s = search_line_fast (s, buffer->rlimit);
959 c = *s;
960 if (c == '\\')
962 /* Record the location of the backslash and continue. */
963 pbackslash = s++;
965 else if (__builtin_expect (c == '?', 0))
967 if (__builtin_expect (s[1] == '?', false)
968 && _cpp_trigraph_map[s[2]])
970 /* Have a trigraph. We may or may not have to convert
971 it. Add a line note regardless, for -Wtrigraphs. */
972 add_line_note (buffer, s, s[2]);
973 if (CPP_OPTION (pfile, trigraphs))
975 /* We do, and that means we have to switch to the
976 slow path. */
977 d = (uchar *) s;
978 *d = _cpp_trigraph_map[s[2]];
979 s += 2;
980 goto slow_path;
983 /* Not a trigraph. Continue on fast-path. */
984 s++;
986 else
987 break;
990 /* This must be \r or \n. We're either done, or we'll be forced
991 to write back to the buffer and continue on the slow path. */
992 d = (uchar *) s;
994 if (__builtin_expect (s == buffer->rlimit, false))
995 goto done;
997 /* DOS line ending? */
998 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1000 s++;
1001 if (s == buffer->rlimit)
1002 goto done;
1005 if (__builtin_expect (pbackslash == NULL, true))
1006 goto done;
1008 /* Check for escaped newline. */
1009 p = d;
1010 while (is_nvspace (p[-1]))
1011 p--;
1012 if (p - 1 != pbackslash)
1013 goto done;
1015 /* Have an escaped newline; process it and proceed to
1016 the slow path. */
1017 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018 d = p - 2;
1019 buffer->next_line = p - 1;
1021 slow_path:
1022 while (1)
1024 c = *++s;
1025 *++d = c;
1027 if (c == '\n' || c == '\r')
1029 /* Handle DOS line endings. */
1030 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031 s++;
1032 if (s == buffer->rlimit)
1033 break;
1035 /* Escaped? */
1036 p = d;
1037 while (p != buffer->next_line && is_nvspace (p[-1]))
1038 p--;
1039 if (p == buffer->next_line || p[-1] != '\\')
1040 break;
1042 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043 d = p - 2;
1044 buffer->next_line = p - 1;
1046 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1048 /* Add a note regardless, for the benefit of -Wtrigraphs. */
1049 add_line_note (buffer, d, s[2]);
1050 if (CPP_OPTION (pfile, trigraphs))
1052 *d = _cpp_trigraph_map[s[2]];
1053 s += 2;
1058 else
1060 while (*s != '\n' && *s != '\r')
1061 s++;
1062 d = (uchar *) s;
1064 /* Handle DOS line endings. */
1065 if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1066 s++;
1069 done:
1070 *d = '\n';
1071 /* A sentinel note that should never be processed. */
1072 add_line_note (buffer, d + 1, '\n');
1073 buffer->next_line = s + 1;
1076 /* Return true if the trigraph indicated by NOTE should be warned
1077 about in a comment. */
1078 static bool
1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1081 const uchar *p;
1083 /* Within comments we don't warn about trigraphs, unless the
1084 trigraph forms an escaped newline, as that may change
1085 behavior. */
1086 if (note->type != '/')
1087 return false;
1089 /* If -trigraphs, then this was an escaped newline iff the next note
1090 is coincident. */
1091 if (CPP_OPTION (pfile, trigraphs))
1092 return note[1].pos == note->pos;
1094 /* Otherwise, see if this forms an escaped newline. */
1095 p = note->pos + 3;
1096 while (is_nvspace (*p))
1097 p++;
1099 /* There might have been escaped newlines between the trigraph and the
1100 newline we found. Hence the position test. */
1101 return (*p == '\n' && p < note[1].pos);
1104 /* Process the notes created by add_line_note as far as the current
1105 location. */
1106 void
1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1109 cpp_buffer *buffer = pfile->buffer;
1111 for (;;)
1113 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114 unsigned int col;
1116 if (note->pos > buffer->cur)
1117 break;
1119 buffer->cur_note++;
1120 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1122 if (note->type == '\\' || note->type == ' ')
1124 if (note->type == ' ' && !in_comment)
1125 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126 "backslash and newline separated by space");
1128 if (buffer->next_line > buffer->rlimit)
1130 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131 "backslash-newline at end of file");
1132 /* Prevent "no newline at end of file" warning. */
1133 buffer->next_line = buffer->rlimit;
1136 buffer->line_base = note->pos;
1137 CPP_INCREMENT_LINE (pfile, 0);
1139 else if (_cpp_trigraph_map[note->type])
1141 if (CPP_OPTION (pfile, warn_trigraphs)
1142 && (!in_comment || warn_in_comment (pfile, note)))
1144 if (CPP_OPTION (pfile, trigraphs))
1145 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146 pfile->line_table->highest_line, col,
1147 "trigraph ??%c converted to %c",
1148 note->type,
1149 (int) _cpp_trigraph_map[note->type]);
1150 else
1152 cpp_warning_with_line
1153 (pfile, CPP_W_TRIGRAPHS,
1154 pfile->line_table->highest_line, col,
1155 "trigraph ??%c ignored, use -trigraphs to enable",
1156 note->type);
1160 else if (note->type == 0)
1161 /* Already processed in lex_raw_string. */;
1162 else
1163 abort ();
1167 /* Skip a C-style block comment. We find the end of the comment by
1168 seeing if an asterisk is before every '/' we encounter. Returns
1169 nonzero if comment terminated by EOF, zero otherwise.
1171 Buffer->cur points to the initial asterisk of the comment. */
1172 bool
1173 _cpp_skip_block_comment (cpp_reader *pfile)
1175 cpp_buffer *buffer = pfile->buffer;
1176 const uchar *cur = buffer->cur;
1177 uchar c;
1179 cur++;
1180 if (*cur == '/')
1181 cur++;
1183 for (;;)
1185 /* People like decorating comments with '*', so check for '/'
1186 instead for efficiency. */
1187 c = *cur++;
1189 if (c == '/')
1191 if (cur[-2] == '*')
1192 break;
1194 /* Warn about potential nested comments, but not if the '/'
1195 comes immediately before the true comment delimiter.
1196 Don't bother to get it right across escaped newlines. */
1197 if (CPP_OPTION (pfile, warn_comments)
1198 && cur[0] == '*' && cur[1] != '/')
1200 buffer->cur = cur;
1201 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1202 pfile->line_table->highest_line,
1203 CPP_BUF_COL (buffer),
1204 "\"/*\" within comment");
1207 else if (c == '\n')
1209 unsigned int cols;
1210 buffer->cur = cur - 1;
1211 _cpp_process_line_notes (pfile, true);
1212 if (buffer->next_line >= buffer->rlimit)
1213 return true;
1214 _cpp_clean_line (pfile);
1216 cols = buffer->next_line - buffer->line_base;
1217 CPP_INCREMENT_LINE (pfile, cols);
1219 cur = buffer->cur;
1223 buffer->cur = cur;
1224 _cpp_process_line_notes (pfile, true);
1225 return false;
1228 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1229 terminating newline. Handles escaped newlines. Returns nonzero
1230 if a multiline comment. */
1231 static int
1232 skip_line_comment (cpp_reader *pfile)
1234 cpp_buffer *buffer = pfile->buffer;
1235 location_t orig_line = pfile->line_table->highest_line;
1237 while (*buffer->cur != '\n')
1238 buffer->cur++;
1240 _cpp_process_line_notes (pfile, true);
1241 return orig_line != pfile->line_table->highest_line;
1244 /* Skips whitespace, saving the next non-whitespace character. */
1245 static void
1246 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1248 cpp_buffer *buffer = pfile->buffer;
1249 bool saw_NUL = false;
1253 /* Horizontal space always OK. */
1254 if (c == ' ' || c == '\t')
1256 /* Just \f \v or \0 left. */
1257 else if (c == '\0')
1258 saw_NUL = true;
1259 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1260 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1261 CPP_BUF_COL (buffer),
1262 "%s in preprocessing directive",
1263 c == '\f' ? "form feed" : "vertical tab");
1265 c = *buffer->cur++;
1267 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1268 while (is_nvspace (c));
1270 if (saw_NUL)
1272 encoding_rich_location rich_loc (pfile);
1273 cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1274 "null character(s) ignored");
1277 buffer->cur--;
1280 /* See if the characters of a number token are valid in a name (no
1281 '.', '+' or '-'). */
1282 static int
1283 name_p (cpp_reader *pfile, const cpp_string *string)
1285 unsigned int i;
1287 for (i = 0; i < string->len; i++)
1288 if (!is_idchar (string->text[i]))
1289 return 0;
1291 return 1;
1294 /* After parsing an identifier or other sequence, produce a warning about
1295 sequences not in NFC/NFKC. */
1296 static void
1297 warn_about_normalization (cpp_reader *pfile,
1298 const cpp_token *token,
1299 const struct normalize_state *s)
1301 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1302 && !pfile->state.skipping)
1304 location_t loc = token->src_loc;
1306 /* If possible, create a location range for the token. */
1307 if (loc >= RESERVED_LOCATION_COUNT
1308 && token->type != CPP_EOF
1309 /* There must be no line notes to process. */
1310 && (!(pfile->buffer->cur
1311 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
1312 && !pfile->overlaid_buffer)))
1314 source_range tok_range;
1315 tok_range.m_start = loc;
1316 tok_range.m_finish
1317 = linemap_position_for_column (pfile->line_table,
1318 CPP_BUF_COLUMN (pfile->buffer,
1319 pfile->buffer->cur));
1320 loc = COMBINE_LOCATION_DATA (pfile->line_table,
1321 loc, tok_range, NULL);
1324 encoding_rich_location rich_loc (pfile, loc);
1326 /* Make sure that the token is printed using UCNs, even
1327 if we'd otherwise happily print UTF-8. */
1328 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1329 size_t sz;
1331 sz = cpp_spell_token (pfile, token, buf, false) - buf;
1332 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1333 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1334 "`%.*s' is not in NFKC", (int) sz, buf);
1335 else if (CPP_OPTION (pfile, cxx23_identifiers))
1336 cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1337 "`%.*s' is not in NFC", (int) sz, buf);
1338 else
1339 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1340 "`%.*s' is not in NFC", (int) sz, buf);
1341 free (buf);
1345 static const cppchar_t utf8_signifier = 0xC0;
1347 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1348 an identifier. FIRST is TRUE if this starts an identifier. */
1349 static bool
1350 forms_identifier_p (cpp_reader *pfile, int first,
1351 struct normalize_state *state)
1353 cpp_buffer *buffer = pfile->buffer;
1355 if (*buffer->cur == '$')
1357 if (!CPP_OPTION (pfile, dollars_in_ident))
1358 return false;
1360 buffer->cur++;
1361 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1363 CPP_OPTION (pfile, warn_dollars) = 0;
1364 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1367 return true;
1370 /* Is this a syntactically valid UCN or a valid UTF-8 char? */
1371 if (CPP_OPTION (pfile, extended_identifiers))
1373 cppchar_t s;
1374 if (*buffer->cur >= utf8_signifier)
1376 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1377 state, &s))
1378 return true;
1380 else if (*buffer->cur == '\\'
1381 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1383 buffer->cur += 2;
1384 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1385 state, &s, NULL, NULL))
1386 return true;
1387 buffer->cur -= 2;
1391 return false;
1394 /* Helper function to issue error about improper __VA_OPT__ use. */
1395 static void
1396 maybe_va_opt_error (cpp_reader *pfile)
1398 if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1400 /* __VA_OPT__ should not be accepted at all, but allow it in
1401 system headers. */
1402 if (!_cpp_in_system_header (pfile))
1403 cpp_error (pfile, CPP_DL_PEDWARN,
1404 "__VA_OPT__ is not available until C++20");
1406 else if (!pfile->state.va_args_ok)
1408 /* __VA_OPT__ should only appear in the replacement list of a
1409 variadic macro. */
1410 cpp_error (pfile, CPP_DL_PEDWARN,
1411 "__VA_OPT__ can only appear in the expansion"
1412 " of a C++20 variadic macro");
1416 /* Helper function to get the cpp_hashnode of the identifier BASE. */
1417 static cpp_hashnode *
1418 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1420 cpp_hashnode *result;
1421 const uchar *cur;
1422 unsigned int len;
1423 unsigned int hash = HT_HASHSTEP (0, *base);
1425 cur = base + 1;
1426 while (ISIDNUM (*cur))
1428 hash = HT_HASHSTEP (hash, *cur);
1429 cur++;
1431 len = cur - base;
1432 hash = HT_HASHFINISH (hash, len);
1433 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1434 base, len, hash, HT_ALLOC));
1436 /* Rarely, identifiers require diagnostics when lexed. */
1437 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1438 && !pfile->state.skipping, 0))
1440 /* It is allowed to poison the same identifier twice. */
1441 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1442 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1443 NODE_NAME (result));
1445 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1446 replacement list of a variadic macro. */
1447 if (result == pfile->spec_nodes.n__VA_ARGS__
1448 && !pfile->state.va_args_ok)
1450 if (CPP_OPTION (pfile, cplusplus))
1451 cpp_error (pfile, CPP_DL_PEDWARN,
1452 "__VA_ARGS__ can only appear in the expansion"
1453 " of a C++11 variadic macro");
1454 else
1455 cpp_error (pfile, CPP_DL_PEDWARN,
1456 "__VA_ARGS__ can only appear in the expansion"
1457 " of a C99 variadic macro");
1460 if (result == pfile->spec_nodes.n__VA_OPT__)
1461 maybe_va_opt_error (pfile);
1463 /* For -Wc++-compat, warn about use of C++ named operators. */
1464 if (result->flags & NODE_WARN_OPERATOR)
1465 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1466 "identifier \"%s\" is a special operator name in C++",
1467 NODE_NAME (result));
1470 return result;
1473 /* Get the cpp_hashnode of an identifier specified by NAME in
1474 the current cpp_reader object. If none is found, NULL is returned. */
1475 cpp_hashnode *
1476 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1478 cpp_hashnode *result;
1479 result = lex_identifier_intern (pfile, (uchar *) name);
1480 return result;
1483 /* Lex an identifier starting at BUFFER->CUR - 1. */
1484 static cpp_hashnode *
1485 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1486 struct normalize_state *nst, cpp_hashnode **spelling)
1488 cpp_hashnode *result;
1489 const uchar *cur;
1490 unsigned int len;
1491 unsigned int hash = HT_HASHSTEP (0, *base);
1493 cur = pfile->buffer->cur;
1494 if (! starts_ucn)
1496 while (ISIDNUM (*cur))
1498 hash = HT_HASHSTEP (hash, *cur);
1499 cur++;
1501 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1503 pfile->buffer->cur = cur;
1504 if (starts_ucn || forms_identifier_p (pfile, false, nst))
1506 /* Slower version for identifiers containing UCNs
1507 or extended chars (including $). */
1508 do {
1509 while (ISIDNUM (*pfile->buffer->cur))
1511 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1512 pfile->buffer->cur++;
1514 } while (forms_identifier_p (pfile, false, nst));
1515 result = _cpp_interpret_identifier (pfile, base,
1516 pfile->buffer->cur - base);
1517 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1519 else
1521 len = cur - base;
1522 hash = HT_HASHFINISH (hash, len);
1524 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1525 base, len, hash, HT_ALLOC));
1526 *spelling = result;
1529 /* Rarely, identifiers require diagnostics when lexed. */
1530 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1531 && !pfile->state.skipping, 0))
1533 /* It is allowed to poison the same identifier twice. */
1534 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1535 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1536 NODE_NAME (result));
1538 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1539 replacement list of a variadic macro. */
1540 if (result == pfile->spec_nodes.n__VA_ARGS__
1541 && !pfile->state.va_args_ok)
1543 if (CPP_OPTION (pfile, cplusplus))
1544 cpp_error (pfile, CPP_DL_PEDWARN,
1545 "__VA_ARGS__ can only appear in the expansion"
1546 " of a C++11 variadic macro");
1547 else
1548 cpp_error (pfile, CPP_DL_PEDWARN,
1549 "__VA_ARGS__ can only appear in the expansion"
1550 " of a C99 variadic macro");
1553 /* __VA_OPT__ should only appear in the replacement list of a
1554 variadic macro. */
1555 if (result == pfile->spec_nodes.n__VA_OPT__)
1556 maybe_va_opt_error (pfile);
1558 /* For -Wc++-compat, warn about use of C++ named operators. */
1559 if (result->flags & NODE_WARN_OPERATOR)
1560 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1561 "identifier \"%s\" is a special operator name in C++",
1562 NODE_NAME (result));
1565 return result;
1568 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
1569 static void
1570 lex_number (cpp_reader *pfile, cpp_string *number,
1571 struct normalize_state *nst)
1573 const uchar *cur;
1574 const uchar *base;
1575 uchar *dest;
1577 base = pfile->buffer->cur - 1;
1580 const uchar *adj_digit_sep = NULL;
1581 cur = pfile->buffer->cur;
1583 /* N.B. ISIDNUM does not include $. */
1584 while (ISIDNUM (*cur)
1585 || (*cur == '.' && !DIGIT_SEP (cur[-1]))
1586 || DIGIT_SEP (*cur)
1587 || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
1589 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1590 /* Adjacent digit separators do not form part of the pp-number syntax.
1591 However, they can safely be diagnosed here as an error, since '' is
1592 not a valid preprocessing token. */
1593 if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
1594 adj_digit_sep = cur;
1595 cur++;
1597 /* A number can't end with a digit separator. */
1598 while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
1599 --cur;
1600 if (adj_digit_sep && adj_digit_sep < cur)
1601 cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
1603 pfile->buffer->cur = cur;
1605 while (forms_identifier_p (pfile, false, nst));
1607 number->len = cur - base;
1608 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1609 memcpy (dest, base, number->len);
1610 dest[number->len] = '\0';
1611 number->text = dest;
1614 /* Create a token of type TYPE with a literal spelling. */
1615 static void
1616 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1617 unsigned int len, enum cpp_ttype type)
1619 token->type = type;
1620 token->val.str.len = len;
1621 token->val.str.text = cpp_alloc_token_string (pfile, base, len);
1624 const uchar *
1625 cpp_alloc_token_string (cpp_reader *pfile,
1626 const unsigned char *ptr, unsigned len)
1628 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1630 dest[len] = 0;
1631 memcpy (dest, ptr, len);
1632 return dest;
1635 /* A pair of raw buffer pointers. The currently open one is [1], the
1636 first one is [0]. Used for string literal lexing. */
1637 struct lit_accum {
1638 _cpp_buff *first;
1639 _cpp_buff *last;
1640 const uchar *rpos;
1641 size_t accum;
1643 lit_accum ()
1644 : first (NULL), last (NULL), rpos (0), accum (0)
1648 void append (cpp_reader *, const uchar *, size_t);
1650 void read_begin (cpp_reader *);
1651 bool reading_p () const
1653 return rpos != NULL;
1655 char read_char ()
1657 char c = *rpos++;
1658 if (rpos == BUFF_FRONT (last))
1659 rpos = NULL;
1660 return c;
1664 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1665 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
1667 void
1668 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
1670 if (!last)
1671 /* Starting. */
1672 first = last = _cpp_get_buff (pfile, len);
1673 else if (len > BUFF_ROOM (last))
1675 /* There is insufficient room in the buffer. Copy what we can,
1676 and then either extend or create a new one. */
1677 size_t room = BUFF_ROOM (last);
1678 memcpy (BUFF_FRONT (last), base, room);
1679 BUFF_FRONT (last) += room;
1680 base += room;
1681 len -= room;
1682 accum += room;
1684 gcc_checking_assert (!rpos);
1686 last = _cpp_append_extend_buff (pfile, last, len);
1689 memcpy (BUFF_FRONT (last), base, len);
1690 BUFF_FRONT (last) += len;
1691 accum += len;
1694 void
1695 lit_accum::read_begin (cpp_reader *pfile)
1697 /* We never accumulate more than 4 chars to read. */
1698 if (BUFF_ROOM (last) < 4)
1700 last = _cpp_append_extend_buff (pfile, last, 4);
1701 rpos = BUFF_FRONT (last);
1704 /* Returns true if a macro has been defined.
1705 This might not work if compile with -save-temps,
1706 or preprocess separately from compilation. */
1708 static bool
1709 is_macro(cpp_reader *pfile, const uchar *base)
1711 const uchar *cur = base;
1712 if (! ISIDST (*cur))
1713 return false;
1714 unsigned int hash = HT_HASHSTEP (0, *cur);
1715 ++cur;
1716 while (ISIDNUM (*cur))
1718 hash = HT_HASHSTEP (hash, *cur);
1719 ++cur;
1721 hash = HT_HASHFINISH (hash, cur - base);
1723 cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1724 base, cur - base, hash, HT_NO_INSERT));
1726 return result && cpp_macro_p (result);
1729 /* Returns true if a literal suffix does not have the expected form
1730 and is defined as a macro. */
1732 static bool
1733 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
1735 /* User-defined literals outside of namespace std must start with a single
1736 underscore, so assume anything of that form really is a UDL suffix.
1737 We don't need to worry about UDLs defined inside namespace std because
1738 their names are reserved, so cannot be used as macro names in valid
1739 programs. */
1740 if (base[0] == '_' && base[1] != '_')
1741 return false;
1742 return is_macro (pfile, base);
1745 /* Lexes a raw string. The stored string contains the spelling,
1746 including double quotes, delimiter string, '(' and ')', any leading
1747 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
1748 the type of the literal, or CPP_OTHER if it was not properly
1749 terminated.
1751 BASE is the start of the token. Updates pfile->buffer->cur to just
1752 after the lexed string.
1754 The spelling is NUL-terminated, but it is not guaranteed that this
1755 is the first NUL since embedded NULs are preserved. */
1757 static void
1758 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1760 const uchar *pos = base;
1762 /* 'tis a pity this information isn't passed down from the lexer's
1763 initial categorization of the token. */
1764 enum cpp_ttype type = CPP_STRING;
1766 if (*pos == 'L')
1768 type = CPP_WSTRING;
1769 pos++;
1771 else if (*pos == 'U')
1773 type = CPP_STRING32;
1774 pos++;
1776 else if (*pos == 'u')
1778 if (pos[1] == '8')
1780 type = CPP_UTF8STRING;
1781 pos++;
1783 else
1784 type = CPP_STRING16;
1785 pos++;
1788 gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
1789 pos += 2;
1791 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1793 /* Skip notes before the ". */
1794 while (note->pos < pos)
1795 ++note;
1797 lit_accum accum;
1799 uchar prefix[17];
1800 unsigned prefix_len = 0;
1801 enum Phase
1803 PHASE_PREFIX = -2,
1804 PHASE_NONE = -1,
1805 PHASE_SUFFIX = 0
1806 } phase = PHASE_PREFIX;
1808 for (;;)
1810 gcc_checking_assert (note->pos >= pos);
1812 /* Undo any escaped newlines and trigraphs. */
1813 if (!accum.reading_p () && note->pos == pos)
1814 switch (note->type)
1816 case '\\':
1817 case ' ':
1818 /* Restore backslash followed by newline. */
1819 accum.append (pfile, base, pos - base);
1820 base = pos;
1821 accum.read_begin (pfile);
1822 accum.append (pfile, UC"\\", 1);
1824 after_backslash:
1825 if (note->type == ' ')
1826 /* GNU backslash whitespace newline extension. FIXME
1827 could be any sequence of non-vertical space. When we
1828 can properly restore any such sequence, we should
1829 mark this note as handled so _cpp_process_line_notes
1830 doesn't warn. */
1831 accum.append (pfile, UC" ", 1);
1833 accum.append (pfile, UC"\n", 1);
1834 note++;
1835 break;
1837 case '\n':
1838 /* This can happen for ??/<NEWLINE> when trigraphs are not
1839 being interpretted. */
1840 gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
1841 note->type = 0;
1842 note++;
1843 break;
1845 default:
1846 gcc_checking_assert (_cpp_trigraph_map[note->type]);
1848 /* Don't warn about this trigraph in
1849 _cpp_process_line_notes, since trigraphs show up as
1850 trigraphs in raw strings. */
1851 uchar type = note->type;
1852 note->type = 0;
1854 if (CPP_OPTION (pfile, trigraphs))
1856 accum.append (pfile, base, pos - base);
1857 base = pos;
1858 accum.read_begin (pfile);
1859 accum.append (pfile, UC"??", 2);
1860 accum.append (pfile, &type, 1);
1862 /* ??/ followed by newline gets two line notes, one for
1863 the trigraph and one for the backslash/newline. */
1864 if (type == '/' && note[1].pos == pos)
1866 note++;
1867 gcc_assert (note->type == '\\' || note->type == ' ');
1868 goto after_backslash;
1870 /* Skip the replacement character. */
1871 base = ++pos;
1874 note++;
1875 break;
1878 /* Now get a char to process. Either from an expanded note, or
1879 from the line buffer. */
1880 bool read_note = accum.reading_p ();
1881 char c = read_note ? accum.read_char () : *pos++;
1883 if (phase == PHASE_PREFIX)
1885 if (c == '(')
1887 /* Done. */
1888 phase = PHASE_NONE;
1889 prefix[prefix_len++] = '"';
1891 else if (prefix_len < 16
1892 /* Prefix chars are any of the basic character set,
1893 [lex.charset] except for '
1894 ()\\\t\v\f\n'. Optimized for a contiguous
1895 alphabet. */
1896 /* Unlike a switch, this collapses down to one or
1897 two shift and bitmask operations on an ASCII
1898 system, with an outlier or two. */
1899 && (('Z' - 'A' == 25
1900 ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
1901 : ISIDST (c))
1902 || (c >= '0' && c <= '9')
1903 || c == '_' || c == '{' || c == '}'
1904 || c == '[' || c == ']' || c == '#'
1905 || c == '<' || c == '>' || c == '%'
1906 || c == ':' || c == ';' || c == '.' || c == '?'
1907 || c == '*' || c == '+' || c == '-' || c == '/'
1908 || c == '^' || c == '&' || c == '|' || c == '~'
1909 || c == '!' || c == '=' || c == ','
1910 || c == '"' || c == '\''))
1911 prefix[prefix_len++] = c;
1912 else
1914 /* Something is wrong. */
1915 int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
1916 if (prefix_len == 16)
1917 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1918 col, "raw string delimiter longer "
1919 "than 16 characters");
1920 else if (c == '\n')
1921 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1922 col, "invalid new-line in raw "
1923 "string delimiter");
1924 else
1925 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1926 col, "invalid character '%c' in "
1927 "raw string delimiter", c);
1928 type = CPP_OTHER;
1929 phase = PHASE_NONE;
1930 /* Continue until we get a close quote, that's probably
1931 the best failure mode. */
1932 prefix_len = 0;
1934 if (c != '\n')
1935 continue;
1938 if (phase != PHASE_NONE)
1940 if (prefix[phase] != c)
1941 phase = PHASE_NONE;
1942 else if (unsigned (phase + 1) == prefix_len)
1943 break;
1944 else
1946 phase = Phase (phase + 1);
1947 continue;
1951 if (!prefix_len && c == '"')
1952 /* Failure mode lexing. */
1953 goto out;
1954 else if (prefix_len && c == ')')
1955 phase = PHASE_SUFFIX;
1956 else if (!read_note && c == '\n')
1958 pos--;
1959 pfile->buffer->cur = pos;
1960 if (pfile->state.in_directive
1961 || (pfile->state.parsing_args
1962 && pfile->buffer->next_line >= pfile->buffer->rlimit))
1964 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1965 "unterminated raw string");
1966 type = CPP_OTHER;
1967 goto out;
1970 accum.append (pfile, base, pos - base + 1);
1971 _cpp_process_line_notes (pfile, false);
1973 if (pfile->buffer->next_line < pfile->buffer->rlimit)
1974 CPP_INCREMENT_LINE (pfile, 0);
1975 pfile->buffer->need_line = true;
1977 if (!_cpp_get_fresh_line (pfile))
1979 /* We ran out of file and failed to get a line. */
1980 location_t src_loc = token->src_loc;
1981 token->type = CPP_EOF;
1982 /* Tell the compiler the line number of the EOF token. */
1983 token->src_loc = pfile->line_table->highest_line;
1984 token->flags = BOL;
1985 if (accum.first)
1986 _cpp_release_buff (pfile, accum.first);
1987 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1988 "unterminated raw string");
1989 /* Now pop the buffer that _cpp_get_fresh_line did not. */
1990 _cpp_pop_buffer (pfile);
1991 return;
1994 pos = base = pfile->buffer->cur;
1995 note = &pfile->buffer->notes[pfile->buffer->cur_note];
1999 if (CPP_OPTION (pfile, user_literals))
2001 /* If a string format macro, say from inttypes.h, is placed touching
2002 a string literal it could be parsed as a C++11 user-defined string
2003 literal thus breaking the program. */
2004 if (is_macro_not_literal_suffix (pfile, pos))
2006 /* Raise a warning, but do not consume subsequent tokens. */
2007 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2008 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2009 token->src_loc, 0,
2010 "invalid suffix on literal; C++11 requires "
2011 "a space between literal and string macro");
2013 /* Grab user defined literal suffix. */
2014 else if (ISIDST (*pos))
2016 type = cpp_userdef_string_add_type (type);
2017 ++pos;
2019 while (ISIDNUM (*pos))
2020 ++pos;
2024 out:
2025 pfile->buffer->cur = pos;
2026 if (!accum.accum)
2027 create_literal (pfile, token, base, pos - base, type);
2028 else
2030 size_t extra_len = pos - base;
2031 uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2033 token->type = type;
2034 token->val.str.len = accum.accum + extra_len;
2035 token->val.str.text = dest;
2036 for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2038 size_t len = BUFF_FRONT (buf) - buf->base;
2039 memcpy (dest, buf->base, len);
2040 dest += len;
2042 _cpp_release_buff (pfile, accum.first);
2043 memcpy (dest, base, extra_len);
2044 dest[extra_len] = '\0';
2048 /* Lexes a string, character constant, or angle-bracketed header file
2049 name. The stored string contains the spelling, including opening
2050 quote and any leading 'L', 'u', 'U' or 'u8' and optional
2051 'R' modifier. It returns the type of the literal, or CPP_OTHER
2052 if it was not properly terminated, or CPP_LESS for an unterminated
2053 header name which must be relexed as normal tokens.
2055 The spelling is NUL-terminated, but it is not guaranteed that this
2056 is the first NUL since embedded NULs are preserved. */
2057 static void
2058 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2060 bool saw_NUL = false;
2061 const uchar *cur;
2062 cppchar_t terminator;
2063 enum cpp_ttype type;
2065 cur = base;
2066 terminator = *cur++;
2067 if (terminator == 'L' || terminator == 'U')
2068 terminator = *cur++;
2069 else if (terminator == 'u')
2071 terminator = *cur++;
2072 if (terminator == '8')
2073 terminator = *cur++;
2075 if (terminator == 'R')
2077 lex_raw_string (pfile, token, base);
2078 return;
2080 if (terminator == '"')
2081 type = (*base == 'L' ? CPP_WSTRING :
2082 *base == 'U' ? CPP_STRING32 :
2083 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2084 : CPP_STRING);
2085 else if (terminator == '\'')
2086 type = (*base == 'L' ? CPP_WCHAR :
2087 *base == 'U' ? CPP_CHAR32 :
2088 *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2089 : CPP_CHAR);
2090 else
2091 terminator = '>', type = CPP_HEADER_NAME;
2093 for (;;)
2095 cppchar_t c = *cur++;
2097 /* In #include-style directives, terminators are not escapable. */
2098 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2099 cur++;
2100 else if (c == terminator)
2101 break;
2102 else if (c == '\n')
2104 cur--;
2105 /* Unmatched quotes always yield undefined behavior, but
2106 greedy lexing means that what appears to be an unterminated
2107 header name may actually be a legitimate sequence of tokens. */
2108 if (terminator == '>')
2110 token->type = CPP_LESS;
2111 return;
2113 type = CPP_OTHER;
2114 break;
2116 else if (c == '\0')
2117 saw_NUL = true;
2120 if (saw_NUL && !pfile->state.skipping)
2121 cpp_error (pfile, CPP_DL_WARNING,
2122 "null character(s) preserved in literal");
2124 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2125 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2126 (int) terminator);
2128 if (CPP_OPTION (pfile, user_literals))
2130 /* If a string format macro, say from inttypes.h, is placed touching
2131 a string literal it could be parsed as a C++11 user-defined string
2132 literal thus breaking the program. */
2133 if (is_macro_not_literal_suffix (pfile, cur))
2135 /* Raise a warning, but do not consume subsequent tokens. */
2136 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2137 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2138 token->src_loc, 0,
2139 "invalid suffix on literal; C++11 requires "
2140 "a space between literal and string macro");
2142 /* Grab user defined literal suffix. */
2143 else if (ISIDST (*cur))
2145 type = cpp_userdef_char_add_type (type);
2146 type = cpp_userdef_string_add_type (type);
2147 ++cur;
2149 while (ISIDNUM (*cur))
2150 ++cur;
2153 else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2154 && is_macro (pfile, cur)
2155 && !pfile->state.skipping)
2156 cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2157 token->src_loc, 0, "C++11 requires a space "
2158 "between string literal and macro");
2160 pfile->buffer->cur = cur;
2161 create_literal (pfile, token, base, cur - base, type);
2164 /* Return the comment table. The client may not make any assumption
2165 about the ordering of the table. */
2166 cpp_comment_table *
2167 cpp_get_comments (cpp_reader *pfile)
2169 return &pfile->comments;
2172 /* Append a comment to the end of the comment table. */
2173 static void
2174 store_comment (cpp_reader *pfile, cpp_token *token)
2176 int len;
2178 if (pfile->comments.allocated == 0)
2180 pfile->comments.allocated = 256;
2181 pfile->comments.entries = (cpp_comment *) xmalloc
2182 (pfile->comments.allocated * sizeof (cpp_comment));
2185 if (pfile->comments.count == pfile->comments.allocated)
2187 pfile->comments.allocated *= 2;
2188 pfile->comments.entries = (cpp_comment *) xrealloc
2189 (pfile->comments.entries,
2190 pfile->comments.allocated * sizeof (cpp_comment));
2193 len = token->val.str.len;
2195 /* Copy comment. Note, token may not be NULL terminated. */
2196 pfile->comments.entries[pfile->comments.count].comment =
2197 (char *) xmalloc (sizeof (char) * (len + 1));
2198 memcpy (pfile->comments.entries[pfile->comments.count].comment,
2199 token->val.str.text, len);
2200 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2202 /* Set source location. */
2203 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2205 /* Increment the count of entries in the comment table. */
2206 pfile->comments.count++;
2209 /* The stored comment includes the comment start and any terminator. */
2210 static void
2211 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2212 cppchar_t type)
2214 unsigned char *buffer;
2215 unsigned int len, clen, i;
2217 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
2219 /* C++ comments probably (not definitely) have moved past a new
2220 line, which we don't want to save in the comment. */
2221 if (is_vspace (pfile->buffer->cur[-1]))
2222 len--;
2224 /* If we are currently in a directive or in argument parsing, then
2225 we need to store all C++ comments as C comments internally, and
2226 so we need to allocate a little extra space in that case.
2228 Note that the only time we encounter a directive here is
2229 when we are saving comments in a "#define". */
2230 clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2231 && type == '/') ? len + 2 : len;
2233 buffer = _cpp_unaligned_alloc (pfile, clen);
2235 token->type = CPP_COMMENT;
2236 token->val.str.len = clen;
2237 token->val.str.text = buffer;
2239 buffer[0] = '/';
2240 memcpy (buffer + 1, from, len - 1);
2242 /* Finish conversion to a C comment, if necessary. */
2243 if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2245 buffer[1] = '*';
2246 buffer[clen - 2] = '*';
2247 buffer[clen - 1] = '/';
2248 /* As there can be in a C++ comments illegal sequences for C comments
2249 we need to filter them out. */
2250 for (i = 2; i < (clen - 2); i++)
2251 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2252 buffer[i] = '|';
2255 /* Finally store this comment for use by clients of libcpp. */
2256 store_comment (pfile, token);
2259 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2260 comment. */
2262 static bool
2263 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2265 const unsigned char *from = comment_start + 1;
2267 switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2269 /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2270 don't recognize any comments. The latter only checks attributes,
2271 the former doesn't warn. */
2272 case 0:
2273 default:
2274 return false;
2275 /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2276 content it has. */
2277 case 1:
2278 return true;
2279 case 2:
2280 /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2281 .*falls?[ \t-]*thr(u|ough).* regex. */
2282 for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2283 from++)
2285 /* Is there anything like strpbrk with upper boundary, or
2286 memchr looking for 2 characters rather than just one? */
2287 if (from[0] != 'f' && from[0] != 'F')
2288 continue;
2289 if (from[1] != 'a' && from[1] != 'A')
2290 continue;
2291 if (from[2] != 'l' && from[2] != 'L')
2292 continue;
2293 if (from[3] != 'l' && from[3] != 'L')
2294 continue;
2295 from += sizeof "fall" - 1;
2296 if (from[0] == 's' || from[0] == 'S')
2297 from++;
2298 while (*from == ' ' || *from == '\t' || *from == '-')
2299 from++;
2300 if (from[0] != 't' && from[0] != 'T')
2301 continue;
2302 if (from[1] != 'h' && from[1] != 'H')
2303 continue;
2304 if (from[2] != 'r' && from[2] != 'R')
2305 continue;
2306 if (from[3] == 'u' || from[3] == 'U')
2307 return true;
2308 if (from[3] != 'o' && from[3] != 'O')
2309 continue;
2310 if (from[4] != 'u' && from[4] != 'U')
2311 continue;
2312 if (from[5] != 'g' && from[5] != 'G')
2313 continue;
2314 if (from[6] != 'h' && from[6] != 'H')
2315 continue;
2316 return true;
2318 return false;
2319 case 3:
2320 case 4:
2321 break;
2324 /* Whole comment contents:
2325 -fallthrough
2326 @fallthrough@
2328 if (*from == '-' || *from == '@')
2330 size_t len = sizeof "fallthrough" - 1;
2331 if ((size_t) (pfile->buffer->cur - from - 1) < len)
2332 return false;
2333 if (memcmp (from + 1, "fallthrough", len))
2334 return false;
2335 if (*from == '@')
2337 if (from[len + 1] != '@')
2338 return false;
2339 len++;
2341 from += 1 + len;
2343 /* Whole comment contents (regex):
2344 lint -fallthrough[ \t]*
2346 else if (*from == 'l')
2348 size_t len = sizeof "int -fallthrough" - 1;
2349 if ((size_t) (pfile->buffer->cur - from - 1) < len)
2350 return false;
2351 if (memcmp (from + 1, "int -fallthrough", len))
2352 return false;
2353 from += 1 + len;
2354 while (*from == ' ' || *from == '\t')
2355 from++;
2357 /* Whole comment contents (regex):
2358 [ \t]*FALLTHR(U|OUGH)[ \t]*
2360 else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2362 while (*from == ' ' || *from == '\t')
2363 from++;
2364 if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
2365 return false;
2366 if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2367 return false;
2368 from += sizeof "FALLTHR" - 1;
2369 if (*from == 'U')
2370 from++;
2371 else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
2372 return false;
2373 else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2374 return false;
2375 else
2376 from += sizeof "OUGH" - 1;
2377 while (*from == ' ' || *from == '\t')
2378 from++;
2380 /* Whole comment contents (regex):
2381 [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2382 [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2383 [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2385 else
2387 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2388 from++;
2389 unsigned char f = *from;
2390 bool all_upper = false;
2391 if (f == 'E' || f == 'e')
2393 if ((size_t) (pfile->buffer->cur - from)
2394 < sizeof "else fallthru" - 1)
2395 return false;
2396 if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2397 all_upper = true;
2398 else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2399 return false;
2400 from += sizeof "else" - 1;
2401 if (*from == ',')
2402 from++;
2403 if (*from != ' ')
2404 return false;
2405 from++;
2406 if (all_upper && *from == 'f')
2407 return false;
2408 if (f == 'e' && *from == 'F')
2409 return false;
2410 f = *from;
2412 else if (f == 'I' || f == 'i')
2414 if ((size_t) (pfile->buffer->cur - from)
2415 < sizeof "intentional fallthru" - 1)
2416 return false;
2417 if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2418 sizeof "NTENTIONAL" - 1) == 0)
2419 all_upper = true;
2420 else if (memcmp (from + 1, "ntentional",
2421 sizeof "ntentional" - 1))
2422 return false;
2423 from += sizeof "intentional" - 1;
2424 if (*from == ' ')
2426 from++;
2427 if (all_upper && *from == 'f')
2428 return false;
2430 else if (all_upper)
2432 if (memcmp (from, "LY F", sizeof "LY F" - 1))
2433 return false;
2434 from += sizeof "LY " - 1;
2436 else
2438 if (memcmp (from, "ly ", sizeof "ly " - 1))
2439 return false;
2440 from += sizeof "ly " - 1;
2442 if (f == 'i' && *from == 'F')
2443 return false;
2444 f = *from;
2446 if (f != 'F' && f != 'f')
2447 return false;
2448 if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
2449 return false;
2450 if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
2451 all_upper = true;
2452 else if (all_upper)
2453 return false;
2454 else if (memcmp (from + 1, "all", sizeof "all" - 1))
2455 return false;
2456 from += sizeof "fall" - 1;
2457 if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
2458 from += 2;
2459 else if (*from == ' ' || *from == '-')
2460 from++;
2461 else if (*from != (all_upper ? 'T' : 't'))
2462 return false;
2463 if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
2464 return false;
2465 if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
2466 return false;
2467 if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
2469 if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
2470 return false;
2471 if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
2472 sizeof "hrough" - 1))
2473 return false;
2474 from += sizeof "through" - 1;
2476 else
2477 from += sizeof "thru" - 1;
2478 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2479 from++;
2480 if (*from == '-')
2482 from++;
2483 if (*comment_start == '*')
2487 while (*from && *from != '*'
2488 && *from != '\n' && *from != '\r')
2489 from++;
2490 if (*from != '*' || from[1] == '/')
2491 break;
2492 from++;
2494 while (1);
2496 else
2497 while (*from && *from != '\n' && *from != '\r')
2498 from++;
2501 /* C block comment. */
2502 if (*comment_start == '*')
2504 if (*from != '*' || from[1] != '/')
2505 return false;
2507 /* C++ line comment. */
2508 else if (*from != '\n')
2509 return false;
2511 return true;
2514 /* Allocate COUNT tokens for RUN. */
2515 void
2516 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
2518 run->base = XNEWVEC (cpp_token, count);
2519 run->limit = run->base + count;
2520 run->next = NULL;
2523 /* Returns the next tokenrun, or creates one if there is none. */
2524 static tokenrun *
2525 next_tokenrun (tokenrun *run)
2527 if (run->next == NULL)
2529 run->next = XNEW (tokenrun);
2530 run->next->prev = run;
2531 _cpp_init_tokenrun (run->next, 250);
2534 return run->next;
2537 /* Return the number of not yet processed token in a given
2538 context. */
2540 _cpp_remaining_tokens_num_in_context (cpp_context *context)
2542 if (context->tokens_kind == TOKENS_KIND_DIRECT)
2543 return (LAST (context).token - FIRST (context).token);
2544 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2545 || context->tokens_kind == TOKENS_KIND_EXTENDED)
2546 return (LAST (context).ptoken - FIRST (context).ptoken);
2547 else
2548 abort ();
2551 /* Returns the token present at index INDEX in a given context. If
2552 INDEX is zero, the next token to be processed is returned. */
2553 static const cpp_token*
2554 _cpp_token_from_context_at (cpp_context *context, int index)
2556 if (context->tokens_kind == TOKENS_KIND_DIRECT)
2557 return &(FIRST (context).token[index]);
2558 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2559 || context->tokens_kind == TOKENS_KIND_EXTENDED)
2560 return FIRST (context).ptoken[index];
2561 else
2562 abort ();
2565 /* Look ahead in the input stream. */
2566 const cpp_token *
2567 cpp_peek_token (cpp_reader *pfile, int index)
2569 cpp_context *context = pfile->context;
2570 const cpp_token *peektok;
2571 int count;
2573 /* First, scan through any pending cpp_context objects. */
2574 while (context->prev)
2576 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2578 if (index < (int) sz)
2579 return _cpp_token_from_context_at (context, index);
2580 index -= (int) sz;
2581 context = context->prev;
2584 /* We will have to read some new tokens after all (and do so
2585 without invalidating preceding tokens). */
2586 count = index;
2587 pfile->keep_tokens++;
2589 /* For peeked tokens temporarily disable line_change reporting,
2590 until the tokens are parsed for real. */
2591 void (*line_change) (cpp_reader *, const cpp_token *, int)
2592 = pfile->cb.line_change;
2593 pfile->cb.line_change = NULL;
2597 peektok = _cpp_lex_token (pfile);
2598 if (peektok->type == CPP_EOF)
2600 index--;
2601 break;
2603 else if (peektok->type == CPP_PRAGMA)
2605 /* Don't peek past a pragma. */
2606 if (peektok == &pfile->directive_result)
2607 /* Save the pragma in the buffer. */
2608 *pfile->cur_token++ = *peektok;
2609 index--;
2610 break;
2613 while (index--);
2615 _cpp_backup_tokens_direct (pfile, count - index);
2616 pfile->keep_tokens--;
2617 pfile->cb.line_change = line_change;
2619 return peektok;
2622 /* Allocate a single token that is invalidated at the same time as the
2623 rest of the tokens on the line. Has its line and col set to the
2624 same as the last lexed token, so that diagnostics appear in the
2625 right place. */
2626 cpp_token *
2627 _cpp_temp_token (cpp_reader *pfile)
2629 cpp_token *old, *result;
2630 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2631 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2633 old = pfile->cur_token - 1;
2634 /* Any pre-existing lookaheads must not be clobbered. */
2635 if (la)
2637 if (sz <= la)
2639 tokenrun *next = next_tokenrun (pfile->cur_run);
2641 if (sz < la)
2642 memmove (next->base + 1, next->base,
2643 (la - sz) * sizeof (cpp_token));
2645 next->base[0] = pfile->cur_run->limit[-1];
2648 if (sz > 1)
2649 memmove (pfile->cur_token + 1, pfile->cur_token,
2650 MIN (la, sz - 1) * sizeof (cpp_token));
2653 if (!sz && pfile->cur_token == pfile->cur_run->limit)
2655 pfile->cur_run = next_tokenrun (pfile->cur_run);
2656 pfile->cur_token = pfile->cur_run->base;
2659 result = pfile->cur_token++;
2660 result->src_loc = old->src_loc;
2661 return result;
2664 /* We're at the beginning of a logical line (so not in
2665 directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
2666 if we should enter deferred_pragma mode to tokenize the rest of the
2667 line as a module control-line. */
2669 static void
2670 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
2672 unsigned backup = 0; /* Tokens we peeked. */
2673 cpp_hashnode *node = result->val.node.node;
2674 cpp_token *peek = result;
2675 cpp_token *keyword = peek;
2676 cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
2677 int header_count = 0;
2679 /* Make sure the incoming state is as we expect it. This way we
2680 can restore it using constants. */
2681 gcc_checking_assert (!pfile->state.in_deferred_pragma
2682 && !pfile->state.skipping
2683 && !pfile->state.parsing_args
2684 && !pfile->state.angled_headers
2685 && (pfile->state.save_comments
2686 == !CPP_OPTION (pfile, discard_comments)));
2688 /* Enter directives mode sufficiently for peeking. We don't have
2689 to actually set in_directive. */
2690 pfile->state.in_deferred_pragma = true;
2692 /* These two fields are needed to process tokenization in deferred
2693 pragma mode. They are not used outside deferred pragma mode or
2694 directives mode. */
2695 pfile->state.pragma_allow_expansion = true;
2696 pfile->directive_line = result->src_loc;
2698 /* Saving comments is incompatible with directives mode. */
2699 pfile->state.save_comments = 0;
2701 if (node == n_modules[spec_nodes::M_EXPORT][0])
2703 peek = _cpp_lex_direct (pfile);
2704 keyword = peek;
2705 backup++;
2706 if (keyword->type != CPP_NAME)
2707 goto not_module;
2708 node = keyword->val.node.node;
2709 if (!(node->flags & NODE_MODULE))
2710 goto not_module;
2713 if (node == n_modules[spec_nodes::M__IMPORT][0])
2714 /* __import */
2715 header_count = backup + 2 + 16;
2716 else if (node == n_modules[spec_nodes::M_IMPORT][0])
2717 /* import */
2718 header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
2719 else if (node == n_modules[spec_nodes::M_MODULE][0])
2720 ; /* module */
2721 else
2722 goto not_module;
2724 /* We've seen [export] {module|import|__import}. Check the next token. */
2725 if (header_count)
2726 /* After '{,__}import' a header name may appear. */
2727 pfile->state.angled_headers = true;
2728 peek = _cpp_lex_direct (pfile);
2729 backup++;
2731 /* ... import followed by identifier, ':', '<' or
2732 header-name preprocessing tokens, or module
2733 followed by cpp-identifier, ':' or ';' preprocessing
2734 tokens. C++ keywords are not yet relevant. */
2735 if (peek->type == CPP_NAME
2736 || peek->type == CPP_COLON
2737 || (header_count
2738 ? (peek->type == CPP_LESS
2739 || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
2740 || peek->type == CPP_HEADER_NAME)
2741 : peek->type == CPP_SEMICOLON))
2743 pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
2744 if (!pfile->state.pragma_allow_expansion)
2745 pfile->state.prevent_expansion++;
2747 if (!header_count && linemap_included_from
2748 (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
2749 cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
2750 "module control-line cannot be in included file");
2752 /* The first one or two tokens cannot be macro names. */
2753 for (int ix = backup; ix--;)
2755 cpp_token *tok = ix ? keyword : result;
2756 cpp_hashnode *node = tok->val.node.node;
2758 /* Don't attempt to expand the token. */
2759 tok->flags |= NO_EXPAND;
2760 if (_cpp_defined_macro_p (node)
2761 && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
2762 && !cpp_fun_like_macro_p (node))
2763 cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
2764 "module control-line \"%s\" cannot be"
2765 " an object-like macro",
2766 NODE_NAME (node));
2769 /* Map to underbar variants. */
2770 keyword->val.node.node = n_modules[header_count
2771 ? spec_nodes::M_IMPORT
2772 : spec_nodes::M_MODULE][1];
2773 if (backup != 1)
2774 result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
2776 /* Maybe tell the tokenizer we expect a header-name down the
2777 road. */
2778 pfile->state.directive_file_token = header_count;
2780 else
2782 not_module:
2783 /* Drop out of directive mode. */
2784 /* We aaserted save_comments had this value upon entry. */
2785 pfile->state.save_comments
2786 = !CPP_OPTION (pfile, discard_comments);
2787 pfile->state.in_deferred_pragma = false;
2788 /* Do not let this remain on. */
2789 pfile->state.angled_headers = false;
2792 /* In either case we want to backup the peeked tokens. */
2793 if (backup)
2795 /* If we saw EOL, we should drop it, because this isn't a module
2796 control-line after all. */
2797 bool eol = peek->type == CPP_PRAGMA_EOL;
2798 if (!eol || backup > 1)
2800 /* Put put the peeked tokens back */
2801 _cpp_backup_tokens_direct (pfile, backup);
2802 /* But if the last one was an EOL, forget it. */
2803 if (eol)
2804 pfile->lookaheads--;
2809 /* Lex a token into RESULT (external interface). Takes care of issues
2810 like directive handling, token lookahead, multiple include
2811 optimization and skipping. */
2812 const cpp_token *
2813 _cpp_lex_token (cpp_reader *pfile)
2815 cpp_token *result;
2817 for (;;)
2819 if (pfile->cur_token == pfile->cur_run->limit)
2821 pfile->cur_run = next_tokenrun (pfile->cur_run);
2822 pfile->cur_token = pfile->cur_run->base;
2824 /* We assume that the current token is somewhere in the current
2825 run. */
2826 if (pfile->cur_token < pfile->cur_run->base
2827 || pfile->cur_token >= pfile->cur_run->limit)
2828 abort ();
2830 if (pfile->lookaheads)
2832 pfile->lookaheads--;
2833 result = pfile->cur_token++;
2835 else
2836 result = _cpp_lex_direct (pfile);
2838 if (result->flags & BOL)
2840 /* Is this a directive. If _cpp_handle_directive returns
2841 false, it is an assembler #. */
2842 if (result->type == CPP_HASH
2843 /* 6.10.3 p 11: Directives in a list of macro arguments
2844 gives undefined behavior. This implementation
2845 handles the directive as normal. */
2846 && pfile->state.parsing_args != 1)
2848 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2850 if (pfile->directive_result.type == CPP_PADDING)
2851 continue;
2852 result = &pfile->directive_result;
2855 else if (pfile->state.in_deferred_pragma)
2856 result = &pfile->directive_result;
2857 else if (result->type == CPP_NAME
2858 && (result->val.node.node->flags & NODE_MODULE)
2859 && !pfile->state.skipping
2860 /* Unlike regular directives, we do not deal with
2861 tokenizing module directives as macro arguments.
2862 That's not permitted. */
2863 && !pfile->state.parsing_args)
2865 /* P1857. Before macro expansion, At start of logical
2866 line ... */
2867 /* We don't have to consider lookaheads at this point. */
2868 gcc_checking_assert (!pfile->lookaheads);
2870 cpp_maybe_module_directive (pfile, result);
2873 if (pfile->cb.line_change && !pfile->state.skipping)
2874 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2877 /* We don't skip tokens in directives. */
2878 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2879 break;
2881 /* Outside a directive, invalidate controlling macros. At file
2882 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2883 get here and MI optimization works. */
2884 pfile->mi_valid = false;
2886 if (!pfile->state.skipping || result->type == CPP_EOF)
2887 break;
2890 return result;
2893 /* Returns true if a fresh line has been loaded. */
2894 bool
2895 _cpp_get_fresh_line (cpp_reader *pfile)
2897 /* We can't get a new line until we leave the current directive. */
2898 if (pfile->state.in_directive)
2899 return false;
2901 for (;;)
2903 cpp_buffer *buffer = pfile->buffer;
2905 if (!buffer->need_line)
2906 return true;
2908 if (buffer->next_line < buffer->rlimit)
2910 _cpp_clean_line (pfile);
2911 return true;
2914 /* First, get out of parsing arguments state. */
2915 if (pfile->state.parsing_args)
2916 return false;
2918 /* End of buffer. Non-empty files should end in a newline. */
2919 if (buffer->buf != buffer->rlimit
2920 && buffer->next_line > buffer->rlimit
2921 && !buffer->from_stage3)
2923 /* Clip to buffer size. */
2924 buffer->next_line = buffer->rlimit;
2927 if (buffer->prev && !buffer->return_at_eof)
2928 _cpp_pop_buffer (pfile);
2929 else
2931 /* End of translation. Do not pop the buffer yet. Increment
2932 line number so that the EOF token is on a line of its own
2933 (_cpp_lex_direct doesn't increment in that case, because
2934 it's hard for it to distinguish this special case). */
2935 CPP_INCREMENT_LINE (pfile, 0);
2936 return false;
2941 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
2942 do \
2944 result->type = ELSE_TYPE; \
2945 if (*buffer->cur == CHAR) \
2946 buffer->cur++, result->type = THEN_TYPE; \
2948 while (0)
2950 /* Lex a token into pfile->cur_token, which is also incremented, to
2951 get diagnostics pointing to the correct location.
2953 Does not handle issues such as token lookahead, multiple-include
2954 optimization, directives, skipping etc. This function is only
2955 suitable for use by _cpp_lex_token, and in special cases like
2956 lex_expansion_token which doesn't care for any of these issues.
2958 When meeting a newline, returns CPP_EOF if parsing a directive,
2959 otherwise returns to the start of the token buffer if permissible.
2960 Returns the location of the lexed token. */
2961 cpp_token *
2962 _cpp_lex_direct (cpp_reader *pfile)
2964 cppchar_t c;
2965 cpp_buffer *buffer;
2966 const unsigned char *comment_start;
2967 bool fallthrough_comment = false;
2968 cpp_token *result = pfile->cur_token++;
2970 fresh_line:
2971 result->flags = 0;
2972 buffer = pfile->buffer;
2973 if (buffer->need_line)
2975 gcc_assert (!pfile->state.in_deferred_pragma);
2976 if (!_cpp_get_fresh_line (pfile))
2978 result->type = CPP_EOF;
2979 /* Not a real EOF in a directive or arg parsing -- we refuse
2980 to advance to the next file now, and will once we're out
2981 of those modes. */
2982 if (!pfile->state.in_directive && !pfile->state.parsing_args)
2984 /* Tell the compiler the line number of the EOF token. */
2985 result->src_loc = pfile->line_table->highest_line;
2986 result->flags = BOL;
2987 /* Now pop the buffer that _cpp_get_fresh_line did not. */
2988 _cpp_pop_buffer (pfile);
2990 return result;
2992 if (buffer != pfile->buffer)
2993 fallthrough_comment = false;
2994 if (!pfile->keep_tokens)
2996 pfile->cur_run = &pfile->base_run;
2997 result = pfile->base_run.base;
2998 pfile->cur_token = result + 1;
3000 result->flags = BOL;
3001 if (pfile->state.parsing_args == 2)
3002 result->flags |= PREV_WHITE;
3004 buffer = pfile->buffer;
3005 update_tokens_line:
3006 result->src_loc = pfile->line_table->highest_line;
3008 skipped_white:
3009 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3010 && !pfile->overlaid_buffer)
3012 _cpp_process_line_notes (pfile, false);
3013 result->src_loc = pfile->line_table->highest_line;
3015 c = *buffer->cur++;
3017 if (pfile->forced_token_location)
3018 result->src_loc = pfile->forced_token_location;
3019 else
3020 result->src_loc = linemap_position_for_column (pfile->line_table,
3021 CPP_BUF_COLUMN (buffer, buffer->cur));
3023 switch (c)
3025 case ' ': case '\t': case '\f': case '\v': case '\0':
3026 result->flags |= PREV_WHITE;
3027 skip_whitespace (pfile, c);
3028 goto skipped_white;
3030 case '\n':
3031 /* Increment the line, unless this is the last line ... */
3032 if (buffer->cur < buffer->rlimit
3033 /* ... or this is a #include, (where _cpp_stack_file needs to
3034 unwind by one line) ... */
3035 || (pfile->state.in_directive > 1
3036 /* ... except traditional-cpp increments this elsewhere. */
3037 && !CPP_OPTION (pfile, traditional)))
3038 CPP_INCREMENT_LINE (pfile, 0);
3039 buffer->need_line = true;
3040 if (pfile->state.in_deferred_pragma)
3042 /* Produce the PRAGMA_EOL on this line. File reading
3043 ensures there is always a \n at end of the buffer, thus
3044 in a deferred pragma we always see CPP_PRAGMA_EOL before
3045 any CPP_EOF. */
3046 result->type = CPP_PRAGMA_EOL;
3047 result->flags &= ~PREV_WHITE;
3048 pfile->state.in_deferred_pragma = false;
3049 if (!pfile->state.pragma_allow_expansion)
3050 pfile->state.prevent_expansion--;
3051 return result;
3053 goto fresh_line;
3055 case '0': case '1': case '2': case '3': case '4':
3056 case '5': case '6': case '7': case '8': case '9':
3058 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3059 result->type = CPP_NUMBER;
3060 lex_number (pfile, &result->val.str, &nst);
3061 warn_about_normalization (pfile, result, &nst);
3062 break;
3065 case 'L':
3066 case 'u':
3067 case 'U':
3068 case 'R':
3069 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3070 wide strings or raw strings. */
3071 if (c == 'L' || CPP_OPTION (pfile, rliterals)
3072 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3074 if ((*buffer->cur == '\'' && c != 'R')
3075 || *buffer->cur == '"'
3076 || (*buffer->cur == 'R'
3077 && c != 'R'
3078 && buffer->cur[1] == '"'
3079 && CPP_OPTION (pfile, rliterals))
3080 || (*buffer->cur == '8'
3081 && c == 'u'
3082 && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3083 && CPP_OPTION (pfile, utf8_char_literals)))
3084 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3085 && CPP_OPTION (pfile, rliterals)))))
3087 lex_string (pfile, result, buffer->cur - 1);
3088 break;
3091 /* Fall through. */
3093 case '_':
3094 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3095 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3096 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3097 case 's': case 't': case 'v': case 'w': case 'x':
3098 case 'y': case 'z':
3099 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3100 case 'G': case 'H': case 'I': case 'J': case 'K':
3101 case 'M': case 'N': case 'O': case 'P': case 'Q':
3102 case 'S': case 'T': case 'V': case 'W': case 'X':
3103 case 'Y': case 'Z':
3104 result->type = CPP_NAME;
3106 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3107 result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3108 &nst,
3109 &result->val.node.spelling);
3110 warn_about_normalization (pfile, result, &nst);
3113 /* Convert named operators to their proper types. */
3114 if (result->val.node.node->flags & NODE_OPERATOR)
3116 result->flags |= NAMED_OP;
3117 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3120 /* Signal FALLTHROUGH comment followed by another token. */
3121 if (fallthrough_comment)
3122 result->flags |= PREV_FALLTHROUGH;
3123 break;
3125 case '\'':
3126 case '"':
3127 lex_string (pfile, result, buffer->cur - 1);
3128 break;
3130 case '/':
3131 /* A potential block or line comment. */
3132 comment_start = buffer->cur;
3133 c = *buffer->cur;
3135 if (c == '*')
3137 if (_cpp_skip_block_comment (pfile))
3138 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3140 else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3142 /* Don't warn for system headers. */
3143 if (_cpp_in_system_header (pfile))
3145 /* Warn about comments if pedantically GNUC89, and not
3146 in system headers. */
3147 else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3148 && CPP_PEDANTIC (pfile)
3149 && ! buffer->warned_cplusplus_comments)
3151 if (cpp_error (pfile, CPP_DL_PEDWARN,
3152 "C++ style comments are not allowed in ISO C90"))
3153 cpp_error (pfile, CPP_DL_NOTE,
3154 "(this will be reported only once per input file)");
3155 buffer->warned_cplusplus_comments = 1;
3157 /* Or if specifically desired via -Wc90-c99-compat. */
3158 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3159 && ! CPP_OPTION (pfile, cplusplus)
3160 && ! buffer->warned_cplusplus_comments)
3162 if (cpp_error (pfile, CPP_DL_WARNING,
3163 "C++ style comments are incompatible with C90"))
3164 cpp_error (pfile, CPP_DL_NOTE,
3165 "(this will be reported only once per input file)");
3166 buffer->warned_cplusplus_comments = 1;
3168 /* In C89/C94, C++ style comments are forbidden. */
3169 else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3170 || CPP_OPTION (pfile, lang) == CLK_STDC94))
3172 /* But don't be confused about valid code such as
3173 - // immediately followed by *,
3174 - // in a preprocessing directive,
3175 - // in an #if 0 block. */
3176 if (buffer->cur[1] == '*'
3177 || pfile->state.in_directive
3178 || pfile->state.skipping)
3180 result->type = CPP_DIV;
3181 break;
3183 else if (! buffer->warned_cplusplus_comments)
3185 if (cpp_error (pfile, CPP_DL_ERROR,
3186 "C++ style comments are not allowed in "
3187 "ISO C90"))
3188 cpp_error (pfile, CPP_DL_NOTE,
3189 "(this will be reported only once per input "
3190 "file)");
3191 buffer->warned_cplusplus_comments = 1;
3194 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3195 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3197 else if (c == '=')
3199 buffer->cur++;
3200 result->type = CPP_DIV_EQ;
3201 break;
3203 else
3205 result->type = CPP_DIV;
3206 break;
3209 if (fallthrough_comment_p (pfile, comment_start))
3210 fallthrough_comment = true;
3212 if (pfile->cb.comment)
3214 size_t len = pfile->buffer->cur - comment_start;
3215 pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3216 len + 1);
3219 if (!pfile->state.save_comments)
3221 result->flags |= PREV_WHITE;
3222 goto update_tokens_line;
3225 if (fallthrough_comment)
3226 result->flags |= PREV_FALLTHROUGH;
3228 /* Save the comment as a token in its own right. */
3229 save_comment (pfile, result, comment_start, c);
3230 break;
3232 case '<':
3233 if (pfile->state.angled_headers)
3235 lex_string (pfile, result, buffer->cur - 1);
3236 if (result->type != CPP_LESS)
3237 break;
3240 result->type = CPP_LESS;
3241 if (*buffer->cur == '=')
3243 buffer->cur++, result->type = CPP_LESS_EQ;
3244 if (*buffer->cur == '>'
3245 && CPP_OPTION (pfile, cplusplus)
3246 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
3247 buffer->cur++, result->type = CPP_SPACESHIP;
3249 else if (*buffer->cur == '<')
3251 buffer->cur++;
3252 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
3254 else if (CPP_OPTION (pfile, digraphs))
3256 if (*buffer->cur == ':')
3258 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
3259 three characters are <:: and the subsequent character
3260 is neither : nor >, the < is treated as a preprocessor
3261 token by itself". */
3262 if (CPP_OPTION (pfile, cplusplus)
3263 && CPP_OPTION (pfile, lang) != CLK_CXX98
3264 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
3265 && buffer->cur[1] == ':'
3266 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3267 break;
3269 buffer->cur++;
3270 result->flags |= DIGRAPH;
3271 result->type = CPP_OPEN_SQUARE;
3273 else if (*buffer->cur == '%')
3275 buffer->cur++;
3276 result->flags |= DIGRAPH;
3277 result->type = CPP_OPEN_BRACE;
3280 break;
3282 case '>':
3283 result->type = CPP_GREATER;
3284 if (*buffer->cur == '=')
3285 buffer->cur++, result->type = CPP_GREATER_EQ;
3286 else if (*buffer->cur == '>')
3288 buffer->cur++;
3289 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3291 break;
3293 case '%':
3294 result->type = CPP_MOD;
3295 if (*buffer->cur == '=')
3296 buffer->cur++, result->type = CPP_MOD_EQ;
3297 else if (CPP_OPTION (pfile, digraphs))
3299 if (*buffer->cur == ':')
3301 buffer->cur++;
3302 result->flags |= DIGRAPH;
3303 result->type = CPP_HASH;
3304 if (*buffer->cur == '%' && buffer->cur[1] == ':')
3305 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3307 else if (*buffer->cur == '>')
3309 buffer->cur++;
3310 result->flags |= DIGRAPH;
3311 result->type = CPP_CLOSE_BRACE;
3314 break;
3316 case '.':
3317 result->type = CPP_DOT;
3318 if (ISDIGIT (*buffer->cur))
3320 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3321 result->type = CPP_NUMBER;
3322 lex_number (pfile, &result->val.str, &nst);
3323 warn_about_normalization (pfile, result, &nst);
3325 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3326 buffer->cur += 2, result->type = CPP_ELLIPSIS;
3327 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3328 buffer->cur++, result->type = CPP_DOT_STAR;
3329 break;
3331 case '+':
3332 result->type = CPP_PLUS;
3333 if (*buffer->cur == '+')
3334 buffer->cur++, result->type = CPP_PLUS_PLUS;
3335 else if (*buffer->cur == '=')
3336 buffer->cur++, result->type = CPP_PLUS_EQ;
3337 break;
3339 case '-':
3340 result->type = CPP_MINUS;
3341 if (*buffer->cur == '>')
3343 buffer->cur++;
3344 result->type = CPP_DEREF;
3345 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3346 buffer->cur++, result->type = CPP_DEREF_STAR;
3348 else if (*buffer->cur == '-')
3349 buffer->cur++, result->type = CPP_MINUS_MINUS;
3350 else if (*buffer->cur == '=')
3351 buffer->cur++, result->type = CPP_MINUS_EQ;
3352 break;
3354 case '&':
3355 result->type = CPP_AND;
3356 if (*buffer->cur == '&')
3357 buffer->cur++, result->type = CPP_AND_AND;
3358 else if (*buffer->cur == '=')
3359 buffer->cur++, result->type = CPP_AND_EQ;
3360 break;
3362 case '|':
3363 result->type = CPP_OR;
3364 if (*buffer->cur == '|')
3365 buffer->cur++, result->type = CPP_OR_OR;
3366 else if (*buffer->cur == '=')
3367 buffer->cur++, result->type = CPP_OR_EQ;
3368 break;
3370 case ':':
3371 result->type = CPP_COLON;
3372 if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
3373 buffer->cur++, result->type = CPP_SCOPE;
3374 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3376 buffer->cur++;
3377 result->flags |= DIGRAPH;
3378 result->type = CPP_CLOSE_SQUARE;
3380 break;
3382 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3383 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3384 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3385 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3386 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3388 case '?': result->type = CPP_QUERY; break;
3389 case '~': result->type = CPP_COMPL; break;
3390 case ',': result->type = CPP_COMMA; break;
3391 case '(': result->type = CPP_OPEN_PAREN; break;
3392 case ')': result->type = CPP_CLOSE_PAREN; break;
3393 case '[': result->type = CPP_OPEN_SQUARE; break;
3394 case ']': result->type = CPP_CLOSE_SQUARE; break;
3395 case '{': result->type = CPP_OPEN_BRACE; break;
3396 case '}': result->type = CPP_CLOSE_BRACE; break;
3397 case ';': result->type = CPP_SEMICOLON; break;
3399 /* @ is a punctuator in Objective-C. */
3400 case '@': result->type = CPP_ATSIGN; break;
3402 default:
3404 const uchar *base = --buffer->cur;
3406 /* Check for an extended identifier ($ or UCN or UTF-8). */
3407 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3408 if (forms_identifier_p (pfile, true, &nst))
3410 result->type = CPP_NAME;
3411 result->val.node.node = lex_identifier (pfile, base, true, &nst,
3412 &result->val.node.spelling);
3413 warn_about_normalization (pfile, result, &nst);
3414 break;
3417 /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
3418 single token. */
3419 buffer->cur++;
3420 if (c >= utf8_signifier)
3422 const uchar *pstr = base;
3423 cppchar_t s;
3424 if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
3425 buffer->cur = pstr;
3427 create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
3428 break;
3433 /* Potentially convert the location of the token to a range. */
3434 if (result->src_loc >= RESERVED_LOCATION_COUNT
3435 && result->type != CPP_EOF)
3437 /* Ensure that any line notes are processed, so that we have the
3438 correct physical line/column for the end-point of the token even
3439 when a logical line is split via one or more backslashes. */
3440 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3441 && !pfile->overlaid_buffer)
3442 _cpp_process_line_notes (pfile, false);
3444 source_range tok_range;
3445 tok_range.m_start = result->src_loc;
3446 tok_range.m_finish
3447 = linemap_position_for_column (pfile->line_table,
3448 CPP_BUF_COLUMN (buffer, buffer->cur));
3450 result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
3451 result->src_loc,
3452 tok_range, NULL);
3455 return result;
3458 /* An upper bound on the number of bytes needed to spell TOKEN.
3459 Does not include preceding whitespace. */
3460 unsigned int
3461 cpp_token_len (const cpp_token *token)
3463 unsigned int len;
3465 switch (TOKEN_SPELL (token))
3467 default: len = 6; break;
3468 case SPELL_LITERAL: len = token->val.str.len; break;
3469 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
3472 return len;
3475 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
3476 Return the number of bytes read out of NAME. (There are always
3477 10 bytes written to BUFFER.) */
3479 static size_t
3480 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
3482 int j;
3483 int ucn_len = 0;
3484 int ucn_len_c;
3485 unsigned t;
3486 unsigned long utf32;
3488 /* Compute the length of the UTF-8 sequence. */
3489 for (t = *name; t & 0x80; t <<= 1)
3490 ucn_len++;
3492 utf32 = *name & (0x7F >> ucn_len);
3493 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
3495 utf32 = (utf32 << 6) | (*++name & 0x3F);
3497 /* Ill-formed UTF-8. */
3498 if ((*name & ~0x3F) != 0x80)
3499 abort ();
3502 *buffer++ = '\\';
3503 *buffer++ = 'U';
3504 for (j = 7; j >= 0; j--)
3505 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
3506 return ucn_len;
3509 /* Given a token TYPE corresponding to a digraph, return a pointer to
3510 the spelling of the digraph. */
3511 static const unsigned char *
3512 cpp_digraph2name (enum cpp_ttype type)
3514 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
3517 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
3518 The buffer must already contain the enough space to hold the
3519 token's spelling. Returns a pointer to the character after the
3520 last character written. */
3521 unsigned char *
3522 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
3524 size_t i;
3525 const unsigned char *name = NODE_NAME (ident);
3527 for (i = 0; i < NODE_LEN (ident); i++)
3528 if (name[i] & ~0x7F)
3530 i += utf8_to_ucn (buffer, name + i) - 1;
3531 buffer += 10;
3533 else
3534 *buffer++ = name[i];
3536 return buffer;
3539 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
3540 already contain the enough space to hold the token's spelling.
3541 Returns a pointer to the character after the last character written.
3542 FORSTRING is true if this is to be the spelling after translation
3543 phase 1 (with the original spelling of extended identifiers), false
3544 if extended identifiers should always be written using UCNs (there is
3545 no option for always writing them in the internal UTF-8 form).
3546 FIXME: Would be nice if we didn't need the PFILE argument. */
3547 unsigned char *
3548 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
3549 unsigned char *buffer, bool forstring)
3551 switch (TOKEN_SPELL (token))
3553 case SPELL_OPERATOR:
3555 const unsigned char *spelling;
3556 unsigned char c;
3558 if (token->flags & DIGRAPH)
3559 spelling = cpp_digraph2name (token->type);
3560 else if (token->flags & NAMED_OP)
3561 goto spell_ident;
3562 else
3563 spelling = TOKEN_NAME (token);
3565 while ((c = *spelling++) != '\0')
3566 *buffer++ = c;
3568 break;
3570 spell_ident:
3571 case SPELL_IDENT:
3572 if (forstring)
3574 memcpy (buffer, NODE_NAME (token->val.node.spelling),
3575 NODE_LEN (token->val.node.spelling));
3576 buffer += NODE_LEN (token->val.node.spelling);
3578 else
3579 buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
3580 break;
3582 case SPELL_LITERAL:
3583 memcpy (buffer, token->val.str.text, token->val.str.len);
3584 buffer += token->val.str.len;
3585 break;
3587 case SPELL_NONE:
3588 cpp_error (pfile, CPP_DL_ICE,
3589 "unspellable token %s", TOKEN_NAME (token));
3590 break;
3593 return buffer;
3596 /* Returns TOKEN spelt as a null-terminated string. The string is
3597 freed when the reader is destroyed. Useful for diagnostics. */
3598 unsigned char *
3599 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
3601 unsigned int len = cpp_token_len (token) + 1;
3602 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
3604 end = cpp_spell_token (pfile, token, start, false);
3605 end[0] = '\0';
3607 return start;
3610 /* Returns a pointer to a string which spells the token defined by
3611 TYPE and FLAGS. Used by C front ends, which really should move to
3612 using cpp_token_as_text. */
3613 const char *
3614 cpp_type2name (enum cpp_ttype type, unsigned char flags)
3616 if (flags & DIGRAPH)
3617 return (const char *) cpp_digraph2name (type);
3618 else if (flags & NAMED_OP)
3619 return cpp_named_operator2name (type);
3621 return (const char *) token_spellings[type].name;
3624 /* Writes the spelling of token to FP, without any preceding space.
3625 Separated from cpp_spell_token for efficiency - to avoid stdio
3626 double-buffering. */
3627 void
3628 cpp_output_token (const cpp_token *token, FILE *fp)
3630 switch (TOKEN_SPELL (token))
3632 case SPELL_OPERATOR:
3634 const unsigned char *spelling;
3635 int c;
3637 if (token->flags & DIGRAPH)
3638 spelling = cpp_digraph2name (token->type);
3639 else if (token->flags & NAMED_OP)
3640 goto spell_ident;
3641 else
3642 spelling = TOKEN_NAME (token);
3644 c = *spelling;
3646 putc (c, fp);
3647 while ((c = *++spelling) != '\0');
3649 break;
3651 spell_ident:
3652 case SPELL_IDENT:
3654 size_t i;
3655 const unsigned char * name = NODE_NAME (token->val.node.node);
3657 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
3658 if (name[i] & ~0x7F)
3660 unsigned char buffer[10];
3661 i += utf8_to_ucn (buffer, name + i) - 1;
3662 fwrite (buffer, 1, 10, fp);
3664 else
3665 fputc (NODE_NAME (token->val.node.node)[i], fp);
3667 break;
3669 case SPELL_LITERAL:
3670 if (token->type == CPP_HEADER_NAME)
3671 fputc ('"', fp);
3672 fwrite (token->val.str.text, 1, token->val.str.len, fp);
3673 if (token->type == CPP_HEADER_NAME)
3674 fputc ('"', fp);
3675 break;
3677 case SPELL_NONE:
3678 /* An error, most probably. */
3679 break;
3683 /* Compare two tokens. */
3685 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
3687 if (a->type == b->type && a->flags == b->flags)
3688 switch (TOKEN_SPELL (a))
3690 default: /* Keep compiler happy. */
3691 case SPELL_OPERATOR:
3692 /* token_no is used to track where multiple consecutive ##
3693 tokens were originally located. */
3694 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
3695 case SPELL_NONE:
3696 return (a->type != CPP_MACRO_ARG
3697 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
3698 && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
3699 case SPELL_IDENT:
3700 return (a->val.node.node == b->val.node.node
3701 && a->val.node.spelling == b->val.node.spelling);
3702 case SPELL_LITERAL:
3703 return (a->val.str.len == b->val.str.len
3704 && !memcmp (a->val.str.text, b->val.str.text,
3705 a->val.str.len));
3708 return 0;
3711 /* Returns nonzero if a space should be inserted to avoid an
3712 accidental token paste for output. For simplicity, it is
3713 conservative, and occasionally advises a space where one is not
3714 needed, e.g. "." and ".2". */
3716 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
3717 const cpp_token *token2)
3719 enum cpp_ttype a = token1->type, b = token2->type;
3720 cppchar_t c;
3722 if (token1->flags & NAMED_OP)
3723 a = CPP_NAME;
3724 if (token2->flags & NAMED_OP)
3725 b = CPP_NAME;
3727 c = EOF;
3728 if (token2->flags & DIGRAPH)
3729 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
3730 else if (token_spellings[b].category == SPELL_OPERATOR)
3731 c = token_spellings[b].name[0];
3733 /* Quickly get everything that can paste with an '='. */
3734 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
3735 return 1;
3737 switch (a)
3739 case CPP_GREATER: return c == '>';
3740 case CPP_LESS: return c == '<' || c == '%' || c == ':';
3741 case CPP_PLUS: return c == '+';
3742 case CPP_MINUS: return c == '-' || c == '>';
3743 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
3744 case CPP_MOD: return c == ':' || c == '>';
3745 case CPP_AND: return c == '&';
3746 case CPP_OR: return c == '|';
3747 case CPP_COLON: return c == ':' || c == '>';
3748 case CPP_DEREF: return c == '*';
3749 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
3750 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
3751 case CPP_PRAGMA:
3752 case CPP_NAME: return ((b == CPP_NUMBER
3753 && name_p (pfile, &token2->val.str))
3754 || b == CPP_NAME
3755 || b == CPP_CHAR || b == CPP_STRING); /* L */
3756 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
3757 || b == CPP_CHAR
3758 || c == '.' || c == '+' || c == '-');
3759 /* UCNs */
3760 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
3761 && b == CPP_NAME)
3762 || (CPP_OPTION (pfile, objc)
3763 && token1->val.str.text[0] == '@'
3764 && (b == CPP_NAME || b == CPP_STRING)));
3765 case CPP_LESS_EQ: return c == '>';
3766 case CPP_STRING:
3767 case CPP_WSTRING:
3768 case CPP_UTF8STRING:
3769 case CPP_STRING16:
3770 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
3771 && (b == CPP_NAME
3772 || (TOKEN_SPELL (token2) == SPELL_LITERAL
3773 && ISIDST (token2->val.str.text[0]))));
3775 default: break;
3778 return 0;
3781 /* Output all the remaining tokens on the current line, and a newline
3782 character, to FP. Leading whitespace is removed. If there are
3783 macros, special token padding is not performed. */
3784 void
3785 cpp_output_line (cpp_reader *pfile, FILE *fp)
3787 const cpp_token *token;
3789 token = cpp_get_token (pfile);
3790 while (token->type != CPP_EOF)
3792 cpp_output_token (token, fp);
3793 token = cpp_get_token (pfile);
3794 if (token->flags & PREV_WHITE)
3795 putc (' ', fp);
3798 putc ('\n', fp);
3801 /* Return a string representation of all the remaining tokens on the
3802 current line. The result is allocated using xmalloc and must be
3803 freed by the caller. */
3804 unsigned char *
3805 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
3807 const cpp_token *token;
3808 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
3809 unsigned int alloced = 120 + out;
3810 unsigned char *result = (unsigned char *) xmalloc (alloced);
3812 /* If DIR_NAME is empty, there are no initial contents. */
3813 if (dir_name)
3815 sprintf ((char *) result, "#%s ", dir_name);
3816 out += 2;
3819 token = cpp_get_token (pfile);
3820 while (token->type != CPP_EOF)
3822 unsigned char *last;
3823 /* Include room for a possible space and the terminating nul. */
3824 unsigned int len = cpp_token_len (token) + 2;
3826 if (out + len > alloced)
3828 alloced *= 2;
3829 if (out + len > alloced)
3830 alloced = out + len;
3831 result = (unsigned char *) xrealloc (result, alloced);
3834 last = cpp_spell_token (pfile, token, &result[out], 0);
3835 out = last - result;
3837 token = cpp_get_token (pfile);
3838 if (token->flags & PREV_WHITE)
3839 result[out++] = ' ';
3842 result[out] = '\0';
3843 return result;
3846 /* Memory buffers. Changing these three constants can have a dramatic
3847 effect on performance. The values here are reasonable defaults,
3848 but might be tuned. If you adjust them, be sure to test across a
3849 range of uses of cpplib, including heavy nested function-like macro
3850 expansion. Also check the change in peak memory usage (NJAMD is a
3851 good tool for this). */
3852 #define MIN_BUFF_SIZE 8000
3853 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3854 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
3855 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
3857 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
3858 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
3859 #endif
3861 /* Create a new allocation buffer. Place the control block at the end
3862 of the buffer, so that buffer overflows will cause immediate chaos. */
3863 static _cpp_buff *
3864 new_buff (size_t len)
3866 _cpp_buff *result;
3867 unsigned char *base;
3869 if (len < MIN_BUFF_SIZE)
3870 len = MIN_BUFF_SIZE;
3871 len = CPP_ALIGN (len);
3873 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3874 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
3875 struct first. */
3876 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
3877 base = XNEWVEC (unsigned char, len + slen);
3878 result = (_cpp_buff *) base;
3879 base += slen;
3880 #else
3881 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3882 result = (_cpp_buff *) (base + len);
3883 #endif
3884 result->base = base;
3885 result->cur = base;
3886 result->limit = base + len;
3887 result->next = NULL;
3888 return result;
3891 /* Place a chain of unwanted allocation buffers on the free list. */
3892 void
3893 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3895 _cpp_buff *end = buff;
3897 while (end->next)
3898 end = end->next;
3899 end->next = pfile->free_buffs;
3900 pfile->free_buffs = buff;
3903 /* Return a free buffer of size at least MIN_SIZE. */
3904 _cpp_buff *
3905 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3907 _cpp_buff *result, **p;
3909 for (p = &pfile->free_buffs;; p = &(*p)->next)
3911 size_t size;
3913 if (*p == NULL)
3914 return new_buff (min_size);
3915 result = *p;
3916 size = result->limit - result->base;
3917 /* Return a buffer that's big enough, but don't waste one that's
3918 way too big. */
3919 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3920 break;
3923 *p = result->next;
3924 result->next = NULL;
3925 result->cur = result->base;
3926 return result;
3929 /* Creates a new buffer with enough space to hold the uncommitted
3930 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
3931 the excess bytes to the new buffer. Chains the new buffer after
3932 BUFF, and returns the new buffer. */
3933 _cpp_buff *
3934 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3936 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3937 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3939 buff->next = new_buff;
3940 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3941 return new_buff;
3944 /* Creates a new buffer with enough space to hold the uncommitted
3945 remaining bytes of the buffer pointed to by BUFF, and at least
3946 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
3947 Chains the new buffer before the buffer pointed to by BUFF, and
3948 updates the pointer to point to the new buffer. */
3949 void
3950 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3952 _cpp_buff *new_buff, *old_buff = *pbuff;
3953 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3955 new_buff = _cpp_get_buff (pfile, size);
3956 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3957 new_buff->next = old_buff;
3958 *pbuff = new_buff;
3961 /* Free a chain of buffers starting at BUFF. */
3962 void
3963 _cpp_free_buff (_cpp_buff *buff)
3965 _cpp_buff *next;
3967 for (; buff; buff = next)
3969 next = buff->next;
3970 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3971 free (buff);
3972 #else
3973 free (buff->base);
3974 #endif
3978 /* Allocate permanent, unaligned storage of length LEN. */
3979 unsigned char *
3980 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3982 _cpp_buff *buff = pfile->u_buff;
3983 unsigned char *result = buff->cur;
3985 if (len > (size_t) (buff->limit - result))
3987 buff = _cpp_get_buff (pfile, len);
3988 buff->next = pfile->u_buff;
3989 pfile->u_buff = buff;
3990 result = buff->cur;
3993 buff->cur = result + len;
3994 return result;
3997 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3998 That buffer is used for growing allocations when saving macro
3999 replacement lists in a #define, and when parsing an answer to an
4000 assertion in #assert, #unassert or #if (and therefore possibly
4001 whilst expanding macros). It therefore must not be used by any
4002 code that they might call: specifically the lexer and the guts of
4003 the macro expander.
4005 All existing other uses clearly fit this restriction: storing
4006 registered pragmas during initialization. */
4007 unsigned char *
4008 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4010 _cpp_buff *buff = pfile->a_buff;
4011 unsigned char *result = buff->cur;
4013 if (len > (size_t) (buff->limit - result))
4015 buff = _cpp_get_buff (pfile, len);
4016 buff->next = pfile->a_buff;
4017 pfile->a_buff = buff;
4018 result = buff->cur;
4021 buff->cur = result + len;
4022 return result;
4025 /* Commit or allocate storage from a buffer. */
4027 void *
4028 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4030 void *ptr = BUFF_FRONT (pfile->a_buff);
4032 if (pfile->hash_table->alloc_subobject)
4034 void *copy = pfile->hash_table->alloc_subobject (size);
4035 memcpy (copy, ptr, size);
4036 ptr = copy;
4038 else
4039 BUFF_FRONT (pfile->a_buff) += size;
4041 return ptr;
4044 /* Say which field of TOK is in use. */
4046 enum cpp_token_fld_kind
4047 cpp_token_val_index (const cpp_token *tok)
4049 switch (TOKEN_SPELL (tok))
4051 case SPELL_IDENT:
4052 return CPP_TOKEN_FLD_NODE;
4053 case SPELL_LITERAL:
4054 return CPP_TOKEN_FLD_STR;
4055 case SPELL_OPERATOR:
4056 /* Operands which were originally spelled as ident keep around
4057 the node for the exact spelling. */
4058 if (tok->flags & NAMED_OP)
4059 return CPP_TOKEN_FLD_NODE;
4060 else if (tok->type == CPP_PASTE)
4061 return CPP_TOKEN_FLD_TOKEN_NO;
4062 else
4063 return CPP_TOKEN_FLD_NONE;
4064 case SPELL_NONE:
4065 if (tok->type == CPP_MACRO_ARG)
4066 return CPP_TOKEN_FLD_ARG_NO;
4067 else if (tok->type == CPP_PADDING)
4068 return CPP_TOKEN_FLD_SOURCE;
4069 else if (tok->type == CPP_PRAGMA)
4070 return CPP_TOKEN_FLD_PRAGMA;
4071 /* fall through */
4072 default:
4073 return CPP_TOKEN_FLD_NONE;
4077 /* All tokens lexed in R after calling this function will be forced to
4078 have their location_t to be P, until
4079 cpp_stop_forcing_token_locations is called for R. */
4081 void
4082 cpp_force_token_locations (cpp_reader *r, location_t loc)
4084 r->forced_token_location = loc;
4087 /* Go back to assigning locations naturally for lexed tokens. */
4089 void
4090 cpp_stop_forcing_token_locations (cpp_reader *r)
4092 r->forced_token_location = 0;
4095 /* We're looking at \, if it's escaping EOL, look past it. If at
4096 LIMIT, don't advance. */
4098 static const unsigned char *
4099 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4101 const unsigned char *probe = peek;
4103 if (__builtin_expect (peek[1] == '\n', true))
4105 eol:
4106 probe += 2;
4107 if (__builtin_expect (probe < limit, true))
4109 peek = probe;
4110 if (*peek == '\\')
4111 /* The user might be perverse. */
4112 return do_peek_backslash (peek, limit);
4115 else if (__builtin_expect (peek[1] == '\r', false))
4117 if (probe[2] == '\n')
4118 probe++;
4119 goto eol;
4122 return peek;
4125 static const unsigned char *
4126 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4128 if (__builtin_expect (*peek == '\\', false))
4129 peek = do_peek_backslash (peek, limit);
4130 return peek;
4133 static const unsigned char *
4134 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4136 if (peek == bound)
4137 return NULL;
4139 unsigned char c = *--peek;
4140 if (__builtin_expect (c == '\n', false)
4141 || __builtin_expect (c == 'r', false))
4143 if (peek == bound)
4144 return peek;
4145 int ix = -1;
4146 if (c == '\n' && peek[ix] == '\r')
4148 if (peek + ix == bound)
4149 return peek;
4150 ix--;
4153 if (peek[ix] == '\\')
4154 return do_peek_prev (peek + ix, bound);
4156 return peek;
4158 else
4159 return peek;
4162 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4163 space. Otherwise return NULL. */
4165 static const unsigned char *
4166 do_peek_ident (const char *match, const unsigned char *peek,
4167 const unsigned char *limit)
4169 for (; *++match; peek++)
4170 if (*peek != *match)
4172 peek = do_peek_next (peek, limit);
4173 if (*peek != *match)
4174 return NULL;
4177 /* Must now not be looking at an identifier char. */
4178 peek = do_peek_next (peek, limit);
4179 if (ISIDNUM (*peek))
4180 return NULL;
4182 /* Skip control-line whitespace. */
4184 while (*peek == ' ' || *peek == '\t')
4185 peek++;
4186 if (__builtin_expect (*peek == '\\', false))
4188 peek = do_peek_backslash (peek, limit);
4189 if (*peek != '\\')
4190 goto ws;
4193 return peek;
4196 /* Are we looking at a module control line starting as PEEK - 1? */
4198 static bool
4199 do_peek_module (cpp_reader *pfile, unsigned char c,
4200 const unsigned char *peek, const unsigned char *limit)
4202 bool import = false;
4204 if (__builtin_expect (c == 'e', false))
4206 if (!((peek[0] == 'x' || peek[0] == '\\')
4207 && (peek = do_peek_ident ("export", peek, limit))))
4208 return false;
4210 /* export, peek for import or module. No need to peek __import
4211 here. */
4212 if (peek[0] == 'i')
4214 if (!((peek[1] == 'm' || peek[1] == '\\')
4215 && (peek = do_peek_ident ("import", peek + 1, limit))))
4216 return false;
4217 import = true;
4219 else if (peek[0] == 'm')
4221 if (!((peek[1] == 'o' || peek[1] == '\\')
4222 && (peek = do_peek_ident ("module", peek + 1, limit))))
4223 return false;
4225 else
4226 return false;
4228 else if (__builtin_expect (c == 'i', false))
4230 if (!((peek[0] == 'm' || peek[0] == '\\')
4231 && (peek = do_peek_ident ("import", peek, limit))))
4232 return false;
4233 import = true;
4235 else if (__builtin_expect (c == '_', false))
4237 /* Needed for translated includes. */
4238 if (!((peek[0] == '_' || peek[0] == '\\')
4239 && (peek = do_peek_ident ("__import", peek, limit))))
4240 return false;
4241 import = true;
4243 else if (__builtin_expect (c == 'm', false))
4245 if (!((peek[0] == 'o' || peek[0] == '\\')
4246 && (peek = do_peek_ident ("module", peek, limit))))
4247 return false;
4249 else
4250 return false;
4252 /* Peek the next character to see if it's good enough. We'll be at
4253 the first non-whitespace char, including skipping an escaped
4254 newline. */
4255 /* ... import followed by identifier, ':', '<' or header-name
4256 preprocessing tokens, or module followed by identifier, ':' or
4257 ';' preprocessing tokens. */
4258 unsigned char p = *peek++;
4260 /* A character literal is ... single quotes, ... optionally preceded
4261 by u8, u, U, or L */
4262 /* A string-literal is a ... double quotes, optionally prefixed by
4263 R, u8, u8R, u, uR, U, UR, L, or LR */
4264 if (p == 'u')
4266 peek = do_peek_next (peek, limit);
4267 if (*peek == '8')
4269 peek++;
4270 goto peek_u8;
4272 goto peek_u;
4274 else if (p == 'U' || p == 'L')
4276 peek_u8:
4277 peek = do_peek_next (peek, limit);
4278 peek_u:
4279 if (*peek == '\"' || *peek == '\'')
4280 return false;
4282 if (*peek == 'R')
4283 goto peek_R;
4284 /* Identifier. Ok. */
4286 else if (p == 'R')
4288 peek_R:
4289 if (CPP_OPTION (pfile, rliterals))
4291 peek = do_peek_next (peek, limit);
4292 if (*peek == '\"')
4293 return false;
4295 /* Identifier. Ok. */
4297 else if ('Z' - 'A' == 25
4298 ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
4299 : ISIDST (p))
4301 /* Identifier. Ok. */
4303 else if (p == '<')
4305 /* Maybe angle header, ok for import. Reject
4306 '<=', '<<' digraph:'<:'. */
4307 if (!import)
4308 return false;
4309 peek = do_peek_next (peek, limit);
4310 if (*peek == '=' || *peek == '<'
4311 || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
4312 return false;
4314 else if (p == ';')
4316 /* SEMICOLON, ok for module. */
4317 if (import)
4318 return false;
4320 else if (p == '"')
4322 /* STRING, ok for import. */
4323 if (!import)
4324 return false;
4326 else if (p == ':')
4328 /* Maybe COLON, ok. Reject '::', digraph:':>'. */
4329 peek = do_peek_next (peek, limit);
4330 if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
4331 return false;
4333 else
4334 /* FIXME: Detect a unicode character, excluding those not
4335 permitted as the initial character. [lex.name]/1. I presume
4336 we need to check the \[uU] spellings, and directly using
4337 Unicode in say UTF8 form? Or perhaps we do the phase-1
4338 conversion of UTF8 to universal-character-names? */
4339 return false;
4341 return true;
4344 /* Directives-only scanning. Somewhat more relaxed than correct
4345 parsing -- some ill-formed programs will not be rejected. */
4347 void
4348 cpp_directive_only_process (cpp_reader *pfile,
4349 void *data,
4350 void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
4352 bool module_p = CPP_OPTION (pfile, module_directives);
4356 restart:
4357 /* Buffer initialization, but no line cleaning. */
4358 cpp_buffer *buffer = pfile->buffer;
4359 buffer->cur_note = buffer->notes_used = 0;
4360 buffer->cur = buffer->line_base = buffer->next_line;
4361 buffer->need_line = false;
4362 /* Files always end in a newline or carriage return. We rely on this for
4363 character peeking safety. */
4364 gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
4366 const unsigned char *base = buffer->cur;
4367 unsigned line_count = 0;
4368 const unsigned char *line_start = base;
4370 bool bol = true;
4371 bool raw = false;
4373 const unsigned char *lwm = base;
4374 for (const unsigned char *pos = base, *limit = buffer->rlimit;
4375 pos < limit;)
4377 unsigned char c = *pos++;
4378 /* This matches the switch in _cpp_lex_direct. */
4379 switch (c)
4381 case ' ': case '\t': case '\f': case '\v':
4382 /* Whitespace, do nothing. */
4383 break;
4385 case '\r': /* MAC line ending, or Windows \r\n */
4386 if (*pos == '\n')
4387 pos++;
4388 /* FALLTHROUGH */
4390 case '\n':
4391 bol = true;
4393 next_line:
4394 CPP_INCREMENT_LINE (pfile, 0);
4395 line_count++;
4396 line_start = pos;
4397 break;
4399 case '\\':
4400 /* <backslash><newline> is removed, and doesn't undo any
4401 preceeding escape or whatnot. */
4402 if (*pos == '\n')
4404 pos++;
4405 goto next_line;
4407 else if (*pos == '\r')
4409 if (pos[1] == '\n')
4410 pos++;
4411 pos++;
4412 goto next_line;
4414 goto dflt;
4416 case '#':
4417 if (bol)
4419 /* Line directive. */
4420 if (pos - 1 > base && !pfile->state.skipping)
4421 cb (pfile, CPP_DO_print, data,
4422 line_count, base, pos - 1 - base);
4424 /* Prep things for directive handling. */
4425 buffer->next_line = pos;
4426 buffer->need_line = true;
4427 bool ok = _cpp_get_fresh_line (pfile);
4428 gcc_checking_assert (ok);
4430 /* Ensure proper column numbering for generated
4431 error messages. */
4432 buffer->line_base -= pos - line_start;
4434 _cpp_handle_directive (pfile, line_start + 1 != pos);
4436 /* Sanitize the line settings. Duplicate #include's can
4437 mess things up. */
4438 // FIXME: Necessary?
4439 pfile->line_table->highest_location
4440 = pfile->line_table->highest_line;
4442 if (!pfile->state.skipping
4443 && pfile->buffer->next_line < pfile->buffer->rlimit)
4444 cb (pfile, CPP_DO_location, data,
4445 pfile->line_table->highest_line);
4447 goto restart;
4449 goto dflt;
4451 case '/':
4453 const unsigned char *peek = do_peek_next (pos, limit);
4454 if (!(*peek == '/' || *peek == '*'))
4455 goto dflt;
4457 /* Line or block comment */
4458 bool is_block = *peek == '*';
4459 bool star = false;
4460 bool esc = false;
4461 location_t sloc
4462 = linemap_position_for_column (pfile->line_table,
4463 pos - line_start);
4465 while (pos < limit)
4467 char c = *pos++;
4468 switch (c)
4470 case '\\':
4471 esc = true;
4472 break;
4474 case '\r':
4475 if (*pos == '\n')
4476 pos++;
4477 /* FALLTHROUGH */
4479 case '\n':
4481 CPP_INCREMENT_LINE (pfile, 0);
4482 line_count++;
4483 line_start = pos;
4484 if (!esc && !is_block)
4486 bol = true;
4487 goto done_comment;
4490 if (!esc)
4491 star = false;
4492 esc = false;
4493 break;
4495 case '*':
4496 if (pos > peek && !esc)
4497 star = is_block;
4498 esc = false;
4499 break;
4501 case '/':
4502 if (star)
4503 goto done_comment;
4504 /* FALLTHROUGH */
4506 default:
4507 star = false;
4508 esc = false;
4509 break;
4512 if (pos < limit || is_block)
4513 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4514 "unterminated comment");
4515 done_comment:
4516 lwm = pos;
4517 break;
4520 case '\'':
4521 if (!CPP_OPTION (pfile, digit_separators))
4522 goto delimited_string;
4524 /* Possibly a number punctuator. */
4525 if (!ISIDNUM (*do_peek_next (pos, limit)))
4526 goto delimited_string;
4528 goto quote_peek;
4530 case '\"':
4531 if (!CPP_OPTION (pfile, rliterals))
4532 goto delimited_string;
4534 quote_peek:
4536 /* For ' see if it's a number punctuator
4537 \.?<digit>(<digit>|<identifier-nondigit>
4538 |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
4539 /* For " see if it's a raw string
4540 {U,L,u,u8}R. This includes CPP_NUMBER detection,
4541 because that could be 0e+R. */
4542 const unsigned char *peek = pos - 1;
4543 bool quote_first = c == '"';
4544 bool quote_eight = false;
4545 bool maybe_number_start = false;
4546 bool want_number = false;
4548 while ((peek = do_peek_prev (peek, lwm)))
4550 unsigned char p = *peek;
4551 if (quote_first)
4553 if (!raw)
4555 if (p != 'R')
4556 break;
4557 raw = true;
4558 continue;
4561 quote_first = false;
4562 if (p == 'L' || p == 'U' || p == 'u')
4564 else if (p == '8')
4565 quote_eight = true;
4566 else
4567 goto second_raw;
4569 else if (quote_eight)
4571 if (p != 'u')
4573 raw = false;
4574 break;
4576 quote_eight = false;
4578 else if (c == '"')
4580 second_raw:;
4581 if (!want_number && ISIDNUM (p))
4583 raw = false;
4584 break;
4588 if (ISDIGIT (p))
4589 maybe_number_start = true;
4590 else if (p == '.')
4591 want_number = true;
4592 else if (ISIDNUM (p))
4593 maybe_number_start = false;
4594 else if (p == '+' || p == '-')
4596 if (const unsigned char *peek_prev
4597 = do_peek_prev (peek, lwm))
4599 p = *peek_prev;
4600 if (p == 'e' || p == 'E'
4601 || p == 'p' || p == 'P')
4603 want_number = true;
4604 maybe_number_start = false;
4606 else
4607 break;
4609 else
4610 break;
4612 else if (p == '\'' || p == '\"')
4614 /* If this is lwm, this must be the end of a
4615 previous string. So this is a trailing
4616 literal type, (a) if those are allowed,
4617 and (b) maybe_start is false. Otherwise
4618 this must be a CPP_NUMBER because we've
4619 met another ', and we'd have checked that
4620 in its own right. */
4621 if (peek == lwm && CPP_OPTION (pfile, uliterals))
4623 if (!maybe_number_start && !want_number)
4624 /* Must be a literal type. */
4625 raw = false;
4627 else if (p == '\''
4628 && CPP_OPTION (pfile, digit_separators))
4629 maybe_number_start = true;
4630 break;
4632 else if (c == '\'')
4633 break;
4634 else if (!quote_first && !quote_eight)
4635 break;
4638 if (maybe_number_start)
4640 if (c == '\'')
4641 /* A CPP NUMBER. */
4642 goto dflt;
4643 raw = false;
4646 goto delimited_string;
4649 delimited_string:
4651 /* (Possibly raw) string or char literal. */
4652 unsigned char end = c;
4653 int delim_len = -1;
4654 const unsigned char *delim = NULL;
4655 location_t sloc = linemap_position_for_column (pfile->line_table,
4656 pos - line_start);
4657 int esc = 0;
4659 if (raw)
4661 /* There can be no line breaks in the delimiter. */
4662 delim = pos;
4663 for (delim_len = 0; (c = *pos++) != '('; delim_len++)
4665 if (delim_len == 16)
4667 cpp_error_with_line (pfile, CPP_DL_ERROR,
4668 sloc, 0,
4669 "raw string delimiter"
4670 " longer than %d"
4671 " characters",
4672 delim_len);
4673 raw = false;
4674 pos = delim;
4675 break;
4677 if (strchr (") \\\t\v\f\n", c))
4679 cpp_error_with_line (pfile, CPP_DL_ERROR,
4680 sloc, 0,
4681 "invalid character '%c'"
4682 " in raw string"
4683 " delimiter", c);
4684 raw = false;
4685 pos = delim;
4686 break;
4688 if (pos >= limit)
4689 goto bad_string;
4693 while (pos < limit)
4695 char c = *pos++;
4696 switch (c)
4698 case '\\':
4699 if (!raw)
4700 esc++;
4701 break;
4703 case '\r':
4704 if (*pos == '\n')
4705 pos++;
4706 /* FALLTHROUGH */
4708 case '\n':
4710 CPP_INCREMENT_LINE (pfile, 0);
4711 line_count++;
4712 line_start = pos;
4714 if (esc)
4715 esc--;
4716 break;
4718 case ')':
4719 if (raw
4720 && pos + delim_len + 1 < limit
4721 && pos[delim_len] == end
4722 && !memcmp (delim, pos, delim_len))
4724 pos += delim_len + 1;
4725 raw = false;
4726 goto done_string;
4728 break;
4730 default:
4731 if (!raw && !(esc & 1) && c == end)
4732 goto done_string;
4733 esc = 0;
4734 break;
4737 bad_string:
4738 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4739 "unterminated literal");
4741 done_string:
4742 raw = false;
4743 lwm = pos - 1;
4745 goto dflt;
4747 case '_':
4748 case 'e':
4749 case 'i':
4750 case 'm':
4751 if (bol && module_p && !pfile->state.skipping
4752 && do_peek_module (pfile, c, pos, limit))
4754 /* We've seen the start of a module control line.
4755 Start up the tokenizer. */
4756 pos--; /* Backup over the first character. */
4758 /* Backup over whitespace to start of line. */
4759 while (pos > line_start
4760 && (pos[-1] == ' ' || pos[-1] == '\t'))
4761 pos--;
4763 if (pos > base)
4764 cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
4766 /* Prep things for directive handling. */
4767 buffer->next_line = pos;
4768 buffer->need_line = true;
4770 /* Now get tokens until the PRAGMA_EOL. */
4773 location_t spelling;
4774 const cpp_token *tok
4775 = cpp_get_token_with_location (pfile, &spelling);
4777 gcc_assert (pfile->state.in_deferred_pragma
4778 || tok->type == CPP_PRAGMA_EOL);
4779 cb (pfile, CPP_DO_token, data, tok, spelling);
4781 while (pfile->state.in_deferred_pragma);
4783 if (pfile->buffer->next_line < pfile->buffer->rlimit)
4784 cb (pfile, CPP_DO_location, data,
4785 pfile->line_table->highest_line);
4787 pfile->mi_valid = false;
4788 goto restart;
4790 goto dflt;
4792 default:
4793 dflt:
4794 bol = false;
4795 pfile->mi_valid = false;
4796 break;
4800 if (buffer->rlimit > base && !pfile->state.skipping)
4802 const unsigned char *limit = buffer->rlimit;
4803 /* If the file was not newline terminated, add rlimit, which is
4804 guaranteed to point to a newline, to the end of our range. */
4805 if (limit[-1] != '\n')
4807 limit++;
4808 CPP_INCREMENT_LINE (pfile, 0);
4809 line_count++;
4811 cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
4814 _cpp_pop_buffer (pfile);
4816 while (pfile->buffer);