aarch64: Use dup and zip1 for interleaving elements in vector initializer.
[official-gcc.git] / libcpp / lex.cc
blobb1107920c947eb07981c2dba583c859dba54b57a
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000-2022 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
27 enum spell_type
29 SPELL_OPERATOR = 0,
30 SPELL_IDENT,
31 SPELL_LITERAL,
32 SPELL_NONE
35 struct token_spelling
37 enum spell_type category;
38 const unsigned char *name;
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
44 #define OP(e, s) { SPELL_OPERATOR, UC s },
45 #define TK(e, s) { SPELL_ ## s, UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
53 /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. */
54 #define UCS_LIMIT 0x10FFFF
56 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
57 static int skip_line_comment (cpp_reader *);
58 static void skip_whitespace (cpp_reader *, cppchar_t);
59 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
60 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
61 static void store_comment (cpp_reader *, cpp_token *);
62 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
63 unsigned int, enum cpp_ttype);
64 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
65 static int name_p (cpp_reader *, const cpp_string *);
66 static tokenrun *next_tokenrun (tokenrun *);
68 static _cpp_buff *new_buff (size_t);
71 /* Utility routine:
73 Compares, the token TOKEN to the NUL-terminated string STRING.
74 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
75 int
76 cpp_ideq (const cpp_token *token, const char *string)
78 if (token->type != CPP_NAME)
79 return 0;
81 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
84 /* Record a note TYPE at byte POS into the current cleaned logical
85 line. */
86 static void
87 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
89 if (buffer->notes_used == buffer->notes_cap)
91 buffer->notes_cap = buffer->notes_cap * 2 + 200;
92 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
93 buffer->notes_cap);
96 buffer->notes[buffer->notes_used].pos = pos;
97 buffer->notes[buffer->notes_used].type = type;
98 buffer->notes_used++;
102 /* Fast path to find line special characters using optimized character
103 scanning algorithms. Anything complicated falls back to the slow
104 path below. Since this loop is very hot it's worth doing these kinds
105 of optimizations.
107 One of the paths through the ifdefs should provide
109 const uchar *search_line_fast (const uchar *s, const uchar *end);
111 Between S and END, search for \n, \r, \\, ?. Return a pointer to
112 the found character.
114 Note that the last character of the buffer is *always* a newline,
115 as forced by _cpp_convert_input. This fact can be used to avoid
116 explicitly looking for the end of the buffer. */
118 /* Configure gives us an ifdef test. */
119 #ifndef WORDS_BIGENDIAN
120 #define WORDS_BIGENDIAN 0
121 #endif
123 /* We'd like the largest integer that fits into a register. There's nothing
124 in <stdint.h> that gives us that. For most hosts this is unsigned long,
125 but MS decided on an LLP64 model. Thankfully when building with GCC we
126 can get the "real" word size. */
127 #ifdef __GNUC__
128 typedef unsigned int word_type __attribute__((__mode__(__word__)));
129 #else
130 typedef unsigned long word_type;
131 #endif
133 /* The code below is only expecting sizes 4 or 8.
134 Die at compile-time if this expectation is violated. */
135 typedef char check_word_type_size
136 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
138 /* Return X with the first N bytes forced to values that won't match one
139 of the interesting characters. Note that NUL is not interesting. */
141 static inline word_type
142 acc_char_mask_misalign (word_type val, unsigned int n)
144 word_type mask = -1;
145 if (WORDS_BIGENDIAN)
146 mask >>= n * 8;
147 else
148 mask <<= n * 8;
149 return val & mask;
152 /* Return X replicated to all byte positions within WORD_TYPE. */
154 static inline word_type
155 acc_char_replicate (uchar x)
157 word_type ret;
159 ret = (x << 24) | (x << 16) | (x << 8) | x;
160 if (sizeof(word_type) == 8)
161 ret = (ret << 16 << 16) | ret;
162 return ret;
165 /* Return non-zero if some byte of VAL is (probably) C. */
167 static inline word_type
168 acc_char_cmp (word_type val, word_type c)
170 #if defined(__GNUC__) && defined(__alpha__)
171 /* We can get exact results using a compare-bytes instruction.
172 Get (val == c) via (0 >= (val ^ c)). */
173 return __builtin_alpha_cmpbge (0, val ^ c);
174 #else
175 word_type magic = 0x7efefefeU;
176 if (sizeof(word_type) == 8)
177 magic = (magic << 16 << 16) | 0xfefefefeU;
178 magic |= 1;
180 val ^= c;
181 return ((val + magic) ^ ~val) & ~magic;
182 #endif
185 /* Given the result of acc_char_cmp is non-zero, return the index of
186 the found character. If this was a false positive, return -1. */
188 static inline int
189 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
190 word_type val ATTRIBUTE_UNUSED)
192 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
193 /* The cmpbge instruction sets *bits* of the result corresponding to
194 matches in the bytes with no false positives. */
195 return __builtin_ctzl (cmp);
196 #else
197 unsigned int i;
199 /* ??? It would be nice to force unrolling here,
200 and have all of these constants folded. */
201 for (i = 0; i < sizeof(word_type); ++i)
203 uchar c;
204 if (WORDS_BIGENDIAN)
205 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
206 else
207 c = (val >> i * 8) & 0xff;
209 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
210 return i;
213 return -1;
214 #endif
217 /* A version of the fast scanner using bit fiddling techniques.
219 For 32-bit words, one would normally perform 16 comparisons and
220 16 branches. With this algorithm one performs 24 arithmetic
221 operations and one branch. Whether this is faster with a 32-bit
222 word size is going to be somewhat system dependent.
224 For 64-bit words, we eliminate twice the number of comparisons
225 and branches without increasing the number of arithmetic operations.
226 It's almost certainly going to be a win with 64-bit word size. */
228 static const uchar * search_line_acc_char (const uchar *, const uchar *)
229 ATTRIBUTE_UNUSED;
231 static const uchar *
232 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
234 const word_type repl_nl = acc_char_replicate ('\n');
235 const word_type repl_cr = acc_char_replicate ('\r');
236 const word_type repl_bs = acc_char_replicate ('\\');
237 const word_type repl_qm = acc_char_replicate ('?');
239 unsigned int misalign;
240 const word_type *p;
241 word_type val, t;
243 /* Align the buffer. Mask out any bytes from before the beginning. */
244 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
245 val = *p;
246 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
247 if (misalign)
248 val = acc_char_mask_misalign (val, misalign);
250 /* Main loop. */
251 while (1)
253 t = acc_char_cmp (val, repl_nl);
254 t |= acc_char_cmp (val, repl_cr);
255 t |= acc_char_cmp (val, repl_bs);
256 t |= acc_char_cmp (val, repl_qm);
258 if (__builtin_expect (t != 0, 0))
260 int i = acc_char_index (t, val);
261 if (i >= 0)
262 return (const uchar *)p + i;
265 val = *++p;
269 /* Disable on Solaris 2/x86 until the following problem can be properly
270 autoconfed:
272 The Solaris 10+ assembler tags objects with the instruction set
273 extensions used, so SSE4.2 executables cannot run on machines that
274 don't support that extension. */
276 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
278 /* Replicated character data to be shared between implementations.
279 Recall that outside of a context with vector support we can't
280 define compatible vector types, therefore these are all defined
281 in terms of raw characters. */
282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
283 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
284 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
285 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
286 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
287 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
288 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
289 { '?', '?', '?', '?', '?', '?', '?', '?',
290 '?', '?', '?', '?', '?', '?', '?', '?' },
293 /* A version of the fast scanner using MMX vectorized byte compare insns.
295 This uses the PMOVMSKB instruction which was introduced with "MMX2",
296 which was packaged into SSE1; it is also present in the AMD MMX
297 extension. Mark the function as using "sse" so that we emit a real
298 "emms" instruction, rather than the 3dNOW "femms" instruction. */
300 static const uchar *
301 #ifndef __SSE__
302 __attribute__((__target__("sse")))
303 #endif
304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
306 typedef char v8qi __attribute__ ((__vector_size__ (8)));
307 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
309 const v8qi repl_nl = *(const v8qi *)repl_chars[0];
310 const v8qi repl_cr = *(const v8qi *)repl_chars[1];
311 const v8qi repl_bs = *(const v8qi *)repl_chars[2];
312 const v8qi repl_qm = *(const v8qi *)repl_chars[3];
314 unsigned int misalign, found, mask;
315 const v8qi *p;
316 v8qi data, t, c;
318 /* Align the source pointer. While MMX doesn't generate unaligned data
319 faults, this allows us to safely scan to the end of the buffer without
320 reading beyond the end of the last page. */
321 misalign = (uintptr_t)s & 7;
322 p = (const v8qi *)((uintptr_t)s & -8);
323 data = *p;
325 /* Create a mask for the bytes that are valid within the first
326 16-byte block. The Idea here is that the AND with the mask
327 within the loop is "free", since we need some AND or TEST
328 insn in order to set the flags for the branch anyway. */
329 mask = -1u << misalign;
331 /* Main loop processing 8 bytes at a time. */
332 goto start;
335 data = *++p;
336 mask = -1;
338 start:
339 t = __builtin_ia32_pcmpeqb(data, repl_nl);
340 c = __builtin_ia32_pcmpeqb(data, repl_cr);
341 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
342 c = __builtin_ia32_pcmpeqb(data, repl_bs);
343 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
344 c = __builtin_ia32_pcmpeqb(data, repl_qm);
345 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
346 found = __builtin_ia32_pmovmskb (t);
347 found &= mask;
349 while (!found);
351 __builtin_ia32_emms ();
353 /* FOUND contains 1 in bits for which we matched a relevant
354 character. Conversion to the byte index is trivial. */
355 found = __builtin_ctz(found);
356 return (const uchar *)p + found;
359 /* A version of the fast scanner using SSE2 vectorized byte compare insns. */
361 static const uchar *
362 #ifndef __SSE2__
363 __attribute__((__target__("sse2")))
364 #endif
365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
367 typedef char v16qi __attribute__ ((__vector_size__ (16)));
369 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
370 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
371 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
372 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
374 unsigned int misalign, found, mask;
375 const v16qi *p;
376 v16qi data, t;
378 /* Align the source pointer. */
379 misalign = (uintptr_t)s & 15;
380 p = (const v16qi *)((uintptr_t)s & -16);
381 data = *p;
383 /* Create a mask for the bytes that are valid within the first
384 16-byte block. The Idea here is that the AND with the mask
385 within the loop is "free", since we need some AND or TEST
386 insn in order to set the flags for the branch anyway. */
387 mask = -1u << misalign;
389 /* Main loop processing 16 bytes at a time. */
390 goto start;
393 data = *++p;
394 mask = -1;
396 start:
397 t = data == repl_nl;
398 t |= data == repl_cr;
399 t |= data == repl_bs;
400 t |= data == repl_qm;
401 found = __builtin_ia32_pmovmskb128 (t);
402 found &= mask;
404 while (!found);
406 /* FOUND contains 1 in bits for which we matched a relevant
407 character. Conversion to the byte index is trivial. */
408 found = __builtin_ctz(found);
409 return (const uchar *)p + found;
412 #ifdef HAVE_SSE4
413 /* A version of the fast scanner using SSE 4.2 vectorized string insns. */
415 static const uchar *
416 #ifndef __SSE4_2__
417 __attribute__((__target__("sse4.2")))
418 #endif
419 search_line_sse42 (const uchar *s, const uchar *end)
421 typedef char v16qi __attribute__ ((__vector_size__ (16)));
422 static const v16qi search = { '\n', '\r', '?', '\\' };
424 uintptr_t si = (uintptr_t)s;
425 uintptr_t index;
427 /* Check for unaligned input. */
428 if (si & 15)
430 v16qi sv;
432 if (__builtin_expect (end - s < 16, 0)
433 && __builtin_expect ((si & 0xfff) > 0xff0, 0))
435 /* There are less than 16 bytes left in the buffer, and less
436 than 16 bytes left on the page. Reading 16 bytes at this
437 point might generate a spurious page fault. Defer to the
438 SSE2 implementation, which already handles alignment. */
439 return search_line_sse2 (s, end);
442 /* ??? The builtin doesn't understand that the PCMPESTRI read from
443 memory need not be aligned. */
444 sv = __builtin_ia32_loaddqu ((const char *) s);
445 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
447 if (__builtin_expect (index < 16, 0))
448 goto found;
450 /* Advance the pointer to an aligned address. We will re-scan a
451 few bytes, but we no longer need care for reading past the
452 end of a page, since we're guaranteed a match. */
453 s = (const uchar *)((si + 15) & -16);
456 /* Main loop, processing 16 bytes at a time. */
457 #ifdef __GCC_ASM_FLAG_OUTPUTS__
458 while (1)
460 char f;
462 /* By using inline assembly instead of the builtin,
463 we can use the result, as well as the flags set. */
464 __asm ("%vpcmpestri\t$0, %2, %3"
465 : "=c"(index), "=@ccc"(f)
466 : "m"(*s), "x"(search), "a"(4), "d"(16));
467 if (f)
468 break;
470 s += 16;
472 #else
473 s -= 16;
474 /* By doing the whole loop in inline assembly,
475 we can make proper use of the flags set. */
476 __asm ( ".balign 16\n"
477 "0: add $16, %1\n"
478 " %vpcmpestri\t$0, (%1), %2\n"
479 " jnc 0b"
480 : "=&c"(index), "+r"(s)
481 : "x"(search), "a"(4), "d"(16));
482 #endif
484 found:
485 return s + index;
488 #else
489 /* Work around out-dated assemblers without sse4 support. */
490 #define search_line_sse42 search_line_sse2
491 #endif
493 /* Check the CPU capabilities. */
495 #include "../gcc/config/i386/cpuid.h"
497 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
498 static search_line_fast_type search_line_fast;
500 #define HAVE_init_vectorized_lexer 1
501 static inline void
502 init_vectorized_lexer (void)
504 unsigned dummy, ecx = 0, edx = 0;
505 search_line_fast_type impl = search_line_acc_char;
506 int minimum = 0;
508 #if defined(__SSE4_2__)
509 minimum = 3;
510 #elif defined(__SSE2__)
511 minimum = 2;
512 #elif defined(__SSE__)
513 minimum = 1;
514 #endif
516 if (minimum == 3)
517 impl = search_line_sse42;
518 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
520 if (minimum == 3 || (ecx & bit_SSE4_2))
521 impl = search_line_sse42;
522 else if (minimum == 2 || (edx & bit_SSE2))
523 impl = search_line_sse2;
524 else if (minimum == 1 || (edx & bit_SSE))
525 impl = search_line_mmx;
527 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
529 if (minimum == 1
530 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
531 impl = search_line_mmx;
534 search_line_fast = impl;
537 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
539 /* A vection of the fast scanner using AltiVec vectorized byte compares
540 and VSX unaligned loads (when VSX is available). This is otherwise
541 the same as the AltiVec version. */
543 ATTRIBUTE_NO_SANITIZE_UNDEFINED
544 static const uchar *
545 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
547 typedef __attribute__((altivec(vector))) unsigned char vc;
549 const vc repl_nl = {
550 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
551 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
553 const vc repl_cr = {
554 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
555 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
557 const vc repl_bs = {
558 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
559 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
561 const vc repl_qm = {
562 '?', '?', '?', '?', '?', '?', '?', '?',
563 '?', '?', '?', '?', '?', '?', '?', '?',
565 const vc zero = { 0 };
567 vc data, t;
569 /* Main loop processing 16 bytes at a time. */
572 vc m_nl, m_cr, m_bs, m_qm;
574 data = __builtin_vec_vsx_ld (0, s);
575 s += 16;
577 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
578 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
579 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
580 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
581 t = (m_nl | m_cr) | (m_bs | m_qm);
583 /* T now contains 0xff in bytes for which we matched one of the relevant
584 characters. We want to exit the loop if any byte in T is non-zero.
585 Below is the expansion of vec_any_ne(t, zero). */
587 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
589 /* Restore s to to point to the 16 bytes we just processed. */
590 s -= 16;
593 #define N (sizeof(vc) / sizeof(long))
595 union {
596 vc v;
597 /* Statically assert that N is 2 or 4. */
598 unsigned long l[(N == 2 || N == 4) ? N : -1];
599 } u;
600 unsigned long l, i = 0;
602 u.v = t;
604 /* Find the first word of T that is non-zero. */
605 switch (N)
607 case 4:
608 l = u.l[i++];
609 if (l != 0)
610 break;
611 s += sizeof(unsigned long);
612 l = u.l[i++];
613 if (l != 0)
614 break;
615 s += sizeof(unsigned long);
616 /* FALLTHRU */
617 case 2:
618 l = u.l[i++];
619 if (l != 0)
620 break;
621 s += sizeof(unsigned long);
622 l = u.l[i];
625 /* L now contains 0xff in bytes for which we matched one of the
626 relevant characters. We can find the byte index by finding
627 its bit index and dividing by 8. */
628 #ifdef __BIG_ENDIAN__
629 l = __builtin_clzl(l) >> 3;
630 #else
631 l = __builtin_ctzl(l) >> 3;
632 #endif
633 return s + l;
635 #undef N
639 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
641 /* A vection of the fast scanner using AltiVec vectorized byte compares.
642 This cannot be used for little endian because vec_lvsl/lvsr are
643 deprecated for little endian and the code won't work properly. */
644 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
645 so we can't compile this function without -maltivec on the command line
646 (or implied by some other switch). */
648 static const uchar *
649 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
651 typedef __attribute__((altivec(vector))) unsigned char vc;
653 const vc repl_nl = {
654 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
655 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
657 const vc repl_cr = {
658 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
659 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
661 const vc repl_bs = {
662 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
663 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
665 const vc repl_qm = {
666 '?', '?', '?', '?', '?', '?', '?', '?',
667 '?', '?', '?', '?', '?', '?', '?', '?',
669 const vc ones = {
670 -1, -1, -1, -1, -1, -1, -1, -1,
671 -1, -1, -1, -1, -1, -1, -1, -1,
673 const vc zero = { 0 };
675 vc data, mask, t;
677 /* Altivec loads automatically mask addresses with -16. This lets us
678 issue the first load as early as possible. */
679 data = __builtin_vec_ld(0, (const vc *)s);
681 /* Discard bytes before the beginning of the buffer. Do this by
682 beginning with all ones and shifting in zeros according to the
683 mis-alignment. The LVSR instruction pulls the exact shift we
684 want from the address. */
685 mask = __builtin_vec_lvsr(0, s);
686 mask = __builtin_vec_perm(zero, ones, mask);
687 data &= mask;
689 /* While altivec loads mask addresses, we still need to align S so
690 that the offset we compute at the end is correct. */
691 s = (const uchar *)((uintptr_t)s & -16);
693 /* Main loop processing 16 bytes at a time. */
694 goto start;
697 vc m_nl, m_cr, m_bs, m_qm;
699 s += 16;
700 data = __builtin_vec_ld(0, (const vc *)s);
702 start:
703 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
704 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
705 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
706 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
707 t = (m_nl | m_cr) | (m_bs | m_qm);
709 /* T now contains 0xff in bytes for which we matched one of the relevant
710 characters. We want to exit the loop if any byte in T is non-zero.
711 Below is the expansion of vec_any_ne(t, zero). */
713 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
716 #define N (sizeof(vc) / sizeof(long))
718 union {
719 vc v;
720 /* Statically assert that N is 2 or 4. */
721 unsigned long l[(N == 2 || N == 4) ? N : -1];
722 } u;
723 unsigned long l, i = 0;
725 u.v = t;
727 /* Find the first word of T that is non-zero. */
728 switch (N)
730 case 4:
731 l = u.l[i++];
732 if (l != 0)
733 break;
734 s += sizeof(unsigned long);
735 l = u.l[i++];
736 if (l != 0)
737 break;
738 s += sizeof(unsigned long);
739 /* FALLTHROUGH */
740 case 2:
741 l = u.l[i++];
742 if (l != 0)
743 break;
744 s += sizeof(unsigned long);
745 l = u.l[i];
748 /* L now contains 0xff in bytes for which we matched one of the
749 relevant characters. We can find the byte index by finding
750 its bit index and dividing by 8. */
751 l = __builtin_clzl(l) >> 3;
752 return s + l;
754 #undef N
758 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
759 #include "arm_neon.h"
761 /* This doesn't have to be the exact page size, but no system may use
762 a size smaller than this. ARMv8 requires a minimum page size of
763 4k. The impact of being conservative here is a small number of
764 cases will take the slightly slower entry path into the main
765 loop. */
767 #define AARCH64_MIN_PAGE_SIZE 4096
769 static const uchar *
770 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
772 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
773 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
774 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
775 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
776 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
778 #ifdef __ARM_BIG_ENDIAN
779 const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
780 #else
781 const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
782 #endif
784 unsigned int found;
785 const uint8_t *p;
786 uint8x16_t data;
787 uint8x16_t t;
788 uint16x8_t m;
789 uint8x16_t u, v, w;
791 /* Align the source pointer. */
792 p = (const uint8_t *)((uintptr_t)s & -16);
794 /* Assuming random string start positions, with a 4k page size we'll take
795 the slow path about 0.37% of the time. */
796 if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
797 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
798 < 16, 0))
800 /* Slow path: the string starts near a possible page boundary. */
801 uint32_t misalign, mask;
803 misalign = (uintptr_t)s & 15;
804 mask = (-1u << misalign) & 0xffff;
805 data = vld1q_u8 (p);
806 t = vceqq_u8 (data, repl_nl);
807 u = vceqq_u8 (data, repl_cr);
808 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
809 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
810 t = vorrq_u8 (v, w);
811 t = vandq_u8 (t, xmask);
812 m = vpaddlq_u8 (t);
813 m = vshlq_u16 (m, shift);
814 found = vaddvq_u16 (m);
815 found &= mask;
816 if (found)
817 return (const uchar*)p + __builtin_ctz (found);
819 else
821 data = vld1q_u8 ((const uint8_t *) s);
822 t = vceqq_u8 (data, repl_nl);
823 u = vceqq_u8 (data, repl_cr);
824 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
825 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
826 t = vorrq_u8 (v, w);
827 if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
828 goto done;
833 p += 16;
834 data = vld1q_u8 (p);
835 t = vceqq_u8 (data, repl_nl);
836 u = vceqq_u8 (data, repl_cr);
837 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
838 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
839 t = vorrq_u8 (v, w);
840 } while (!vpaddd_u64 ((uint64x2_t)t));
842 done:
843 /* Now that we've found the terminating substring, work out precisely where
844 we need to stop. */
845 t = vandq_u8 (t, xmask);
846 m = vpaddlq_u8 (t);
847 m = vshlq_u16 (m, shift);
848 found = vaddvq_u16 (m);
849 return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
850 + __builtin_ctz (found));
853 #elif defined (__ARM_NEON)
854 #include "arm_neon.h"
856 static const uchar *
857 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
859 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
860 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
861 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
862 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
863 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
865 unsigned int misalign, found, mask;
866 const uint8_t *p;
867 uint8x16_t data;
869 /* Align the source pointer. */
870 misalign = (uintptr_t)s & 15;
871 p = (const uint8_t *)((uintptr_t)s & -16);
872 data = vld1q_u8 (p);
874 /* Create a mask for the bytes that are valid within the first
875 16-byte block. The Idea here is that the AND with the mask
876 within the loop is "free", since we need some AND or TEST
877 insn in order to set the flags for the branch anyway. */
878 mask = (-1u << misalign) & 0xffff;
880 /* Main loop, processing 16 bytes at a time. */
881 goto start;
885 uint8x8_t l;
886 uint16x4_t m;
887 uint32x2_t n;
888 uint8x16_t t, u, v, w;
890 p += 16;
891 data = vld1q_u8 (p);
892 mask = 0xffff;
894 start:
895 t = vceqq_u8 (data, repl_nl);
896 u = vceqq_u8 (data, repl_cr);
897 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
898 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
899 t = vandq_u8 (vorrq_u8 (v, w), xmask);
900 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
901 m = vpaddl_u8 (l);
902 n = vpaddl_u16 (m);
904 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
905 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
906 found &= mask;
908 while (!found);
910 /* FOUND contains 1 in bits for which we matched a relevant
911 character. Conversion to the byte index is trivial. */
912 found = __builtin_ctz (found);
913 return (const uchar *)p + found;
916 #else
918 /* We only have one accelerated alternative. Use a direct call so that
919 we encourage inlining. */
921 #define search_line_fast search_line_acc_char
923 #endif
925 /* Initialize the lexer if needed. */
927 void
928 _cpp_init_lexer (void)
930 #ifdef HAVE_init_vectorized_lexer
931 init_vectorized_lexer ();
932 #endif
935 /* Returns with a logical line that contains no escaped newlines or
936 trigraphs. This is a time-critical inner loop. */
937 void
938 _cpp_clean_line (cpp_reader *pfile)
940 cpp_buffer *buffer;
941 const uchar *s;
942 uchar c, *d, *p;
944 buffer = pfile->buffer;
945 buffer->cur_note = buffer->notes_used = 0;
946 buffer->cur = buffer->line_base = buffer->next_line;
947 buffer->need_line = false;
948 s = buffer->next_line;
950 if (!buffer->from_stage3)
952 const uchar *pbackslash = NULL;
954 /* Fast path. This is the common case of an un-escaped line with
955 no trigraphs. The primary win here is by not writing any
956 data back to memory until we have to. */
957 while (1)
959 /* Perform an optimized search for \n, \r, \\, ?. */
960 s = search_line_fast (s, buffer->rlimit);
962 c = *s;
963 if (c == '\\')
965 /* Record the location of the backslash and continue. */
966 pbackslash = s++;
968 else if (__builtin_expect (c == '?', 0))
970 if (__builtin_expect (s[1] == '?', false)
971 && _cpp_trigraph_map[s[2]])
973 /* Have a trigraph. We may or may not have to convert
974 it. Add a line note regardless, for -Wtrigraphs. */
975 add_line_note (buffer, s, s[2]);
976 if (CPP_OPTION (pfile, trigraphs))
978 /* We do, and that means we have to switch to the
979 slow path. */
980 d = (uchar *) s;
981 *d = _cpp_trigraph_map[s[2]];
982 s += 2;
983 goto slow_path;
986 /* Not a trigraph. Continue on fast-path. */
987 s++;
989 else
990 break;
993 /* This must be \r or \n. We're either done, or we'll be forced
994 to write back to the buffer and continue on the slow path. */
995 d = (uchar *) s;
997 if (__builtin_expect (s == buffer->rlimit, false))
998 goto done;
1000 /* DOS line ending? */
1001 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1003 s++;
1004 if (s == buffer->rlimit)
1005 goto done;
1008 if (__builtin_expect (pbackslash == NULL, true))
1009 goto done;
1011 /* Check for escaped newline. */
1012 p = d;
1013 while (is_nvspace (p[-1]))
1014 p--;
1015 if (p - 1 != pbackslash)
1016 goto done;
1018 /* Have an escaped newline; process it and proceed to
1019 the slow path. */
1020 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1021 d = p - 2;
1022 buffer->next_line = p - 1;
1024 slow_path:
1025 while (1)
1027 c = *++s;
1028 *++d = c;
1030 if (c == '\n' || c == '\r')
1032 /* Handle DOS line endings. */
1033 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1034 s++;
1035 if (s == buffer->rlimit)
1036 break;
1038 /* Escaped? */
1039 p = d;
1040 while (p != buffer->next_line && is_nvspace (p[-1]))
1041 p--;
1042 if (p == buffer->next_line || p[-1] != '\\')
1043 break;
1045 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1046 d = p - 2;
1047 buffer->next_line = p - 1;
1049 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1051 /* Add a note regardless, for the benefit of -Wtrigraphs. */
1052 add_line_note (buffer, d, s[2]);
1053 if (CPP_OPTION (pfile, trigraphs))
1055 *d = _cpp_trigraph_map[s[2]];
1056 s += 2;
1061 else
1063 while (*s != '\n' && *s != '\r')
1064 s++;
1065 d = (uchar *) s;
1067 /* Handle DOS line endings. */
1068 if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1069 s++;
1072 done:
1073 *d = '\n';
1074 /* A sentinel note that should never be processed. */
1075 add_line_note (buffer, d + 1, '\n');
1076 buffer->next_line = s + 1;
1079 template <bool lexing_raw_string>
1080 static bool get_fresh_line_impl (cpp_reader *pfile);
1082 /* Return true if the trigraph indicated by NOTE should be warned
1083 about in a comment. */
1084 static bool
1085 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1087 const uchar *p;
1089 /* Within comments we don't warn about trigraphs, unless the
1090 trigraph forms an escaped newline, as that may change
1091 behavior. */
1092 if (note->type != '/')
1093 return false;
1095 /* If -trigraphs, then this was an escaped newline iff the next note
1096 is coincident. */
1097 if (CPP_OPTION (pfile, trigraphs))
1098 return note[1].pos == note->pos;
1100 /* Otherwise, see if this forms an escaped newline. */
1101 p = note->pos + 3;
1102 while (is_nvspace (*p))
1103 p++;
1105 /* There might have been escaped newlines between the trigraph and the
1106 newline we found. Hence the position test. */
1107 return (*p == '\n' && p < note[1].pos);
1110 /* Process the notes created by add_line_note as far as the current
1111 location. */
1112 void
1113 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1115 cpp_buffer *buffer = pfile->buffer;
1117 for (;;)
1119 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1120 unsigned int col;
1122 if (note->pos > buffer->cur)
1123 break;
1125 buffer->cur_note++;
1126 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1128 if (note->type == '\\' || note->type == ' ')
1130 if (note->type == ' ' && !in_comment)
1131 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1132 "backslash and newline separated by space");
1134 if (buffer->next_line > buffer->rlimit)
1136 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1137 "backslash-newline at end of file");
1138 /* Prevent "no newline at end of file" warning. */
1139 buffer->next_line = buffer->rlimit;
1142 buffer->line_base = note->pos;
1143 CPP_INCREMENT_LINE (pfile, 0);
1145 else if (_cpp_trigraph_map[note->type])
1147 if (CPP_OPTION (pfile, warn_trigraphs)
1148 && (!in_comment || warn_in_comment (pfile, note)))
1150 if (CPP_OPTION (pfile, trigraphs))
1151 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1152 pfile->line_table->highest_line, col,
1153 "trigraph ??%c converted to %c",
1154 note->type,
1155 (int) _cpp_trigraph_map[note->type]);
1156 else
1158 cpp_warning_with_line
1159 (pfile, CPP_W_TRIGRAPHS,
1160 pfile->line_table->highest_line, col,
1161 "trigraph ??%c ignored, use -trigraphs to enable",
1162 note->type);
1166 else if (note->type == 0)
1167 /* Already processed in lex_raw_string. */;
1168 else
1169 abort ();
1173 namespace bidi {
1174 enum class kind {
1175 NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1178 /* All the UTF-8 encodings of bidi characters start with E2. */
1179 constexpr uchar utf8_start = 0xe2;
1181 struct context
1183 context () {}
1184 context (location_t loc, kind k, bool pdf, bool ucn)
1185 : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1189 kind get_pop_kind () const
1191 return m_pdf ? kind::PDF : kind::PDI;
1193 bool ucn_p () const
1195 return m_ucn;
1198 location_t m_loc;
1199 kind m_kind;
1200 unsigned m_pdf : 1;
1201 unsigned m_ucn : 1;
1204 /* A vector holding currently open bidi contexts. We use a char for
1205 each context, its LSB is 1 if it represents a PDF context, 0 if it
1206 represents a PDI context. The next bit is 1 if this context was open
1207 by a bidi character written as a UCN, and 0 when it was UTF-8. */
1208 semi_embedded_vec <context, 16> vec;
1210 /* Close the whole comment/identifier/string literal/character constant
1211 context. */
1212 void on_close ()
1214 vec.truncate (0);
1217 /* Pop the last element in the vector. */
1218 void pop ()
1220 unsigned int len = vec.count ();
1221 gcc_checking_assert (len > 0);
1222 vec.truncate (len - 1);
1225 /* Return the pop kind of the context of the Ith element. */
1226 kind pop_kind_at (unsigned int i)
1228 return vec[i].get_pop_kind ();
1231 /* Return the pop kind of the context that is currently opened. */
1232 kind current_ctx ()
1234 unsigned int len = vec.count ();
1235 if (len == 0)
1236 return kind::NONE;
1237 return vec[len - 1].get_pop_kind ();
1240 /* Return true if the current context comes from a UCN origin, that is,
1241 the bidi char which started this bidi context was written as a UCN. */
1242 bool current_ctx_ucn_p ()
1244 unsigned int len = vec.count ();
1245 gcc_checking_assert (len > 0);
1246 return vec[len - 1].m_ucn;
1249 location_t current_ctx_loc ()
1251 unsigned int len = vec.count ();
1252 gcc_checking_assert (len > 0);
1253 return vec[len - 1].m_loc;
1256 /* We've read a bidi char, update the current vector as necessary.
1257 LOC is only valid when K is not kind::NONE. */
1258 void on_char (kind k, bool ucn_p, location_t loc)
1260 switch (k)
1262 case kind::LRE:
1263 case kind::RLE:
1264 case kind::LRO:
1265 case kind::RLO:
1266 vec.push (context (loc, k, true, ucn_p));
1267 break;
1268 case kind::LRI:
1269 case kind::RLI:
1270 case kind::FSI:
1271 vec.push (context (loc, k, false, ucn_p));
1272 break;
1273 /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1274 whose scope has not yet been terminated. */
1275 case kind::PDF:
1276 if (current_ctx () == kind::PDF)
1277 pop ();
1278 break;
1279 /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1280 scope has not yet been terminated, as well as the scopes of
1281 any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1282 yet been terminated. */
1283 case kind::PDI:
1284 for (int i = vec.count () - 1; i >= 0; --i)
1285 if (pop_kind_at (i) == kind::PDI)
1287 vec.truncate (i);
1288 break;
1290 break;
1291 case kind::LTR:
1292 case kind::RTL:
1293 /* These aren't popped by a PDF/PDI. */
1294 break;
1295 ATTR_LIKELY case kind::NONE:
1296 break;
1297 default:
1298 abort ();
1302 /* Return a descriptive string for K. */
1303 const char *to_str (kind k)
1305 switch (k)
1307 case kind::LRE:
1308 return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1309 case kind::RLE:
1310 return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1311 case kind::LRO:
1312 return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1313 case kind::RLO:
1314 return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1315 case kind::LRI:
1316 return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1317 case kind::RLI:
1318 return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1319 case kind::FSI:
1320 return "U+2068 (FIRST STRONG ISOLATE)";
1321 case kind::PDF:
1322 return "U+202C (POP DIRECTIONAL FORMATTING)";
1323 case kind::PDI:
1324 return "U+2069 (POP DIRECTIONAL ISOLATE)";
1325 case kind::LTR:
1326 return "U+200E (LEFT-TO-RIGHT MARK)";
1327 case kind::RTL:
1328 return "U+200F (RIGHT-TO-LEFT MARK)";
1329 default:
1330 abort ();
1335 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1336 within the current line in FILE, with the caret at START. */
1338 static location_t
1339 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1340 const unsigned char *const start,
1341 size_t num_bytes)
1343 gcc_checking_assert (num_bytes > 0);
1345 /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1346 to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1347 whereas linemap_position_for_column is 1-based. */
1349 /* Get 0-based offsets within the line. */
1350 size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1351 size_t end_offset = start_offset + num_bytes - 1;
1353 /* Now convert to location_t, where "columns" are 1-based byte offsets. */
1354 location_t start_loc = linemap_position_for_column (pfile->line_table,
1355 start_offset + 1);
1356 location_t end_loc = linemap_position_for_column (pfile->line_table,
1357 end_offset + 1);
1359 if (start_loc == end_loc)
1360 return start_loc;
1362 source_range src_range;
1363 src_range.m_start = start_loc;
1364 src_range.m_finish = end_loc;
1365 location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
1366 start_loc,
1367 src_range,
1368 NULL,
1370 return combined_loc;
1373 /* Parse a sequence of 3 bytes starting with P and return its bidi code. */
1375 static bidi::kind
1376 get_bidi_utf8_1 (const unsigned char *const p)
1378 gcc_checking_assert (p[0] == bidi::utf8_start);
1380 if (p[1] == 0x80)
1381 switch (p[2])
1383 case 0xaa:
1384 return bidi::kind::LRE;
1385 case 0xab:
1386 return bidi::kind::RLE;
1387 case 0xac:
1388 return bidi::kind::PDF;
1389 case 0xad:
1390 return bidi::kind::LRO;
1391 case 0xae:
1392 return bidi::kind::RLO;
1393 case 0x8e:
1394 return bidi::kind::LTR;
1395 case 0x8f:
1396 return bidi::kind::RTL;
1397 default:
1398 break;
1400 else if (p[1] == 0x81)
1401 switch (p[2])
1403 case 0xa6:
1404 return bidi::kind::LRI;
1405 case 0xa7:
1406 return bidi::kind::RLI;
1407 case 0xa8:
1408 return bidi::kind::FSI;
1409 case 0xa9:
1410 return bidi::kind::PDI;
1411 default:
1412 break;
1415 return bidi::kind::NONE;
1418 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1419 If the kind is not NONE, write the location to *OUT.*/
1421 static bidi::kind
1422 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1424 bidi::kind result = get_bidi_utf8_1 (p);
1425 if (result != bidi::kind::NONE)
1427 /* We have a sequence of 3 bytes starting at P. */
1428 *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1430 return result;
1433 /* Parse a UCN where P points just past \u or \U and return its bidi code. */
1435 static bidi::kind
1436 get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
1438 /* 6.4.3 Universal Character Names
1439 \u hex-quad
1440 \U hex-quad hex-quad
1441 \u { simple-hexadecimal-digit-sequence }
1442 where \unnnn means \U0000nnnn. */
1444 *end = p + 4;
1445 if (is_U)
1447 if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1448 return bidi::kind::NONE;
1449 /* Skip 4B so we can treat \u and \U the same below. */
1450 p += 4;
1451 *end += 4;
1453 else if (p[0] == '{')
1455 p++;
1456 while (*p == '0')
1457 p++;
1458 if (p[0] != '2'
1459 || p[1] != '0'
1460 || !ISXDIGIT (p[2])
1461 || !ISXDIGIT (p[3])
1462 || p[4] != '}')
1463 return bidi::kind::NONE;
1464 *end = p + 5;
1467 /* All code points we are looking for start with 20xx. */
1468 if (p[0] != '2' || p[1] != '0')
1469 return bidi::kind::NONE;
1470 else if (p[2] == '2')
1471 switch (p[3])
1473 case 'a':
1474 case 'A':
1475 return bidi::kind::LRE;
1476 case 'b':
1477 case 'B':
1478 return bidi::kind::RLE;
1479 case 'c':
1480 case 'C':
1481 return bidi::kind::PDF;
1482 case 'd':
1483 case 'D':
1484 return bidi::kind::LRO;
1485 case 'e':
1486 case 'E':
1487 return bidi::kind::RLO;
1488 default:
1489 break;
1491 else if (p[2] == '6')
1492 switch (p[3])
1494 case '6':
1495 return bidi::kind::LRI;
1496 case '7':
1497 return bidi::kind::RLI;
1498 case '8':
1499 return bidi::kind::FSI;
1500 case '9':
1501 return bidi::kind::PDI;
1502 default:
1503 break;
1505 else if (p[2] == '0')
1506 switch (p[3])
1508 case 'e':
1509 case 'E':
1510 return bidi::kind::LTR;
1511 case 'f':
1512 case 'F':
1513 return bidi::kind::RTL;
1514 default:
1515 break;
1518 return bidi::kind::NONE;
1521 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1522 If the kind is not NONE, write the location to *OUT. */
1524 static bidi::kind
1525 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1526 location_t *out)
1528 const unsigned char *end;
1529 bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
1530 if (result != bidi::kind::NONE)
1532 const unsigned char *start = p - 2;
1533 size_t num_bytes = end - start;
1534 *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1536 return result;
1539 /* Parse a named universal character escape where P points just past \N and
1540 return its bidi code. If the kind is not NONE, write the location to
1541 *OUT. */
1543 static bidi::kind
1544 get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1546 bidi::kind result = bidi::kind::NONE;
1547 if (*p != '{')
1548 return bidi::kind::NONE;
1549 if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
1551 if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1552 result = bidi::kind::LTR;
1553 else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1554 result = bidi::kind::LRE;
1555 else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1556 result = bidi::kind::LRO;
1557 else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1558 result = bidi::kind::LRI;
1560 else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
1562 if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1563 result = bidi::kind::RTL;
1564 else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1565 result = bidi::kind::RLE;
1566 else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1567 result = bidi::kind::RLO;
1568 else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1569 result = bidi::kind::RLI;
1571 else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
1573 if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
1574 result = bidi::kind::PDF;
1575 else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
1576 result = bidi::kind::PDI;
1578 else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
1579 result = bidi::kind::FSI;
1580 if (result != bidi::kind::NONE)
1581 *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
1582 (strchr ((const char *)
1583 (p + 1), '}')
1584 - (const char *) p)
1585 + 3);
1586 return result;
1589 /* Subclass of rich_location for reporting on unpaired UTF-8
1590 bidirectional control character(s).
1591 Escape the source lines on output, and show all unclosed
1592 bidi context, labelling everything. */
1594 class unpaired_bidi_rich_location : public rich_location
1596 public:
1597 class custom_range_label : public range_label
1599 public:
1600 label_text get_text (unsigned range_idx) const final override
1602 /* range 0 is the primary location; each subsequent range i + 1
1603 is for bidi::vec[i]. */
1604 if (range_idx > 0)
1606 const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1607 return label_text::borrow (bidi::to_str (ctxt.m_kind));
1609 else
1610 return label_text::borrow (_("end of bidirectional context"));
1614 unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1615 : rich_location (pfile->line_table, loc, &m_custom_label)
1617 set_escape_on_output (true);
1618 for (unsigned i = 0; i < bidi::vec.count (); i++)
1619 add_range (bidi::vec[i].m_loc,
1620 SHOW_RANGE_WITHOUT_CARET,
1621 &m_custom_label);
1624 private:
1625 custom_range_label m_custom_label;
1628 /* We're closing a bidi context, that is, we've encountered a newline,
1629 are closing a C-style comment, or are at the end of a string literal,
1630 character constant, or identifier. Warn if this context was not
1631 properly terminated by a PDI or PDF. P points to the last character
1632 in this context. */
1634 static void
1635 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1637 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1638 if (bidi::vec.count () > 0
1639 && (warn_bidi & bidirectional_unpaired
1640 && (!bidi::current_ctx_ucn_p ()
1641 || (warn_bidi & bidirectional_ucn))))
1643 const location_t loc
1644 = linemap_position_for_column (pfile->line_table,
1645 CPP_BUF_COLUMN (pfile->buffer, p));
1646 unpaired_bidi_rich_location rich_loc (pfile, loc);
1647 /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1648 forms of a diagnostic, so fake it for now. */
1649 if (bidi::vec.count () > 1)
1650 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1651 "unpaired UTF-8 bidirectional control characters "
1652 "detected");
1653 else
1654 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1655 "unpaired UTF-8 bidirectional control character "
1656 "detected");
1658 /* We're done with this context. */
1659 bidi::on_close ();
1662 /* We're at the beginning or in the middle of an identifier/comment/string
1663 literal/character constant. Warn if we've encountered a bidi character.
1664 KIND says which bidi control character it was; UCN_P is true iff this bidi
1665 control character was written as a UCN. LOC is the location of the
1666 character, but is only valid if KIND != bidi::kind::NONE. */
1668 static void
1669 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1670 bool ucn_p, location_t loc)
1672 if (__builtin_expect (kind == bidi::kind::NONE, 1))
1673 return;
1675 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1677 if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1679 rich_location rich_loc (pfile->line_table, loc);
1680 rich_loc.set_escape_on_output (true);
1682 /* It seems excessive to warn about a PDI/PDF that is closing
1683 an opened context because we've already warned about the
1684 opening character. Except warn when we have a UCN x UTF-8
1685 mismatch, if UCN checking is enabled. */
1686 if (kind == bidi::current_ctx ())
1688 if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1689 && bidi::current_ctx_ucn_p () != ucn_p)
1691 rich_loc.add_range (bidi::current_ctx_loc ());
1692 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1693 "UTF-8 vs UCN mismatch when closing "
1694 "a context by \"%s\"", bidi::to_str (kind));
1697 else if (warn_bidi & bidirectional_any
1698 && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1700 if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1701 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1702 "\"%s\" is closing an unopened context",
1703 bidi::to_str (kind));
1704 else
1705 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1706 "found problematic Unicode character \"%s\"",
1707 bidi::to_str (kind));
1710 /* We're done with this context. */
1711 bidi::on_char (kind, ucn_p, loc);
1714 static const cppchar_t utf8_continuation = 0x80;
1715 static const cppchar_t utf8_signifier = 0xC0;
1717 /* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1718 at PFILE->buffer->cur. Return a pointer after the diagnosed
1719 invalid character. */
1721 static const uchar *
1722 _cpp_warn_invalid_utf8 (cpp_reader *pfile)
1724 cpp_buffer *buffer = pfile->buffer;
1725 const uchar *cur = buffer->cur;
1726 bool pedantic = (CPP_PEDANTIC (pfile)
1727 && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1729 if (cur[0] < utf8_signifier
1730 || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1732 if (pedantic)
1733 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1734 pfile->line_table->highest_line,
1735 CPP_BUF_COL (buffer),
1736 "invalid UTF-8 character <%x>",
1737 cur[0]);
1738 else
1739 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1740 pfile->line_table->highest_line,
1741 CPP_BUF_COL (buffer),
1742 "invalid UTF-8 character <%x>",
1743 cur[0]);
1744 return cur + 1;
1746 else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1748 if (pedantic)
1749 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1750 pfile->line_table->highest_line,
1751 CPP_BUF_COL (buffer),
1752 "invalid UTF-8 character <%x><%x>",
1753 cur[0], cur[1]);
1754 else
1755 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1756 pfile->line_table->highest_line,
1757 CPP_BUF_COL (buffer),
1758 "invalid UTF-8 character <%x><%x>",
1759 cur[0], cur[1]);
1760 return cur + 2;
1762 else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1764 if (pedantic)
1765 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1766 pfile->line_table->highest_line,
1767 CPP_BUF_COL (buffer),
1768 "invalid UTF-8 character <%x><%x><%x>",
1769 cur[0], cur[1], cur[2]);
1770 else
1771 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1772 pfile->line_table->highest_line,
1773 CPP_BUF_COL (buffer),
1774 "invalid UTF-8 character <%x><%x><%x>",
1775 cur[0], cur[1], cur[2]);
1776 return cur + 3;
1778 else
1780 if (pedantic)
1781 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1782 pfile->line_table->highest_line,
1783 CPP_BUF_COL (buffer),
1784 "invalid UTF-8 character <%x><%x><%x><%x>",
1785 cur[0], cur[1], cur[2], cur[3]);
1786 else
1787 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1788 pfile->line_table->highest_line,
1789 CPP_BUF_COL (buffer),
1790 "invalid UTF-8 character <%x><%x><%x><%x>",
1791 cur[0], cur[1], cur[2], cur[3]);
1792 return cur + 4;
1796 /* Helper function of *skip_*_comment and lex*_string. For C,
1797 character at CUR[-1] with MSB set handle -Wbidi-chars* and
1798 -Winvalid-utf8 diagnostics and return pointer to first character
1799 that should be processed next. */
1801 static inline const uchar *
1802 _cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1803 const uchar *cur, bool warn_bidi_p,
1804 bool warn_invalid_utf8_p)
1806 /* If this is a beginning of a UTF-8 encoding, it might be
1807 a bidirectional control character. */
1808 if (c == bidi::utf8_start && warn_bidi_p)
1810 location_t loc;
1811 bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1812 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1814 if (!warn_invalid_utf8_p)
1815 return cur;
1816 if (c >= utf8_signifier)
1818 cppchar_t s;
1819 const uchar *pstr = cur - 1;
1820 if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
1821 && s <= UCS_LIMIT)
1822 return pstr;
1824 pfile->buffer->cur = cur - 1;
1825 return _cpp_warn_invalid_utf8 (pfile);
1828 /* Skip a C-style block comment. We find the end of the comment by
1829 seeing if an asterisk is before every '/' we encounter. Returns
1830 nonzero if comment terminated by EOF, zero otherwise.
1832 Buffer->cur points to the initial asterisk of the comment. */
1833 bool
1834 _cpp_skip_block_comment (cpp_reader *pfile)
1836 cpp_buffer *buffer = pfile->buffer;
1837 const uchar *cur = buffer->cur;
1838 uchar c;
1839 const bool warn_bidi_p = pfile->warn_bidi_p ();
1840 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1841 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1843 cur++;
1844 if (*cur == '/')
1845 cur++;
1847 for (;;)
1849 /* People like decorating comments with '*', so check for '/'
1850 instead for efficiency. */
1851 c = *cur++;
1853 if (c == '/')
1855 if (cur[-2] == '*')
1857 if (warn_bidi_p)
1858 maybe_warn_bidi_on_close (pfile, cur);
1859 break;
1862 /* Warn about potential nested comments, but not if the '/'
1863 comes immediately before the true comment delimiter.
1864 Don't bother to get it right across escaped newlines. */
1865 if (CPP_OPTION (pfile, warn_comments)
1866 && cur[0] == '*' && cur[1] != '/')
1868 buffer->cur = cur;
1869 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1870 pfile->line_table->highest_line,
1871 CPP_BUF_COL (buffer),
1872 "\"/*\" within comment");
1875 else if (c == '\n')
1877 unsigned int cols;
1878 buffer->cur = cur - 1;
1879 if (warn_bidi_p)
1880 maybe_warn_bidi_on_close (pfile, cur);
1881 _cpp_process_line_notes (pfile, true);
1882 if (buffer->next_line >= buffer->rlimit)
1883 return true;
1884 _cpp_clean_line (pfile);
1886 cols = buffer->next_line - buffer->line_base;
1887 CPP_INCREMENT_LINE (pfile, cols);
1889 cur = buffer->cur;
1891 else if (__builtin_expect (c >= utf8_continuation, 0)
1892 && warn_bidi_or_invalid_utf8_p)
1893 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1894 warn_invalid_utf8_p);
1897 buffer->cur = cur;
1898 _cpp_process_line_notes (pfile, true);
1899 return false;
1902 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1903 terminating newline. Handles escaped newlines. Returns nonzero
1904 if a multiline comment. */
1905 static int
1906 skip_line_comment (cpp_reader *pfile)
1908 cpp_buffer *buffer = pfile->buffer;
1909 location_t orig_line = pfile->line_table->highest_line;
1910 const bool warn_bidi_p = pfile->warn_bidi_p ();
1911 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1912 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1914 if (!warn_bidi_or_invalid_utf8_p)
1915 while (*buffer->cur != '\n')
1916 buffer->cur++;
1917 else if (!warn_invalid_utf8_p)
1919 while (*buffer->cur != '\n'
1920 && *buffer->cur != bidi::utf8_start)
1921 buffer->cur++;
1922 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1924 while (*buffer->cur != '\n')
1926 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1928 location_t loc;
1929 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1930 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1932 buffer->cur++;
1934 maybe_warn_bidi_on_close (pfile, buffer->cur);
1937 else
1939 while (*buffer->cur != '\n')
1941 if (*buffer->cur < utf8_continuation)
1943 buffer->cur++;
1944 continue;
1946 buffer->cur
1947 = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
1948 warn_bidi_p, warn_invalid_utf8_p);
1950 if (warn_bidi_p)
1951 maybe_warn_bidi_on_close (pfile, buffer->cur);
1954 _cpp_process_line_notes (pfile, true);
1955 return orig_line != pfile->line_table->highest_line;
1958 /* Skips whitespace, saving the next non-whitespace character. */
1959 static void
1960 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1962 cpp_buffer *buffer = pfile->buffer;
1963 bool saw_NUL = false;
1967 /* Horizontal space always OK. */
1968 if (c == ' ' || c == '\t')
1970 /* Just \f \v or \0 left. */
1971 else if (c == '\0')
1972 saw_NUL = true;
1973 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1974 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1975 CPP_BUF_COL (buffer),
1976 "%s in preprocessing directive",
1977 c == '\f' ? "form feed" : "vertical tab");
1979 c = *buffer->cur++;
1981 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1982 while (is_nvspace (c));
1984 if (saw_NUL)
1986 encoding_rich_location rich_loc (pfile);
1987 cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1988 "null character(s) ignored");
1991 buffer->cur--;
1994 /* See if the characters of a number token are valid in a name (no
1995 '.', '+' or '-'). */
1996 static int
1997 name_p (cpp_reader *pfile, const cpp_string *string)
1999 unsigned int i;
2001 for (i = 0; i < string->len; i++)
2002 if (!is_idchar (string->text[i]))
2003 return 0;
2005 return 1;
2008 /* After parsing an identifier or other sequence, produce a warning about
2009 sequences not in NFC/NFKC. */
2010 static void
2011 warn_about_normalization (cpp_reader *pfile,
2012 const cpp_token *token,
2013 const struct normalize_state *s,
2014 bool identifier)
2016 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2017 && !pfile->state.skipping)
2019 location_t loc = token->src_loc;
2021 /* If possible, create a location range for the token. */
2022 if (loc >= RESERVED_LOCATION_COUNT
2023 && token->type != CPP_EOF
2024 /* There must be no line notes to process. */
2025 && (!(pfile->buffer->cur
2026 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2027 && !pfile->overlaid_buffer)))
2029 source_range tok_range;
2030 tok_range.m_start = loc;
2031 tok_range.m_finish
2032 = linemap_position_for_column (pfile->line_table,
2033 CPP_BUF_COLUMN (pfile->buffer,
2034 pfile->buffer->cur));
2035 loc = COMBINE_LOCATION_DATA (pfile->line_table,
2036 loc, tok_range, NULL, 0);
2039 encoding_rich_location rich_loc (pfile, loc);
2041 /* Make sure that the token is printed using UCNs, even
2042 if we'd otherwise happily print UTF-8. */
2043 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2044 size_t sz;
2046 sz = cpp_spell_token (pfile, token, buf, false) - buf;
2047 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2048 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2049 "`%.*s' is not in NFKC", (int) sz, buf);
2050 else if (identifier && CPP_OPTION (pfile, xid_identifiers))
2051 cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2052 "`%.*s' is not in NFC", (int) sz, buf);
2053 else
2054 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2055 "`%.*s' is not in NFC", (int) sz, buf);
2056 free (buf);
2060 /* Returns TRUE if the sequence starting at buffer->cur is valid in
2061 an identifier. FIRST is TRUE if this starts an identifier. */
2063 static bool
2064 forms_identifier_p (cpp_reader *pfile, int first,
2065 struct normalize_state *state)
2067 cpp_buffer *buffer = pfile->buffer;
2068 const bool warn_bidi_p = pfile->warn_bidi_p ();
2070 if (*buffer->cur == '$')
2072 if (!CPP_OPTION (pfile, dollars_in_ident))
2073 return false;
2075 buffer->cur++;
2076 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
2078 CPP_OPTION (pfile, warn_dollars) = 0;
2079 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
2082 return true;
2085 /* Is this a syntactically valid UCN or a valid UTF-8 char? */
2086 if (CPP_OPTION (pfile, extended_identifiers))
2088 cppchar_t s;
2089 if (*buffer->cur >= utf8_signifier)
2091 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2092 && warn_bidi_p)
2094 location_t loc;
2095 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
2096 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2098 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2099 state, &s))
2100 return true;
2102 else if (*buffer->cur == '\\'
2103 && (buffer->cur[1] == 'u'
2104 || buffer->cur[1] == 'U'
2105 || buffer->cur[1] == 'N'))
2107 buffer->cur += 2;
2108 if (warn_bidi_p)
2110 location_t loc;
2111 bidi::kind kind;
2112 if (buffer->cur[-1] == 'N')
2113 kind = get_bidi_named (pfile, buffer->cur, &loc);
2114 else
2115 kind = get_bidi_ucn (pfile, buffer->cur,
2116 buffer->cur[-1] == 'U', &loc);
2117 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2119 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2120 state, &s, NULL, NULL))
2121 return true;
2122 buffer->cur -= 2;
2126 return false;
2129 /* Helper function to issue error about improper __VA_OPT__ use. */
2130 static void
2131 maybe_va_opt_error (cpp_reader *pfile)
2133 if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2135 /* __VA_OPT__ should not be accepted at all, but allow it in
2136 system headers. */
2137 if (!_cpp_in_system_header (pfile))
2138 cpp_error (pfile, CPP_DL_PEDWARN,
2139 "__VA_OPT__ is not available until C++20");
2141 else if (!pfile->state.va_args_ok)
2143 /* __VA_OPT__ should only appear in the replacement list of a
2144 variadic macro. */
2145 cpp_error (pfile, CPP_DL_PEDWARN,
2146 "__VA_OPT__ can only appear in the expansion"
2147 " of a C++20 variadic macro");
2151 /* Helper function to get the cpp_hashnode of the identifier BASE. */
2152 static cpp_hashnode *
2153 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
2155 cpp_hashnode *result;
2156 const uchar *cur;
2157 unsigned int len;
2158 unsigned int hash = HT_HASHSTEP (0, *base);
2160 cur = base + 1;
2161 while (ISIDNUM (*cur))
2163 hash = HT_HASHSTEP (hash, *cur);
2164 cur++;
2166 len = cur - base;
2167 hash = HT_HASHFINISH (hash, len);
2168 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2169 base, len, hash, HT_ALLOC));
2171 /* Rarely, identifiers require diagnostics when lexed. */
2172 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2173 && !pfile->state.skipping, 0))
2175 /* It is allowed to poison the same identifier twice. */
2176 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2177 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2178 NODE_NAME (result));
2180 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2181 replacement list of a variadic macro. */
2182 if (result == pfile->spec_nodes.n__VA_ARGS__
2183 && !pfile->state.va_args_ok)
2185 if (CPP_OPTION (pfile, cplusplus))
2186 cpp_error (pfile, CPP_DL_PEDWARN,
2187 "__VA_ARGS__ can only appear in the expansion"
2188 " of a C++11 variadic macro");
2189 else
2190 cpp_error (pfile, CPP_DL_PEDWARN,
2191 "__VA_ARGS__ can only appear in the expansion"
2192 " of a C99 variadic macro");
2195 if (result == pfile->spec_nodes.n__VA_OPT__)
2196 maybe_va_opt_error (pfile);
2198 /* For -Wc++-compat, warn about use of C++ named operators. */
2199 if (result->flags & NODE_WARN_OPERATOR)
2200 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2201 "identifier \"%s\" is a special operator name in C++",
2202 NODE_NAME (result));
2205 return result;
2208 /* Get the cpp_hashnode of an identifier specified by NAME in
2209 the current cpp_reader object. If none is found, NULL is returned. */
2210 cpp_hashnode *
2211 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2213 cpp_hashnode *result;
2214 result = lex_identifier_intern (pfile, (uchar *) name);
2215 return result;
2218 /* Lex an identifier starting at BUFFER->CUR - 1. */
2219 static cpp_hashnode *
2220 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2221 struct normalize_state *nst, cpp_hashnode **spelling)
2223 cpp_hashnode *result;
2224 const uchar *cur;
2225 unsigned int len;
2226 unsigned int hash = HT_HASHSTEP (0, *base);
2227 const bool warn_bidi_p = pfile->warn_bidi_p ();
2229 cur = pfile->buffer->cur;
2230 if (! starts_ucn)
2232 while (ISIDNUM (*cur))
2234 hash = HT_HASHSTEP (hash, *cur);
2235 cur++;
2237 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2239 pfile->buffer->cur = cur;
2240 if (starts_ucn || forms_identifier_p (pfile, false, nst))
2242 /* Slower version for identifiers containing UCNs
2243 or extended chars (including $). */
2244 do {
2245 while (ISIDNUM (*pfile->buffer->cur))
2247 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2248 pfile->buffer->cur++;
2250 } while (forms_identifier_p (pfile, false, nst));
2251 if (warn_bidi_p)
2252 maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2253 result = _cpp_interpret_identifier (pfile, base,
2254 pfile->buffer->cur - base);
2255 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2257 else
2259 len = cur - base;
2260 hash = HT_HASHFINISH (hash, len);
2262 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2263 base, len, hash, HT_ALLOC));
2264 *spelling = result;
2267 /* Rarely, identifiers require diagnostics when lexed. */
2268 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2269 && !pfile->state.skipping, 0))
2271 /* It is allowed to poison the same identifier twice. */
2272 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2273 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2274 NODE_NAME (result));
2276 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2277 replacement list of a variadic macro. */
2278 if (result == pfile->spec_nodes.n__VA_ARGS__
2279 && !pfile->state.va_args_ok)
2281 if (CPP_OPTION (pfile, cplusplus))
2282 cpp_error (pfile, CPP_DL_PEDWARN,
2283 "__VA_ARGS__ can only appear in the expansion"
2284 " of a C++11 variadic macro");
2285 else
2286 cpp_error (pfile, CPP_DL_PEDWARN,
2287 "__VA_ARGS__ can only appear in the expansion"
2288 " of a C99 variadic macro");
2291 /* __VA_OPT__ should only appear in the replacement list of a
2292 variadic macro. */
2293 if (result == pfile->spec_nodes.n__VA_OPT__)
2294 maybe_va_opt_error (pfile);
2296 /* For -Wc++-compat, warn about use of C++ named operators. */
2297 if (result->flags & NODE_WARN_OPERATOR)
2298 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2299 "identifier \"%s\" is a special operator name in C++",
2300 NODE_NAME (result));
2303 return result;
2306 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
2307 static void
2308 lex_number (cpp_reader *pfile, cpp_string *number,
2309 struct normalize_state *nst)
2311 const uchar *cur;
2312 const uchar *base;
2313 uchar *dest;
2315 base = pfile->buffer->cur - 1;
2318 const uchar *adj_digit_sep = NULL;
2319 cur = pfile->buffer->cur;
2321 /* N.B. ISIDNUM does not include $. */
2322 while (ISIDNUM (*cur)
2323 || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2324 || DIGIT_SEP (*cur)
2325 || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2327 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2328 /* Adjacent digit separators do not form part of the pp-number syntax.
2329 However, they can safely be diagnosed here as an error, since '' is
2330 not a valid preprocessing token. */
2331 if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2332 adj_digit_sep = cur;
2333 cur++;
2335 /* A number can't end with a digit separator. */
2336 while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2337 --cur;
2338 if (adj_digit_sep && adj_digit_sep < cur)
2339 cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2341 pfile->buffer->cur = cur;
2343 while (forms_identifier_p (pfile, false, nst));
2345 number->len = cur - base;
2346 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2347 memcpy (dest, base, number->len);
2348 dest[number->len] = '\0';
2349 number->text = dest;
2352 /* Create a token of type TYPE with a literal spelling. */
2353 static void
2354 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2355 unsigned int len, enum cpp_ttype type)
2357 token->type = type;
2358 token->val.str.len = len;
2359 token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2362 const uchar *
2363 cpp_alloc_token_string (cpp_reader *pfile,
2364 const unsigned char *ptr, unsigned len)
2366 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2368 dest[len] = 0;
2369 memcpy (dest, ptr, len);
2370 return dest;
2373 /* A pair of raw buffer pointers. The currently open one is [1], the
2374 first one is [0]. Used for string literal lexing. */
2375 struct lit_accum {
2376 _cpp_buff *first;
2377 _cpp_buff *last;
2378 const uchar *rpos;
2379 size_t accum;
2381 lit_accum ()
2382 : first (NULL), last (NULL), rpos (0), accum (0)
2386 void append (cpp_reader *, const uchar *, size_t);
2388 void read_begin (cpp_reader *);
2389 bool reading_p () const
2391 return rpos != NULL;
2393 char read_char ()
2395 char c = *rpos++;
2396 if (rpos == BUFF_FRONT (last))
2397 rpos = NULL;
2398 return c;
2402 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2403 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
2405 void
2406 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2408 if (!last)
2409 /* Starting. */
2410 first = last = _cpp_get_buff (pfile, len);
2411 else if (len > BUFF_ROOM (last))
2413 /* There is insufficient room in the buffer. Copy what we can,
2414 and then either extend or create a new one. */
2415 size_t room = BUFF_ROOM (last);
2416 memcpy (BUFF_FRONT (last), base, room);
2417 BUFF_FRONT (last) += room;
2418 base += room;
2419 len -= room;
2420 accum += room;
2422 gcc_checking_assert (!rpos);
2424 last = _cpp_append_extend_buff (pfile, last, len);
2427 memcpy (BUFF_FRONT (last), base, len);
2428 BUFF_FRONT (last) += len;
2429 accum += len;
2432 void
2433 lit_accum::read_begin (cpp_reader *pfile)
2435 /* We never accumulate more than 4 chars to read. */
2436 if (BUFF_ROOM (last) < 4)
2438 last = _cpp_append_extend_buff (pfile, last, 4);
2439 rpos = BUFF_FRONT (last);
2442 /* Returns true if a macro has been defined.
2443 This might not work if compile with -save-temps,
2444 or preprocess separately from compilation. */
2446 static bool
2447 is_macro(cpp_reader *pfile, const uchar *base)
2449 const uchar *cur = base;
2450 if (! ISIDST (*cur))
2451 return false;
2452 unsigned int hash = HT_HASHSTEP (0, *cur);
2453 ++cur;
2454 while (ISIDNUM (*cur))
2456 hash = HT_HASHSTEP (hash, *cur);
2457 ++cur;
2459 hash = HT_HASHFINISH (hash, cur - base);
2461 cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2462 base, cur - base, hash, HT_NO_INSERT));
2464 return result && cpp_macro_p (result);
2467 /* Returns true if a literal suffix does not have the expected form
2468 and is defined as a macro. */
2470 static bool
2471 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
2473 /* User-defined literals outside of namespace std must start with a single
2474 underscore, so assume anything of that form really is a UDL suffix.
2475 We don't need to worry about UDLs defined inside namespace std because
2476 their names are reserved, so cannot be used as macro names in valid
2477 programs. */
2478 if (base[0] == '_' && base[1] != '_')
2479 return false;
2480 return is_macro (pfile, base);
2483 /* Lexes a raw string. The stored string contains the spelling,
2484 including double quotes, delimiter string, '(' and ')', any leading
2485 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
2486 the type of the literal, or CPP_OTHER if it was not properly
2487 terminated.
2489 BASE is the start of the token. Updates pfile->buffer->cur to just
2490 after the lexed string.
2492 The spelling is NUL-terminated, but it is not guaranteed that this
2493 is the first NUL since embedded NULs are preserved. */
2495 static void
2496 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2498 const uchar *pos = base;
2499 const bool warn_bidi_p = pfile->warn_bidi_p ();
2500 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2501 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2503 /* 'tis a pity this information isn't passed down from the lexer's
2504 initial categorization of the token. */
2505 enum cpp_ttype type = CPP_STRING;
2507 if (*pos == 'L')
2509 type = CPP_WSTRING;
2510 pos++;
2512 else if (*pos == 'U')
2514 type = CPP_STRING32;
2515 pos++;
2517 else if (*pos == 'u')
2519 if (pos[1] == '8')
2521 type = CPP_UTF8STRING;
2522 pos++;
2524 else
2525 type = CPP_STRING16;
2526 pos++;
2529 gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2530 pos += 2;
2532 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2534 /* Skip notes before the ". */
2535 while (note->pos < pos)
2536 ++note;
2538 lit_accum accum;
2540 uchar prefix[17];
2541 unsigned prefix_len = 0;
2542 enum Phase
2544 PHASE_PREFIX = -2,
2545 PHASE_NONE = -1,
2546 PHASE_SUFFIX = 0
2547 } phase = PHASE_PREFIX;
2549 for (;;)
2551 gcc_checking_assert (note->pos >= pos);
2553 /* Undo any escaped newlines and trigraphs. */
2554 if (!accum.reading_p () && note->pos == pos)
2555 switch (note->type)
2557 case '\\':
2558 case ' ':
2559 /* Restore backslash followed by newline. */
2560 accum.append (pfile, base, pos - base);
2561 base = pos;
2562 accum.read_begin (pfile);
2563 accum.append (pfile, UC"\\", 1);
2565 after_backslash:
2566 if (note->type == ' ')
2567 /* GNU backslash whitespace newline extension. FIXME
2568 could be any sequence of non-vertical space. When we
2569 can properly restore any such sequence, we should
2570 mark this note as handled so _cpp_process_line_notes
2571 doesn't warn. */
2572 accum.append (pfile, UC" ", 1);
2574 accum.append (pfile, UC"\n", 1);
2575 note++;
2576 break;
2578 case '\n':
2579 /* This can happen for ??/<NEWLINE> when trigraphs are not
2580 being interpretted. */
2581 gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2582 note->type = 0;
2583 note++;
2584 break;
2586 default:
2587 gcc_checking_assert (_cpp_trigraph_map[note->type]);
2589 /* Don't warn about this trigraph in
2590 _cpp_process_line_notes, since trigraphs show up as
2591 trigraphs in raw strings. */
2592 uchar type = note->type;
2593 note->type = 0;
2595 if (CPP_OPTION (pfile, trigraphs))
2597 accum.append (pfile, base, pos - base);
2598 base = pos;
2599 accum.read_begin (pfile);
2600 accum.append (pfile, UC"??", 2);
2601 accum.append (pfile, &type, 1);
2603 /* ??/ followed by newline gets two line notes, one for
2604 the trigraph and one for the backslash/newline. */
2605 if (type == '/' && note[1].pos == pos)
2607 note++;
2608 gcc_assert (note->type == '\\' || note->type == ' ');
2609 goto after_backslash;
2611 /* Skip the replacement character. */
2612 base = ++pos;
2615 note++;
2616 break;
2619 /* Now get a char to process. Either from an expanded note, or
2620 from the line buffer. */
2621 bool read_note = accum.reading_p ();
2622 char c = read_note ? accum.read_char () : *pos++;
2624 if (phase == PHASE_PREFIX)
2626 if (c == '(')
2628 /* Done. */
2629 phase = PHASE_NONE;
2630 prefix[prefix_len++] = '"';
2632 else if (prefix_len < 16
2633 /* Prefix chars are any of the basic character set,
2634 [lex.charset] except for '
2635 ()\\\t\v\f\n'. Optimized for a contiguous
2636 alphabet. */
2637 /* Unlike a switch, this collapses down to one or
2638 two shift and bitmask operations on an ASCII
2639 system, with an outlier or two. */
2640 && (('Z' - 'A' == 25
2641 ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2642 : ISIDST (c))
2643 || (c >= '0' && c <= '9')
2644 || c == '_' || c == '{' || c == '}'
2645 || c == '[' || c == ']' || c == '#'
2646 || c == '<' || c == '>' || c == '%'
2647 || c == ':' || c == ';' || c == '.' || c == '?'
2648 || c == '*' || c == '+' || c == '-' || c == '/'
2649 || c == '^' || c == '&' || c == '|' || c == '~'
2650 || c == '!' || c == '=' || c == ','
2651 || c == '"' || c == '\''))
2652 prefix[prefix_len++] = c;
2653 else
2655 /* Something is wrong. */
2656 int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2657 if (prefix_len == 16)
2658 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2659 col, "raw string delimiter longer "
2660 "than 16 characters");
2661 else if (c == '\n')
2662 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2663 col, "invalid new-line in raw "
2664 "string delimiter");
2665 else
2666 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2667 col, "invalid character '%c' in "
2668 "raw string delimiter", c);
2669 type = CPP_OTHER;
2670 phase = PHASE_NONE;
2671 /* Continue until we get a close quote, that's probably
2672 the best failure mode. */
2673 prefix_len = 0;
2675 if (c != '\n')
2676 continue;
2679 if (phase != PHASE_NONE)
2681 if (prefix[phase] != c)
2682 phase = PHASE_NONE;
2683 else if (unsigned (phase + 1) == prefix_len)
2684 break;
2685 else
2687 phase = Phase (phase + 1);
2688 continue;
2692 if (!prefix_len && c == '"')
2693 /* Failure mode lexing. */
2694 goto out;
2695 else if (prefix_len && c == ')')
2696 phase = PHASE_SUFFIX;
2697 else if (!read_note && c == '\n')
2699 pos--;
2700 pfile->buffer->cur = pos;
2701 if ((pfile->state.in_directive || pfile->state.parsing_args)
2702 && pfile->buffer->next_line >= pfile->buffer->rlimit)
2704 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2705 "unterminated raw string");
2706 type = CPP_OTHER;
2707 goto out;
2710 accum.append (pfile, base, pos - base + 1);
2711 _cpp_process_line_notes (pfile, false);
2713 if (pfile->buffer->next_line < pfile->buffer->rlimit)
2714 CPP_INCREMENT_LINE (pfile, 0);
2715 pfile->buffer->need_line = true;
2717 if (!get_fresh_line_impl<true> (pfile))
2719 /* We ran out of file and failed to get a line. */
2720 location_t src_loc = token->src_loc;
2721 token->type = CPP_EOF;
2722 /* Tell the compiler the line number of the EOF token. */
2723 token->src_loc = pfile->line_table->highest_line;
2724 token->flags = BOL;
2725 if (accum.first)
2726 _cpp_release_buff (pfile, accum.first);
2727 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2728 "unterminated raw string");
2730 /* Now pop the buffer that get_fresh_line_impl() did not. Popping
2731 is not safe if processing a directive, however this cannot
2732 happen as we already checked above that a line would be
2733 available, and get_fresh_line_impl() can't fail in this
2734 case. */
2735 gcc_assert (!pfile->state.in_directive);
2736 _cpp_pop_buffer (pfile);
2738 return;
2741 pos = base = pfile->buffer->cur;
2742 note = &pfile->buffer->notes[pfile->buffer->cur_note];
2744 else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2745 && warn_bidi_or_invalid_utf8_p)
2746 pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
2747 warn_invalid_utf8_p);
2750 if (warn_bidi_p)
2751 maybe_warn_bidi_on_close (pfile, pos);
2753 if (CPP_OPTION (pfile, user_literals))
2755 /* If a string format macro, say from inttypes.h, is placed touching
2756 a string literal it could be parsed as a C++11 user-defined string
2757 literal thus breaking the program. */
2758 if (is_macro_not_literal_suffix (pfile, pos))
2760 /* Raise a warning, but do not consume subsequent tokens. */
2761 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2762 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2763 token->src_loc, 0,
2764 "invalid suffix on literal; C++11 requires "
2765 "a space between literal and string macro");
2767 /* Grab user defined literal suffix. */
2768 else if (ISIDST (*pos))
2770 type = cpp_userdef_string_add_type (type);
2771 ++pos;
2773 while (ISIDNUM (*pos))
2774 ++pos;
2778 out:
2779 pfile->buffer->cur = pos;
2780 if (!accum.accum)
2781 create_literal (pfile, token, base, pos - base, type);
2782 else
2784 size_t extra_len = pos - base;
2785 uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2787 token->type = type;
2788 token->val.str.len = accum.accum + extra_len;
2789 token->val.str.text = dest;
2790 for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2792 size_t len = BUFF_FRONT (buf) - buf->base;
2793 memcpy (dest, buf->base, len);
2794 dest += len;
2796 _cpp_release_buff (pfile, accum.first);
2797 memcpy (dest, base, extra_len);
2798 dest[extra_len] = '\0';
2802 /* Lexes a string, character constant, or angle-bracketed header file
2803 name. The stored string contains the spelling, including opening
2804 quote and any leading 'L', 'u', 'U' or 'u8' and optional
2805 'R' modifier. It returns the type of the literal, or CPP_OTHER
2806 if it was not properly terminated, or CPP_LESS for an unterminated
2807 header name which must be relexed as normal tokens.
2809 The spelling is NUL-terminated, but it is not guaranteed that this
2810 is the first NUL since embedded NULs are preserved. */
2811 static void
2812 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2814 bool saw_NUL = false;
2815 const uchar *cur;
2816 cppchar_t terminator;
2817 enum cpp_ttype type;
2819 cur = base;
2820 terminator = *cur++;
2821 if (terminator == 'L' || terminator == 'U')
2822 terminator = *cur++;
2823 else if (terminator == 'u')
2825 terminator = *cur++;
2826 if (terminator == '8')
2827 terminator = *cur++;
2829 if (terminator == 'R')
2831 lex_raw_string (pfile, token, base);
2832 return;
2834 if (terminator == '"')
2835 type = (*base == 'L' ? CPP_WSTRING :
2836 *base == 'U' ? CPP_STRING32 :
2837 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2838 : CPP_STRING);
2839 else if (terminator == '\'')
2840 type = (*base == 'L' ? CPP_WCHAR :
2841 *base == 'U' ? CPP_CHAR32 :
2842 *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2843 : CPP_CHAR);
2844 else
2845 terminator = '>', type = CPP_HEADER_NAME;
2847 const bool warn_bidi_p = pfile->warn_bidi_p ();
2848 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2849 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2850 for (;;)
2852 cppchar_t c = *cur++;
2854 /* In #include-style directives, terminators are not escapable. */
2855 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2857 if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
2859 location_t loc;
2860 bidi::kind kind;
2861 if (cur[0] == 'N')
2862 kind = get_bidi_named (pfile, cur + 1, &loc);
2863 else
2864 kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
2865 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2867 cur++;
2869 else if (c == terminator)
2871 if (warn_bidi_p)
2872 maybe_warn_bidi_on_close (pfile, cur - 1);
2873 break;
2875 else if (c == '\n')
2877 cur--;
2878 /* Unmatched quotes always yield undefined behavior, but
2879 greedy lexing means that what appears to be an unterminated
2880 header name may actually be a legitimate sequence of tokens. */
2881 if (terminator == '>')
2883 token->type = CPP_LESS;
2884 return;
2886 type = CPP_OTHER;
2887 break;
2889 else if (c == '\0')
2890 saw_NUL = true;
2891 else if (__builtin_expect (c >= utf8_continuation, 0)
2892 && warn_bidi_or_invalid_utf8_p)
2893 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2894 warn_invalid_utf8_p);
2897 if (saw_NUL && !pfile->state.skipping)
2898 cpp_error (pfile, CPP_DL_WARNING,
2899 "null character(s) preserved in literal");
2901 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2902 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2903 (int) terminator);
2905 if (CPP_OPTION (pfile, user_literals))
2907 /* If a string format macro, say from inttypes.h, is placed touching
2908 a string literal it could be parsed as a C++11 user-defined string
2909 literal thus breaking the program. */
2910 if (is_macro_not_literal_suffix (pfile, cur))
2912 /* Raise a warning, but do not consume subsequent tokens. */
2913 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2914 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2915 token->src_loc, 0,
2916 "invalid suffix on literal; C++11 requires "
2917 "a space between literal and string macro");
2919 /* Grab user defined literal suffix. */
2920 else if (ISIDST (*cur))
2922 type = cpp_userdef_char_add_type (type);
2923 type = cpp_userdef_string_add_type (type);
2924 ++cur;
2926 while (ISIDNUM (*cur))
2927 ++cur;
2930 else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2931 && is_macro (pfile, cur)
2932 && !pfile->state.skipping)
2933 cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2934 token->src_loc, 0, "C++11 requires a space "
2935 "between string literal and macro");
2937 pfile->buffer->cur = cur;
2938 create_literal (pfile, token, base, cur - base, type);
2941 /* Return the comment table. The client may not make any assumption
2942 about the ordering of the table. */
2943 cpp_comment_table *
2944 cpp_get_comments (cpp_reader *pfile)
2946 return &pfile->comments;
2949 /* Append a comment to the end of the comment table. */
2950 static void
2951 store_comment (cpp_reader *pfile, cpp_token *token)
2953 int len;
2955 if (pfile->comments.allocated == 0)
2957 pfile->comments.allocated = 256;
2958 pfile->comments.entries = (cpp_comment *) xmalloc
2959 (pfile->comments.allocated * sizeof (cpp_comment));
2962 if (pfile->comments.count == pfile->comments.allocated)
2964 pfile->comments.allocated *= 2;
2965 pfile->comments.entries = (cpp_comment *) xrealloc
2966 (pfile->comments.entries,
2967 pfile->comments.allocated * sizeof (cpp_comment));
2970 len = token->val.str.len;
2972 /* Copy comment. Note, token may not be NULL terminated. */
2973 pfile->comments.entries[pfile->comments.count].comment =
2974 (char *) xmalloc (sizeof (char) * (len + 1));
2975 memcpy (pfile->comments.entries[pfile->comments.count].comment,
2976 token->val.str.text, len);
2977 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2979 /* Set source location. */
2980 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2982 /* Increment the count of entries in the comment table. */
2983 pfile->comments.count++;
2986 /* The stored comment includes the comment start and any terminator. */
2987 static void
2988 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2989 cppchar_t type)
2991 unsigned char *buffer;
2992 unsigned int len, clen, i;
2994 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
2996 /* C++ comments probably (not definitely) have moved past a new
2997 line, which we don't want to save in the comment. */
2998 if (is_vspace (pfile->buffer->cur[-1]))
2999 len--;
3001 /* If we are currently in a directive or in argument parsing, then
3002 we need to store all C++ comments as C comments internally, and
3003 so we need to allocate a little extra space in that case.
3005 Note that the only time we encounter a directive here is
3006 when we are saving comments in a "#define". */
3007 clen = ((pfile->state.in_directive || pfile->state.parsing_args)
3008 && type == '/') ? len + 2 : len;
3010 buffer = _cpp_unaligned_alloc (pfile, clen);
3012 token->type = CPP_COMMENT;
3013 token->val.str.len = clen;
3014 token->val.str.text = buffer;
3016 buffer[0] = '/';
3017 memcpy (buffer + 1, from, len - 1);
3019 /* Finish conversion to a C comment, if necessary. */
3020 if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
3022 buffer[1] = '*';
3023 buffer[clen - 2] = '*';
3024 buffer[clen - 1] = '/';
3025 /* As there can be in a C++ comments illegal sequences for C comments
3026 we need to filter them out. */
3027 for (i = 2; i < (clen - 2); i++)
3028 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3029 buffer[i] = '|';
3032 /* Finally store this comment for use by clients of libcpp. */
3033 store_comment (pfile, token);
3036 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3037 comment. */
3039 static bool
3040 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3042 const unsigned char *from = comment_start + 1;
3044 switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3046 /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3047 don't recognize any comments. The latter only checks attributes,
3048 the former doesn't warn. */
3049 case 0:
3050 default:
3051 return false;
3052 /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3053 content it has. */
3054 case 1:
3055 return true;
3056 case 2:
3057 /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3058 .*falls?[ \t-]*thr(u|ough).* regex. */
3059 for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3060 from++)
3062 /* Is there anything like strpbrk with upper boundary, or
3063 memchr looking for 2 characters rather than just one? */
3064 if (from[0] != 'f' && from[0] != 'F')
3065 continue;
3066 if (from[1] != 'a' && from[1] != 'A')
3067 continue;
3068 if (from[2] != 'l' && from[2] != 'L')
3069 continue;
3070 if (from[3] != 'l' && from[3] != 'L')
3071 continue;
3072 from += sizeof "fall" - 1;
3073 if (from[0] == 's' || from[0] == 'S')
3074 from++;
3075 while (*from == ' ' || *from == '\t' || *from == '-')
3076 from++;
3077 if (from[0] != 't' && from[0] != 'T')
3078 continue;
3079 if (from[1] != 'h' && from[1] != 'H')
3080 continue;
3081 if (from[2] != 'r' && from[2] != 'R')
3082 continue;
3083 if (from[3] == 'u' || from[3] == 'U')
3084 return true;
3085 if (from[3] != 'o' && from[3] != 'O')
3086 continue;
3087 if (from[4] != 'u' && from[4] != 'U')
3088 continue;
3089 if (from[5] != 'g' && from[5] != 'G')
3090 continue;
3091 if (from[6] != 'h' && from[6] != 'H')
3092 continue;
3093 return true;
3095 return false;
3096 case 3:
3097 case 4:
3098 break;
3101 /* Whole comment contents:
3102 -fallthrough
3103 @fallthrough@
3105 if (*from == '-' || *from == '@')
3107 size_t len = sizeof "fallthrough" - 1;
3108 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3109 return false;
3110 if (memcmp (from + 1, "fallthrough", len))
3111 return false;
3112 if (*from == '@')
3114 if (from[len + 1] != '@')
3115 return false;
3116 len++;
3118 from += 1 + len;
3120 /* Whole comment contents (regex):
3121 lint -fallthrough[ \t]*
3123 else if (*from == 'l')
3125 size_t len = sizeof "int -fallthrough" - 1;
3126 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3127 return false;
3128 if (memcmp (from + 1, "int -fallthrough", len))
3129 return false;
3130 from += 1 + len;
3131 while (*from == ' ' || *from == '\t')
3132 from++;
3134 /* Whole comment contents (regex):
3135 [ \t]*FALLTHR(U|OUGH)[ \t]*
3137 else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3139 while (*from == ' ' || *from == '\t')
3140 from++;
3141 if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
3142 return false;
3143 if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
3144 return false;
3145 from += sizeof "FALLTHR" - 1;
3146 if (*from == 'U')
3147 from++;
3148 else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
3149 return false;
3150 else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
3151 return false;
3152 else
3153 from += sizeof "OUGH" - 1;
3154 while (*from == ' ' || *from == '\t')
3155 from++;
3157 /* Whole comment contents (regex):
3158 [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3159 [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3160 [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
3162 else
3164 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3165 from++;
3166 unsigned char f = *from;
3167 bool all_upper = false;
3168 if (f == 'E' || f == 'e')
3170 if ((size_t) (pfile->buffer->cur - from)
3171 < sizeof "else fallthru" - 1)
3172 return false;
3173 if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
3174 all_upper = true;
3175 else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
3176 return false;
3177 from += sizeof "else" - 1;
3178 if (*from == ',')
3179 from++;
3180 if (*from != ' ')
3181 return false;
3182 from++;
3183 if (all_upper && *from == 'f')
3184 return false;
3185 if (f == 'e' && *from == 'F')
3186 return false;
3187 f = *from;
3189 else if (f == 'I' || f == 'i')
3191 if ((size_t) (pfile->buffer->cur - from)
3192 < sizeof "intentional fallthru" - 1)
3193 return false;
3194 if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
3195 sizeof "NTENTIONAL" - 1) == 0)
3196 all_upper = true;
3197 else if (memcmp (from + 1, "ntentional",
3198 sizeof "ntentional" - 1))
3199 return false;
3200 from += sizeof "intentional" - 1;
3201 if (*from == ' ')
3203 from++;
3204 if (all_upper && *from == 'f')
3205 return false;
3207 else if (all_upper)
3209 if (memcmp (from, "LY F", sizeof "LY F" - 1))
3210 return false;
3211 from += sizeof "LY " - 1;
3213 else
3215 if (memcmp (from, "ly ", sizeof "ly " - 1))
3216 return false;
3217 from += sizeof "ly " - 1;
3219 if (f == 'i' && *from == 'F')
3220 return false;
3221 f = *from;
3223 if (f != 'F' && f != 'f')
3224 return false;
3225 if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3226 return false;
3227 if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3228 all_upper = true;
3229 else if (all_upper)
3230 return false;
3231 else if (memcmp (from + 1, "all", sizeof "all" - 1))
3232 return false;
3233 from += sizeof "fall" - 1;
3234 if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3235 from += 2;
3236 else if (*from == ' ' || *from == '-')
3237 from++;
3238 else if (*from != (all_upper ? 'T' : 't'))
3239 return false;
3240 if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3241 return false;
3242 if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3243 return false;
3244 if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3246 if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3247 return false;
3248 if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3249 sizeof "hrough" - 1))
3250 return false;
3251 from += sizeof "through" - 1;
3253 else
3254 from += sizeof "thru" - 1;
3255 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3256 from++;
3257 if (*from == '-')
3259 from++;
3260 if (*comment_start == '*')
3264 while (*from && *from != '*'
3265 && *from != '\n' && *from != '\r')
3266 from++;
3267 if (*from != '*' || from[1] == '/')
3268 break;
3269 from++;
3271 while (1);
3273 else
3274 while (*from && *from != '\n' && *from != '\r')
3275 from++;
3278 /* C block comment. */
3279 if (*comment_start == '*')
3281 if (*from != '*' || from[1] != '/')
3282 return false;
3284 /* C++ line comment. */
3285 else if (*from != '\n')
3286 return false;
3288 return true;
3291 /* Allocate COUNT tokens for RUN. */
3292 void
3293 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3295 run->base = XNEWVEC (cpp_token, count);
3296 run->limit = run->base + count;
3297 run->next = NULL;
3300 /* Returns the next tokenrun, or creates one if there is none. */
3301 static tokenrun *
3302 next_tokenrun (tokenrun *run)
3304 if (run->next == NULL)
3306 run->next = XNEW (tokenrun);
3307 run->next->prev = run;
3308 _cpp_init_tokenrun (run->next, 250);
3311 return run->next;
3314 /* Return the number of not yet processed token in a given
3315 context. */
3317 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3319 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3320 return (LAST (context).token - FIRST (context).token);
3321 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3322 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3323 return (LAST (context).ptoken - FIRST (context).ptoken);
3324 else
3325 abort ();
3328 /* Returns the token present at index INDEX in a given context. If
3329 INDEX is zero, the next token to be processed is returned. */
3330 static const cpp_token*
3331 _cpp_token_from_context_at (cpp_context *context, int index)
3333 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3334 return &(FIRST (context).token[index]);
3335 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3336 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3337 return FIRST (context).ptoken[index];
3338 else
3339 abort ();
3342 /* Look ahead in the input stream. */
3343 const cpp_token *
3344 cpp_peek_token (cpp_reader *pfile, int index)
3346 cpp_context *context = pfile->context;
3347 const cpp_token *peektok;
3348 int count;
3350 /* First, scan through any pending cpp_context objects. */
3351 while (context->prev)
3353 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3355 if (index < (int) sz)
3356 return _cpp_token_from_context_at (context, index);
3357 index -= (int) sz;
3358 context = context->prev;
3361 /* We will have to read some new tokens after all (and do so
3362 without invalidating preceding tokens). */
3363 count = index;
3364 pfile->keep_tokens++;
3366 /* For peeked tokens temporarily disable line_change reporting,
3367 until the tokens are parsed for real. */
3368 void (*line_change) (cpp_reader *, const cpp_token *, int)
3369 = pfile->cb.line_change;
3370 pfile->cb.line_change = NULL;
3374 peektok = _cpp_lex_token (pfile);
3375 if (peektok->type == CPP_EOF)
3377 index--;
3378 break;
3380 else if (peektok->type == CPP_PRAGMA)
3382 /* Don't peek past a pragma. */
3383 if (peektok == &pfile->directive_result)
3384 /* Save the pragma in the buffer. */
3385 *pfile->cur_token++ = *peektok;
3386 index--;
3387 break;
3390 while (index--);
3392 _cpp_backup_tokens_direct (pfile, count - index);
3393 pfile->keep_tokens--;
3394 pfile->cb.line_change = line_change;
3396 return peektok;
3399 /* Allocate a single token that is invalidated at the same time as the
3400 rest of the tokens on the line. Has its line and col set to the
3401 same as the last lexed token, so that diagnostics appear in the
3402 right place. */
3403 cpp_token *
3404 _cpp_temp_token (cpp_reader *pfile)
3406 cpp_token *old, *result;
3407 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3408 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3410 old = pfile->cur_token - 1;
3411 /* Any pre-existing lookaheads must not be clobbered. */
3412 if (la)
3414 if (sz <= la)
3416 tokenrun *next = next_tokenrun (pfile->cur_run);
3418 if (sz < la)
3419 memmove (next->base + 1, next->base,
3420 (la - sz) * sizeof (cpp_token));
3422 next->base[0] = pfile->cur_run->limit[-1];
3425 if (sz > 1)
3426 memmove (pfile->cur_token + 1, pfile->cur_token,
3427 MIN (la, sz - 1) * sizeof (cpp_token));
3430 if (!sz && pfile->cur_token == pfile->cur_run->limit)
3432 pfile->cur_run = next_tokenrun (pfile->cur_run);
3433 pfile->cur_token = pfile->cur_run->base;
3436 result = pfile->cur_token++;
3437 result->src_loc = old->src_loc;
3438 return result;
3441 /* We're at the beginning of a logical line (so not in
3442 directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
3443 if we should enter deferred_pragma mode to tokenize the rest of the
3444 line as a module control-line. */
3446 static void
3447 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3449 unsigned backup = 0; /* Tokens we peeked. */
3450 cpp_hashnode *node = result->val.node.node;
3451 cpp_token *peek = result;
3452 cpp_token *keyword = peek;
3453 cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3454 int header_count = 0;
3456 /* Make sure the incoming state is as we expect it. This way we
3457 can restore it using constants. */
3458 gcc_checking_assert (!pfile->state.in_deferred_pragma
3459 && !pfile->state.skipping
3460 && !pfile->state.parsing_args
3461 && !pfile->state.angled_headers
3462 && (pfile->state.save_comments
3463 == !CPP_OPTION (pfile, discard_comments)));
3465 /* Enter directives mode sufficiently for peeking. We don't have
3466 to actually set in_directive. */
3467 pfile->state.in_deferred_pragma = true;
3469 /* These two fields are needed to process tokenization in deferred
3470 pragma mode. They are not used outside deferred pragma mode or
3471 directives mode. */
3472 pfile->state.pragma_allow_expansion = true;
3473 pfile->directive_line = result->src_loc;
3475 /* Saving comments is incompatible with directives mode. */
3476 pfile->state.save_comments = 0;
3478 if (node == n_modules[spec_nodes::M_EXPORT][0])
3480 peek = _cpp_lex_direct (pfile);
3481 keyword = peek;
3482 backup++;
3483 if (keyword->type != CPP_NAME)
3484 goto not_module;
3485 node = keyword->val.node.node;
3486 if (!(node->flags & NODE_MODULE))
3487 goto not_module;
3490 if (node == n_modules[spec_nodes::M__IMPORT][0])
3491 /* __import */
3492 header_count = backup + 2 + 16;
3493 else if (node == n_modules[spec_nodes::M_IMPORT][0])
3494 /* import */
3495 header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3496 else if (node == n_modules[spec_nodes::M_MODULE][0])
3497 ; /* module */
3498 else
3499 goto not_module;
3501 /* We've seen [export] {module|import|__import}. Check the next token. */
3502 if (header_count)
3503 /* After '{,__}import' a header name may appear. */
3504 pfile->state.angled_headers = true;
3505 peek = _cpp_lex_direct (pfile);
3506 backup++;
3508 /* ... import followed by identifier, ':', '<' or
3509 header-name preprocessing tokens, or module
3510 followed by cpp-identifier, ':' or ';' preprocessing
3511 tokens. C++ keywords are not yet relevant. */
3512 if (peek->type == CPP_NAME
3513 || peek->type == CPP_COLON
3514 || (header_count
3515 ? (peek->type == CPP_LESS
3516 || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3517 || peek->type == CPP_HEADER_NAME)
3518 : peek->type == CPP_SEMICOLON))
3520 pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3521 if (!pfile->state.pragma_allow_expansion)
3522 pfile->state.prevent_expansion++;
3524 if (!header_count && linemap_included_from
3525 (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3526 cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3527 "module control-line cannot be in included file");
3529 /* The first one or two tokens cannot be macro names. */
3530 for (int ix = backup; ix--;)
3532 cpp_token *tok = ix ? keyword : result;
3533 cpp_hashnode *node = tok->val.node.node;
3535 /* Don't attempt to expand the token. */
3536 tok->flags |= NO_EXPAND;
3537 if (_cpp_defined_macro_p (node)
3538 && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3539 && !cpp_fun_like_macro_p (node))
3540 cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3541 "module control-line \"%s\" cannot be"
3542 " an object-like macro",
3543 NODE_NAME (node));
3546 /* Map to underbar variants. */
3547 keyword->val.node.node = n_modules[header_count
3548 ? spec_nodes::M_IMPORT
3549 : spec_nodes::M_MODULE][1];
3550 if (backup != 1)
3551 result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3553 /* Maybe tell the tokenizer we expect a header-name down the
3554 road. */
3555 pfile->state.directive_file_token = header_count;
3557 else
3559 not_module:
3560 /* Drop out of directive mode. */
3561 /* We aaserted save_comments had this value upon entry. */
3562 pfile->state.save_comments
3563 = !CPP_OPTION (pfile, discard_comments);
3564 pfile->state.in_deferred_pragma = false;
3565 /* Do not let this remain on. */
3566 pfile->state.angled_headers = false;
3569 /* In either case we want to backup the peeked tokens. */
3570 if (backup)
3572 /* If we saw EOL, we should drop it, because this isn't a module
3573 control-line after all. */
3574 bool eol = peek->type == CPP_PRAGMA_EOL;
3575 if (!eol || backup > 1)
3577 /* Put put the peeked tokens back */
3578 _cpp_backup_tokens_direct (pfile, backup);
3579 /* But if the last one was an EOL, forget it. */
3580 if (eol)
3581 pfile->lookaheads--;
3586 /* Lex a token into RESULT (external interface). Takes care of issues
3587 like directive handling, token lookahead, multiple include
3588 optimization and skipping. */
3589 const cpp_token *
3590 _cpp_lex_token (cpp_reader *pfile)
3592 cpp_token *result;
3594 for (;;)
3596 if (pfile->cur_token == pfile->cur_run->limit)
3598 pfile->cur_run = next_tokenrun (pfile->cur_run);
3599 pfile->cur_token = pfile->cur_run->base;
3601 /* We assume that the current token is somewhere in the current
3602 run. */
3603 if (pfile->cur_token < pfile->cur_run->base
3604 || pfile->cur_token >= pfile->cur_run->limit)
3605 abort ();
3607 if (pfile->lookaheads)
3609 pfile->lookaheads--;
3610 result = pfile->cur_token++;
3612 else
3613 result = _cpp_lex_direct (pfile);
3615 if (result->flags & BOL)
3617 /* Is this a directive. If _cpp_handle_directive returns
3618 false, it is an assembler #. */
3619 if (result->type == CPP_HASH
3620 /* 6.10.3 p 11: Directives in a list of macro arguments
3621 gives undefined behavior. This implementation
3622 handles the directive as normal. */
3623 && pfile->state.parsing_args != 1)
3625 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3627 if (pfile->directive_result.type == CPP_PADDING)
3628 continue;
3629 result = &pfile->directive_result;
3632 else if (pfile->state.in_deferred_pragma)
3633 result = &pfile->directive_result;
3634 else if (result->type == CPP_NAME
3635 && (result->val.node.node->flags & NODE_MODULE)
3636 && !pfile->state.skipping
3637 /* Unlike regular directives, we do not deal with
3638 tokenizing module directives as macro arguments.
3639 That's not permitted. */
3640 && !pfile->state.parsing_args)
3642 /* P1857. Before macro expansion, At start of logical
3643 line ... */
3644 /* We don't have to consider lookaheads at this point. */
3645 gcc_checking_assert (!pfile->lookaheads);
3647 cpp_maybe_module_directive (pfile, result);
3650 if (pfile->cb.line_change && !pfile->state.skipping)
3651 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3654 /* We don't skip tokens in directives. */
3655 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3656 break;
3658 /* Outside a directive, invalidate controlling macros. At file
3659 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3660 get here and MI optimization works. */
3661 pfile->mi_valid = false;
3663 if (!pfile->state.skipping || result->type == CPP_EOF)
3664 break;
3667 return result;
3670 /* Returns true if a fresh line has been loaded. */
3671 template <bool lexing_raw_string>
3672 static bool
3673 get_fresh_line_impl (cpp_reader *pfile)
3675 /* We can't get a new line until we leave the current directive, unless we
3676 are lexing a raw string, in which case it will be OK as long as we don't
3677 pop the current buffer. */
3678 if (!lexing_raw_string && pfile->state.in_directive)
3679 return false;
3681 for (;;)
3683 cpp_buffer *buffer = pfile->buffer;
3685 if (!buffer->need_line)
3686 return true;
3688 if (buffer->next_line < buffer->rlimit)
3690 _cpp_clean_line (pfile);
3691 return true;
3694 /* We can't change buffers until we leave the current directive. */
3695 if (lexing_raw_string && pfile->state.in_directive)
3696 return false;
3698 /* First, get out of parsing arguments state. */
3699 if (pfile->state.parsing_args)
3700 return false;
3702 /* End of buffer. Non-empty files should end in a newline. */
3703 if (buffer->buf != buffer->rlimit
3704 && buffer->next_line > buffer->rlimit
3705 && !buffer->from_stage3)
3707 /* Clip to buffer size. */
3708 buffer->next_line = buffer->rlimit;
3711 if (buffer->prev && !buffer->return_at_eof)
3712 _cpp_pop_buffer (pfile);
3713 else
3715 /* End of translation. Do not pop the buffer yet. Increment
3716 line number so that the EOF token is on a line of its own
3717 (_cpp_lex_direct doesn't increment in that case, because
3718 it's hard for it to distinguish this special case). */
3719 CPP_INCREMENT_LINE (pfile, 0);
3720 return false;
3725 bool
3726 _cpp_get_fresh_line (cpp_reader *pfile)
3728 return get_fresh_line_impl<false> (pfile);
3732 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
3733 do \
3735 result->type = ELSE_TYPE; \
3736 if (*buffer->cur == CHAR) \
3737 buffer->cur++, result->type = THEN_TYPE; \
3739 while (0)
3741 /* Lex a token into pfile->cur_token, which is also incremented, to
3742 get diagnostics pointing to the correct location.
3744 Does not handle issues such as token lookahead, multiple-include
3745 optimization, directives, skipping etc. This function is only
3746 suitable for use by _cpp_lex_token, and in special cases like
3747 lex_expansion_token which doesn't care for any of these issues.
3749 When meeting a newline, returns CPP_EOF if parsing a directive,
3750 otherwise returns to the start of the token buffer if permissible.
3751 Returns the location of the lexed token. */
3752 cpp_token *
3753 _cpp_lex_direct (cpp_reader *pfile)
3755 cppchar_t c;
3756 cpp_buffer *buffer;
3757 const unsigned char *comment_start;
3758 bool fallthrough_comment = false;
3759 cpp_token *result = pfile->cur_token++;
3761 fresh_line:
3762 result->flags = 0;
3763 buffer = pfile->buffer;
3764 if (buffer->need_line)
3766 if (pfile->state.in_deferred_pragma)
3768 /* This can happen in cases like:
3769 #define loop(x) whatever
3770 #pragma omp loop
3771 where when trying to expand loop we need to peek
3772 next token after loop, but aren't still in_deferred_pragma
3773 mode but are in in_directive mode, so buffer->need_line
3774 is set, a CPP_EOF is peeked. */
3775 result->type = CPP_PRAGMA_EOL;
3776 pfile->state.in_deferred_pragma = false;
3777 if (!pfile->state.pragma_allow_expansion)
3778 pfile->state.prevent_expansion--;
3779 return result;
3781 if (!_cpp_get_fresh_line (pfile))
3783 result->type = CPP_EOF;
3784 /* Not a real EOF in a directive or arg parsing -- we refuse
3785 to advance to the next file now, and will once we're out
3786 of those modes. */
3787 if (!pfile->state.in_directive && !pfile->state.parsing_args)
3789 /* Tell the compiler the line number of the EOF token. */
3790 result->src_loc = pfile->line_table->highest_line;
3791 result->flags = BOL;
3792 /* Now pop the buffer that _cpp_get_fresh_line did not. */
3793 _cpp_pop_buffer (pfile);
3795 return result;
3797 if (buffer != pfile->buffer)
3798 fallthrough_comment = false;
3799 if (!pfile->keep_tokens)
3801 pfile->cur_run = &pfile->base_run;
3802 result = pfile->base_run.base;
3803 pfile->cur_token = result + 1;
3805 result->flags = BOL;
3806 if (pfile->state.parsing_args == 2)
3807 result->flags |= PREV_WHITE;
3809 buffer = pfile->buffer;
3810 update_tokens_line:
3811 result->src_loc = pfile->line_table->highest_line;
3813 skipped_white:
3814 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3815 && !pfile->overlaid_buffer)
3817 _cpp_process_line_notes (pfile, false);
3818 result->src_loc = pfile->line_table->highest_line;
3820 c = *buffer->cur++;
3822 if (pfile->forced_token_location)
3823 result->src_loc = pfile->forced_token_location;
3824 else
3825 result->src_loc = linemap_position_for_column (pfile->line_table,
3826 CPP_BUF_COLUMN (buffer, buffer->cur));
3828 switch (c)
3830 case ' ': case '\t': case '\f': case '\v': case '\0':
3831 result->flags |= PREV_WHITE;
3832 skip_whitespace (pfile, c);
3833 goto skipped_white;
3835 case '\n':
3836 /* Increment the line, unless this is the last line ... */
3837 if (buffer->cur < buffer->rlimit
3838 /* ... or this is a #include, (where _cpp_stack_file needs to
3839 unwind by one line) ... */
3840 || (pfile->state.in_directive > 1
3841 /* ... except traditional-cpp increments this elsewhere. */
3842 && !CPP_OPTION (pfile, traditional)))
3843 CPP_INCREMENT_LINE (pfile, 0);
3844 buffer->need_line = true;
3845 if (pfile->state.in_deferred_pragma)
3847 /* Produce the PRAGMA_EOL on this line. File reading
3848 ensures there is always a \n at end of the buffer, thus
3849 in a deferred pragma we always see CPP_PRAGMA_EOL before
3850 any CPP_EOF. */
3851 result->type = CPP_PRAGMA_EOL;
3852 result->flags &= ~PREV_WHITE;
3853 pfile->state.in_deferred_pragma = false;
3854 if (!pfile->state.pragma_allow_expansion)
3855 pfile->state.prevent_expansion--;
3856 return result;
3858 goto fresh_line;
3860 case '0': case '1': case '2': case '3': case '4':
3861 case '5': case '6': case '7': case '8': case '9':
3863 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3864 result->type = CPP_NUMBER;
3865 lex_number (pfile, &result->val.str, &nst);
3866 warn_about_normalization (pfile, result, &nst, false);
3867 break;
3870 case 'L':
3871 case 'u':
3872 case 'U':
3873 case 'R':
3874 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3875 wide strings or raw strings. */
3876 if (c == 'L' || CPP_OPTION (pfile, rliterals)
3877 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3879 if ((*buffer->cur == '\'' && c != 'R')
3880 || *buffer->cur == '"'
3881 || (*buffer->cur == 'R'
3882 && c != 'R'
3883 && buffer->cur[1] == '"'
3884 && CPP_OPTION (pfile, rliterals))
3885 || (*buffer->cur == '8'
3886 && c == 'u'
3887 && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3888 && CPP_OPTION (pfile, utf8_char_literals)))
3889 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3890 && CPP_OPTION (pfile, rliterals)))))
3892 lex_string (pfile, result, buffer->cur - 1);
3893 break;
3896 /* Fall through. */
3898 case '_':
3899 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3900 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3901 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3902 case 's': case 't': case 'v': case 'w': case 'x':
3903 case 'y': case 'z':
3904 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3905 case 'G': case 'H': case 'I': case 'J': case 'K':
3906 case 'M': case 'N': case 'O': case 'P': case 'Q':
3907 case 'S': case 'T': case 'V': case 'W': case 'X':
3908 case 'Y': case 'Z':
3909 result->type = CPP_NAME;
3911 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3912 result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3913 &nst,
3914 &result->val.node.spelling);
3915 warn_about_normalization (pfile, result, &nst, true);
3918 /* Convert named operators to their proper types. */
3919 if (result->val.node.node->flags & NODE_OPERATOR)
3921 result->flags |= NAMED_OP;
3922 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3925 /* Signal FALLTHROUGH comment followed by another token. */
3926 if (fallthrough_comment)
3927 result->flags |= PREV_FALLTHROUGH;
3928 break;
3930 case '\'':
3931 case '"':
3932 lex_string (pfile, result, buffer->cur - 1);
3933 break;
3935 case '/':
3936 /* A potential block or line comment. */
3937 comment_start = buffer->cur;
3938 c = *buffer->cur;
3940 if (c == '*')
3942 if (_cpp_skip_block_comment (pfile))
3943 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3945 else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3947 /* Don't warn for system headers. */
3948 if (_cpp_in_system_header (pfile))
3950 /* Warn about comments if pedantically GNUC89, and not
3951 in system headers. */
3952 else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3953 && CPP_PEDANTIC (pfile)
3954 && ! buffer->warned_cplusplus_comments)
3956 if (cpp_error (pfile, CPP_DL_PEDWARN,
3957 "C++ style comments are not allowed in ISO C90"))
3958 cpp_error (pfile, CPP_DL_NOTE,
3959 "(this will be reported only once per input file)");
3960 buffer->warned_cplusplus_comments = 1;
3962 /* Or if specifically desired via -Wc90-c99-compat. */
3963 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3964 && ! CPP_OPTION (pfile, cplusplus)
3965 && ! buffer->warned_cplusplus_comments)
3967 if (cpp_error (pfile, CPP_DL_WARNING,
3968 "C++ style comments are incompatible with C90"))
3969 cpp_error (pfile, CPP_DL_NOTE,
3970 "(this will be reported only once per input file)");
3971 buffer->warned_cplusplus_comments = 1;
3973 /* In C89/C94, C++ style comments are forbidden. */
3974 else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3975 || CPP_OPTION (pfile, lang) == CLK_STDC94))
3977 /* But don't be confused about valid code such as
3978 - // immediately followed by *,
3979 - // in a preprocessing directive,
3980 - // in an #if 0 block. */
3981 if (buffer->cur[1] == '*'
3982 || pfile->state.in_directive
3983 || pfile->state.skipping)
3985 result->type = CPP_DIV;
3986 break;
3988 else if (! buffer->warned_cplusplus_comments)
3990 if (cpp_error (pfile, CPP_DL_ERROR,
3991 "C++ style comments are not allowed in "
3992 "ISO C90"))
3993 cpp_error (pfile, CPP_DL_NOTE,
3994 "(this will be reported only once per input "
3995 "file)");
3996 buffer->warned_cplusplus_comments = 1;
3999 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
4000 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
4002 else if (c == '=')
4004 buffer->cur++;
4005 result->type = CPP_DIV_EQ;
4006 break;
4008 else
4010 result->type = CPP_DIV;
4011 break;
4014 if (fallthrough_comment_p (pfile, comment_start))
4015 fallthrough_comment = true;
4017 if (pfile->cb.comment)
4019 size_t len = pfile->buffer->cur - comment_start;
4020 pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
4021 len + 1);
4024 if (!pfile->state.save_comments)
4026 result->flags |= PREV_WHITE;
4027 goto update_tokens_line;
4030 if (fallthrough_comment)
4031 result->flags |= PREV_FALLTHROUGH;
4033 /* Save the comment as a token in its own right. */
4034 save_comment (pfile, result, comment_start, c);
4035 break;
4037 case '<':
4038 if (pfile->state.angled_headers)
4040 lex_string (pfile, result, buffer->cur - 1);
4041 if (result->type != CPP_LESS)
4042 break;
4045 result->type = CPP_LESS;
4046 if (*buffer->cur == '=')
4048 buffer->cur++, result->type = CPP_LESS_EQ;
4049 if (*buffer->cur == '>'
4050 && CPP_OPTION (pfile, cplusplus)
4051 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
4052 buffer->cur++, result->type = CPP_SPACESHIP;
4054 else if (*buffer->cur == '<')
4056 buffer->cur++;
4057 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
4059 else if (CPP_OPTION (pfile, digraphs))
4061 if (*buffer->cur == ':')
4063 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4064 three characters are <:: and the subsequent character
4065 is neither : nor >, the < is treated as a preprocessor
4066 token by itself". */
4067 if (CPP_OPTION (pfile, cplusplus)
4068 && CPP_OPTION (pfile, lang) != CLK_CXX98
4069 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
4070 && buffer->cur[1] == ':'
4071 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4072 break;
4074 buffer->cur++;
4075 result->flags |= DIGRAPH;
4076 result->type = CPP_OPEN_SQUARE;
4078 else if (*buffer->cur == '%')
4080 buffer->cur++;
4081 result->flags |= DIGRAPH;
4082 result->type = CPP_OPEN_BRACE;
4085 break;
4087 case '>':
4088 result->type = CPP_GREATER;
4089 if (*buffer->cur == '=')
4090 buffer->cur++, result->type = CPP_GREATER_EQ;
4091 else if (*buffer->cur == '>')
4093 buffer->cur++;
4094 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4096 break;
4098 case '%':
4099 result->type = CPP_MOD;
4100 if (*buffer->cur == '=')
4101 buffer->cur++, result->type = CPP_MOD_EQ;
4102 else if (CPP_OPTION (pfile, digraphs))
4104 if (*buffer->cur == ':')
4106 buffer->cur++;
4107 result->flags |= DIGRAPH;
4108 result->type = CPP_HASH;
4109 if (*buffer->cur == '%' && buffer->cur[1] == ':')
4110 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
4112 else if (*buffer->cur == '>')
4114 buffer->cur++;
4115 result->flags |= DIGRAPH;
4116 result->type = CPP_CLOSE_BRACE;
4119 break;
4121 case '.':
4122 result->type = CPP_DOT;
4123 if (ISDIGIT (*buffer->cur))
4125 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4126 result->type = CPP_NUMBER;
4127 lex_number (pfile, &result->val.str, &nst);
4128 warn_about_normalization (pfile, result, &nst, false);
4130 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4131 buffer->cur += 2, result->type = CPP_ELLIPSIS;
4132 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4133 buffer->cur++, result->type = CPP_DOT_STAR;
4134 break;
4136 case '+':
4137 result->type = CPP_PLUS;
4138 if (*buffer->cur == '+')
4139 buffer->cur++, result->type = CPP_PLUS_PLUS;
4140 else if (*buffer->cur == '=')
4141 buffer->cur++, result->type = CPP_PLUS_EQ;
4142 break;
4144 case '-':
4145 result->type = CPP_MINUS;
4146 if (*buffer->cur == '>')
4148 buffer->cur++;
4149 result->type = CPP_DEREF;
4150 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4151 buffer->cur++, result->type = CPP_DEREF_STAR;
4153 else if (*buffer->cur == '-')
4154 buffer->cur++, result->type = CPP_MINUS_MINUS;
4155 else if (*buffer->cur == '=')
4156 buffer->cur++, result->type = CPP_MINUS_EQ;
4157 break;
4159 case '&':
4160 result->type = CPP_AND;
4161 if (*buffer->cur == '&')
4162 buffer->cur++, result->type = CPP_AND_AND;
4163 else if (*buffer->cur == '=')
4164 buffer->cur++, result->type = CPP_AND_EQ;
4165 break;
4167 case '|':
4168 result->type = CPP_OR;
4169 if (*buffer->cur == '|')
4170 buffer->cur++, result->type = CPP_OR_OR;
4171 else if (*buffer->cur == '=')
4172 buffer->cur++, result->type = CPP_OR_EQ;
4173 break;
4175 case ':':
4176 result->type = CPP_COLON;
4177 if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
4178 buffer->cur++, result->type = CPP_SCOPE;
4179 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
4181 buffer->cur++;
4182 result->flags |= DIGRAPH;
4183 result->type = CPP_CLOSE_SQUARE;
4185 break;
4187 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4188 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4189 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4190 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
4191 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
4193 case '?': result->type = CPP_QUERY; break;
4194 case '~': result->type = CPP_COMPL; break;
4195 case ',': result->type = CPP_COMMA; break;
4196 case '(': result->type = CPP_OPEN_PAREN; break;
4197 case ')': result->type = CPP_CLOSE_PAREN; break;
4198 case '[': result->type = CPP_OPEN_SQUARE; break;
4199 case ']': result->type = CPP_CLOSE_SQUARE; break;
4200 case '{': result->type = CPP_OPEN_BRACE; break;
4201 case '}': result->type = CPP_CLOSE_BRACE; break;
4202 case ';': result->type = CPP_SEMICOLON; break;
4204 /* @ is a punctuator in Objective-C. */
4205 case '@': result->type = CPP_ATSIGN; break;
4207 default:
4209 const uchar *base = --buffer->cur;
4210 static int no_warn_cnt;
4212 /* Check for an extended identifier ($ or UCN or UTF-8). */
4213 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4214 if (forms_identifier_p (pfile, true, &nst))
4216 result->type = CPP_NAME;
4217 result->val.node.node = lex_identifier (pfile, base, true, &nst,
4218 &result->val.node.spelling);
4219 warn_about_normalization (pfile, result, &nst, true);
4220 break;
4223 /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
4224 single token. */
4225 buffer->cur++;
4226 if (c >= utf8_signifier)
4228 const uchar *pstr = base;
4229 cppchar_t s;
4230 if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4232 if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4234 buffer->cur = base;
4235 _cpp_warn_invalid_utf8 (pfile);
4237 buffer->cur = pstr;
4239 else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4241 buffer->cur = base;
4242 const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4243 buffer->cur = base + 1;
4244 no_warn_cnt = end - buffer->cur;
4247 else if (c >= utf8_continuation
4248 && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4250 if (no_warn_cnt)
4251 --no_warn_cnt;
4252 else
4254 buffer->cur = base;
4255 _cpp_warn_invalid_utf8 (pfile);
4256 buffer->cur = base + 1;
4259 create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4260 break;
4265 /* Potentially convert the location of the token to a range. */
4266 if (result->src_loc >= RESERVED_LOCATION_COUNT
4267 && result->type != CPP_EOF)
4269 /* Ensure that any line notes are processed, so that we have the
4270 correct physical line/column for the end-point of the token even
4271 when a logical line is split via one or more backslashes. */
4272 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4273 && !pfile->overlaid_buffer)
4274 _cpp_process_line_notes (pfile, false);
4276 source_range tok_range;
4277 tok_range.m_start = result->src_loc;
4278 tok_range.m_finish
4279 = linemap_position_for_column (pfile->line_table,
4280 CPP_BUF_COLUMN (buffer, buffer->cur));
4282 result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
4283 result->src_loc,
4284 tok_range, NULL, 0);
4287 return result;
4290 /* An upper bound on the number of bytes needed to spell TOKEN.
4291 Does not include preceding whitespace. */
4292 unsigned int
4293 cpp_token_len (const cpp_token *token)
4295 unsigned int len;
4297 switch (TOKEN_SPELL (token))
4299 default: len = 6; break;
4300 case SPELL_LITERAL: len = token->val.str.len; break;
4301 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
4304 return len;
4307 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4308 Return the number of bytes read out of NAME. (There are always
4309 10 bytes written to BUFFER.) */
4311 static size_t
4312 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4314 int j;
4315 int ucn_len = 0;
4316 int ucn_len_c;
4317 unsigned t;
4318 unsigned long utf32;
4320 /* Compute the length of the UTF-8 sequence. */
4321 for (t = *name; t & 0x80; t <<= 1)
4322 ucn_len++;
4324 utf32 = *name & (0x7F >> ucn_len);
4325 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4327 utf32 = (utf32 << 6) | (*++name & 0x3F);
4329 /* Ill-formed UTF-8. */
4330 if ((*name & ~0x3F) != 0x80)
4331 abort ();
4334 *buffer++ = '\\';
4335 *buffer++ = 'U';
4336 for (j = 7; j >= 0; j--)
4337 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4338 return ucn_len;
4341 /* Given a token TYPE corresponding to a digraph, return a pointer to
4342 the spelling of the digraph. */
4343 static const unsigned char *
4344 cpp_digraph2name (enum cpp_ttype type)
4346 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4349 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4350 The buffer must already contain the enough space to hold the
4351 token's spelling. Returns a pointer to the character after the
4352 last character written. */
4353 unsigned char *
4354 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4356 size_t i;
4357 const unsigned char *name = NODE_NAME (ident);
4359 for (i = 0; i < NODE_LEN (ident); i++)
4360 if (name[i] & ~0x7F)
4362 i += utf8_to_ucn (buffer, name + i) - 1;
4363 buffer += 10;
4365 else
4366 *buffer++ = name[i];
4368 return buffer;
4371 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
4372 already contain the enough space to hold the token's spelling.
4373 Returns a pointer to the character after the last character written.
4374 FORSTRING is true if this is to be the spelling after translation
4375 phase 1 (with the original spelling of extended identifiers), false
4376 if extended identifiers should always be written using UCNs (there is
4377 no option for always writing them in the internal UTF-8 form).
4378 FIXME: Would be nice if we didn't need the PFILE argument. */
4379 unsigned char *
4380 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4381 unsigned char *buffer, bool forstring)
4383 switch (TOKEN_SPELL (token))
4385 case SPELL_OPERATOR:
4387 const unsigned char *spelling;
4388 unsigned char c;
4390 if (token->flags & DIGRAPH)
4391 spelling = cpp_digraph2name (token->type);
4392 else if (token->flags & NAMED_OP)
4393 goto spell_ident;
4394 else
4395 spelling = TOKEN_NAME (token);
4397 while ((c = *spelling++) != '\0')
4398 *buffer++ = c;
4400 break;
4402 spell_ident:
4403 case SPELL_IDENT:
4404 if (forstring)
4406 memcpy (buffer, NODE_NAME (token->val.node.spelling),
4407 NODE_LEN (token->val.node.spelling));
4408 buffer += NODE_LEN (token->val.node.spelling);
4410 else
4411 buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4412 break;
4414 case SPELL_LITERAL:
4415 memcpy (buffer, token->val.str.text, token->val.str.len);
4416 buffer += token->val.str.len;
4417 break;
4419 case SPELL_NONE:
4420 cpp_error (pfile, CPP_DL_ICE,
4421 "unspellable token %s", TOKEN_NAME (token));
4422 break;
4425 return buffer;
4428 /* Returns TOKEN spelt as a null-terminated string. The string is
4429 freed when the reader is destroyed. Useful for diagnostics. */
4430 unsigned char *
4431 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4433 unsigned int len = cpp_token_len (token) + 1;
4434 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4436 end = cpp_spell_token (pfile, token, start, false);
4437 end[0] = '\0';
4439 return start;
4442 /* Returns a pointer to a string which spells the token defined by
4443 TYPE and FLAGS. Used by C front ends, which really should move to
4444 using cpp_token_as_text. */
4445 const char *
4446 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4448 if (flags & DIGRAPH)
4449 return (const char *) cpp_digraph2name (type);
4450 else if (flags & NAMED_OP)
4451 return cpp_named_operator2name (type);
4453 return (const char *) token_spellings[type].name;
4456 /* Writes the spelling of token to FP, without any preceding space.
4457 Separated from cpp_spell_token for efficiency - to avoid stdio
4458 double-buffering. */
4459 void
4460 cpp_output_token (const cpp_token *token, FILE *fp)
4462 switch (TOKEN_SPELL (token))
4464 case SPELL_OPERATOR:
4466 const unsigned char *spelling;
4467 int c;
4469 if (token->flags & DIGRAPH)
4470 spelling = cpp_digraph2name (token->type);
4471 else if (token->flags & NAMED_OP)
4472 goto spell_ident;
4473 else
4474 spelling = TOKEN_NAME (token);
4476 c = *spelling;
4478 putc (c, fp);
4479 while ((c = *++spelling) != '\0');
4481 break;
4483 spell_ident:
4484 case SPELL_IDENT:
4486 size_t i;
4487 const unsigned char * name = NODE_NAME (token->val.node.node);
4489 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4490 if (name[i] & ~0x7F)
4492 unsigned char buffer[10];
4493 i += utf8_to_ucn (buffer, name + i) - 1;
4494 fwrite (buffer, 1, 10, fp);
4496 else
4497 fputc (NODE_NAME (token->val.node.node)[i], fp);
4499 break;
4501 case SPELL_LITERAL:
4502 if (token->type == CPP_HEADER_NAME)
4503 fputc ('"', fp);
4504 fwrite (token->val.str.text, 1, token->val.str.len, fp);
4505 if (token->type == CPP_HEADER_NAME)
4506 fputc ('"', fp);
4507 break;
4509 case SPELL_NONE:
4510 /* An error, most probably. */
4511 break;
4515 /* Compare two tokens. */
4517 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4519 if (a->type == b->type && a->flags == b->flags)
4520 switch (TOKEN_SPELL (a))
4522 default: /* Keep compiler happy. */
4523 case SPELL_OPERATOR:
4524 /* token_no is used to track where multiple consecutive ##
4525 tokens were originally located. */
4526 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4527 case SPELL_NONE:
4528 return (a->type != CPP_MACRO_ARG
4529 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4530 && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4531 case SPELL_IDENT:
4532 return (a->val.node.node == b->val.node.node
4533 && a->val.node.spelling == b->val.node.spelling);
4534 case SPELL_LITERAL:
4535 return (a->val.str.len == b->val.str.len
4536 && !memcmp (a->val.str.text, b->val.str.text,
4537 a->val.str.len));
4540 return 0;
4543 /* Returns nonzero if a space should be inserted to avoid an
4544 accidental token paste for output. For simplicity, it is
4545 conservative, and occasionally advises a space where one is not
4546 needed, e.g. "." and ".2". */
4548 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4549 const cpp_token *token2)
4551 enum cpp_ttype a = token1->type, b = token2->type;
4552 cppchar_t c;
4554 if (token1->flags & NAMED_OP)
4555 a = CPP_NAME;
4556 if (token2->flags & NAMED_OP)
4557 b = CPP_NAME;
4559 c = EOF;
4560 if (token2->flags & DIGRAPH)
4561 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4562 else if (token_spellings[b].category == SPELL_OPERATOR)
4563 c = token_spellings[b].name[0];
4565 /* Quickly get everything that can paste with an '='. */
4566 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4567 return 1;
4569 switch (a)
4571 case CPP_GREATER: return c == '>';
4572 case CPP_LESS: return c == '<' || c == '%' || c == ':';
4573 case CPP_PLUS: return c == '+';
4574 case CPP_MINUS: return c == '-' || c == '>';
4575 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
4576 case CPP_MOD: return c == ':' || c == '>';
4577 case CPP_AND: return c == '&';
4578 case CPP_OR: return c == '|';
4579 case CPP_COLON: return c == ':' || c == '>';
4580 case CPP_DEREF: return c == '*';
4581 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
4582 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
4583 case CPP_PRAGMA:
4584 case CPP_NAME: return ((b == CPP_NUMBER
4585 && name_p (pfile, &token2->val.str))
4586 || b == CPP_NAME
4587 || b == CPP_CHAR || b == CPP_STRING); /* L */
4588 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
4589 || b == CPP_CHAR
4590 || c == '.' || c == '+' || c == '-');
4591 /* UCNs */
4592 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
4593 && b == CPP_NAME)
4594 || (CPP_OPTION (pfile, objc)
4595 && token1->val.str.text[0] == '@'
4596 && (b == CPP_NAME || b == CPP_STRING)));
4597 case CPP_LESS_EQ: return c == '>';
4598 case CPP_STRING:
4599 case CPP_WSTRING:
4600 case CPP_UTF8STRING:
4601 case CPP_STRING16:
4602 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
4603 && (b == CPP_NAME
4604 || (TOKEN_SPELL (token2) == SPELL_LITERAL
4605 && ISIDST (token2->val.str.text[0]))));
4607 default: break;
4610 return 0;
4613 /* Output all the remaining tokens on the current line, and a newline
4614 character, to FP. Leading whitespace is removed. If there are
4615 macros, special token padding is not performed. */
4616 void
4617 cpp_output_line (cpp_reader *pfile, FILE *fp)
4619 const cpp_token *token;
4621 token = cpp_get_token (pfile);
4622 while (token->type != CPP_EOF)
4624 cpp_output_token (token, fp);
4625 token = cpp_get_token (pfile);
4626 if (token->flags & PREV_WHITE)
4627 putc (' ', fp);
4630 putc ('\n', fp);
4633 /* Return a string representation of all the remaining tokens on the
4634 current line. The result is allocated using xmalloc and must be
4635 freed by the caller. */
4636 unsigned char *
4637 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4639 const cpp_token *token;
4640 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4641 unsigned int alloced = 120 + out;
4642 unsigned char *result = (unsigned char *) xmalloc (alloced);
4644 /* If DIR_NAME is empty, there are no initial contents. */
4645 if (dir_name)
4647 sprintf ((char *) result, "#%s ", dir_name);
4648 out += 2;
4651 token = cpp_get_token (pfile);
4652 while (token->type != CPP_EOF)
4654 unsigned char *last;
4655 /* Include room for a possible space and the terminating nul. */
4656 unsigned int len = cpp_token_len (token) + 2;
4658 if (out + len > alloced)
4660 alloced *= 2;
4661 if (out + len > alloced)
4662 alloced = out + len;
4663 result = (unsigned char *) xrealloc (result, alloced);
4666 last = cpp_spell_token (pfile, token, &result[out], 0);
4667 out = last - result;
4669 token = cpp_get_token (pfile);
4670 if (token->flags & PREV_WHITE)
4671 result[out++] = ' ';
4674 result[out] = '\0';
4675 return result;
4678 /* Memory buffers. Changing these three constants can have a dramatic
4679 effect on performance. The values here are reasonable defaults,
4680 but might be tuned. If you adjust them, be sure to test across a
4681 range of uses of cpplib, including heavy nested function-like macro
4682 expansion. Also check the change in peak memory usage (NJAMD is a
4683 good tool for this). */
4684 #define MIN_BUFF_SIZE 8000
4685 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4686 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4687 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4689 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4690 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4691 #endif
4693 /* Create a new allocation buffer. Place the control block at the end
4694 of the buffer, so that buffer overflows will cause immediate chaos. */
4695 static _cpp_buff *
4696 new_buff (size_t len)
4698 _cpp_buff *result;
4699 unsigned char *base;
4701 if (len < MIN_BUFF_SIZE)
4702 len = MIN_BUFF_SIZE;
4703 len = CPP_ALIGN (len);
4705 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4706 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4707 struct first. */
4708 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4709 base = XNEWVEC (unsigned char, len + slen);
4710 result = (_cpp_buff *) base;
4711 base += slen;
4712 #else
4713 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4714 result = (_cpp_buff *) (base + len);
4715 #endif
4716 result->base = base;
4717 result->cur = base;
4718 result->limit = base + len;
4719 result->next = NULL;
4720 return result;
4723 /* Place a chain of unwanted allocation buffers on the free list. */
4724 void
4725 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4727 _cpp_buff *end = buff;
4729 while (end->next)
4730 end = end->next;
4731 end->next = pfile->free_buffs;
4732 pfile->free_buffs = buff;
4735 /* Return a free buffer of size at least MIN_SIZE. */
4736 _cpp_buff *
4737 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4739 _cpp_buff *result, **p;
4741 for (p = &pfile->free_buffs;; p = &(*p)->next)
4743 size_t size;
4745 if (*p == NULL)
4746 return new_buff (min_size);
4747 result = *p;
4748 size = result->limit - result->base;
4749 /* Return a buffer that's big enough, but don't waste one that's
4750 way too big. */
4751 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4752 break;
4755 *p = result->next;
4756 result->next = NULL;
4757 result->cur = result->base;
4758 return result;
4761 /* Creates a new buffer with enough space to hold the uncommitted
4762 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
4763 the excess bytes to the new buffer. Chains the new buffer after
4764 BUFF, and returns the new buffer. */
4765 _cpp_buff *
4766 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4768 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4769 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4771 buff->next = new_buff;
4772 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4773 return new_buff;
4776 /* Creates a new buffer with enough space to hold the uncommitted
4777 remaining bytes of the buffer pointed to by BUFF, and at least
4778 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
4779 Chains the new buffer before the buffer pointed to by BUFF, and
4780 updates the pointer to point to the new buffer. */
4781 void
4782 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4784 _cpp_buff *new_buff, *old_buff = *pbuff;
4785 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4787 new_buff = _cpp_get_buff (pfile, size);
4788 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4789 new_buff->next = old_buff;
4790 *pbuff = new_buff;
4793 /* Free a chain of buffers starting at BUFF. */
4794 void
4795 _cpp_free_buff (_cpp_buff *buff)
4797 _cpp_buff *next;
4799 for (; buff; buff = next)
4801 next = buff->next;
4802 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4803 free (buff);
4804 #else
4805 free (buff->base);
4806 #endif
4810 /* Allocate permanent, unaligned storage of length LEN. */
4811 unsigned char *
4812 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4814 _cpp_buff *buff = pfile->u_buff;
4815 unsigned char *result = buff->cur;
4817 if (len > (size_t) (buff->limit - result))
4819 buff = _cpp_get_buff (pfile, len);
4820 buff->next = pfile->u_buff;
4821 pfile->u_buff = buff;
4822 result = buff->cur;
4825 buff->cur = result + len;
4826 return result;
4829 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4830 That buffer is used for growing allocations when saving macro
4831 replacement lists in a #define, and when parsing an answer to an
4832 assertion in #assert, #unassert or #if (and therefore possibly
4833 whilst expanding macros). It therefore must not be used by any
4834 code that they might call: specifically the lexer and the guts of
4835 the macro expander.
4837 All existing other uses clearly fit this restriction: storing
4838 registered pragmas during initialization. */
4839 unsigned char *
4840 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4842 _cpp_buff *buff = pfile->a_buff;
4843 unsigned char *result = buff->cur;
4845 if (len > (size_t) (buff->limit - result))
4847 buff = _cpp_get_buff (pfile, len);
4848 buff->next = pfile->a_buff;
4849 pfile->a_buff = buff;
4850 result = buff->cur;
4853 buff->cur = result + len;
4854 return result;
4857 /* Commit or allocate storage from a buffer. */
4859 void *
4860 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4862 void *ptr = BUFF_FRONT (pfile->a_buff);
4864 if (pfile->hash_table->alloc_subobject)
4866 void *copy = pfile->hash_table->alloc_subobject (size);
4867 memcpy (copy, ptr, size);
4868 ptr = copy;
4870 else
4871 BUFF_FRONT (pfile->a_buff) += size;
4873 return ptr;
4876 /* Say which field of TOK is in use. */
4878 enum cpp_token_fld_kind
4879 cpp_token_val_index (const cpp_token *tok)
4881 switch (TOKEN_SPELL (tok))
4883 case SPELL_IDENT:
4884 return CPP_TOKEN_FLD_NODE;
4885 case SPELL_LITERAL:
4886 return CPP_TOKEN_FLD_STR;
4887 case SPELL_OPERATOR:
4888 /* Operands which were originally spelled as ident keep around
4889 the node for the exact spelling. */
4890 if (tok->flags & NAMED_OP)
4891 return CPP_TOKEN_FLD_NODE;
4892 else if (tok->type == CPP_PASTE)
4893 return CPP_TOKEN_FLD_TOKEN_NO;
4894 else
4895 return CPP_TOKEN_FLD_NONE;
4896 case SPELL_NONE:
4897 if (tok->type == CPP_MACRO_ARG)
4898 return CPP_TOKEN_FLD_ARG_NO;
4899 else if (tok->type == CPP_PADDING)
4900 return CPP_TOKEN_FLD_SOURCE;
4901 else if (tok->type == CPP_PRAGMA)
4902 return CPP_TOKEN_FLD_PRAGMA;
4903 /* fall through */
4904 default:
4905 return CPP_TOKEN_FLD_NONE;
4909 /* All tokens lexed in R after calling this function will be forced to
4910 have their location_t to be P, until
4911 cpp_stop_forcing_token_locations is called for R. */
4913 void
4914 cpp_force_token_locations (cpp_reader *r, location_t loc)
4916 r->forced_token_location = loc;
4919 /* Go back to assigning locations naturally for lexed tokens. */
4921 void
4922 cpp_stop_forcing_token_locations (cpp_reader *r)
4924 r->forced_token_location = 0;
4927 /* We're looking at \, if it's escaping EOL, look past it. If at
4928 LIMIT, don't advance. */
4930 static const unsigned char *
4931 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4933 const unsigned char *probe = peek;
4935 if (__builtin_expect (peek[1] == '\n', true))
4937 eol:
4938 probe += 2;
4939 if (__builtin_expect (probe < limit, true))
4941 peek = probe;
4942 if (*peek == '\\')
4943 /* The user might be perverse. */
4944 return do_peek_backslash (peek, limit);
4947 else if (__builtin_expect (peek[1] == '\r', false))
4949 if (probe[2] == '\n')
4950 probe++;
4951 goto eol;
4954 return peek;
4957 static const unsigned char *
4958 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4960 if (__builtin_expect (*peek == '\\', false))
4961 peek = do_peek_backslash (peek, limit);
4962 return peek;
4965 static const unsigned char *
4966 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4968 if (peek == bound)
4969 return NULL;
4971 unsigned char c = *--peek;
4972 if (__builtin_expect (c == '\n', false)
4973 || __builtin_expect (c == 'r', false))
4975 if (peek == bound)
4976 return peek;
4977 int ix = -1;
4978 if (c == '\n' && peek[ix] == '\r')
4980 if (peek + ix == bound)
4981 return peek;
4982 ix--;
4985 if (peek[ix] == '\\')
4986 return do_peek_prev (peek + ix, bound);
4988 return peek;
4990 else
4991 return peek;
4994 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4995 space. Otherwise return NULL. */
4997 static const unsigned char *
4998 do_peek_ident (const char *match, const unsigned char *peek,
4999 const unsigned char *limit)
5001 for (; *++match; peek++)
5002 if (*peek != *match)
5004 peek = do_peek_next (peek, limit);
5005 if (*peek != *match)
5006 return NULL;
5009 /* Must now not be looking at an identifier char. */
5010 peek = do_peek_next (peek, limit);
5011 if (ISIDNUM (*peek))
5012 return NULL;
5014 /* Skip control-line whitespace. */
5016 while (*peek == ' ' || *peek == '\t')
5017 peek++;
5018 if (__builtin_expect (*peek == '\\', false))
5020 peek = do_peek_backslash (peek, limit);
5021 if (*peek != '\\')
5022 goto ws;
5025 return peek;
5028 /* Are we looking at a module control line starting as PEEK - 1? */
5030 static bool
5031 do_peek_module (cpp_reader *pfile, unsigned char c,
5032 const unsigned char *peek, const unsigned char *limit)
5034 bool import = false;
5036 if (__builtin_expect (c == 'e', false))
5038 if (!((peek[0] == 'x' || peek[0] == '\\')
5039 && (peek = do_peek_ident ("export", peek, limit))))
5040 return false;
5042 /* export, peek for import or module. No need to peek __import
5043 here. */
5044 if (peek[0] == 'i')
5046 if (!((peek[1] == 'm' || peek[1] == '\\')
5047 && (peek = do_peek_ident ("import", peek + 1, limit))))
5048 return false;
5049 import = true;
5051 else if (peek[0] == 'm')
5053 if (!((peek[1] == 'o' || peek[1] == '\\')
5054 && (peek = do_peek_ident ("module", peek + 1, limit))))
5055 return false;
5057 else
5058 return false;
5060 else if (__builtin_expect (c == 'i', false))
5062 if (!((peek[0] == 'm' || peek[0] == '\\')
5063 && (peek = do_peek_ident ("import", peek, limit))))
5064 return false;
5065 import = true;
5067 else if (__builtin_expect (c == '_', false))
5069 /* Needed for translated includes. */
5070 if (!((peek[0] == '_' || peek[0] == '\\')
5071 && (peek = do_peek_ident ("__import", peek, limit))))
5072 return false;
5073 import = true;
5075 else if (__builtin_expect (c == 'm', false))
5077 if (!((peek[0] == 'o' || peek[0] == '\\')
5078 && (peek = do_peek_ident ("module", peek, limit))))
5079 return false;
5081 else
5082 return false;
5084 /* Peek the next character to see if it's good enough. We'll be at
5085 the first non-whitespace char, including skipping an escaped
5086 newline. */
5087 /* ... import followed by identifier, ':', '<' or header-name
5088 preprocessing tokens, or module followed by identifier, ':' or
5089 ';' preprocessing tokens. */
5090 unsigned char p = *peek++;
5092 /* A character literal is ... single quotes, ... optionally preceded
5093 by u8, u, U, or L */
5094 /* A string-literal is a ... double quotes, optionally prefixed by
5095 R, u8, u8R, u, uR, U, UR, L, or LR */
5096 if (p == 'u')
5098 peek = do_peek_next (peek, limit);
5099 if (*peek == '8')
5101 peek++;
5102 goto peek_u8;
5104 goto peek_u;
5106 else if (p == 'U' || p == 'L')
5108 peek_u8:
5109 peek = do_peek_next (peek, limit);
5110 peek_u:
5111 if (*peek == '\"' || *peek == '\'')
5112 return false;
5114 if (*peek == 'R')
5115 goto peek_R;
5116 /* Identifier. Ok. */
5118 else if (p == 'R')
5120 peek_R:
5121 if (CPP_OPTION (pfile, rliterals))
5123 peek = do_peek_next (peek, limit);
5124 if (*peek == '\"')
5125 return false;
5127 /* Identifier. Ok. */
5129 else if ('Z' - 'A' == 25
5130 ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5131 : ISIDST (p))
5133 /* Identifier. Ok. */
5135 else if (p == '<')
5137 /* Maybe angle header, ok for import. Reject
5138 '<=', '<<' digraph:'<:'. */
5139 if (!import)
5140 return false;
5141 peek = do_peek_next (peek, limit);
5142 if (*peek == '=' || *peek == '<'
5143 || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5144 return false;
5146 else if (p == ';')
5148 /* SEMICOLON, ok for module. */
5149 if (import)
5150 return false;
5152 else if (p == '"')
5154 /* STRING, ok for import. */
5155 if (!import)
5156 return false;
5158 else if (p == ':')
5160 /* Maybe COLON, ok. Reject '::', digraph:':>'. */
5161 peek = do_peek_next (peek, limit);
5162 if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5163 return false;
5165 else
5166 /* FIXME: Detect a unicode character, excluding those not
5167 permitted as the initial character. [lex.name]/1. I presume
5168 we need to check the \[uU] spellings, and directly using
5169 Unicode in say UTF8 form? Or perhaps we do the phase-1
5170 conversion of UTF8 to universal-character-names? */
5171 return false;
5173 return true;
5176 /* Directives-only scanning. Somewhat more relaxed than correct
5177 parsing -- some ill-formed programs will not be rejected. */
5179 void
5180 cpp_directive_only_process (cpp_reader *pfile,
5181 void *data,
5182 void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5184 bool module_p = CPP_OPTION (pfile, module_directives);
5188 restart:
5189 /* Buffer initialization, but no line cleaning. */
5190 cpp_buffer *buffer = pfile->buffer;
5191 buffer->cur_note = buffer->notes_used = 0;
5192 buffer->cur = buffer->line_base = buffer->next_line;
5193 buffer->need_line = false;
5194 /* Files always end in a newline or carriage return. We rely on this for
5195 character peeking safety. */
5196 gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
5198 const unsigned char *base = buffer->cur;
5199 unsigned line_count = 0;
5200 const unsigned char *line_start = base;
5202 bool bol = true;
5203 bool raw = false;
5205 const unsigned char *lwm = base;
5206 for (const unsigned char *pos = base, *limit = buffer->rlimit;
5207 pos < limit;)
5209 unsigned char c = *pos++;
5210 /* This matches the switch in _cpp_lex_direct. */
5211 switch (c)
5213 case ' ': case '\t': case '\f': case '\v':
5214 /* Whitespace, do nothing. */
5215 break;
5217 case '\r': /* MAC line ending, or Windows \r\n */
5218 if (*pos == '\n')
5219 pos++;
5220 /* FALLTHROUGH */
5222 case '\n':
5223 bol = true;
5225 next_line:
5226 CPP_INCREMENT_LINE (pfile, 0);
5227 line_count++;
5228 line_start = pos;
5229 break;
5231 case '\\':
5232 /* <backslash><newline> is removed, and doesn't undo any
5233 preceeding escape or whatnot. */
5234 if (*pos == '\n')
5236 pos++;
5237 goto next_line;
5239 else if (*pos == '\r')
5241 if (pos[1] == '\n')
5242 pos++;
5243 pos++;
5244 goto next_line;
5246 goto dflt;
5248 case '#':
5249 if (bol)
5251 /* Line directive. */
5252 if (pos - 1 > base && !pfile->state.skipping)
5253 cb (pfile, CPP_DO_print, data,
5254 line_count, base, pos - 1 - base);
5256 /* Prep things for directive handling. */
5257 buffer->next_line = pos;
5258 buffer->need_line = true;
5259 bool ok = _cpp_get_fresh_line (pfile);
5260 gcc_checking_assert (ok);
5262 /* Ensure proper column numbering for generated
5263 error messages. */
5264 buffer->line_base -= pos - line_start;
5266 _cpp_handle_directive (pfile, line_start + 1 != pos);
5268 /* Sanitize the line settings. Duplicate #include's can
5269 mess things up. */
5270 // FIXME: Necessary?
5271 pfile->line_table->highest_location
5272 = pfile->line_table->highest_line;
5274 if (!pfile->state.skipping
5275 && pfile->buffer->next_line < pfile->buffer->rlimit)
5276 cb (pfile, CPP_DO_location, data,
5277 pfile->line_table->highest_line);
5279 goto restart;
5281 goto dflt;
5283 case '/':
5285 const unsigned char *peek = do_peek_next (pos, limit);
5286 if (!(*peek == '/' || *peek == '*'))
5287 goto dflt;
5289 /* Line or block comment */
5290 bool is_block = *peek == '*';
5291 bool star = false;
5292 bool esc = false;
5293 location_t sloc
5294 = linemap_position_for_column (pfile->line_table,
5295 pos - line_start);
5297 while (pos < limit)
5299 char c = *pos++;
5300 switch (c)
5302 case '\\':
5303 esc = true;
5304 break;
5306 case '\r':
5307 if (*pos == '\n')
5308 pos++;
5309 /* FALLTHROUGH */
5311 case '\n':
5313 CPP_INCREMENT_LINE (pfile, 0);
5314 line_count++;
5315 line_start = pos;
5316 if (!esc && !is_block)
5318 bol = true;
5319 goto done_comment;
5322 if (!esc)
5323 star = false;
5324 esc = false;
5325 break;
5327 case '*':
5328 if (pos > peek)
5329 star = is_block;
5330 esc = false;
5331 break;
5333 case '/':
5334 if (star)
5335 goto done_comment;
5336 /* FALLTHROUGH */
5338 default:
5339 star = false;
5340 esc = false;
5341 break;
5344 if (pos < limit || is_block)
5345 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5346 "unterminated comment");
5347 done_comment:
5348 lwm = pos;
5349 break;
5352 case '\'':
5353 if (!CPP_OPTION (pfile, digit_separators))
5354 goto delimited_string;
5356 /* Possibly a number punctuator. */
5357 if (!ISIDNUM (*do_peek_next (pos, limit)))
5358 goto delimited_string;
5360 goto quote_peek;
5362 case '\"':
5363 if (!CPP_OPTION (pfile, rliterals))
5364 goto delimited_string;
5366 quote_peek:
5368 /* For ' see if it's a number punctuator
5369 \.?<digit>(<digit>|<identifier-nondigit>
5370 |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5371 /* For " see if it's a raw string
5372 {U,L,u,u8}R. This includes CPP_NUMBER detection,
5373 because that could be 0e+R. */
5374 const unsigned char *peek = pos - 1;
5375 bool quote_first = c == '"';
5376 bool quote_eight = false;
5377 bool maybe_number_start = false;
5378 bool want_number = false;
5380 while ((peek = do_peek_prev (peek, lwm)))
5382 unsigned char p = *peek;
5383 if (quote_first)
5385 if (!raw)
5387 if (p != 'R')
5388 break;
5389 raw = true;
5390 continue;
5393 quote_first = false;
5394 if (p == 'L' || p == 'U' || p == 'u')
5396 else if (p == '8')
5397 quote_eight = true;
5398 else
5399 goto second_raw;
5401 else if (quote_eight)
5403 if (p != 'u')
5405 raw = false;
5406 break;
5408 quote_eight = false;
5410 else if (c == '"')
5412 second_raw:;
5413 if (!want_number && ISIDNUM (p))
5415 raw = false;
5416 break;
5420 if (ISDIGIT (p))
5421 maybe_number_start = true;
5422 else if (p == '.')
5423 want_number = true;
5424 else if (ISIDNUM (p))
5425 maybe_number_start = false;
5426 else if (p == '+' || p == '-')
5428 if (const unsigned char *peek_prev
5429 = do_peek_prev (peek, lwm))
5431 p = *peek_prev;
5432 if (p == 'e' || p == 'E'
5433 || p == 'p' || p == 'P')
5435 want_number = true;
5436 maybe_number_start = false;
5438 else
5439 break;
5441 else
5442 break;
5444 else if (p == '\'' || p == '\"')
5446 /* If this is lwm, this must be the end of a
5447 previous string. So this is a trailing
5448 literal type, (a) if those are allowed,
5449 and (b) maybe_start is false. Otherwise
5450 this must be a CPP_NUMBER because we've
5451 met another ', and we'd have checked that
5452 in its own right. */
5453 if (peek == lwm && CPP_OPTION (pfile, uliterals))
5455 if (!maybe_number_start && !want_number)
5456 /* Must be a literal type. */
5457 raw = false;
5459 else if (p == '\''
5460 && CPP_OPTION (pfile, digit_separators))
5461 maybe_number_start = true;
5462 break;
5464 else if (c == '\'')
5465 break;
5466 else if (!quote_first && !quote_eight)
5467 break;
5470 if (maybe_number_start)
5472 if (c == '\'')
5473 /* A CPP NUMBER. */
5474 goto dflt;
5475 raw = false;
5478 goto delimited_string;
5481 delimited_string:
5483 /* (Possibly raw) string or char literal. */
5484 unsigned char end = c;
5485 int delim_len = -1;
5486 const unsigned char *delim = NULL;
5487 location_t sloc = linemap_position_for_column (pfile->line_table,
5488 pos - line_start);
5489 int esc = 0;
5491 if (raw)
5493 /* There can be no line breaks in the delimiter. */
5494 delim = pos;
5495 for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5497 if (delim_len == 16)
5499 cpp_error_with_line (pfile, CPP_DL_ERROR,
5500 sloc, 0,
5501 "raw string delimiter"
5502 " longer than %d"
5503 " characters",
5504 delim_len);
5505 raw = false;
5506 pos = delim;
5507 break;
5509 if (strchr (") \\\t\v\f\n", c))
5511 cpp_error_with_line (pfile, CPP_DL_ERROR,
5512 sloc, 0,
5513 "invalid character '%c'"
5514 " in raw string"
5515 " delimiter", c);
5516 raw = false;
5517 pos = delim;
5518 break;
5520 if (pos >= limit)
5521 goto bad_string;
5525 while (pos < limit)
5527 char c = *pos++;
5528 switch (c)
5530 case '\\':
5531 if (!raw)
5532 esc++;
5533 break;
5535 case '\r':
5536 if (*pos == '\n')
5537 pos++;
5538 /* FALLTHROUGH */
5540 case '\n':
5542 CPP_INCREMENT_LINE (pfile, 0);
5543 line_count++;
5544 line_start = pos;
5546 if (esc)
5547 esc--;
5548 break;
5550 case ')':
5551 if (raw
5552 && pos + delim_len + 1 < limit
5553 && pos[delim_len] == end
5554 && !memcmp (delim, pos, delim_len))
5556 pos += delim_len + 1;
5557 raw = false;
5558 goto done_string;
5560 break;
5562 default:
5563 if (!raw && !(esc & 1) && c == end)
5564 goto done_string;
5565 esc = 0;
5566 break;
5569 bad_string:
5570 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5571 "unterminated literal");
5573 done_string:
5574 raw = false;
5575 lwm = pos - 1;
5577 goto dflt;
5579 case '_':
5580 case 'e':
5581 case 'i':
5582 case 'm':
5583 if (bol && module_p && !pfile->state.skipping
5584 && do_peek_module (pfile, c, pos, limit))
5586 /* We've seen the start of a module control line.
5587 Start up the tokenizer. */
5588 pos--; /* Backup over the first character. */
5590 /* Backup over whitespace to start of line. */
5591 while (pos > line_start
5592 && (pos[-1] == ' ' || pos[-1] == '\t'))
5593 pos--;
5595 if (pos > base)
5596 cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5598 /* Prep things for directive handling. */
5599 buffer->next_line = pos;
5600 buffer->need_line = true;
5602 /* Now get tokens until the PRAGMA_EOL. */
5605 location_t spelling;
5606 const cpp_token *tok
5607 = cpp_get_token_with_location (pfile, &spelling);
5609 gcc_assert (pfile->state.in_deferred_pragma
5610 || tok->type == CPP_PRAGMA_EOL);
5611 cb (pfile, CPP_DO_token, data, tok, spelling);
5613 while (pfile->state.in_deferred_pragma);
5615 if (pfile->buffer->next_line < pfile->buffer->rlimit)
5616 cb (pfile, CPP_DO_location, data,
5617 pfile->line_table->highest_line);
5619 pfile->mi_valid = false;
5620 goto restart;
5622 goto dflt;
5624 default:
5625 dflt:
5626 bol = false;
5627 pfile->mi_valid = false;
5628 break;
5632 if (buffer->rlimit > base && !pfile->state.skipping)
5634 const unsigned char *limit = buffer->rlimit;
5635 /* If the file was not newline terminated, add rlimit, which is
5636 guaranteed to point to a newline, to the end of our range. */
5637 if (limit[-1] != '\n')
5639 limit++;
5640 CPP_INCREMENT_LINE (pfile, 0);
5641 line_count++;
5643 cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5646 _cpp_pop_buffer (pfile);
5648 while (pfile->buffer);