c++: ICE with VEC_INIT_EXPR and defarg [PR106925]
[official-gcc.git] / libcpp / lex.cc
bloba429a3d44ceee9e7b2b7e8c43d158916a5733427
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000-2022 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
27 enum spell_type
29 SPELL_OPERATOR = 0,
30 SPELL_IDENT,
31 SPELL_LITERAL,
32 SPELL_NONE
35 struct token_spelling
37 enum spell_type category;
38 const unsigned char *name;
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
44 #define OP(e, s) { SPELL_OPERATOR, UC s },
45 #define TK(e, s) { SPELL_ ## s, UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
53 /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. */
54 #define UCS_LIMIT 0x10FFFF
56 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
57 static int skip_line_comment (cpp_reader *);
58 static void skip_whitespace (cpp_reader *, cppchar_t);
59 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
60 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
61 static void store_comment (cpp_reader *, cpp_token *);
62 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
63 unsigned int, enum cpp_ttype);
64 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
65 static int name_p (cpp_reader *, const cpp_string *);
66 static tokenrun *next_tokenrun (tokenrun *);
68 static _cpp_buff *new_buff (size_t);
71 /* Utility routine:
73 Compares, the token TOKEN to the NUL-terminated string STRING.
74 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
75 int
76 cpp_ideq (const cpp_token *token, const char *string)
78 if (token->type != CPP_NAME)
79 return 0;
81 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
84 /* Record a note TYPE at byte POS into the current cleaned logical
85 line. */
86 static void
87 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
89 if (buffer->notes_used == buffer->notes_cap)
91 buffer->notes_cap = buffer->notes_cap * 2 + 200;
92 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
93 buffer->notes_cap);
96 buffer->notes[buffer->notes_used].pos = pos;
97 buffer->notes[buffer->notes_used].type = type;
98 buffer->notes_used++;
102 /* Fast path to find line special characters using optimized character
103 scanning algorithms. Anything complicated falls back to the slow
104 path below. Since this loop is very hot it's worth doing these kinds
105 of optimizations.
107 One of the paths through the ifdefs should provide
109 const uchar *search_line_fast (const uchar *s, const uchar *end);
111 Between S and END, search for \n, \r, \\, ?. Return a pointer to
112 the found character.
114 Note that the last character of the buffer is *always* a newline,
115 as forced by _cpp_convert_input. This fact can be used to avoid
116 explicitly looking for the end of the buffer. */
118 /* Configure gives us an ifdef test. */
119 #ifndef WORDS_BIGENDIAN
120 #define WORDS_BIGENDIAN 0
121 #endif
123 /* We'd like the largest integer that fits into a register. There's nothing
124 in <stdint.h> that gives us that. For most hosts this is unsigned long,
125 but MS decided on an LLP64 model. Thankfully when building with GCC we
126 can get the "real" word size. */
127 #ifdef __GNUC__
128 typedef unsigned int word_type __attribute__((__mode__(__word__)));
129 #else
130 typedef unsigned long word_type;
131 #endif
133 /* The code below is only expecting sizes 4 or 8.
134 Die at compile-time if this expectation is violated. */
135 typedef char check_word_type_size
136 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
138 /* Return X with the first N bytes forced to values that won't match one
139 of the interesting characters. Note that NUL is not interesting. */
141 static inline word_type
142 acc_char_mask_misalign (word_type val, unsigned int n)
144 word_type mask = -1;
145 if (WORDS_BIGENDIAN)
146 mask >>= n * 8;
147 else
148 mask <<= n * 8;
149 return val & mask;
152 /* Return X replicated to all byte positions within WORD_TYPE. */
154 static inline word_type
155 acc_char_replicate (uchar x)
157 word_type ret;
159 ret = (x << 24) | (x << 16) | (x << 8) | x;
160 if (sizeof(word_type) == 8)
161 ret = (ret << 16 << 16) | ret;
162 return ret;
165 /* Return non-zero if some byte of VAL is (probably) C. */
167 static inline word_type
168 acc_char_cmp (word_type val, word_type c)
170 #if defined(__GNUC__) && defined(__alpha__)
171 /* We can get exact results using a compare-bytes instruction.
172 Get (val == c) via (0 >= (val ^ c)). */
173 return __builtin_alpha_cmpbge (0, val ^ c);
174 #else
175 word_type magic = 0x7efefefeU;
176 if (sizeof(word_type) == 8)
177 magic = (magic << 16 << 16) | 0xfefefefeU;
178 magic |= 1;
180 val ^= c;
181 return ((val + magic) ^ ~val) & ~magic;
182 #endif
185 /* Given the result of acc_char_cmp is non-zero, return the index of
186 the found character. If this was a false positive, return -1. */
188 static inline int
189 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
190 word_type val ATTRIBUTE_UNUSED)
192 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
193 /* The cmpbge instruction sets *bits* of the result corresponding to
194 matches in the bytes with no false positives. */
195 return __builtin_ctzl (cmp);
196 #else
197 unsigned int i;
199 /* ??? It would be nice to force unrolling here,
200 and have all of these constants folded. */
201 for (i = 0; i < sizeof(word_type); ++i)
203 uchar c;
204 if (WORDS_BIGENDIAN)
205 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
206 else
207 c = (val >> i * 8) & 0xff;
209 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
210 return i;
213 return -1;
214 #endif
217 /* A version of the fast scanner using bit fiddling techniques.
219 For 32-bit words, one would normally perform 16 comparisons and
220 16 branches. With this algorithm one performs 24 arithmetic
221 operations and one branch. Whether this is faster with a 32-bit
222 word size is going to be somewhat system dependent.
224 For 64-bit words, we eliminate twice the number of comparisons
225 and branches without increasing the number of arithmetic operations.
226 It's almost certainly going to be a win with 64-bit word size. */
228 static const uchar * search_line_acc_char (const uchar *, const uchar *)
229 ATTRIBUTE_UNUSED;
231 static const uchar *
232 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
234 const word_type repl_nl = acc_char_replicate ('\n');
235 const word_type repl_cr = acc_char_replicate ('\r');
236 const word_type repl_bs = acc_char_replicate ('\\');
237 const word_type repl_qm = acc_char_replicate ('?');
239 unsigned int misalign;
240 const word_type *p;
241 word_type val, t;
243 /* Align the buffer. Mask out any bytes from before the beginning. */
244 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
245 val = *p;
246 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
247 if (misalign)
248 val = acc_char_mask_misalign (val, misalign);
250 /* Main loop. */
251 while (1)
253 t = acc_char_cmp (val, repl_nl);
254 t |= acc_char_cmp (val, repl_cr);
255 t |= acc_char_cmp (val, repl_bs);
256 t |= acc_char_cmp (val, repl_qm);
258 if (__builtin_expect (t != 0, 0))
260 int i = acc_char_index (t, val);
261 if (i >= 0)
262 return (const uchar *)p + i;
265 val = *++p;
269 /* Disable on Solaris 2/x86 until the following problem can be properly
270 autoconfed:
272 The Solaris 10+ assembler tags objects with the instruction set
273 extensions used, so SSE4.2 executables cannot run on machines that
274 don't support that extension. */
276 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
278 /* Replicated character data to be shared between implementations.
279 Recall that outside of a context with vector support we can't
280 define compatible vector types, therefore these are all defined
281 in terms of raw characters. */
282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
283 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
284 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
285 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
286 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
287 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
288 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
289 { '?', '?', '?', '?', '?', '?', '?', '?',
290 '?', '?', '?', '?', '?', '?', '?', '?' },
293 /* A version of the fast scanner using MMX vectorized byte compare insns.
295 This uses the PMOVMSKB instruction which was introduced with "MMX2",
296 which was packaged into SSE1; it is also present in the AMD MMX
297 extension. Mark the function as using "sse" so that we emit a real
298 "emms" instruction, rather than the 3dNOW "femms" instruction. */
300 static const uchar *
301 #ifndef __SSE__
302 __attribute__((__target__("sse")))
303 #endif
304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
306 typedef char v8qi __attribute__ ((__vector_size__ (8)));
307 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
309 const v8qi repl_nl = *(const v8qi *)repl_chars[0];
310 const v8qi repl_cr = *(const v8qi *)repl_chars[1];
311 const v8qi repl_bs = *(const v8qi *)repl_chars[2];
312 const v8qi repl_qm = *(const v8qi *)repl_chars[3];
314 unsigned int misalign, found, mask;
315 const v8qi *p;
316 v8qi data, t, c;
318 /* Align the source pointer. While MMX doesn't generate unaligned data
319 faults, this allows us to safely scan to the end of the buffer without
320 reading beyond the end of the last page. */
321 misalign = (uintptr_t)s & 7;
322 p = (const v8qi *)((uintptr_t)s & -8);
323 data = *p;
325 /* Create a mask for the bytes that are valid within the first
326 16-byte block. The Idea here is that the AND with the mask
327 within the loop is "free", since we need some AND or TEST
328 insn in order to set the flags for the branch anyway. */
329 mask = -1u << misalign;
331 /* Main loop processing 8 bytes at a time. */
332 goto start;
335 data = *++p;
336 mask = -1;
338 start:
339 t = __builtin_ia32_pcmpeqb(data, repl_nl);
340 c = __builtin_ia32_pcmpeqb(data, repl_cr);
341 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
342 c = __builtin_ia32_pcmpeqb(data, repl_bs);
343 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
344 c = __builtin_ia32_pcmpeqb(data, repl_qm);
345 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
346 found = __builtin_ia32_pmovmskb (t);
347 found &= mask;
349 while (!found);
351 __builtin_ia32_emms ();
353 /* FOUND contains 1 in bits for which we matched a relevant
354 character. Conversion to the byte index is trivial. */
355 found = __builtin_ctz(found);
356 return (const uchar *)p + found;
359 /* A version of the fast scanner using SSE2 vectorized byte compare insns. */
361 static const uchar *
362 #ifndef __SSE2__
363 __attribute__((__target__("sse2")))
364 #endif
365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
367 typedef char v16qi __attribute__ ((__vector_size__ (16)));
369 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
370 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
371 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
372 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
374 unsigned int misalign, found, mask;
375 const v16qi *p;
376 v16qi data, t;
378 /* Align the source pointer. */
379 misalign = (uintptr_t)s & 15;
380 p = (const v16qi *)((uintptr_t)s & -16);
381 data = *p;
383 /* Create a mask for the bytes that are valid within the first
384 16-byte block. The Idea here is that the AND with the mask
385 within the loop is "free", since we need some AND or TEST
386 insn in order to set the flags for the branch anyway. */
387 mask = -1u << misalign;
389 /* Main loop processing 16 bytes at a time. */
390 goto start;
393 data = *++p;
394 mask = -1;
396 start:
397 t = data == repl_nl;
398 t |= data == repl_cr;
399 t |= data == repl_bs;
400 t |= data == repl_qm;
401 found = __builtin_ia32_pmovmskb128 (t);
402 found &= mask;
404 while (!found);
406 /* FOUND contains 1 in bits for which we matched a relevant
407 character. Conversion to the byte index is trivial. */
408 found = __builtin_ctz(found);
409 return (const uchar *)p + found;
412 #ifdef HAVE_SSE4
413 /* A version of the fast scanner using SSE 4.2 vectorized string insns. */
415 static const uchar *
416 #ifndef __SSE4_2__
417 __attribute__((__target__("sse4.2")))
418 #endif
419 search_line_sse42 (const uchar *s, const uchar *end)
421 typedef char v16qi __attribute__ ((__vector_size__ (16)));
422 static const v16qi search = { '\n', '\r', '?', '\\' };
424 uintptr_t si = (uintptr_t)s;
425 uintptr_t index;
427 /* Check for unaligned input. */
428 if (si & 15)
430 v16qi sv;
432 if (__builtin_expect (end - s < 16, 0)
433 && __builtin_expect ((si & 0xfff) > 0xff0, 0))
435 /* There are less than 16 bytes left in the buffer, and less
436 than 16 bytes left on the page. Reading 16 bytes at this
437 point might generate a spurious page fault. Defer to the
438 SSE2 implementation, which already handles alignment. */
439 return search_line_sse2 (s, end);
442 /* ??? The builtin doesn't understand that the PCMPESTRI read from
443 memory need not be aligned. */
444 sv = __builtin_ia32_loaddqu ((const char *) s);
445 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
447 if (__builtin_expect (index < 16, 0))
448 goto found;
450 /* Advance the pointer to an aligned address. We will re-scan a
451 few bytes, but we no longer need care for reading past the
452 end of a page, since we're guaranteed a match. */
453 s = (const uchar *)((si + 15) & -16);
456 /* Main loop, processing 16 bytes at a time. */
457 #ifdef __GCC_ASM_FLAG_OUTPUTS__
458 while (1)
460 char f;
462 /* By using inline assembly instead of the builtin,
463 we can use the result, as well as the flags set. */
464 __asm ("%vpcmpestri\t$0, %2, %3"
465 : "=c"(index), "=@ccc"(f)
466 : "m"(*s), "x"(search), "a"(4), "d"(16));
467 if (f)
468 break;
470 s += 16;
472 #else
473 s -= 16;
474 /* By doing the whole loop in inline assembly,
475 we can make proper use of the flags set. */
476 __asm ( ".balign 16\n"
477 "0: add $16, %1\n"
478 " %vpcmpestri\t$0, (%1), %2\n"
479 " jnc 0b"
480 : "=&c"(index), "+r"(s)
481 : "x"(search), "a"(4), "d"(16));
482 #endif
484 found:
485 return s + index;
488 #else
489 /* Work around out-dated assemblers without sse4 support. */
490 #define search_line_sse42 search_line_sse2
491 #endif
493 /* Check the CPU capabilities. */
495 #include "../gcc/config/i386/cpuid.h"
497 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
498 static search_line_fast_type search_line_fast;
500 #define HAVE_init_vectorized_lexer 1
501 static inline void
502 init_vectorized_lexer (void)
504 unsigned dummy, ecx = 0, edx = 0;
505 search_line_fast_type impl = search_line_acc_char;
506 int minimum = 0;
508 #if defined(__SSE4_2__)
509 minimum = 3;
510 #elif defined(__SSE2__)
511 minimum = 2;
512 #elif defined(__SSE__)
513 minimum = 1;
514 #endif
516 if (minimum == 3)
517 impl = search_line_sse42;
518 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
520 if (minimum == 3 || (ecx & bit_SSE4_2))
521 impl = search_line_sse42;
522 else if (minimum == 2 || (edx & bit_SSE2))
523 impl = search_line_sse2;
524 else if (minimum == 1 || (edx & bit_SSE))
525 impl = search_line_mmx;
527 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
529 if (minimum == 1
530 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
531 impl = search_line_mmx;
534 search_line_fast = impl;
537 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
539 /* A vection of the fast scanner using AltiVec vectorized byte compares
540 and VSX unaligned loads (when VSX is available). This is otherwise
541 the same as the AltiVec version. */
543 ATTRIBUTE_NO_SANITIZE_UNDEFINED
544 static const uchar *
545 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
547 typedef __attribute__((altivec(vector))) unsigned char vc;
549 const vc repl_nl = {
550 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
551 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
553 const vc repl_cr = {
554 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
555 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
557 const vc repl_bs = {
558 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
559 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
561 const vc repl_qm = {
562 '?', '?', '?', '?', '?', '?', '?', '?',
563 '?', '?', '?', '?', '?', '?', '?', '?',
565 const vc zero = { 0 };
567 vc data, t;
569 /* Main loop processing 16 bytes at a time. */
572 vc m_nl, m_cr, m_bs, m_qm;
574 data = __builtin_vec_vsx_ld (0, s);
575 s += 16;
577 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
578 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
579 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
580 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
581 t = (m_nl | m_cr) | (m_bs | m_qm);
583 /* T now contains 0xff in bytes for which we matched one of the relevant
584 characters. We want to exit the loop if any byte in T is non-zero.
585 Below is the expansion of vec_any_ne(t, zero). */
587 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
589 /* Restore s to to point to the 16 bytes we just processed. */
590 s -= 16;
593 #define N (sizeof(vc) / sizeof(long))
595 union {
596 vc v;
597 /* Statically assert that N is 2 or 4. */
598 unsigned long l[(N == 2 || N == 4) ? N : -1];
599 } u;
600 unsigned long l, i = 0;
602 u.v = t;
604 /* Find the first word of T that is non-zero. */
605 switch (N)
607 case 4:
608 l = u.l[i++];
609 if (l != 0)
610 break;
611 s += sizeof(unsigned long);
612 l = u.l[i++];
613 if (l != 0)
614 break;
615 s += sizeof(unsigned long);
616 /* FALLTHRU */
617 case 2:
618 l = u.l[i++];
619 if (l != 0)
620 break;
621 s += sizeof(unsigned long);
622 l = u.l[i];
625 /* L now contains 0xff in bytes for which we matched one of the
626 relevant characters. We can find the byte index by finding
627 its bit index and dividing by 8. */
628 #ifdef __BIG_ENDIAN__
629 l = __builtin_clzl(l) >> 3;
630 #else
631 l = __builtin_ctzl(l) >> 3;
632 #endif
633 return s + l;
635 #undef N
639 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
641 /* A vection of the fast scanner using AltiVec vectorized byte compares.
642 This cannot be used for little endian because vec_lvsl/lvsr are
643 deprecated for little endian and the code won't work properly. */
644 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
645 so we can't compile this function without -maltivec on the command line
646 (or implied by some other switch). */
648 static const uchar *
649 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
651 typedef __attribute__((altivec(vector))) unsigned char vc;
653 const vc repl_nl = {
654 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
655 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
657 const vc repl_cr = {
658 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
659 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
661 const vc repl_bs = {
662 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
663 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
665 const vc repl_qm = {
666 '?', '?', '?', '?', '?', '?', '?', '?',
667 '?', '?', '?', '?', '?', '?', '?', '?',
669 const vc ones = {
670 -1, -1, -1, -1, -1, -1, -1, -1,
671 -1, -1, -1, -1, -1, -1, -1, -1,
673 const vc zero = { 0 };
675 vc data, mask, t;
677 /* Altivec loads automatically mask addresses with -16. This lets us
678 issue the first load as early as possible. */
679 data = __builtin_vec_ld(0, (const vc *)s);
681 /* Discard bytes before the beginning of the buffer. Do this by
682 beginning with all ones and shifting in zeros according to the
683 mis-alignment. The LVSR instruction pulls the exact shift we
684 want from the address. */
685 mask = __builtin_vec_lvsr(0, s);
686 mask = __builtin_vec_perm(zero, ones, mask);
687 data &= mask;
689 /* While altivec loads mask addresses, we still need to align S so
690 that the offset we compute at the end is correct. */
691 s = (const uchar *)((uintptr_t)s & -16);
693 /* Main loop processing 16 bytes at a time. */
694 goto start;
697 vc m_nl, m_cr, m_bs, m_qm;
699 s += 16;
700 data = __builtin_vec_ld(0, (const vc *)s);
702 start:
703 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
704 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
705 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
706 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
707 t = (m_nl | m_cr) | (m_bs | m_qm);
709 /* T now contains 0xff in bytes for which we matched one of the relevant
710 characters. We want to exit the loop if any byte in T is non-zero.
711 Below is the expansion of vec_any_ne(t, zero). */
713 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
716 #define N (sizeof(vc) / sizeof(long))
718 union {
719 vc v;
720 /* Statically assert that N is 2 or 4. */
721 unsigned long l[(N == 2 || N == 4) ? N : -1];
722 } u;
723 unsigned long l, i = 0;
725 u.v = t;
727 /* Find the first word of T that is non-zero. */
728 switch (N)
730 case 4:
731 l = u.l[i++];
732 if (l != 0)
733 break;
734 s += sizeof(unsigned long);
735 l = u.l[i++];
736 if (l != 0)
737 break;
738 s += sizeof(unsigned long);
739 /* FALLTHROUGH */
740 case 2:
741 l = u.l[i++];
742 if (l != 0)
743 break;
744 s += sizeof(unsigned long);
745 l = u.l[i];
748 /* L now contains 0xff in bytes for which we matched one of the
749 relevant characters. We can find the byte index by finding
750 its bit index and dividing by 8. */
751 l = __builtin_clzl(l) >> 3;
752 return s + l;
754 #undef N
758 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
759 #include "arm_neon.h"
761 /* This doesn't have to be the exact page size, but no system may use
762 a size smaller than this. ARMv8 requires a minimum page size of
763 4k. The impact of being conservative here is a small number of
764 cases will take the slightly slower entry path into the main
765 loop. */
767 #define AARCH64_MIN_PAGE_SIZE 4096
769 static const uchar *
770 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
772 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
773 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
774 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
775 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
776 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
778 #ifdef __ARM_BIG_ENDIAN
779 const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
780 #else
781 const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
782 #endif
784 unsigned int found;
785 const uint8_t *p;
786 uint8x16_t data;
787 uint8x16_t t;
788 uint16x8_t m;
789 uint8x16_t u, v, w;
791 /* Align the source pointer. */
792 p = (const uint8_t *)((uintptr_t)s & -16);
794 /* Assuming random string start positions, with a 4k page size we'll take
795 the slow path about 0.37% of the time. */
796 if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
797 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
798 < 16, 0))
800 /* Slow path: the string starts near a possible page boundary. */
801 uint32_t misalign, mask;
803 misalign = (uintptr_t)s & 15;
804 mask = (-1u << misalign) & 0xffff;
805 data = vld1q_u8 (p);
806 t = vceqq_u8 (data, repl_nl);
807 u = vceqq_u8 (data, repl_cr);
808 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
809 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
810 t = vorrq_u8 (v, w);
811 t = vandq_u8 (t, xmask);
812 m = vpaddlq_u8 (t);
813 m = vshlq_u16 (m, shift);
814 found = vaddvq_u16 (m);
815 found &= mask;
816 if (found)
817 return (const uchar*)p + __builtin_ctz (found);
819 else
821 data = vld1q_u8 ((const uint8_t *) s);
822 t = vceqq_u8 (data, repl_nl);
823 u = vceqq_u8 (data, repl_cr);
824 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
825 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
826 t = vorrq_u8 (v, w);
827 if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
828 goto done;
833 p += 16;
834 data = vld1q_u8 (p);
835 t = vceqq_u8 (data, repl_nl);
836 u = vceqq_u8 (data, repl_cr);
837 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
838 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
839 t = vorrq_u8 (v, w);
840 } while (!vpaddd_u64 ((uint64x2_t)t));
842 done:
843 /* Now that we've found the terminating substring, work out precisely where
844 we need to stop. */
845 t = vandq_u8 (t, xmask);
846 m = vpaddlq_u8 (t);
847 m = vshlq_u16 (m, shift);
848 found = vaddvq_u16 (m);
849 return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
850 + __builtin_ctz (found));
853 #elif defined (__ARM_NEON)
854 #include "arm_neon.h"
856 static const uchar *
857 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
859 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
860 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
861 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
862 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
863 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
865 unsigned int misalign, found, mask;
866 const uint8_t *p;
867 uint8x16_t data;
869 /* Align the source pointer. */
870 misalign = (uintptr_t)s & 15;
871 p = (const uint8_t *)((uintptr_t)s & -16);
872 data = vld1q_u8 (p);
874 /* Create a mask for the bytes that are valid within the first
875 16-byte block. The Idea here is that the AND with the mask
876 within the loop is "free", since we need some AND or TEST
877 insn in order to set the flags for the branch anyway. */
878 mask = (-1u << misalign) & 0xffff;
880 /* Main loop, processing 16 bytes at a time. */
881 goto start;
885 uint8x8_t l;
886 uint16x4_t m;
887 uint32x2_t n;
888 uint8x16_t t, u, v, w;
890 p += 16;
891 data = vld1q_u8 (p);
892 mask = 0xffff;
894 start:
895 t = vceqq_u8 (data, repl_nl);
896 u = vceqq_u8 (data, repl_cr);
897 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
898 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
899 t = vandq_u8 (vorrq_u8 (v, w), xmask);
900 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
901 m = vpaddl_u8 (l);
902 n = vpaddl_u16 (m);
904 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
905 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
906 found &= mask;
908 while (!found);
910 /* FOUND contains 1 in bits for which we matched a relevant
911 character. Conversion to the byte index is trivial. */
912 found = __builtin_ctz (found);
913 return (const uchar *)p + found;
916 #else
918 /* We only have one accelerated alternative. Use a direct call so that
919 we encourage inlining. */
921 #define search_line_fast search_line_acc_char
923 #endif
925 /* Initialize the lexer if needed. */
927 void
928 _cpp_init_lexer (void)
930 #ifdef HAVE_init_vectorized_lexer
931 init_vectorized_lexer ();
932 #endif
935 /* Returns with a logical line that contains no escaped newlines or
936 trigraphs. This is a time-critical inner loop. */
937 void
938 _cpp_clean_line (cpp_reader *pfile)
940 cpp_buffer *buffer;
941 const uchar *s;
942 uchar c, *d, *p;
944 buffer = pfile->buffer;
945 buffer->cur_note = buffer->notes_used = 0;
946 buffer->cur = buffer->line_base = buffer->next_line;
947 buffer->need_line = false;
948 s = buffer->next_line;
950 if (!buffer->from_stage3)
952 const uchar *pbackslash = NULL;
954 /* Fast path. This is the common case of an un-escaped line with
955 no trigraphs. The primary win here is by not writing any
956 data back to memory until we have to. */
957 while (1)
959 /* Perform an optimized search for \n, \r, \\, ?. */
960 s = search_line_fast (s, buffer->rlimit);
962 c = *s;
963 if (c == '\\')
965 /* Record the location of the backslash and continue. */
966 pbackslash = s++;
968 else if (__builtin_expect (c == '?', 0))
970 if (__builtin_expect (s[1] == '?', false)
971 && _cpp_trigraph_map[s[2]])
973 /* Have a trigraph. We may or may not have to convert
974 it. Add a line note regardless, for -Wtrigraphs. */
975 add_line_note (buffer, s, s[2]);
976 if (CPP_OPTION (pfile, trigraphs))
978 /* We do, and that means we have to switch to the
979 slow path. */
980 d = (uchar *) s;
981 *d = _cpp_trigraph_map[s[2]];
982 s += 2;
983 goto slow_path;
986 /* Not a trigraph. Continue on fast-path. */
987 s++;
989 else
990 break;
993 /* This must be \r or \n. We're either done, or we'll be forced
994 to write back to the buffer and continue on the slow path. */
995 d = (uchar *) s;
997 if (__builtin_expect (s == buffer->rlimit, false))
998 goto done;
1000 /* DOS line ending? */
1001 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1003 s++;
1004 if (s == buffer->rlimit)
1005 goto done;
1008 if (__builtin_expect (pbackslash == NULL, true))
1009 goto done;
1011 /* Check for escaped newline. */
1012 p = d;
1013 while (is_nvspace (p[-1]))
1014 p--;
1015 if (p - 1 != pbackslash)
1016 goto done;
1018 /* Have an escaped newline; process it and proceed to
1019 the slow path. */
1020 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1021 d = p - 2;
1022 buffer->next_line = p - 1;
1024 slow_path:
1025 while (1)
1027 c = *++s;
1028 *++d = c;
1030 if (c == '\n' || c == '\r')
1032 /* Handle DOS line endings. */
1033 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1034 s++;
1035 if (s == buffer->rlimit)
1036 break;
1038 /* Escaped? */
1039 p = d;
1040 while (p != buffer->next_line && is_nvspace (p[-1]))
1041 p--;
1042 if (p == buffer->next_line || p[-1] != '\\')
1043 break;
1045 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1046 d = p - 2;
1047 buffer->next_line = p - 1;
1049 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1051 /* Add a note regardless, for the benefit of -Wtrigraphs. */
1052 add_line_note (buffer, d, s[2]);
1053 if (CPP_OPTION (pfile, trigraphs))
1055 *d = _cpp_trigraph_map[s[2]];
1056 s += 2;
1061 else
1063 while (*s != '\n' && *s != '\r')
1064 s++;
1065 d = (uchar *) s;
1067 /* Handle DOS line endings. */
1068 if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1069 s++;
1072 done:
1073 *d = '\n';
1074 /* A sentinel note that should never be processed. */
1075 add_line_note (buffer, d + 1, '\n');
1076 buffer->next_line = s + 1;
1079 /* Return true if the trigraph indicated by NOTE should be warned
1080 about in a comment. */
1081 static bool
1082 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1084 const uchar *p;
1086 /* Within comments we don't warn about trigraphs, unless the
1087 trigraph forms an escaped newline, as that may change
1088 behavior. */
1089 if (note->type != '/')
1090 return false;
1092 /* If -trigraphs, then this was an escaped newline iff the next note
1093 is coincident. */
1094 if (CPP_OPTION (pfile, trigraphs))
1095 return note[1].pos == note->pos;
1097 /* Otherwise, see if this forms an escaped newline. */
1098 p = note->pos + 3;
1099 while (is_nvspace (*p))
1100 p++;
1102 /* There might have been escaped newlines between the trigraph and the
1103 newline we found. Hence the position test. */
1104 return (*p == '\n' && p < note[1].pos);
1107 /* Process the notes created by add_line_note as far as the current
1108 location. */
1109 void
1110 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1112 cpp_buffer *buffer = pfile->buffer;
1114 for (;;)
1116 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1117 unsigned int col;
1119 if (note->pos > buffer->cur)
1120 break;
1122 buffer->cur_note++;
1123 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1125 if (note->type == '\\' || note->type == ' ')
1127 if (note->type == ' ' && !in_comment)
1128 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1129 "backslash and newline separated by space");
1131 if (buffer->next_line > buffer->rlimit)
1133 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1134 "backslash-newline at end of file");
1135 /* Prevent "no newline at end of file" warning. */
1136 buffer->next_line = buffer->rlimit;
1139 buffer->line_base = note->pos;
1140 CPP_INCREMENT_LINE (pfile, 0);
1142 else if (_cpp_trigraph_map[note->type])
1144 if (CPP_OPTION (pfile, warn_trigraphs)
1145 && (!in_comment || warn_in_comment (pfile, note)))
1147 if (CPP_OPTION (pfile, trigraphs))
1148 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1149 pfile->line_table->highest_line, col,
1150 "trigraph ??%c converted to %c",
1151 note->type,
1152 (int) _cpp_trigraph_map[note->type]);
1153 else
1155 cpp_warning_with_line
1156 (pfile, CPP_W_TRIGRAPHS,
1157 pfile->line_table->highest_line, col,
1158 "trigraph ??%c ignored, use -trigraphs to enable",
1159 note->type);
1163 else if (note->type == 0)
1164 /* Already processed in lex_raw_string. */;
1165 else
1166 abort ();
1170 namespace bidi {
1171 enum class kind {
1172 NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1175 /* All the UTF-8 encodings of bidi characters start with E2. */
1176 constexpr uchar utf8_start = 0xe2;
1178 struct context
1180 context () {}
1181 context (location_t loc, kind k, bool pdf, bool ucn)
1182 : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1186 kind get_pop_kind () const
1188 return m_pdf ? kind::PDF : kind::PDI;
1190 bool ucn_p () const
1192 return m_ucn;
1195 location_t m_loc;
1196 kind m_kind;
1197 unsigned m_pdf : 1;
1198 unsigned m_ucn : 1;
1201 /* A vector holding currently open bidi contexts. We use a char for
1202 each context, its LSB is 1 if it represents a PDF context, 0 if it
1203 represents a PDI context. The next bit is 1 if this context was open
1204 by a bidi character written as a UCN, and 0 when it was UTF-8. */
1205 semi_embedded_vec <context, 16> vec;
1207 /* Close the whole comment/identifier/string literal/character constant
1208 context. */
1209 void on_close ()
1211 vec.truncate (0);
1214 /* Pop the last element in the vector. */
1215 void pop ()
1217 unsigned int len = vec.count ();
1218 gcc_checking_assert (len > 0);
1219 vec.truncate (len - 1);
1222 /* Return the pop kind of the context of the Ith element. */
1223 kind pop_kind_at (unsigned int i)
1225 return vec[i].get_pop_kind ();
1228 /* Return the pop kind of the context that is currently opened. */
1229 kind current_ctx ()
1231 unsigned int len = vec.count ();
1232 if (len == 0)
1233 return kind::NONE;
1234 return vec[len - 1].get_pop_kind ();
1237 /* Return true if the current context comes from a UCN origin, that is,
1238 the bidi char which started this bidi context was written as a UCN. */
1239 bool current_ctx_ucn_p ()
1241 unsigned int len = vec.count ();
1242 gcc_checking_assert (len > 0);
1243 return vec[len - 1].m_ucn;
1246 location_t current_ctx_loc ()
1248 unsigned int len = vec.count ();
1249 gcc_checking_assert (len > 0);
1250 return vec[len - 1].m_loc;
1253 /* We've read a bidi char, update the current vector as necessary.
1254 LOC is only valid when K is not kind::NONE. */
1255 void on_char (kind k, bool ucn_p, location_t loc)
1257 switch (k)
1259 case kind::LRE:
1260 case kind::RLE:
1261 case kind::LRO:
1262 case kind::RLO:
1263 vec.push (context (loc, k, true, ucn_p));
1264 break;
1265 case kind::LRI:
1266 case kind::RLI:
1267 case kind::FSI:
1268 vec.push (context (loc, k, false, ucn_p));
1269 break;
1270 /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1271 whose scope has not yet been terminated. */
1272 case kind::PDF:
1273 if (current_ctx () == kind::PDF)
1274 pop ();
1275 break;
1276 /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1277 scope has not yet been terminated, as well as the scopes of
1278 any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1279 yet been terminated. */
1280 case kind::PDI:
1281 for (int i = vec.count () - 1; i >= 0; --i)
1282 if (pop_kind_at (i) == kind::PDI)
1284 vec.truncate (i);
1285 break;
1287 break;
1288 case kind::LTR:
1289 case kind::RTL:
1290 /* These aren't popped by a PDF/PDI. */
1291 break;
1292 ATTR_LIKELY case kind::NONE:
1293 break;
1294 default:
1295 abort ();
1299 /* Return a descriptive string for K. */
1300 const char *to_str (kind k)
1302 switch (k)
1304 case kind::LRE:
1305 return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1306 case kind::RLE:
1307 return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1308 case kind::LRO:
1309 return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1310 case kind::RLO:
1311 return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1312 case kind::LRI:
1313 return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1314 case kind::RLI:
1315 return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1316 case kind::FSI:
1317 return "U+2068 (FIRST STRONG ISOLATE)";
1318 case kind::PDF:
1319 return "U+202C (POP DIRECTIONAL FORMATTING)";
1320 case kind::PDI:
1321 return "U+2069 (POP DIRECTIONAL ISOLATE)";
1322 case kind::LTR:
1323 return "U+200E (LEFT-TO-RIGHT MARK)";
1324 case kind::RTL:
1325 return "U+200F (RIGHT-TO-LEFT MARK)";
1326 default:
1327 abort ();
1332 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1333 within the current line in FILE, with the caret at START. */
1335 static location_t
1336 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1337 const unsigned char *const start,
1338 size_t num_bytes)
1340 gcc_checking_assert (num_bytes > 0);
1342 /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1343 to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1344 whereas linemap_position_for_column is 1-based. */
1346 /* Get 0-based offsets within the line. */
1347 size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1348 size_t end_offset = start_offset + num_bytes - 1;
1350 /* Now convert to location_t, where "columns" are 1-based byte offsets. */
1351 location_t start_loc = linemap_position_for_column (pfile->line_table,
1352 start_offset + 1);
1353 location_t end_loc = linemap_position_for_column (pfile->line_table,
1354 end_offset + 1);
1356 if (start_loc == end_loc)
1357 return start_loc;
1359 source_range src_range;
1360 src_range.m_start = start_loc;
1361 src_range.m_finish = end_loc;
1362 location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
1363 start_loc,
1364 src_range,
1365 NULL,
1367 return combined_loc;
1370 /* Parse a sequence of 3 bytes starting with P and return its bidi code. */
1372 static bidi::kind
1373 get_bidi_utf8_1 (const unsigned char *const p)
1375 gcc_checking_assert (p[0] == bidi::utf8_start);
1377 if (p[1] == 0x80)
1378 switch (p[2])
1380 case 0xaa:
1381 return bidi::kind::LRE;
1382 case 0xab:
1383 return bidi::kind::RLE;
1384 case 0xac:
1385 return bidi::kind::PDF;
1386 case 0xad:
1387 return bidi::kind::LRO;
1388 case 0xae:
1389 return bidi::kind::RLO;
1390 case 0x8e:
1391 return bidi::kind::LTR;
1392 case 0x8f:
1393 return bidi::kind::RTL;
1394 default:
1395 break;
1397 else if (p[1] == 0x81)
1398 switch (p[2])
1400 case 0xa6:
1401 return bidi::kind::LRI;
1402 case 0xa7:
1403 return bidi::kind::RLI;
1404 case 0xa8:
1405 return bidi::kind::FSI;
1406 case 0xa9:
1407 return bidi::kind::PDI;
1408 default:
1409 break;
1412 return bidi::kind::NONE;
1415 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1416 If the kind is not NONE, write the location to *OUT.*/
1418 static bidi::kind
1419 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1421 bidi::kind result = get_bidi_utf8_1 (p);
1422 if (result != bidi::kind::NONE)
1424 /* We have a sequence of 3 bytes starting at P. */
1425 *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1427 return result;
1430 /* Parse a UCN where P points just past \u or \U and return its bidi code. */
1432 static bidi::kind
1433 get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
1435 /* 6.4.3 Universal Character Names
1436 \u hex-quad
1437 \U hex-quad hex-quad
1438 \u { simple-hexadecimal-digit-sequence }
1439 where \unnnn means \U0000nnnn. */
1441 *end = p + 4;
1442 if (is_U)
1444 if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1445 return bidi::kind::NONE;
1446 /* Skip 4B so we can treat \u and \U the same below. */
1447 p += 4;
1448 *end += 4;
1450 else if (p[0] == '{')
1452 p++;
1453 while (*p == '0')
1454 p++;
1455 if (p[0] != '2'
1456 || p[1] != '0'
1457 || !ISXDIGIT (p[2])
1458 || !ISXDIGIT (p[3])
1459 || p[4] != '}')
1460 return bidi::kind::NONE;
1461 *end = p + 5;
1464 /* All code points we are looking for start with 20xx. */
1465 if (p[0] != '2' || p[1] != '0')
1466 return bidi::kind::NONE;
1467 else if (p[2] == '2')
1468 switch (p[3])
1470 case 'a':
1471 case 'A':
1472 return bidi::kind::LRE;
1473 case 'b':
1474 case 'B':
1475 return bidi::kind::RLE;
1476 case 'c':
1477 case 'C':
1478 return bidi::kind::PDF;
1479 case 'd':
1480 case 'D':
1481 return bidi::kind::LRO;
1482 case 'e':
1483 case 'E':
1484 return bidi::kind::RLO;
1485 default:
1486 break;
1488 else if (p[2] == '6')
1489 switch (p[3])
1491 case '6':
1492 return bidi::kind::LRI;
1493 case '7':
1494 return bidi::kind::RLI;
1495 case '8':
1496 return bidi::kind::FSI;
1497 case '9':
1498 return bidi::kind::PDI;
1499 default:
1500 break;
1502 else if (p[2] == '0')
1503 switch (p[3])
1505 case 'e':
1506 case 'E':
1507 return bidi::kind::LTR;
1508 case 'f':
1509 case 'F':
1510 return bidi::kind::RTL;
1511 default:
1512 break;
1515 return bidi::kind::NONE;
1518 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1519 If the kind is not NONE, write the location to *OUT. */
1521 static bidi::kind
1522 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1523 location_t *out)
1525 const unsigned char *end;
1526 bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
1527 if (result != bidi::kind::NONE)
1529 const unsigned char *start = p - 2;
1530 size_t num_bytes = end - start;
1531 *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1533 return result;
1536 /* Parse a named universal character escape where P points just past \N and
1537 return its bidi code. If the kind is not NONE, write the location to
1538 *OUT. */
1540 static bidi::kind
1541 get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1543 bidi::kind result = bidi::kind::NONE;
1544 if (*p != '{')
1545 return bidi::kind::NONE;
1546 if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
1548 if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1549 result = bidi::kind::LTR;
1550 else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1551 result = bidi::kind::LRE;
1552 else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1553 result = bidi::kind::LRO;
1554 else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1555 result = bidi::kind::LRI;
1557 else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
1559 if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1560 result = bidi::kind::RTL;
1561 else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1562 result = bidi::kind::RLE;
1563 else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1564 result = bidi::kind::RLO;
1565 else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1566 result = bidi::kind::RLI;
1568 else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
1570 if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
1571 result = bidi::kind::PDF;
1572 else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
1573 result = bidi::kind::PDI;
1575 else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
1576 result = bidi::kind::FSI;
1577 if (result != bidi::kind::NONE)
1578 *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
1579 (strchr ((const char *)
1580 (p + 1), '}')
1581 - (const char *) p)
1582 + 3);
1583 return result;
1586 /* Subclass of rich_location for reporting on unpaired UTF-8
1587 bidirectional control character(s).
1588 Escape the source lines on output, and show all unclosed
1589 bidi context, labelling everything. */
1591 class unpaired_bidi_rich_location : public rich_location
1593 public:
1594 class custom_range_label : public range_label
1596 public:
1597 label_text get_text (unsigned range_idx) const final override
1599 /* range 0 is the primary location; each subsequent range i + 1
1600 is for bidi::vec[i]. */
1601 if (range_idx > 0)
1603 const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1604 return label_text::borrow (bidi::to_str (ctxt.m_kind));
1606 else
1607 return label_text::borrow (_("end of bidirectional context"));
1611 unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1612 : rich_location (pfile->line_table, loc, &m_custom_label)
1614 set_escape_on_output (true);
1615 for (unsigned i = 0; i < bidi::vec.count (); i++)
1616 add_range (bidi::vec[i].m_loc,
1617 SHOW_RANGE_WITHOUT_CARET,
1618 &m_custom_label);
1621 private:
1622 custom_range_label m_custom_label;
1625 /* We're closing a bidi context, that is, we've encountered a newline,
1626 are closing a C-style comment, or are at the end of a string literal,
1627 character constant, or identifier. Warn if this context was not
1628 properly terminated by a PDI or PDF. P points to the last character
1629 in this context. */
1631 static void
1632 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1634 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1635 if (bidi::vec.count () > 0
1636 && (warn_bidi & bidirectional_unpaired
1637 && (!bidi::current_ctx_ucn_p ()
1638 || (warn_bidi & bidirectional_ucn))))
1640 const location_t loc
1641 = linemap_position_for_column (pfile->line_table,
1642 CPP_BUF_COLUMN (pfile->buffer, p));
1643 unpaired_bidi_rich_location rich_loc (pfile, loc);
1644 /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1645 forms of a diagnostic, so fake it for now. */
1646 if (bidi::vec.count () > 1)
1647 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1648 "unpaired UTF-8 bidirectional control characters "
1649 "detected");
1650 else
1651 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1652 "unpaired UTF-8 bidirectional control character "
1653 "detected");
1655 /* We're done with this context. */
1656 bidi::on_close ();
1659 /* We're at the beginning or in the middle of an identifier/comment/string
1660 literal/character constant. Warn if we've encountered a bidi character.
1661 KIND says which bidi control character it was; UCN_P is true iff this bidi
1662 control character was written as a UCN. LOC is the location of the
1663 character, but is only valid if KIND != bidi::kind::NONE. */
1665 static void
1666 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1667 bool ucn_p, location_t loc)
1669 if (__builtin_expect (kind == bidi::kind::NONE, 1))
1670 return;
1672 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1674 if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1676 rich_location rich_loc (pfile->line_table, loc);
1677 rich_loc.set_escape_on_output (true);
1679 /* It seems excessive to warn about a PDI/PDF that is closing
1680 an opened context because we've already warned about the
1681 opening character. Except warn when we have a UCN x UTF-8
1682 mismatch, if UCN checking is enabled. */
1683 if (kind == bidi::current_ctx ())
1685 if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1686 && bidi::current_ctx_ucn_p () != ucn_p)
1688 rich_loc.add_range (bidi::current_ctx_loc ());
1689 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1690 "UTF-8 vs UCN mismatch when closing "
1691 "a context by \"%s\"", bidi::to_str (kind));
1694 else if (warn_bidi & bidirectional_any
1695 && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1697 if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1698 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1699 "\"%s\" is closing an unopened context",
1700 bidi::to_str (kind));
1701 else
1702 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1703 "found problematic Unicode character \"%s\"",
1704 bidi::to_str (kind));
1707 /* We're done with this context. */
1708 bidi::on_char (kind, ucn_p, loc);
1711 static const cppchar_t utf8_continuation = 0x80;
1712 static const cppchar_t utf8_signifier = 0xC0;
1714 /* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1715 at PFILE->buffer->cur. Return a pointer after the diagnosed
1716 invalid character. */
1718 static const uchar *
1719 _cpp_warn_invalid_utf8 (cpp_reader *pfile)
1721 cpp_buffer *buffer = pfile->buffer;
1722 const uchar *cur = buffer->cur;
1723 bool pedantic = (CPP_PEDANTIC (pfile)
1724 && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1726 if (cur[0] < utf8_signifier
1727 || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1729 if (pedantic)
1730 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1731 pfile->line_table->highest_line,
1732 CPP_BUF_COL (buffer),
1733 "invalid UTF-8 character <%x>",
1734 cur[0]);
1735 else
1736 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1737 pfile->line_table->highest_line,
1738 CPP_BUF_COL (buffer),
1739 "invalid UTF-8 character <%x>",
1740 cur[0]);
1741 return cur + 1;
1743 else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1745 if (pedantic)
1746 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1747 pfile->line_table->highest_line,
1748 CPP_BUF_COL (buffer),
1749 "invalid UTF-8 character <%x><%x>",
1750 cur[0], cur[1]);
1751 else
1752 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1753 pfile->line_table->highest_line,
1754 CPP_BUF_COL (buffer),
1755 "invalid UTF-8 character <%x><%x>",
1756 cur[0], cur[1]);
1757 return cur + 2;
1759 else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1761 if (pedantic)
1762 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1763 pfile->line_table->highest_line,
1764 CPP_BUF_COL (buffer),
1765 "invalid UTF-8 character <%x><%x><%x>",
1766 cur[0], cur[1], cur[2]);
1767 else
1768 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1769 pfile->line_table->highest_line,
1770 CPP_BUF_COL (buffer),
1771 "invalid UTF-8 character <%x><%x><%x>",
1772 cur[0], cur[1], cur[2]);
1773 return cur + 3;
1775 else
1777 if (pedantic)
1778 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1779 pfile->line_table->highest_line,
1780 CPP_BUF_COL (buffer),
1781 "invalid UTF-8 character <%x><%x><%x><%x>",
1782 cur[0], cur[1], cur[2], cur[3]);
1783 else
1784 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1785 pfile->line_table->highest_line,
1786 CPP_BUF_COL (buffer),
1787 "invalid UTF-8 character <%x><%x><%x><%x>",
1788 cur[0], cur[1], cur[2], cur[3]);
1789 return cur + 4;
1793 /* Helper function of *skip_*_comment and lex*_string. For C,
1794 character at CUR[-1] with MSB set handle -Wbidi-chars* and
1795 -Winvalid-utf8 diagnostics and return pointer to first character
1796 that should be processed next. */
1798 static inline const uchar *
1799 _cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1800 const uchar *cur, bool warn_bidi_p,
1801 bool warn_invalid_utf8_p)
1803 /* If this is a beginning of a UTF-8 encoding, it might be
1804 a bidirectional control character. */
1805 if (c == bidi::utf8_start && warn_bidi_p)
1807 location_t loc;
1808 bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1809 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1811 if (!warn_invalid_utf8_p)
1812 return cur;
1813 if (c >= utf8_signifier)
1815 cppchar_t s;
1816 const uchar *pstr = cur - 1;
1817 if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
1818 && s <= UCS_LIMIT)
1819 return pstr;
1821 pfile->buffer->cur = cur - 1;
1822 return _cpp_warn_invalid_utf8 (pfile);
1825 /* Skip a C-style block comment. We find the end of the comment by
1826 seeing if an asterisk is before every '/' we encounter. Returns
1827 nonzero if comment terminated by EOF, zero otherwise.
1829 Buffer->cur points to the initial asterisk of the comment. */
1830 bool
1831 _cpp_skip_block_comment (cpp_reader *pfile)
1833 cpp_buffer *buffer = pfile->buffer;
1834 const uchar *cur = buffer->cur;
1835 uchar c;
1836 const bool warn_bidi_p = pfile->warn_bidi_p ();
1837 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1838 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1840 cur++;
1841 if (*cur == '/')
1842 cur++;
1844 for (;;)
1846 /* People like decorating comments with '*', so check for '/'
1847 instead for efficiency. */
1848 c = *cur++;
1850 if (c == '/')
1852 if (cur[-2] == '*')
1854 if (warn_bidi_p)
1855 maybe_warn_bidi_on_close (pfile, cur);
1856 break;
1859 /* Warn about potential nested comments, but not if the '/'
1860 comes immediately before the true comment delimiter.
1861 Don't bother to get it right across escaped newlines. */
1862 if (CPP_OPTION (pfile, warn_comments)
1863 && cur[0] == '*' && cur[1] != '/')
1865 buffer->cur = cur;
1866 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1867 pfile->line_table->highest_line,
1868 CPP_BUF_COL (buffer),
1869 "\"/*\" within comment");
1872 else if (c == '\n')
1874 unsigned int cols;
1875 buffer->cur = cur - 1;
1876 if (warn_bidi_p)
1877 maybe_warn_bidi_on_close (pfile, cur);
1878 _cpp_process_line_notes (pfile, true);
1879 if (buffer->next_line >= buffer->rlimit)
1880 return true;
1881 _cpp_clean_line (pfile);
1883 cols = buffer->next_line - buffer->line_base;
1884 CPP_INCREMENT_LINE (pfile, cols);
1886 cur = buffer->cur;
1888 else if (__builtin_expect (c >= utf8_continuation, 0)
1889 && warn_bidi_or_invalid_utf8_p)
1890 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1891 warn_invalid_utf8_p);
1894 buffer->cur = cur;
1895 _cpp_process_line_notes (pfile, true);
1896 return false;
1899 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1900 terminating newline. Handles escaped newlines. Returns nonzero
1901 if a multiline comment. */
1902 static int
1903 skip_line_comment (cpp_reader *pfile)
1905 cpp_buffer *buffer = pfile->buffer;
1906 location_t orig_line = pfile->line_table->highest_line;
1907 const bool warn_bidi_p = pfile->warn_bidi_p ();
1908 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1909 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1911 if (!warn_bidi_or_invalid_utf8_p)
1912 while (*buffer->cur != '\n')
1913 buffer->cur++;
1914 else if (!warn_invalid_utf8_p)
1916 while (*buffer->cur != '\n'
1917 && *buffer->cur != bidi::utf8_start)
1918 buffer->cur++;
1919 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1921 while (*buffer->cur != '\n')
1923 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1925 location_t loc;
1926 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1927 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1929 buffer->cur++;
1931 maybe_warn_bidi_on_close (pfile, buffer->cur);
1934 else
1936 while (*buffer->cur != '\n')
1938 if (*buffer->cur < utf8_continuation)
1940 buffer->cur++;
1941 continue;
1943 buffer->cur
1944 = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
1945 warn_bidi_p, warn_invalid_utf8_p);
1947 if (warn_bidi_p)
1948 maybe_warn_bidi_on_close (pfile, buffer->cur);
1951 _cpp_process_line_notes (pfile, true);
1952 return orig_line != pfile->line_table->highest_line;
1955 /* Skips whitespace, saving the next non-whitespace character. */
1956 static void
1957 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1959 cpp_buffer *buffer = pfile->buffer;
1960 bool saw_NUL = false;
1964 /* Horizontal space always OK. */
1965 if (c == ' ' || c == '\t')
1967 /* Just \f \v or \0 left. */
1968 else if (c == '\0')
1969 saw_NUL = true;
1970 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1971 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1972 CPP_BUF_COL (buffer),
1973 "%s in preprocessing directive",
1974 c == '\f' ? "form feed" : "vertical tab");
1976 c = *buffer->cur++;
1978 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1979 while (is_nvspace (c));
1981 if (saw_NUL)
1983 encoding_rich_location rich_loc (pfile);
1984 cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1985 "null character(s) ignored");
1988 buffer->cur--;
1991 /* See if the characters of a number token are valid in a name (no
1992 '.', '+' or '-'). */
1993 static int
1994 name_p (cpp_reader *pfile, const cpp_string *string)
1996 unsigned int i;
1998 for (i = 0; i < string->len; i++)
1999 if (!is_idchar (string->text[i]))
2000 return 0;
2002 return 1;
2005 /* After parsing an identifier or other sequence, produce a warning about
2006 sequences not in NFC/NFKC. */
2007 static void
2008 warn_about_normalization (cpp_reader *pfile,
2009 const cpp_token *token,
2010 const struct normalize_state *s)
2012 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2013 && !pfile->state.skipping)
2015 location_t loc = token->src_loc;
2017 /* If possible, create a location range for the token. */
2018 if (loc >= RESERVED_LOCATION_COUNT
2019 && token->type != CPP_EOF
2020 /* There must be no line notes to process. */
2021 && (!(pfile->buffer->cur
2022 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2023 && !pfile->overlaid_buffer)))
2025 source_range tok_range;
2026 tok_range.m_start = loc;
2027 tok_range.m_finish
2028 = linemap_position_for_column (pfile->line_table,
2029 CPP_BUF_COLUMN (pfile->buffer,
2030 pfile->buffer->cur));
2031 loc = COMBINE_LOCATION_DATA (pfile->line_table,
2032 loc, tok_range, NULL, 0);
2035 encoding_rich_location rich_loc (pfile, loc);
2037 /* Make sure that the token is printed using UCNs, even
2038 if we'd otherwise happily print UTF-8. */
2039 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2040 size_t sz;
2042 sz = cpp_spell_token (pfile, token, buf, false) - buf;
2043 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2044 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2045 "`%.*s' is not in NFKC", (int) sz, buf);
2046 else if (CPP_OPTION (pfile, cplusplus))
2047 cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2048 "`%.*s' is not in NFC", (int) sz, buf);
2049 else
2050 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2051 "`%.*s' is not in NFC", (int) sz, buf);
2052 free (buf);
2056 /* Returns TRUE if the sequence starting at buffer->cur is valid in
2057 an identifier. FIRST is TRUE if this starts an identifier. */
2059 static bool
2060 forms_identifier_p (cpp_reader *pfile, int first,
2061 struct normalize_state *state)
2063 cpp_buffer *buffer = pfile->buffer;
2064 const bool warn_bidi_p = pfile->warn_bidi_p ();
2066 if (*buffer->cur == '$')
2068 if (!CPP_OPTION (pfile, dollars_in_ident))
2069 return false;
2071 buffer->cur++;
2072 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
2074 CPP_OPTION (pfile, warn_dollars) = 0;
2075 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
2078 return true;
2081 /* Is this a syntactically valid UCN or a valid UTF-8 char? */
2082 if (CPP_OPTION (pfile, extended_identifiers))
2084 cppchar_t s;
2085 if (*buffer->cur >= utf8_signifier)
2087 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2088 && warn_bidi_p)
2090 location_t loc;
2091 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
2092 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2094 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2095 state, &s))
2096 return true;
2098 else if (*buffer->cur == '\\'
2099 && (buffer->cur[1] == 'u'
2100 || buffer->cur[1] == 'U'
2101 || buffer->cur[1] == 'N'))
2103 buffer->cur += 2;
2104 if (warn_bidi_p)
2106 location_t loc;
2107 bidi::kind kind;
2108 if (buffer->cur[-1] == 'N')
2109 kind = get_bidi_named (pfile, buffer->cur, &loc);
2110 else
2111 kind = get_bidi_ucn (pfile, buffer->cur,
2112 buffer->cur[-1] == 'U', &loc);
2113 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2115 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2116 state, &s, NULL, NULL))
2117 return true;
2118 buffer->cur -= 2;
2122 return false;
2125 /* Helper function to issue error about improper __VA_OPT__ use. */
2126 static void
2127 maybe_va_opt_error (cpp_reader *pfile)
2129 if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2131 /* __VA_OPT__ should not be accepted at all, but allow it in
2132 system headers. */
2133 if (!_cpp_in_system_header (pfile))
2134 cpp_error (pfile, CPP_DL_PEDWARN,
2135 "__VA_OPT__ is not available until C++20");
2137 else if (!pfile->state.va_args_ok)
2139 /* __VA_OPT__ should only appear in the replacement list of a
2140 variadic macro. */
2141 cpp_error (pfile, CPP_DL_PEDWARN,
2142 "__VA_OPT__ can only appear in the expansion"
2143 " of a C++20 variadic macro");
2147 /* Helper function to get the cpp_hashnode of the identifier BASE. */
2148 static cpp_hashnode *
2149 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
2151 cpp_hashnode *result;
2152 const uchar *cur;
2153 unsigned int len;
2154 unsigned int hash = HT_HASHSTEP (0, *base);
2156 cur = base + 1;
2157 while (ISIDNUM (*cur))
2159 hash = HT_HASHSTEP (hash, *cur);
2160 cur++;
2162 len = cur - base;
2163 hash = HT_HASHFINISH (hash, len);
2164 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2165 base, len, hash, HT_ALLOC));
2167 /* Rarely, identifiers require diagnostics when lexed. */
2168 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2169 && !pfile->state.skipping, 0))
2171 /* It is allowed to poison the same identifier twice. */
2172 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2173 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2174 NODE_NAME (result));
2176 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2177 replacement list of a variadic macro. */
2178 if (result == pfile->spec_nodes.n__VA_ARGS__
2179 && !pfile->state.va_args_ok)
2181 if (CPP_OPTION (pfile, cplusplus))
2182 cpp_error (pfile, CPP_DL_PEDWARN,
2183 "__VA_ARGS__ can only appear in the expansion"
2184 " of a C++11 variadic macro");
2185 else
2186 cpp_error (pfile, CPP_DL_PEDWARN,
2187 "__VA_ARGS__ can only appear in the expansion"
2188 " of a C99 variadic macro");
2191 if (result == pfile->spec_nodes.n__VA_OPT__)
2192 maybe_va_opt_error (pfile);
2194 /* For -Wc++-compat, warn about use of C++ named operators. */
2195 if (result->flags & NODE_WARN_OPERATOR)
2196 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2197 "identifier \"%s\" is a special operator name in C++",
2198 NODE_NAME (result));
2201 return result;
2204 /* Get the cpp_hashnode of an identifier specified by NAME in
2205 the current cpp_reader object. If none is found, NULL is returned. */
2206 cpp_hashnode *
2207 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2209 cpp_hashnode *result;
2210 result = lex_identifier_intern (pfile, (uchar *) name);
2211 return result;
2214 /* Lex an identifier starting at BUFFER->CUR - 1. */
2215 static cpp_hashnode *
2216 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2217 struct normalize_state *nst, cpp_hashnode **spelling)
2219 cpp_hashnode *result;
2220 const uchar *cur;
2221 unsigned int len;
2222 unsigned int hash = HT_HASHSTEP (0, *base);
2223 const bool warn_bidi_p = pfile->warn_bidi_p ();
2225 cur = pfile->buffer->cur;
2226 if (! starts_ucn)
2228 while (ISIDNUM (*cur))
2230 hash = HT_HASHSTEP (hash, *cur);
2231 cur++;
2233 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2235 pfile->buffer->cur = cur;
2236 if (starts_ucn || forms_identifier_p (pfile, false, nst))
2238 /* Slower version for identifiers containing UCNs
2239 or extended chars (including $). */
2240 do {
2241 while (ISIDNUM (*pfile->buffer->cur))
2243 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2244 pfile->buffer->cur++;
2246 } while (forms_identifier_p (pfile, false, nst));
2247 if (warn_bidi_p)
2248 maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2249 result = _cpp_interpret_identifier (pfile, base,
2250 pfile->buffer->cur - base);
2251 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2253 else
2255 len = cur - base;
2256 hash = HT_HASHFINISH (hash, len);
2258 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2259 base, len, hash, HT_ALLOC));
2260 *spelling = result;
2263 /* Rarely, identifiers require diagnostics when lexed. */
2264 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2265 && !pfile->state.skipping, 0))
2267 /* It is allowed to poison the same identifier twice. */
2268 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2269 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2270 NODE_NAME (result));
2272 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2273 replacement list of a variadic macro. */
2274 if (result == pfile->spec_nodes.n__VA_ARGS__
2275 && !pfile->state.va_args_ok)
2277 if (CPP_OPTION (pfile, cplusplus))
2278 cpp_error (pfile, CPP_DL_PEDWARN,
2279 "__VA_ARGS__ can only appear in the expansion"
2280 " of a C++11 variadic macro");
2281 else
2282 cpp_error (pfile, CPP_DL_PEDWARN,
2283 "__VA_ARGS__ can only appear in the expansion"
2284 " of a C99 variadic macro");
2287 /* __VA_OPT__ should only appear in the replacement list of a
2288 variadic macro. */
2289 if (result == pfile->spec_nodes.n__VA_OPT__)
2290 maybe_va_opt_error (pfile);
2292 /* For -Wc++-compat, warn about use of C++ named operators. */
2293 if (result->flags & NODE_WARN_OPERATOR)
2294 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2295 "identifier \"%s\" is a special operator name in C++",
2296 NODE_NAME (result));
2299 return result;
2302 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
2303 static void
2304 lex_number (cpp_reader *pfile, cpp_string *number,
2305 struct normalize_state *nst)
2307 const uchar *cur;
2308 const uchar *base;
2309 uchar *dest;
2311 base = pfile->buffer->cur - 1;
2314 const uchar *adj_digit_sep = NULL;
2315 cur = pfile->buffer->cur;
2317 /* N.B. ISIDNUM does not include $. */
2318 while (ISIDNUM (*cur)
2319 || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2320 || DIGIT_SEP (*cur)
2321 || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2323 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2324 /* Adjacent digit separators do not form part of the pp-number syntax.
2325 However, they can safely be diagnosed here as an error, since '' is
2326 not a valid preprocessing token. */
2327 if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2328 adj_digit_sep = cur;
2329 cur++;
2331 /* A number can't end with a digit separator. */
2332 while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2333 --cur;
2334 if (adj_digit_sep && adj_digit_sep < cur)
2335 cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2337 pfile->buffer->cur = cur;
2339 while (forms_identifier_p (pfile, false, nst));
2341 number->len = cur - base;
2342 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2343 memcpy (dest, base, number->len);
2344 dest[number->len] = '\0';
2345 number->text = dest;
2348 /* Create a token of type TYPE with a literal spelling. */
2349 static void
2350 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2351 unsigned int len, enum cpp_ttype type)
2353 token->type = type;
2354 token->val.str.len = len;
2355 token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2358 const uchar *
2359 cpp_alloc_token_string (cpp_reader *pfile,
2360 const unsigned char *ptr, unsigned len)
2362 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2364 dest[len] = 0;
2365 memcpy (dest, ptr, len);
2366 return dest;
2369 /* A pair of raw buffer pointers. The currently open one is [1], the
2370 first one is [0]. Used for string literal lexing. */
2371 struct lit_accum {
2372 _cpp_buff *first;
2373 _cpp_buff *last;
2374 const uchar *rpos;
2375 size_t accum;
2377 lit_accum ()
2378 : first (NULL), last (NULL), rpos (0), accum (0)
2382 void append (cpp_reader *, const uchar *, size_t);
2384 void read_begin (cpp_reader *);
2385 bool reading_p () const
2387 return rpos != NULL;
2389 char read_char ()
2391 char c = *rpos++;
2392 if (rpos == BUFF_FRONT (last))
2393 rpos = NULL;
2394 return c;
2398 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2399 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
2401 void
2402 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2404 if (!last)
2405 /* Starting. */
2406 first = last = _cpp_get_buff (pfile, len);
2407 else if (len > BUFF_ROOM (last))
2409 /* There is insufficient room in the buffer. Copy what we can,
2410 and then either extend or create a new one. */
2411 size_t room = BUFF_ROOM (last);
2412 memcpy (BUFF_FRONT (last), base, room);
2413 BUFF_FRONT (last) += room;
2414 base += room;
2415 len -= room;
2416 accum += room;
2418 gcc_checking_assert (!rpos);
2420 last = _cpp_append_extend_buff (pfile, last, len);
2423 memcpy (BUFF_FRONT (last), base, len);
2424 BUFF_FRONT (last) += len;
2425 accum += len;
2428 void
2429 lit_accum::read_begin (cpp_reader *pfile)
2431 /* We never accumulate more than 4 chars to read. */
2432 if (BUFF_ROOM (last) < 4)
2434 last = _cpp_append_extend_buff (pfile, last, 4);
2435 rpos = BUFF_FRONT (last);
2438 /* Returns true if a macro has been defined.
2439 This might not work if compile with -save-temps,
2440 or preprocess separately from compilation. */
2442 static bool
2443 is_macro(cpp_reader *pfile, const uchar *base)
2445 const uchar *cur = base;
2446 if (! ISIDST (*cur))
2447 return false;
2448 unsigned int hash = HT_HASHSTEP (0, *cur);
2449 ++cur;
2450 while (ISIDNUM (*cur))
2452 hash = HT_HASHSTEP (hash, *cur);
2453 ++cur;
2455 hash = HT_HASHFINISH (hash, cur - base);
2457 cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2458 base, cur - base, hash, HT_NO_INSERT));
2460 return result && cpp_macro_p (result);
2463 /* Returns true if a literal suffix does not have the expected form
2464 and is defined as a macro. */
2466 static bool
2467 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
2469 /* User-defined literals outside of namespace std must start with a single
2470 underscore, so assume anything of that form really is a UDL suffix.
2471 We don't need to worry about UDLs defined inside namespace std because
2472 their names are reserved, so cannot be used as macro names in valid
2473 programs. */
2474 if (base[0] == '_' && base[1] != '_')
2475 return false;
2476 return is_macro (pfile, base);
2479 /* Lexes a raw string. The stored string contains the spelling,
2480 including double quotes, delimiter string, '(' and ')', any leading
2481 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
2482 the type of the literal, or CPP_OTHER if it was not properly
2483 terminated.
2485 BASE is the start of the token. Updates pfile->buffer->cur to just
2486 after the lexed string.
2488 The spelling is NUL-terminated, but it is not guaranteed that this
2489 is the first NUL since embedded NULs are preserved. */
2491 static void
2492 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2494 const uchar *pos = base;
2495 const bool warn_bidi_p = pfile->warn_bidi_p ();
2496 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2497 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2499 /* 'tis a pity this information isn't passed down from the lexer's
2500 initial categorization of the token. */
2501 enum cpp_ttype type = CPP_STRING;
2503 if (*pos == 'L')
2505 type = CPP_WSTRING;
2506 pos++;
2508 else if (*pos == 'U')
2510 type = CPP_STRING32;
2511 pos++;
2513 else if (*pos == 'u')
2515 if (pos[1] == '8')
2517 type = CPP_UTF8STRING;
2518 pos++;
2520 else
2521 type = CPP_STRING16;
2522 pos++;
2525 gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2526 pos += 2;
2528 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2530 /* Skip notes before the ". */
2531 while (note->pos < pos)
2532 ++note;
2534 lit_accum accum;
2536 uchar prefix[17];
2537 unsigned prefix_len = 0;
2538 enum Phase
2540 PHASE_PREFIX = -2,
2541 PHASE_NONE = -1,
2542 PHASE_SUFFIX = 0
2543 } phase = PHASE_PREFIX;
2545 for (;;)
2547 gcc_checking_assert (note->pos >= pos);
2549 /* Undo any escaped newlines and trigraphs. */
2550 if (!accum.reading_p () && note->pos == pos)
2551 switch (note->type)
2553 case '\\':
2554 case ' ':
2555 /* Restore backslash followed by newline. */
2556 accum.append (pfile, base, pos - base);
2557 base = pos;
2558 accum.read_begin (pfile);
2559 accum.append (pfile, UC"\\", 1);
2561 after_backslash:
2562 if (note->type == ' ')
2563 /* GNU backslash whitespace newline extension. FIXME
2564 could be any sequence of non-vertical space. When we
2565 can properly restore any such sequence, we should
2566 mark this note as handled so _cpp_process_line_notes
2567 doesn't warn. */
2568 accum.append (pfile, UC" ", 1);
2570 accum.append (pfile, UC"\n", 1);
2571 note++;
2572 break;
2574 case '\n':
2575 /* This can happen for ??/<NEWLINE> when trigraphs are not
2576 being interpretted. */
2577 gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2578 note->type = 0;
2579 note++;
2580 break;
2582 default:
2583 gcc_checking_assert (_cpp_trigraph_map[note->type]);
2585 /* Don't warn about this trigraph in
2586 _cpp_process_line_notes, since trigraphs show up as
2587 trigraphs in raw strings. */
2588 uchar type = note->type;
2589 note->type = 0;
2591 if (CPP_OPTION (pfile, trigraphs))
2593 accum.append (pfile, base, pos - base);
2594 base = pos;
2595 accum.read_begin (pfile);
2596 accum.append (pfile, UC"??", 2);
2597 accum.append (pfile, &type, 1);
2599 /* ??/ followed by newline gets two line notes, one for
2600 the trigraph and one for the backslash/newline. */
2601 if (type == '/' && note[1].pos == pos)
2603 note++;
2604 gcc_assert (note->type == '\\' || note->type == ' ');
2605 goto after_backslash;
2607 /* Skip the replacement character. */
2608 base = ++pos;
2611 note++;
2612 break;
2615 /* Now get a char to process. Either from an expanded note, or
2616 from the line buffer. */
2617 bool read_note = accum.reading_p ();
2618 char c = read_note ? accum.read_char () : *pos++;
2620 if (phase == PHASE_PREFIX)
2622 if (c == '(')
2624 /* Done. */
2625 phase = PHASE_NONE;
2626 prefix[prefix_len++] = '"';
2628 else if (prefix_len < 16
2629 /* Prefix chars are any of the basic character set,
2630 [lex.charset] except for '
2631 ()\\\t\v\f\n'. Optimized for a contiguous
2632 alphabet. */
2633 /* Unlike a switch, this collapses down to one or
2634 two shift and bitmask operations on an ASCII
2635 system, with an outlier or two. */
2636 && (('Z' - 'A' == 25
2637 ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2638 : ISIDST (c))
2639 || (c >= '0' && c <= '9')
2640 || c == '_' || c == '{' || c == '}'
2641 || c == '[' || c == ']' || c == '#'
2642 || c == '<' || c == '>' || c == '%'
2643 || c == ':' || c == ';' || c == '.' || c == '?'
2644 || c == '*' || c == '+' || c == '-' || c == '/'
2645 || c == '^' || c == '&' || c == '|' || c == '~'
2646 || c == '!' || c == '=' || c == ','
2647 || c == '"' || c == '\''))
2648 prefix[prefix_len++] = c;
2649 else
2651 /* Something is wrong. */
2652 int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2653 if (prefix_len == 16)
2654 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2655 col, "raw string delimiter longer "
2656 "than 16 characters");
2657 else if (c == '\n')
2658 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2659 col, "invalid new-line in raw "
2660 "string delimiter");
2661 else
2662 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2663 col, "invalid character '%c' in "
2664 "raw string delimiter", c);
2665 type = CPP_OTHER;
2666 phase = PHASE_NONE;
2667 /* Continue until we get a close quote, that's probably
2668 the best failure mode. */
2669 prefix_len = 0;
2671 if (c != '\n')
2672 continue;
2675 if (phase != PHASE_NONE)
2677 if (prefix[phase] != c)
2678 phase = PHASE_NONE;
2679 else if (unsigned (phase + 1) == prefix_len)
2680 break;
2681 else
2683 phase = Phase (phase + 1);
2684 continue;
2688 if (!prefix_len && c == '"')
2689 /* Failure mode lexing. */
2690 goto out;
2691 else if (prefix_len && c == ')')
2692 phase = PHASE_SUFFIX;
2693 else if (!read_note && c == '\n')
2695 pos--;
2696 pfile->buffer->cur = pos;
2697 if (pfile->state.in_directive
2698 || (pfile->state.parsing_args
2699 && pfile->buffer->next_line >= pfile->buffer->rlimit))
2701 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2702 "unterminated raw string");
2703 type = CPP_OTHER;
2704 goto out;
2707 accum.append (pfile, base, pos - base + 1);
2708 _cpp_process_line_notes (pfile, false);
2710 if (pfile->buffer->next_line < pfile->buffer->rlimit)
2711 CPP_INCREMENT_LINE (pfile, 0);
2712 pfile->buffer->need_line = true;
2714 if (!_cpp_get_fresh_line (pfile))
2716 /* We ran out of file and failed to get a line. */
2717 location_t src_loc = token->src_loc;
2718 token->type = CPP_EOF;
2719 /* Tell the compiler the line number of the EOF token. */
2720 token->src_loc = pfile->line_table->highest_line;
2721 token->flags = BOL;
2722 if (accum.first)
2723 _cpp_release_buff (pfile, accum.first);
2724 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2725 "unterminated raw string");
2726 /* Now pop the buffer that _cpp_get_fresh_line did not. */
2727 _cpp_pop_buffer (pfile);
2728 return;
2731 pos = base = pfile->buffer->cur;
2732 note = &pfile->buffer->notes[pfile->buffer->cur_note];
2734 else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2735 && warn_bidi_or_invalid_utf8_p)
2736 pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
2737 warn_invalid_utf8_p);
2740 if (warn_bidi_p)
2741 maybe_warn_bidi_on_close (pfile, pos);
2743 if (CPP_OPTION (pfile, user_literals))
2745 /* If a string format macro, say from inttypes.h, is placed touching
2746 a string literal it could be parsed as a C++11 user-defined string
2747 literal thus breaking the program. */
2748 if (is_macro_not_literal_suffix (pfile, pos))
2750 /* Raise a warning, but do not consume subsequent tokens. */
2751 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2752 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2753 token->src_loc, 0,
2754 "invalid suffix on literal; C++11 requires "
2755 "a space between literal and string macro");
2757 /* Grab user defined literal suffix. */
2758 else if (ISIDST (*pos))
2760 type = cpp_userdef_string_add_type (type);
2761 ++pos;
2763 while (ISIDNUM (*pos))
2764 ++pos;
2768 out:
2769 pfile->buffer->cur = pos;
2770 if (!accum.accum)
2771 create_literal (pfile, token, base, pos - base, type);
2772 else
2774 size_t extra_len = pos - base;
2775 uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2777 token->type = type;
2778 token->val.str.len = accum.accum + extra_len;
2779 token->val.str.text = dest;
2780 for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2782 size_t len = BUFF_FRONT (buf) - buf->base;
2783 memcpy (dest, buf->base, len);
2784 dest += len;
2786 _cpp_release_buff (pfile, accum.first);
2787 memcpy (dest, base, extra_len);
2788 dest[extra_len] = '\0';
2792 /* Lexes a string, character constant, or angle-bracketed header file
2793 name. The stored string contains the spelling, including opening
2794 quote and any leading 'L', 'u', 'U' or 'u8' and optional
2795 'R' modifier. It returns the type of the literal, or CPP_OTHER
2796 if it was not properly terminated, or CPP_LESS for an unterminated
2797 header name which must be relexed as normal tokens.
2799 The spelling is NUL-terminated, but it is not guaranteed that this
2800 is the first NUL since embedded NULs are preserved. */
2801 static void
2802 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2804 bool saw_NUL = false;
2805 const uchar *cur;
2806 cppchar_t terminator;
2807 enum cpp_ttype type;
2809 cur = base;
2810 terminator = *cur++;
2811 if (terminator == 'L' || terminator == 'U')
2812 terminator = *cur++;
2813 else if (terminator == 'u')
2815 terminator = *cur++;
2816 if (terminator == '8')
2817 terminator = *cur++;
2819 if (terminator == 'R')
2821 lex_raw_string (pfile, token, base);
2822 return;
2824 if (terminator == '"')
2825 type = (*base == 'L' ? CPP_WSTRING :
2826 *base == 'U' ? CPP_STRING32 :
2827 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2828 : CPP_STRING);
2829 else if (terminator == '\'')
2830 type = (*base == 'L' ? CPP_WCHAR :
2831 *base == 'U' ? CPP_CHAR32 :
2832 *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2833 : CPP_CHAR);
2834 else
2835 terminator = '>', type = CPP_HEADER_NAME;
2837 const bool warn_bidi_p = pfile->warn_bidi_p ();
2838 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2839 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2840 for (;;)
2842 cppchar_t c = *cur++;
2844 /* In #include-style directives, terminators are not escapable. */
2845 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2847 if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
2849 location_t loc;
2850 bidi::kind kind;
2851 if (cur[0] == 'N')
2852 kind = get_bidi_named (pfile, cur + 1, &loc);
2853 else
2854 kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
2855 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2857 cur++;
2859 else if (c == terminator)
2861 if (warn_bidi_p)
2862 maybe_warn_bidi_on_close (pfile, cur - 1);
2863 break;
2865 else if (c == '\n')
2867 cur--;
2868 /* Unmatched quotes always yield undefined behavior, but
2869 greedy lexing means that what appears to be an unterminated
2870 header name may actually be a legitimate sequence of tokens. */
2871 if (terminator == '>')
2873 token->type = CPP_LESS;
2874 return;
2876 type = CPP_OTHER;
2877 break;
2879 else if (c == '\0')
2880 saw_NUL = true;
2881 else if (__builtin_expect (c >= utf8_continuation, 0)
2882 && warn_bidi_or_invalid_utf8_p)
2883 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2884 warn_invalid_utf8_p);
2887 if (saw_NUL && !pfile->state.skipping)
2888 cpp_error (pfile, CPP_DL_WARNING,
2889 "null character(s) preserved in literal");
2891 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2892 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2893 (int) terminator);
2895 if (CPP_OPTION (pfile, user_literals))
2897 /* If a string format macro, say from inttypes.h, is placed touching
2898 a string literal it could be parsed as a C++11 user-defined string
2899 literal thus breaking the program. */
2900 if (is_macro_not_literal_suffix (pfile, cur))
2902 /* Raise a warning, but do not consume subsequent tokens. */
2903 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2904 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2905 token->src_loc, 0,
2906 "invalid suffix on literal; C++11 requires "
2907 "a space between literal and string macro");
2909 /* Grab user defined literal suffix. */
2910 else if (ISIDST (*cur))
2912 type = cpp_userdef_char_add_type (type);
2913 type = cpp_userdef_string_add_type (type);
2914 ++cur;
2916 while (ISIDNUM (*cur))
2917 ++cur;
2920 else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2921 && is_macro (pfile, cur)
2922 && !pfile->state.skipping)
2923 cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2924 token->src_loc, 0, "C++11 requires a space "
2925 "between string literal and macro");
2927 pfile->buffer->cur = cur;
2928 create_literal (pfile, token, base, cur - base, type);
2931 /* Return the comment table. The client may not make any assumption
2932 about the ordering of the table. */
2933 cpp_comment_table *
2934 cpp_get_comments (cpp_reader *pfile)
2936 return &pfile->comments;
2939 /* Append a comment to the end of the comment table. */
2940 static void
2941 store_comment (cpp_reader *pfile, cpp_token *token)
2943 int len;
2945 if (pfile->comments.allocated == 0)
2947 pfile->comments.allocated = 256;
2948 pfile->comments.entries = (cpp_comment *) xmalloc
2949 (pfile->comments.allocated * sizeof (cpp_comment));
2952 if (pfile->comments.count == pfile->comments.allocated)
2954 pfile->comments.allocated *= 2;
2955 pfile->comments.entries = (cpp_comment *) xrealloc
2956 (pfile->comments.entries,
2957 pfile->comments.allocated * sizeof (cpp_comment));
2960 len = token->val.str.len;
2962 /* Copy comment. Note, token may not be NULL terminated. */
2963 pfile->comments.entries[pfile->comments.count].comment =
2964 (char *) xmalloc (sizeof (char) * (len + 1));
2965 memcpy (pfile->comments.entries[pfile->comments.count].comment,
2966 token->val.str.text, len);
2967 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2969 /* Set source location. */
2970 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2972 /* Increment the count of entries in the comment table. */
2973 pfile->comments.count++;
2976 /* The stored comment includes the comment start and any terminator. */
2977 static void
2978 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2979 cppchar_t type)
2981 unsigned char *buffer;
2982 unsigned int len, clen, i;
2984 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
2986 /* C++ comments probably (not definitely) have moved past a new
2987 line, which we don't want to save in the comment. */
2988 if (is_vspace (pfile->buffer->cur[-1]))
2989 len--;
2991 /* If we are currently in a directive or in argument parsing, then
2992 we need to store all C++ comments as C comments internally, and
2993 so we need to allocate a little extra space in that case.
2995 Note that the only time we encounter a directive here is
2996 when we are saving comments in a "#define". */
2997 clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2998 && type == '/') ? len + 2 : len;
3000 buffer = _cpp_unaligned_alloc (pfile, clen);
3002 token->type = CPP_COMMENT;
3003 token->val.str.len = clen;
3004 token->val.str.text = buffer;
3006 buffer[0] = '/';
3007 memcpy (buffer + 1, from, len - 1);
3009 /* Finish conversion to a C comment, if necessary. */
3010 if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
3012 buffer[1] = '*';
3013 buffer[clen - 2] = '*';
3014 buffer[clen - 1] = '/';
3015 /* As there can be in a C++ comments illegal sequences for C comments
3016 we need to filter them out. */
3017 for (i = 2; i < (clen - 2); i++)
3018 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3019 buffer[i] = '|';
3022 /* Finally store this comment for use by clients of libcpp. */
3023 store_comment (pfile, token);
3026 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3027 comment. */
3029 static bool
3030 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3032 const unsigned char *from = comment_start + 1;
3034 switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3036 /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3037 don't recognize any comments. The latter only checks attributes,
3038 the former doesn't warn. */
3039 case 0:
3040 default:
3041 return false;
3042 /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3043 content it has. */
3044 case 1:
3045 return true;
3046 case 2:
3047 /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3048 .*falls?[ \t-]*thr(u|ough).* regex. */
3049 for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3050 from++)
3052 /* Is there anything like strpbrk with upper boundary, or
3053 memchr looking for 2 characters rather than just one? */
3054 if (from[0] != 'f' && from[0] != 'F')
3055 continue;
3056 if (from[1] != 'a' && from[1] != 'A')
3057 continue;
3058 if (from[2] != 'l' && from[2] != 'L')
3059 continue;
3060 if (from[3] != 'l' && from[3] != 'L')
3061 continue;
3062 from += sizeof "fall" - 1;
3063 if (from[0] == 's' || from[0] == 'S')
3064 from++;
3065 while (*from == ' ' || *from == '\t' || *from == '-')
3066 from++;
3067 if (from[0] != 't' && from[0] != 'T')
3068 continue;
3069 if (from[1] != 'h' && from[1] != 'H')
3070 continue;
3071 if (from[2] != 'r' && from[2] != 'R')
3072 continue;
3073 if (from[3] == 'u' || from[3] == 'U')
3074 return true;
3075 if (from[3] != 'o' && from[3] != 'O')
3076 continue;
3077 if (from[4] != 'u' && from[4] != 'U')
3078 continue;
3079 if (from[5] != 'g' && from[5] != 'G')
3080 continue;
3081 if (from[6] != 'h' && from[6] != 'H')
3082 continue;
3083 return true;
3085 return false;
3086 case 3:
3087 case 4:
3088 break;
3091 /* Whole comment contents:
3092 -fallthrough
3093 @fallthrough@
3095 if (*from == '-' || *from == '@')
3097 size_t len = sizeof "fallthrough" - 1;
3098 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3099 return false;
3100 if (memcmp (from + 1, "fallthrough", len))
3101 return false;
3102 if (*from == '@')
3104 if (from[len + 1] != '@')
3105 return false;
3106 len++;
3108 from += 1 + len;
3110 /* Whole comment contents (regex):
3111 lint -fallthrough[ \t]*
3113 else if (*from == 'l')
3115 size_t len = sizeof "int -fallthrough" - 1;
3116 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3117 return false;
3118 if (memcmp (from + 1, "int -fallthrough", len))
3119 return false;
3120 from += 1 + len;
3121 while (*from == ' ' || *from == '\t')
3122 from++;
3124 /* Whole comment contents (regex):
3125 [ \t]*FALLTHR(U|OUGH)[ \t]*
3127 else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3129 while (*from == ' ' || *from == '\t')
3130 from++;
3131 if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
3132 return false;
3133 if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
3134 return false;
3135 from += sizeof "FALLTHR" - 1;
3136 if (*from == 'U')
3137 from++;
3138 else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
3139 return false;
3140 else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
3141 return false;
3142 else
3143 from += sizeof "OUGH" - 1;
3144 while (*from == ' ' || *from == '\t')
3145 from++;
3147 /* Whole comment contents (regex):
3148 [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3149 [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3150 [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
3152 else
3154 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3155 from++;
3156 unsigned char f = *from;
3157 bool all_upper = false;
3158 if (f == 'E' || f == 'e')
3160 if ((size_t) (pfile->buffer->cur - from)
3161 < sizeof "else fallthru" - 1)
3162 return false;
3163 if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
3164 all_upper = true;
3165 else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
3166 return false;
3167 from += sizeof "else" - 1;
3168 if (*from == ',')
3169 from++;
3170 if (*from != ' ')
3171 return false;
3172 from++;
3173 if (all_upper && *from == 'f')
3174 return false;
3175 if (f == 'e' && *from == 'F')
3176 return false;
3177 f = *from;
3179 else if (f == 'I' || f == 'i')
3181 if ((size_t) (pfile->buffer->cur - from)
3182 < sizeof "intentional fallthru" - 1)
3183 return false;
3184 if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
3185 sizeof "NTENTIONAL" - 1) == 0)
3186 all_upper = true;
3187 else if (memcmp (from + 1, "ntentional",
3188 sizeof "ntentional" - 1))
3189 return false;
3190 from += sizeof "intentional" - 1;
3191 if (*from == ' ')
3193 from++;
3194 if (all_upper && *from == 'f')
3195 return false;
3197 else if (all_upper)
3199 if (memcmp (from, "LY F", sizeof "LY F" - 1))
3200 return false;
3201 from += sizeof "LY " - 1;
3203 else
3205 if (memcmp (from, "ly ", sizeof "ly " - 1))
3206 return false;
3207 from += sizeof "ly " - 1;
3209 if (f == 'i' && *from == 'F')
3210 return false;
3211 f = *from;
3213 if (f != 'F' && f != 'f')
3214 return false;
3215 if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3216 return false;
3217 if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3218 all_upper = true;
3219 else if (all_upper)
3220 return false;
3221 else if (memcmp (from + 1, "all", sizeof "all" - 1))
3222 return false;
3223 from += sizeof "fall" - 1;
3224 if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3225 from += 2;
3226 else if (*from == ' ' || *from == '-')
3227 from++;
3228 else if (*from != (all_upper ? 'T' : 't'))
3229 return false;
3230 if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3231 return false;
3232 if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3233 return false;
3234 if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3236 if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3237 return false;
3238 if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3239 sizeof "hrough" - 1))
3240 return false;
3241 from += sizeof "through" - 1;
3243 else
3244 from += sizeof "thru" - 1;
3245 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3246 from++;
3247 if (*from == '-')
3249 from++;
3250 if (*comment_start == '*')
3254 while (*from && *from != '*'
3255 && *from != '\n' && *from != '\r')
3256 from++;
3257 if (*from != '*' || from[1] == '/')
3258 break;
3259 from++;
3261 while (1);
3263 else
3264 while (*from && *from != '\n' && *from != '\r')
3265 from++;
3268 /* C block comment. */
3269 if (*comment_start == '*')
3271 if (*from != '*' || from[1] != '/')
3272 return false;
3274 /* C++ line comment. */
3275 else if (*from != '\n')
3276 return false;
3278 return true;
3281 /* Allocate COUNT tokens for RUN. */
3282 void
3283 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3285 run->base = XNEWVEC (cpp_token, count);
3286 run->limit = run->base + count;
3287 run->next = NULL;
3290 /* Returns the next tokenrun, or creates one if there is none. */
3291 static tokenrun *
3292 next_tokenrun (tokenrun *run)
3294 if (run->next == NULL)
3296 run->next = XNEW (tokenrun);
3297 run->next->prev = run;
3298 _cpp_init_tokenrun (run->next, 250);
3301 return run->next;
3304 /* Return the number of not yet processed token in a given
3305 context. */
3307 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3309 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3310 return (LAST (context).token - FIRST (context).token);
3311 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3312 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3313 return (LAST (context).ptoken - FIRST (context).ptoken);
3314 else
3315 abort ();
3318 /* Returns the token present at index INDEX in a given context. If
3319 INDEX is zero, the next token to be processed is returned. */
3320 static const cpp_token*
3321 _cpp_token_from_context_at (cpp_context *context, int index)
3323 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3324 return &(FIRST (context).token[index]);
3325 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3326 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3327 return FIRST (context).ptoken[index];
3328 else
3329 abort ();
3332 /* Look ahead in the input stream. */
3333 const cpp_token *
3334 cpp_peek_token (cpp_reader *pfile, int index)
3336 cpp_context *context = pfile->context;
3337 const cpp_token *peektok;
3338 int count;
3340 /* First, scan through any pending cpp_context objects. */
3341 while (context->prev)
3343 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3345 if (index < (int) sz)
3346 return _cpp_token_from_context_at (context, index);
3347 index -= (int) sz;
3348 context = context->prev;
3351 /* We will have to read some new tokens after all (and do so
3352 without invalidating preceding tokens). */
3353 count = index;
3354 pfile->keep_tokens++;
3356 /* For peeked tokens temporarily disable line_change reporting,
3357 until the tokens are parsed for real. */
3358 void (*line_change) (cpp_reader *, const cpp_token *, int)
3359 = pfile->cb.line_change;
3360 pfile->cb.line_change = NULL;
3364 peektok = _cpp_lex_token (pfile);
3365 if (peektok->type == CPP_EOF)
3367 index--;
3368 break;
3370 else if (peektok->type == CPP_PRAGMA)
3372 /* Don't peek past a pragma. */
3373 if (peektok == &pfile->directive_result)
3374 /* Save the pragma in the buffer. */
3375 *pfile->cur_token++ = *peektok;
3376 index--;
3377 break;
3380 while (index--);
3382 _cpp_backup_tokens_direct (pfile, count - index);
3383 pfile->keep_tokens--;
3384 pfile->cb.line_change = line_change;
3386 return peektok;
3389 /* Allocate a single token that is invalidated at the same time as the
3390 rest of the tokens on the line. Has its line and col set to the
3391 same as the last lexed token, so that diagnostics appear in the
3392 right place. */
3393 cpp_token *
3394 _cpp_temp_token (cpp_reader *pfile)
3396 cpp_token *old, *result;
3397 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3398 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3400 old = pfile->cur_token - 1;
3401 /* Any pre-existing lookaheads must not be clobbered. */
3402 if (la)
3404 if (sz <= la)
3406 tokenrun *next = next_tokenrun (pfile->cur_run);
3408 if (sz < la)
3409 memmove (next->base + 1, next->base,
3410 (la - sz) * sizeof (cpp_token));
3412 next->base[0] = pfile->cur_run->limit[-1];
3415 if (sz > 1)
3416 memmove (pfile->cur_token + 1, pfile->cur_token,
3417 MIN (la, sz - 1) * sizeof (cpp_token));
3420 if (!sz && pfile->cur_token == pfile->cur_run->limit)
3422 pfile->cur_run = next_tokenrun (pfile->cur_run);
3423 pfile->cur_token = pfile->cur_run->base;
3426 result = pfile->cur_token++;
3427 result->src_loc = old->src_loc;
3428 return result;
3431 /* We're at the beginning of a logical line (so not in
3432 directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
3433 if we should enter deferred_pragma mode to tokenize the rest of the
3434 line as a module control-line. */
3436 static void
3437 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3439 unsigned backup = 0; /* Tokens we peeked. */
3440 cpp_hashnode *node = result->val.node.node;
3441 cpp_token *peek = result;
3442 cpp_token *keyword = peek;
3443 cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3444 int header_count = 0;
3446 /* Make sure the incoming state is as we expect it. This way we
3447 can restore it using constants. */
3448 gcc_checking_assert (!pfile->state.in_deferred_pragma
3449 && !pfile->state.skipping
3450 && !pfile->state.parsing_args
3451 && !pfile->state.angled_headers
3452 && (pfile->state.save_comments
3453 == !CPP_OPTION (pfile, discard_comments)));
3455 /* Enter directives mode sufficiently for peeking. We don't have
3456 to actually set in_directive. */
3457 pfile->state.in_deferred_pragma = true;
3459 /* These two fields are needed to process tokenization in deferred
3460 pragma mode. They are not used outside deferred pragma mode or
3461 directives mode. */
3462 pfile->state.pragma_allow_expansion = true;
3463 pfile->directive_line = result->src_loc;
3465 /* Saving comments is incompatible with directives mode. */
3466 pfile->state.save_comments = 0;
3468 if (node == n_modules[spec_nodes::M_EXPORT][0])
3470 peek = _cpp_lex_direct (pfile);
3471 keyword = peek;
3472 backup++;
3473 if (keyword->type != CPP_NAME)
3474 goto not_module;
3475 node = keyword->val.node.node;
3476 if (!(node->flags & NODE_MODULE))
3477 goto not_module;
3480 if (node == n_modules[spec_nodes::M__IMPORT][0])
3481 /* __import */
3482 header_count = backup + 2 + 16;
3483 else if (node == n_modules[spec_nodes::M_IMPORT][0])
3484 /* import */
3485 header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3486 else if (node == n_modules[spec_nodes::M_MODULE][0])
3487 ; /* module */
3488 else
3489 goto not_module;
3491 /* We've seen [export] {module|import|__import}. Check the next token. */
3492 if (header_count)
3493 /* After '{,__}import' a header name may appear. */
3494 pfile->state.angled_headers = true;
3495 peek = _cpp_lex_direct (pfile);
3496 backup++;
3498 /* ... import followed by identifier, ':', '<' or
3499 header-name preprocessing tokens, or module
3500 followed by cpp-identifier, ':' or ';' preprocessing
3501 tokens. C++ keywords are not yet relevant. */
3502 if (peek->type == CPP_NAME
3503 || peek->type == CPP_COLON
3504 || (header_count
3505 ? (peek->type == CPP_LESS
3506 || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3507 || peek->type == CPP_HEADER_NAME)
3508 : peek->type == CPP_SEMICOLON))
3510 pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3511 if (!pfile->state.pragma_allow_expansion)
3512 pfile->state.prevent_expansion++;
3514 if (!header_count && linemap_included_from
3515 (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3516 cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3517 "module control-line cannot be in included file");
3519 /* The first one or two tokens cannot be macro names. */
3520 for (int ix = backup; ix--;)
3522 cpp_token *tok = ix ? keyword : result;
3523 cpp_hashnode *node = tok->val.node.node;
3525 /* Don't attempt to expand the token. */
3526 tok->flags |= NO_EXPAND;
3527 if (_cpp_defined_macro_p (node)
3528 && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3529 && !cpp_fun_like_macro_p (node))
3530 cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3531 "module control-line \"%s\" cannot be"
3532 " an object-like macro",
3533 NODE_NAME (node));
3536 /* Map to underbar variants. */
3537 keyword->val.node.node = n_modules[header_count
3538 ? spec_nodes::M_IMPORT
3539 : spec_nodes::M_MODULE][1];
3540 if (backup != 1)
3541 result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3543 /* Maybe tell the tokenizer we expect a header-name down the
3544 road. */
3545 pfile->state.directive_file_token = header_count;
3547 else
3549 not_module:
3550 /* Drop out of directive mode. */
3551 /* We aaserted save_comments had this value upon entry. */
3552 pfile->state.save_comments
3553 = !CPP_OPTION (pfile, discard_comments);
3554 pfile->state.in_deferred_pragma = false;
3555 /* Do not let this remain on. */
3556 pfile->state.angled_headers = false;
3559 /* In either case we want to backup the peeked tokens. */
3560 if (backup)
3562 /* If we saw EOL, we should drop it, because this isn't a module
3563 control-line after all. */
3564 bool eol = peek->type == CPP_PRAGMA_EOL;
3565 if (!eol || backup > 1)
3567 /* Put put the peeked tokens back */
3568 _cpp_backup_tokens_direct (pfile, backup);
3569 /* But if the last one was an EOL, forget it. */
3570 if (eol)
3571 pfile->lookaheads--;
3576 /* Lex a token into RESULT (external interface). Takes care of issues
3577 like directive handling, token lookahead, multiple include
3578 optimization and skipping. */
3579 const cpp_token *
3580 _cpp_lex_token (cpp_reader *pfile)
3582 cpp_token *result;
3584 for (;;)
3586 if (pfile->cur_token == pfile->cur_run->limit)
3588 pfile->cur_run = next_tokenrun (pfile->cur_run);
3589 pfile->cur_token = pfile->cur_run->base;
3591 /* We assume that the current token is somewhere in the current
3592 run. */
3593 if (pfile->cur_token < pfile->cur_run->base
3594 || pfile->cur_token >= pfile->cur_run->limit)
3595 abort ();
3597 if (pfile->lookaheads)
3599 pfile->lookaheads--;
3600 result = pfile->cur_token++;
3602 else
3603 result = _cpp_lex_direct (pfile);
3605 if (result->flags & BOL)
3607 /* Is this a directive. If _cpp_handle_directive returns
3608 false, it is an assembler #. */
3609 if (result->type == CPP_HASH
3610 /* 6.10.3 p 11: Directives in a list of macro arguments
3611 gives undefined behavior. This implementation
3612 handles the directive as normal. */
3613 && pfile->state.parsing_args != 1)
3615 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3617 if (pfile->directive_result.type == CPP_PADDING)
3618 continue;
3619 result = &pfile->directive_result;
3622 else if (pfile->state.in_deferred_pragma)
3623 result = &pfile->directive_result;
3624 else if (result->type == CPP_NAME
3625 && (result->val.node.node->flags & NODE_MODULE)
3626 && !pfile->state.skipping
3627 /* Unlike regular directives, we do not deal with
3628 tokenizing module directives as macro arguments.
3629 That's not permitted. */
3630 && !pfile->state.parsing_args)
3632 /* P1857. Before macro expansion, At start of logical
3633 line ... */
3634 /* We don't have to consider lookaheads at this point. */
3635 gcc_checking_assert (!pfile->lookaheads);
3637 cpp_maybe_module_directive (pfile, result);
3640 if (pfile->cb.line_change && !pfile->state.skipping)
3641 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3644 /* We don't skip tokens in directives. */
3645 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3646 break;
3648 /* Outside a directive, invalidate controlling macros. At file
3649 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3650 get here and MI optimization works. */
3651 pfile->mi_valid = false;
3653 if (!pfile->state.skipping || result->type == CPP_EOF)
3654 break;
3657 return result;
3660 /* Returns true if a fresh line has been loaded. */
3661 bool
3662 _cpp_get_fresh_line (cpp_reader *pfile)
3664 /* We can't get a new line until we leave the current directive. */
3665 if (pfile->state.in_directive)
3666 return false;
3668 for (;;)
3670 cpp_buffer *buffer = pfile->buffer;
3672 if (!buffer->need_line)
3673 return true;
3675 if (buffer->next_line < buffer->rlimit)
3677 _cpp_clean_line (pfile);
3678 return true;
3681 /* First, get out of parsing arguments state. */
3682 if (pfile->state.parsing_args)
3683 return false;
3685 /* End of buffer. Non-empty files should end in a newline. */
3686 if (buffer->buf != buffer->rlimit
3687 && buffer->next_line > buffer->rlimit
3688 && !buffer->from_stage3)
3690 /* Clip to buffer size. */
3691 buffer->next_line = buffer->rlimit;
3694 if (buffer->prev && !buffer->return_at_eof)
3695 _cpp_pop_buffer (pfile);
3696 else
3698 /* End of translation. Do not pop the buffer yet. Increment
3699 line number so that the EOF token is on a line of its own
3700 (_cpp_lex_direct doesn't increment in that case, because
3701 it's hard for it to distinguish this special case). */
3702 CPP_INCREMENT_LINE (pfile, 0);
3703 return false;
3708 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
3709 do \
3711 result->type = ELSE_TYPE; \
3712 if (*buffer->cur == CHAR) \
3713 buffer->cur++, result->type = THEN_TYPE; \
3715 while (0)
3717 /* Lex a token into pfile->cur_token, which is also incremented, to
3718 get diagnostics pointing to the correct location.
3720 Does not handle issues such as token lookahead, multiple-include
3721 optimization, directives, skipping etc. This function is only
3722 suitable for use by _cpp_lex_token, and in special cases like
3723 lex_expansion_token which doesn't care for any of these issues.
3725 When meeting a newline, returns CPP_EOF if parsing a directive,
3726 otherwise returns to the start of the token buffer if permissible.
3727 Returns the location of the lexed token. */
3728 cpp_token *
3729 _cpp_lex_direct (cpp_reader *pfile)
3731 cppchar_t c;
3732 cpp_buffer *buffer;
3733 const unsigned char *comment_start;
3734 bool fallthrough_comment = false;
3735 cpp_token *result = pfile->cur_token++;
3737 fresh_line:
3738 result->flags = 0;
3739 buffer = pfile->buffer;
3740 if (buffer->need_line)
3742 if (pfile->state.in_deferred_pragma)
3744 /* This can happen in cases like:
3745 #define loop(x) whatever
3746 #pragma omp loop
3747 where when trying to expand loop we need to peek
3748 next token after loop, but aren't still in_deferred_pragma
3749 mode but are in in_directive mode, so buffer->need_line
3750 is set, a CPP_EOF is peeked. */
3751 result->type = CPP_PRAGMA_EOL;
3752 pfile->state.in_deferred_pragma = false;
3753 if (!pfile->state.pragma_allow_expansion)
3754 pfile->state.prevent_expansion--;
3755 return result;
3757 if (!_cpp_get_fresh_line (pfile))
3759 result->type = CPP_EOF;
3760 /* Not a real EOF in a directive or arg parsing -- we refuse
3761 to advance to the next file now, and will once we're out
3762 of those modes. */
3763 if (!pfile->state.in_directive && !pfile->state.parsing_args)
3765 /* Tell the compiler the line number of the EOF token. */
3766 result->src_loc = pfile->line_table->highest_line;
3767 result->flags = BOL;
3768 /* Now pop the buffer that _cpp_get_fresh_line did not. */
3769 _cpp_pop_buffer (pfile);
3771 return result;
3773 if (buffer != pfile->buffer)
3774 fallthrough_comment = false;
3775 if (!pfile->keep_tokens)
3777 pfile->cur_run = &pfile->base_run;
3778 result = pfile->base_run.base;
3779 pfile->cur_token = result + 1;
3781 result->flags = BOL;
3782 if (pfile->state.parsing_args == 2)
3783 result->flags |= PREV_WHITE;
3785 buffer = pfile->buffer;
3786 update_tokens_line:
3787 result->src_loc = pfile->line_table->highest_line;
3789 skipped_white:
3790 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3791 && !pfile->overlaid_buffer)
3793 _cpp_process_line_notes (pfile, false);
3794 result->src_loc = pfile->line_table->highest_line;
3796 c = *buffer->cur++;
3798 if (pfile->forced_token_location)
3799 result->src_loc = pfile->forced_token_location;
3800 else
3801 result->src_loc = linemap_position_for_column (pfile->line_table,
3802 CPP_BUF_COLUMN (buffer, buffer->cur));
3804 switch (c)
3806 case ' ': case '\t': case '\f': case '\v': case '\0':
3807 result->flags |= PREV_WHITE;
3808 skip_whitespace (pfile, c);
3809 goto skipped_white;
3811 case '\n':
3812 /* Increment the line, unless this is the last line ... */
3813 if (buffer->cur < buffer->rlimit
3814 /* ... or this is a #include, (where _cpp_stack_file needs to
3815 unwind by one line) ... */
3816 || (pfile->state.in_directive > 1
3817 /* ... except traditional-cpp increments this elsewhere. */
3818 && !CPP_OPTION (pfile, traditional)))
3819 CPP_INCREMENT_LINE (pfile, 0);
3820 buffer->need_line = true;
3821 if (pfile->state.in_deferred_pragma)
3823 /* Produce the PRAGMA_EOL on this line. File reading
3824 ensures there is always a \n at end of the buffer, thus
3825 in a deferred pragma we always see CPP_PRAGMA_EOL before
3826 any CPP_EOF. */
3827 result->type = CPP_PRAGMA_EOL;
3828 result->flags &= ~PREV_WHITE;
3829 pfile->state.in_deferred_pragma = false;
3830 if (!pfile->state.pragma_allow_expansion)
3831 pfile->state.prevent_expansion--;
3832 return result;
3834 goto fresh_line;
3836 case '0': case '1': case '2': case '3': case '4':
3837 case '5': case '6': case '7': case '8': case '9':
3839 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3840 result->type = CPP_NUMBER;
3841 lex_number (pfile, &result->val.str, &nst);
3842 warn_about_normalization (pfile, result, &nst);
3843 break;
3846 case 'L':
3847 case 'u':
3848 case 'U':
3849 case 'R':
3850 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3851 wide strings or raw strings. */
3852 if (c == 'L' || CPP_OPTION (pfile, rliterals)
3853 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3855 if ((*buffer->cur == '\'' && c != 'R')
3856 || *buffer->cur == '"'
3857 || (*buffer->cur == 'R'
3858 && c != 'R'
3859 && buffer->cur[1] == '"'
3860 && CPP_OPTION (pfile, rliterals))
3861 || (*buffer->cur == '8'
3862 && c == 'u'
3863 && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3864 && CPP_OPTION (pfile, utf8_char_literals)))
3865 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3866 && CPP_OPTION (pfile, rliterals)))))
3868 lex_string (pfile, result, buffer->cur - 1);
3869 break;
3872 /* Fall through. */
3874 case '_':
3875 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3876 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3877 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3878 case 's': case 't': case 'v': case 'w': case 'x':
3879 case 'y': case 'z':
3880 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3881 case 'G': case 'H': case 'I': case 'J': case 'K':
3882 case 'M': case 'N': case 'O': case 'P': case 'Q':
3883 case 'S': case 'T': case 'V': case 'W': case 'X':
3884 case 'Y': case 'Z':
3885 result->type = CPP_NAME;
3887 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3888 result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3889 &nst,
3890 &result->val.node.spelling);
3891 warn_about_normalization (pfile, result, &nst);
3894 /* Convert named operators to their proper types. */
3895 if (result->val.node.node->flags & NODE_OPERATOR)
3897 result->flags |= NAMED_OP;
3898 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3901 /* Signal FALLTHROUGH comment followed by another token. */
3902 if (fallthrough_comment)
3903 result->flags |= PREV_FALLTHROUGH;
3904 break;
3906 case '\'':
3907 case '"':
3908 lex_string (pfile, result, buffer->cur - 1);
3909 break;
3911 case '/':
3912 /* A potential block or line comment. */
3913 comment_start = buffer->cur;
3914 c = *buffer->cur;
3916 if (c == '*')
3918 if (_cpp_skip_block_comment (pfile))
3919 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3921 else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3923 /* Don't warn for system headers. */
3924 if (_cpp_in_system_header (pfile))
3926 /* Warn about comments if pedantically GNUC89, and not
3927 in system headers. */
3928 else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3929 && CPP_PEDANTIC (pfile)
3930 && ! buffer->warned_cplusplus_comments)
3932 if (cpp_error (pfile, CPP_DL_PEDWARN,
3933 "C++ style comments are not allowed in ISO C90"))
3934 cpp_error (pfile, CPP_DL_NOTE,
3935 "(this will be reported only once per input file)");
3936 buffer->warned_cplusplus_comments = 1;
3938 /* Or if specifically desired via -Wc90-c99-compat. */
3939 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3940 && ! CPP_OPTION (pfile, cplusplus)
3941 && ! buffer->warned_cplusplus_comments)
3943 if (cpp_error (pfile, CPP_DL_WARNING,
3944 "C++ style comments are incompatible with C90"))
3945 cpp_error (pfile, CPP_DL_NOTE,
3946 "(this will be reported only once per input file)");
3947 buffer->warned_cplusplus_comments = 1;
3949 /* In C89/C94, C++ style comments are forbidden. */
3950 else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3951 || CPP_OPTION (pfile, lang) == CLK_STDC94))
3953 /* But don't be confused about valid code such as
3954 - // immediately followed by *,
3955 - // in a preprocessing directive,
3956 - // in an #if 0 block. */
3957 if (buffer->cur[1] == '*'
3958 || pfile->state.in_directive
3959 || pfile->state.skipping)
3961 result->type = CPP_DIV;
3962 break;
3964 else if (! buffer->warned_cplusplus_comments)
3966 if (cpp_error (pfile, CPP_DL_ERROR,
3967 "C++ style comments are not allowed in "
3968 "ISO C90"))
3969 cpp_error (pfile, CPP_DL_NOTE,
3970 "(this will be reported only once per input "
3971 "file)");
3972 buffer->warned_cplusplus_comments = 1;
3975 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3976 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3978 else if (c == '=')
3980 buffer->cur++;
3981 result->type = CPP_DIV_EQ;
3982 break;
3984 else
3986 result->type = CPP_DIV;
3987 break;
3990 if (fallthrough_comment_p (pfile, comment_start))
3991 fallthrough_comment = true;
3993 if (pfile->cb.comment)
3995 size_t len = pfile->buffer->cur - comment_start;
3996 pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3997 len + 1);
4000 if (!pfile->state.save_comments)
4002 result->flags |= PREV_WHITE;
4003 goto update_tokens_line;
4006 if (fallthrough_comment)
4007 result->flags |= PREV_FALLTHROUGH;
4009 /* Save the comment as a token in its own right. */
4010 save_comment (pfile, result, comment_start, c);
4011 break;
4013 case '<':
4014 if (pfile->state.angled_headers)
4016 lex_string (pfile, result, buffer->cur - 1);
4017 if (result->type != CPP_LESS)
4018 break;
4021 result->type = CPP_LESS;
4022 if (*buffer->cur == '=')
4024 buffer->cur++, result->type = CPP_LESS_EQ;
4025 if (*buffer->cur == '>'
4026 && CPP_OPTION (pfile, cplusplus)
4027 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
4028 buffer->cur++, result->type = CPP_SPACESHIP;
4030 else if (*buffer->cur == '<')
4032 buffer->cur++;
4033 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
4035 else if (CPP_OPTION (pfile, digraphs))
4037 if (*buffer->cur == ':')
4039 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4040 three characters are <:: and the subsequent character
4041 is neither : nor >, the < is treated as a preprocessor
4042 token by itself". */
4043 if (CPP_OPTION (pfile, cplusplus)
4044 && CPP_OPTION (pfile, lang) != CLK_CXX98
4045 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
4046 && buffer->cur[1] == ':'
4047 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4048 break;
4050 buffer->cur++;
4051 result->flags |= DIGRAPH;
4052 result->type = CPP_OPEN_SQUARE;
4054 else if (*buffer->cur == '%')
4056 buffer->cur++;
4057 result->flags |= DIGRAPH;
4058 result->type = CPP_OPEN_BRACE;
4061 break;
4063 case '>':
4064 result->type = CPP_GREATER;
4065 if (*buffer->cur == '=')
4066 buffer->cur++, result->type = CPP_GREATER_EQ;
4067 else if (*buffer->cur == '>')
4069 buffer->cur++;
4070 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4072 break;
4074 case '%':
4075 result->type = CPP_MOD;
4076 if (*buffer->cur == '=')
4077 buffer->cur++, result->type = CPP_MOD_EQ;
4078 else if (CPP_OPTION (pfile, digraphs))
4080 if (*buffer->cur == ':')
4082 buffer->cur++;
4083 result->flags |= DIGRAPH;
4084 result->type = CPP_HASH;
4085 if (*buffer->cur == '%' && buffer->cur[1] == ':')
4086 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
4088 else if (*buffer->cur == '>')
4090 buffer->cur++;
4091 result->flags |= DIGRAPH;
4092 result->type = CPP_CLOSE_BRACE;
4095 break;
4097 case '.':
4098 result->type = CPP_DOT;
4099 if (ISDIGIT (*buffer->cur))
4101 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4102 result->type = CPP_NUMBER;
4103 lex_number (pfile, &result->val.str, &nst);
4104 warn_about_normalization (pfile, result, &nst);
4106 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4107 buffer->cur += 2, result->type = CPP_ELLIPSIS;
4108 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4109 buffer->cur++, result->type = CPP_DOT_STAR;
4110 break;
4112 case '+':
4113 result->type = CPP_PLUS;
4114 if (*buffer->cur == '+')
4115 buffer->cur++, result->type = CPP_PLUS_PLUS;
4116 else if (*buffer->cur == '=')
4117 buffer->cur++, result->type = CPP_PLUS_EQ;
4118 break;
4120 case '-':
4121 result->type = CPP_MINUS;
4122 if (*buffer->cur == '>')
4124 buffer->cur++;
4125 result->type = CPP_DEREF;
4126 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4127 buffer->cur++, result->type = CPP_DEREF_STAR;
4129 else if (*buffer->cur == '-')
4130 buffer->cur++, result->type = CPP_MINUS_MINUS;
4131 else if (*buffer->cur == '=')
4132 buffer->cur++, result->type = CPP_MINUS_EQ;
4133 break;
4135 case '&':
4136 result->type = CPP_AND;
4137 if (*buffer->cur == '&')
4138 buffer->cur++, result->type = CPP_AND_AND;
4139 else if (*buffer->cur == '=')
4140 buffer->cur++, result->type = CPP_AND_EQ;
4141 break;
4143 case '|':
4144 result->type = CPP_OR;
4145 if (*buffer->cur == '|')
4146 buffer->cur++, result->type = CPP_OR_OR;
4147 else if (*buffer->cur == '=')
4148 buffer->cur++, result->type = CPP_OR_EQ;
4149 break;
4151 case ':':
4152 result->type = CPP_COLON;
4153 if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
4154 buffer->cur++, result->type = CPP_SCOPE;
4155 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
4157 buffer->cur++;
4158 result->flags |= DIGRAPH;
4159 result->type = CPP_CLOSE_SQUARE;
4161 break;
4163 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4164 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4165 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4166 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
4167 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
4169 case '?': result->type = CPP_QUERY; break;
4170 case '~': result->type = CPP_COMPL; break;
4171 case ',': result->type = CPP_COMMA; break;
4172 case '(': result->type = CPP_OPEN_PAREN; break;
4173 case ')': result->type = CPP_CLOSE_PAREN; break;
4174 case '[': result->type = CPP_OPEN_SQUARE; break;
4175 case ']': result->type = CPP_CLOSE_SQUARE; break;
4176 case '{': result->type = CPP_OPEN_BRACE; break;
4177 case '}': result->type = CPP_CLOSE_BRACE; break;
4178 case ';': result->type = CPP_SEMICOLON; break;
4180 /* @ is a punctuator in Objective-C. */
4181 case '@': result->type = CPP_ATSIGN; break;
4183 default:
4185 const uchar *base = --buffer->cur;
4186 static int no_warn_cnt;
4188 /* Check for an extended identifier ($ or UCN or UTF-8). */
4189 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4190 if (forms_identifier_p (pfile, true, &nst))
4192 result->type = CPP_NAME;
4193 result->val.node.node = lex_identifier (pfile, base, true, &nst,
4194 &result->val.node.spelling);
4195 warn_about_normalization (pfile, result, &nst);
4196 break;
4199 /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
4200 single token. */
4201 buffer->cur++;
4202 if (c >= utf8_signifier)
4204 const uchar *pstr = base;
4205 cppchar_t s;
4206 if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4208 if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4210 buffer->cur = base;
4211 _cpp_warn_invalid_utf8 (pfile);
4213 buffer->cur = pstr;
4215 else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4217 buffer->cur = base;
4218 const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4219 buffer->cur = base + 1;
4220 no_warn_cnt = end - buffer->cur;
4223 else if (c >= utf8_continuation
4224 && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4226 if (no_warn_cnt)
4227 --no_warn_cnt;
4228 else
4230 buffer->cur = base;
4231 _cpp_warn_invalid_utf8 (pfile);
4232 buffer->cur = base + 1;
4235 create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4236 break;
4241 /* Potentially convert the location of the token to a range. */
4242 if (result->src_loc >= RESERVED_LOCATION_COUNT
4243 && result->type != CPP_EOF)
4245 /* Ensure that any line notes are processed, so that we have the
4246 correct physical line/column for the end-point of the token even
4247 when a logical line is split via one or more backslashes. */
4248 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4249 && !pfile->overlaid_buffer)
4250 _cpp_process_line_notes (pfile, false);
4252 source_range tok_range;
4253 tok_range.m_start = result->src_loc;
4254 tok_range.m_finish
4255 = linemap_position_for_column (pfile->line_table,
4256 CPP_BUF_COLUMN (buffer, buffer->cur));
4258 result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
4259 result->src_loc,
4260 tok_range, NULL, 0);
4263 return result;
4266 /* An upper bound on the number of bytes needed to spell TOKEN.
4267 Does not include preceding whitespace. */
4268 unsigned int
4269 cpp_token_len (const cpp_token *token)
4271 unsigned int len;
4273 switch (TOKEN_SPELL (token))
4275 default: len = 6; break;
4276 case SPELL_LITERAL: len = token->val.str.len; break;
4277 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
4280 return len;
4283 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4284 Return the number of bytes read out of NAME. (There are always
4285 10 bytes written to BUFFER.) */
4287 static size_t
4288 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4290 int j;
4291 int ucn_len = 0;
4292 int ucn_len_c;
4293 unsigned t;
4294 unsigned long utf32;
4296 /* Compute the length of the UTF-8 sequence. */
4297 for (t = *name; t & 0x80; t <<= 1)
4298 ucn_len++;
4300 utf32 = *name & (0x7F >> ucn_len);
4301 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4303 utf32 = (utf32 << 6) | (*++name & 0x3F);
4305 /* Ill-formed UTF-8. */
4306 if ((*name & ~0x3F) != 0x80)
4307 abort ();
4310 *buffer++ = '\\';
4311 *buffer++ = 'U';
4312 for (j = 7; j >= 0; j--)
4313 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4314 return ucn_len;
4317 /* Given a token TYPE corresponding to a digraph, return a pointer to
4318 the spelling of the digraph. */
4319 static const unsigned char *
4320 cpp_digraph2name (enum cpp_ttype type)
4322 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4325 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4326 The buffer must already contain the enough space to hold the
4327 token's spelling. Returns a pointer to the character after the
4328 last character written. */
4329 unsigned char *
4330 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4332 size_t i;
4333 const unsigned char *name = NODE_NAME (ident);
4335 for (i = 0; i < NODE_LEN (ident); i++)
4336 if (name[i] & ~0x7F)
4338 i += utf8_to_ucn (buffer, name + i) - 1;
4339 buffer += 10;
4341 else
4342 *buffer++ = name[i];
4344 return buffer;
4347 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
4348 already contain the enough space to hold the token's spelling.
4349 Returns a pointer to the character after the last character written.
4350 FORSTRING is true if this is to be the spelling after translation
4351 phase 1 (with the original spelling of extended identifiers), false
4352 if extended identifiers should always be written using UCNs (there is
4353 no option for always writing them in the internal UTF-8 form).
4354 FIXME: Would be nice if we didn't need the PFILE argument. */
4355 unsigned char *
4356 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4357 unsigned char *buffer, bool forstring)
4359 switch (TOKEN_SPELL (token))
4361 case SPELL_OPERATOR:
4363 const unsigned char *spelling;
4364 unsigned char c;
4366 if (token->flags & DIGRAPH)
4367 spelling = cpp_digraph2name (token->type);
4368 else if (token->flags & NAMED_OP)
4369 goto spell_ident;
4370 else
4371 spelling = TOKEN_NAME (token);
4373 while ((c = *spelling++) != '\0')
4374 *buffer++ = c;
4376 break;
4378 spell_ident:
4379 case SPELL_IDENT:
4380 if (forstring)
4382 memcpy (buffer, NODE_NAME (token->val.node.spelling),
4383 NODE_LEN (token->val.node.spelling));
4384 buffer += NODE_LEN (token->val.node.spelling);
4386 else
4387 buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4388 break;
4390 case SPELL_LITERAL:
4391 memcpy (buffer, token->val.str.text, token->val.str.len);
4392 buffer += token->val.str.len;
4393 break;
4395 case SPELL_NONE:
4396 cpp_error (pfile, CPP_DL_ICE,
4397 "unspellable token %s", TOKEN_NAME (token));
4398 break;
4401 return buffer;
4404 /* Returns TOKEN spelt as a null-terminated string. The string is
4405 freed when the reader is destroyed. Useful for diagnostics. */
4406 unsigned char *
4407 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4409 unsigned int len = cpp_token_len (token) + 1;
4410 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4412 end = cpp_spell_token (pfile, token, start, false);
4413 end[0] = '\0';
4415 return start;
4418 /* Returns a pointer to a string which spells the token defined by
4419 TYPE and FLAGS. Used by C front ends, which really should move to
4420 using cpp_token_as_text. */
4421 const char *
4422 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4424 if (flags & DIGRAPH)
4425 return (const char *) cpp_digraph2name (type);
4426 else if (flags & NAMED_OP)
4427 return cpp_named_operator2name (type);
4429 return (const char *) token_spellings[type].name;
4432 /* Writes the spelling of token to FP, without any preceding space.
4433 Separated from cpp_spell_token for efficiency - to avoid stdio
4434 double-buffering. */
4435 void
4436 cpp_output_token (const cpp_token *token, FILE *fp)
4438 switch (TOKEN_SPELL (token))
4440 case SPELL_OPERATOR:
4442 const unsigned char *spelling;
4443 int c;
4445 if (token->flags & DIGRAPH)
4446 spelling = cpp_digraph2name (token->type);
4447 else if (token->flags & NAMED_OP)
4448 goto spell_ident;
4449 else
4450 spelling = TOKEN_NAME (token);
4452 c = *spelling;
4454 putc (c, fp);
4455 while ((c = *++spelling) != '\0');
4457 break;
4459 spell_ident:
4460 case SPELL_IDENT:
4462 size_t i;
4463 const unsigned char * name = NODE_NAME (token->val.node.node);
4465 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4466 if (name[i] & ~0x7F)
4468 unsigned char buffer[10];
4469 i += utf8_to_ucn (buffer, name + i) - 1;
4470 fwrite (buffer, 1, 10, fp);
4472 else
4473 fputc (NODE_NAME (token->val.node.node)[i], fp);
4475 break;
4477 case SPELL_LITERAL:
4478 if (token->type == CPP_HEADER_NAME)
4479 fputc ('"', fp);
4480 fwrite (token->val.str.text, 1, token->val.str.len, fp);
4481 if (token->type == CPP_HEADER_NAME)
4482 fputc ('"', fp);
4483 break;
4485 case SPELL_NONE:
4486 /* An error, most probably. */
4487 break;
4491 /* Compare two tokens. */
4493 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4495 if (a->type == b->type && a->flags == b->flags)
4496 switch (TOKEN_SPELL (a))
4498 default: /* Keep compiler happy. */
4499 case SPELL_OPERATOR:
4500 /* token_no is used to track where multiple consecutive ##
4501 tokens were originally located. */
4502 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4503 case SPELL_NONE:
4504 return (a->type != CPP_MACRO_ARG
4505 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4506 && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4507 case SPELL_IDENT:
4508 return (a->val.node.node == b->val.node.node
4509 && a->val.node.spelling == b->val.node.spelling);
4510 case SPELL_LITERAL:
4511 return (a->val.str.len == b->val.str.len
4512 && !memcmp (a->val.str.text, b->val.str.text,
4513 a->val.str.len));
4516 return 0;
4519 /* Returns nonzero if a space should be inserted to avoid an
4520 accidental token paste for output. For simplicity, it is
4521 conservative, and occasionally advises a space where one is not
4522 needed, e.g. "." and ".2". */
4524 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4525 const cpp_token *token2)
4527 enum cpp_ttype a = token1->type, b = token2->type;
4528 cppchar_t c;
4530 if (token1->flags & NAMED_OP)
4531 a = CPP_NAME;
4532 if (token2->flags & NAMED_OP)
4533 b = CPP_NAME;
4535 c = EOF;
4536 if (token2->flags & DIGRAPH)
4537 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4538 else if (token_spellings[b].category == SPELL_OPERATOR)
4539 c = token_spellings[b].name[0];
4541 /* Quickly get everything that can paste with an '='. */
4542 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4543 return 1;
4545 switch (a)
4547 case CPP_GREATER: return c == '>';
4548 case CPP_LESS: return c == '<' || c == '%' || c == ':';
4549 case CPP_PLUS: return c == '+';
4550 case CPP_MINUS: return c == '-' || c == '>';
4551 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
4552 case CPP_MOD: return c == ':' || c == '>';
4553 case CPP_AND: return c == '&';
4554 case CPP_OR: return c == '|';
4555 case CPP_COLON: return c == ':' || c == '>';
4556 case CPP_DEREF: return c == '*';
4557 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
4558 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
4559 case CPP_PRAGMA:
4560 case CPP_NAME: return ((b == CPP_NUMBER
4561 && name_p (pfile, &token2->val.str))
4562 || b == CPP_NAME
4563 || b == CPP_CHAR || b == CPP_STRING); /* L */
4564 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
4565 || b == CPP_CHAR
4566 || c == '.' || c == '+' || c == '-');
4567 /* UCNs */
4568 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
4569 && b == CPP_NAME)
4570 || (CPP_OPTION (pfile, objc)
4571 && token1->val.str.text[0] == '@'
4572 && (b == CPP_NAME || b == CPP_STRING)));
4573 case CPP_LESS_EQ: return c == '>';
4574 case CPP_STRING:
4575 case CPP_WSTRING:
4576 case CPP_UTF8STRING:
4577 case CPP_STRING16:
4578 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
4579 && (b == CPP_NAME
4580 || (TOKEN_SPELL (token2) == SPELL_LITERAL
4581 && ISIDST (token2->val.str.text[0]))));
4583 default: break;
4586 return 0;
4589 /* Output all the remaining tokens on the current line, and a newline
4590 character, to FP. Leading whitespace is removed. If there are
4591 macros, special token padding is not performed. */
4592 void
4593 cpp_output_line (cpp_reader *pfile, FILE *fp)
4595 const cpp_token *token;
4597 token = cpp_get_token (pfile);
4598 while (token->type != CPP_EOF)
4600 cpp_output_token (token, fp);
4601 token = cpp_get_token (pfile);
4602 if (token->flags & PREV_WHITE)
4603 putc (' ', fp);
4606 putc ('\n', fp);
4609 /* Return a string representation of all the remaining tokens on the
4610 current line. The result is allocated using xmalloc and must be
4611 freed by the caller. */
4612 unsigned char *
4613 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4615 const cpp_token *token;
4616 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4617 unsigned int alloced = 120 + out;
4618 unsigned char *result = (unsigned char *) xmalloc (alloced);
4620 /* If DIR_NAME is empty, there are no initial contents. */
4621 if (dir_name)
4623 sprintf ((char *) result, "#%s ", dir_name);
4624 out += 2;
4627 token = cpp_get_token (pfile);
4628 while (token->type != CPP_EOF)
4630 unsigned char *last;
4631 /* Include room for a possible space and the terminating nul. */
4632 unsigned int len = cpp_token_len (token) + 2;
4634 if (out + len > alloced)
4636 alloced *= 2;
4637 if (out + len > alloced)
4638 alloced = out + len;
4639 result = (unsigned char *) xrealloc (result, alloced);
4642 last = cpp_spell_token (pfile, token, &result[out], 0);
4643 out = last - result;
4645 token = cpp_get_token (pfile);
4646 if (token->flags & PREV_WHITE)
4647 result[out++] = ' ';
4650 result[out] = '\0';
4651 return result;
4654 /* Memory buffers. Changing these three constants can have a dramatic
4655 effect on performance. The values here are reasonable defaults,
4656 but might be tuned. If you adjust them, be sure to test across a
4657 range of uses of cpplib, including heavy nested function-like macro
4658 expansion. Also check the change in peak memory usage (NJAMD is a
4659 good tool for this). */
4660 #define MIN_BUFF_SIZE 8000
4661 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4662 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4663 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4665 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4666 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4667 #endif
4669 /* Create a new allocation buffer. Place the control block at the end
4670 of the buffer, so that buffer overflows will cause immediate chaos. */
4671 static _cpp_buff *
4672 new_buff (size_t len)
4674 _cpp_buff *result;
4675 unsigned char *base;
4677 if (len < MIN_BUFF_SIZE)
4678 len = MIN_BUFF_SIZE;
4679 len = CPP_ALIGN (len);
4681 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4682 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4683 struct first. */
4684 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4685 base = XNEWVEC (unsigned char, len + slen);
4686 result = (_cpp_buff *) base;
4687 base += slen;
4688 #else
4689 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4690 result = (_cpp_buff *) (base + len);
4691 #endif
4692 result->base = base;
4693 result->cur = base;
4694 result->limit = base + len;
4695 result->next = NULL;
4696 return result;
4699 /* Place a chain of unwanted allocation buffers on the free list. */
4700 void
4701 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4703 _cpp_buff *end = buff;
4705 while (end->next)
4706 end = end->next;
4707 end->next = pfile->free_buffs;
4708 pfile->free_buffs = buff;
4711 /* Return a free buffer of size at least MIN_SIZE. */
4712 _cpp_buff *
4713 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4715 _cpp_buff *result, **p;
4717 for (p = &pfile->free_buffs;; p = &(*p)->next)
4719 size_t size;
4721 if (*p == NULL)
4722 return new_buff (min_size);
4723 result = *p;
4724 size = result->limit - result->base;
4725 /* Return a buffer that's big enough, but don't waste one that's
4726 way too big. */
4727 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4728 break;
4731 *p = result->next;
4732 result->next = NULL;
4733 result->cur = result->base;
4734 return result;
4737 /* Creates a new buffer with enough space to hold the uncommitted
4738 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
4739 the excess bytes to the new buffer. Chains the new buffer after
4740 BUFF, and returns the new buffer. */
4741 _cpp_buff *
4742 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4744 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4745 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4747 buff->next = new_buff;
4748 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4749 return new_buff;
4752 /* Creates a new buffer with enough space to hold the uncommitted
4753 remaining bytes of the buffer pointed to by BUFF, and at least
4754 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
4755 Chains the new buffer before the buffer pointed to by BUFF, and
4756 updates the pointer to point to the new buffer. */
4757 void
4758 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4760 _cpp_buff *new_buff, *old_buff = *pbuff;
4761 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4763 new_buff = _cpp_get_buff (pfile, size);
4764 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4765 new_buff->next = old_buff;
4766 *pbuff = new_buff;
4769 /* Free a chain of buffers starting at BUFF. */
4770 void
4771 _cpp_free_buff (_cpp_buff *buff)
4773 _cpp_buff *next;
4775 for (; buff; buff = next)
4777 next = buff->next;
4778 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4779 free (buff);
4780 #else
4781 free (buff->base);
4782 #endif
4786 /* Allocate permanent, unaligned storage of length LEN. */
4787 unsigned char *
4788 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4790 _cpp_buff *buff = pfile->u_buff;
4791 unsigned char *result = buff->cur;
4793 if (len > (size_t) (buff->limit - result))
4795 buff = _cpp_get_buff (pfile, len);
4796 buff->next = pfile->u_buff;
4797 pfile->u_buff = buff;
4798 result = buff->cur;
4801 buff->cur = result + len;
4802 return result;
4805 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4806 That buffer is used for growing allocations when saving macro
4807 replacement lists in a #define, and when parsing an answer to an
4808 assertion in #assert, #unassert or #if (and therefore possibly
4809 whilst expanding macros). It therefore must not be used by any
4810 code that they might call: specifically the lexer and the guts of
4811 the macro expander.
4813 All existing other uses clearly fit this restriction: storing
4814 registered pragmas during initialization. */
4815 unsigned char *
4816 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4818 _cpp_buff *buff = pfile->a_buff;
4819 unsigned char *result = buff->cur;
4821 if (len > (size_t) (buff->limit - result))
4823 buff = _cpp_get_buff (pfile, len);
4824 buff->next = pfile->a_buff;
4825 pfile->a_buff = buff;
4826 result = buff->cur;
4829 buff->cur = result + len;
4830 return result;
4833 /* Commit or allocate storage from a buffer. */
4835 void *
4836 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4838 void *ptr = BUFF_FRONT (pfile->a_buff);
4840 if (pfile->hash_table->alloc_subobject)
4842 void *copy = pfile->hash_table->alloc_subobject (size);
4843 memcpy (copy, ptr, size);
4844 ptr = copy;
4846 else
4847 BUFF_FRONT (pfile->a_buff) += size;
4849 return ptr;
4852 /* Say which field of TOK is in use. */
4854 enum cpp_token_fld_kind
4855 cpp_token_val_index (const cpp_token *tok)
4857 switch (TOKEN_SPELL (tok))
4859 case SPELL_IDENT:
4860 return CPP_TOKEN_FLD_NODE;
4861 case SPELL_LITERAL:
4862 return CPP_TOKEN_FLD_STR;
4863 case SPELL_OPERATOR:
4864 /* Operands which were originally spelled as ident keep around
4865 the node for the exact spelling. */
4866 if (tok->flags & NAMED_OP)
4867 return CPP_TOKEN_FLD_NODE;
4868 else if (tok->type == CPP_PASTE)
4869 return CPP_TOKEN_FLD_TOKEN_NO;
4870 else
4871 return CPP_TOKEN_FLD_NONE;
4872 case SPELL_NONE:
4873 if (tok->type == CPP_MACRO_ARG)
4874 return CPP_TOKEN_FLD_ARG_NO;
4875 else if (tok->type == CPP_PADDING)
4876 return CPP_TOKEN_FLD_SOURCE;
4877 else if (tok->type == CPP_PRAGMA)
4878 return CPP_TOKEN_FLD_PRAGMA;
4879 /* fall through */
4880 default:
4881 return CPP_TOKEN_FLD_NONE;
4885 /* All tokens lexed in R after calling this function will be forced to
4886 have their location_t to be P, until
4887 cpp_stop_forcing_token_locations is called for R. */
4889 void
4890 cpp_force_token_locations (cpp_reader *r, location_t loc)
4892 r->forced_token_location = loc;
4895 /* Go back to assigning locations naturally for lexed tokens. */
4897 void
4898 cpp_stop_forcing_token_locations (cpp_reader *r)
4900 r->forced_token_location = 0;
4903 /* We're looking at \, if it's escaping EOL, look past it. If at
4904 LIMIT, don't advance. */
4906 static const unsigned char *
4907 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4909 const unsigned char *probe = peek;
4911 if (__builtin_expect (peek[1] == '\n', true))
4913 eol:
4914 probe += 2;
4915 if (__builtin_expect (probe < limit, true))
4917 peek = probe;
4918 if (*peek == '\\')
4919 /* The user might be perverse. */
4920 return do_peek_backslash (peek, limit);
4923 else if (__builtin_expect (peek[1] == '\r', false))
4925 if (probe[2] == '\n')
4926 probe++;
4927 goto eol;
4930 return peek;
4933 static const unsigned char *
4934 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4936 if (__builtin_expect (*peek == '\\', false))
4937 peek = do_peek_backslash (peek, limit);
4938 return peek;
4941 static const unsigned char *
4942 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4944 if (peek == bound)
4945 return NULL;
4947 unsigned char c = *--peek;
4948 if (__builtin_expect (c == '\n', false)
4949 || __builtin_expect (c == 'r', false))
4951 if (peek == bound)
4952 return peek;
4953 int ix = -1;
4954 if (c == '\n' && peek[ix] == '\r')
4956 if (peek + ix == bound)
4957 return peek;
4958 ix--;
4961 if (peek[ix] == '\\')
4962 return do_peek_prev (peek + ix, bound);
4964 return peek;
4966 else
4967 return peek;
4970 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4971 space. Otherwise return NULL. */
4973 static const unsigned char *
4974 do_peek_ident (const char *match, const unsigned char *peek,
4975 const unsigned char *limit)
4977 for (; *++match; peek++)
4978 if (*peek != *match)
4980 peek = do_peek_next (peek, limit);
4981 if (*peek != *match)
4982 return NULL;
4985 /* Must now not be looking at an identifier char. */
4986 peek = do_peek_next (peek, limit);
4987 if (ISIDNUM (*peek))
4988 return NULL;
4990 /* Skip control-line whitespace. */
4992 while (*peek == ' ' || *peek == '\t')
4993 peek++;
4994 if (__builtin_expect (*peek == '\\', false))
4996 peek = do_peek_backslash (peek, limit);
4997 if (*peek != '\\')
4998 goto ws;
5001 return peek;
5004 /* Are we looking at a module control line starting as PEEK - 1? */
5006 static bool
5007 do_peek_module (cpp_reader *pfile, unsigned char c,
5008 const unsigned char *peek, const unsigned char *limit)
5010 bool import = false;
5012 if (__builtin_expect (c == 'e', false))
5014 if (!((peek[0] == 'x' || peek[0] == '\\')
5015 && (peek = do_peek_ident ("export", peek, limit))))
5016 return false;
5018 /* export, peek for import or module. No need to peek __import
5019 here. */
5020 if (peek[0] == 'i')
5022 if (!((peek[1] == 'm' || peek[1] == '\\')
5023 && (peek = do_peek_ident ("import", peek + 1, limit))))
5024 return false;
5025 import = true;
5027 else if (peek[0] == 'm')
5029 if (!((peek[1] == 'o' || peek[1] == '\\')
5030 && (peek = do_peek_ident ("module", peek + 1, limit))))
5031 return false;
5033 else
5034 return false;
5036 else if (__builtin_expect (c == 'i', false))
5038 if (!((peek[0] == 'm' || peek[0] == '\\')
5039 && (peek = do_peek_ident ("import", peek, limit))))
5040 return false;
5041 import = true;
5043 else if (__builtin_expect (c == '_', false))
5045 /* Needed for translated includes. */
5046 if (!((peek[0] == '_' || peek[0] == '\\')
5047 && (peek = do_peek_ident ("__import", peek, limit))))
5048 return false;
5049 import = true;
5051 else if (__builtin_expect (c == 'm', false))
5053 if (!((peek[0] == 'o' || peek[0] == '\\')
5054 && (peek = do_peek_ident ("module", peek, limit))))
5055 return false;
5057 else
5058 return false;
5060 /* Peek the next character to see if it's good enough. We'll be at
5061 the first non-whitespace char, including skipping an escaped
5062 newline. */
5063 /* ... import followed by identifier, ':', '<' or header-name
5064 preprocessing tokens, or module followed by identifier, ':' or
5065 ';' preprocessing tokens. */
5066 unsigned char p = *peek++;
5068 /* A character literal is ... single quotes, ... optionally preceded
5069 by u8, u, U, or L */
5070 /* A string-literal is a ... double quotes, optionally prefixed by
5071 R, u8, u8R, u, uR, U, UR, L, or LR */
5072 if (p == 'u')
5074 peek = do_peek_next (peek, limit);
5075 if (*peek == '8')
5077 peek++;
5078 goto peek_u8;
5080 goto peek_u;
5082 else if (p == 'U' || p == 'L')
5084 peek_u8:
5085 peek = do_peek_next (peek, limit);
5086 peek_u:
5087 if (*peek == '\"' || *peek == '\'')
5088 return false;
5090 if (*peek == 'R')
5091 goto peek_R;
5092 /* Identifier. Ok. */
5094 else if (p == 'R')
5096 peek_R:
5097 if (CPP_OPTION (pfile, rliterals))
5099 peek = do_peek_next (peek, limit);
5100 if (*peek == '\"')
5101 return false;
5103 /* Identifier. Ok. */
5105 else if ('Z' - 'A' == 25
5106 ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5107 : ISIDST (p))
5109 /* Identifier. Ok. */
5111 else if (p == '<')
5113 /* Maybe angle header, ok for import. Reject
5114 '<=', '<<' digraph:'<:'. */
5115 if (!import)
5116 return false;
5117 peek = do_peek_next (peek, limit);
5118 if (*peek == '=' || *peek == '<'
5119 || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5120 return false;
5122 else if (p == ';')
5124 /* SEMICOLON, ok for module. */
5125 if (import)
5126 return false;
5128 else if (p == '"')
5130 /* STRING, ok for import. */
5131 if (!import)
5132 return false;
5134 else if (p == ':')
5136 /* Maybe COLON, ok. Reject '::', digraph:':>'. */
5137 peek = do_peek_next (peek, limit);
5138 if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5139 return false;
5141 else
5142 /* FIXME: Detect a unicode character, excluding those not
5143 permitted as the initial character. [lex.name]/1. I presume
5144 we need to check the \[uU] spellings, and directly using
5145 Unicode in say UTF8 form? Or perhaps we do the phase-1
5146 conversion of UTF8 to universal-character-names? */
5147 return false;
5149 return true;
5152 /* Directives-only scanning. Somewhat more relaxed than correct
5153 parsing -- some ill-formed programs will not be rejected. */
5155 void
5156 cpp_directive_only_process (cpp_reader *pfile,
5157 void *data,
5158 void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5160 bool module_p = CPP_OPTION (pfile, module_directives);
5164 restart:
5165 /* Buffer initialization, but no line cleaning. */
5166 cpp_buffer *buffer = pfile->buffer;
5167 buffer->cur_note = buffer->notes_used = 0;
5168 buffer->cur = buffer->line_base = buffer->next_line;
5169 buffer->need_line = false;
5170 /* Files always end in a newline or carriage return. We rely on this for
5171 character peeking safety. */
5172 gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
5174 const unsigned char *base = buffer->cur;
5175 unsigned line_count = 0;
5176 const unsigned char *line_start = base;
5178 bool bol = true;
5179 bool raw = false;
5181 const unsigned char *lwm = base;
5182 for (const unsigned char *pos = base, *limit = buffer->rlimit;
5183 pos < limit;)
5185 unsigned char c = *pos++;
5186 /* This matches the switch in _cpp_lex_direct. */
5187 switch (c)
5189 case ' ': case '\t': case '\f': case '\v':
5190 /* Whitespace, do nothing. */
5191 break;
5193 case '\r': /* MAC line ending, or Windows \r\n */
5194 if (*pos == '\n')
5195 pos++;
5196 /* FALLTHROUGH */
5198 case '\n':
5199 bol = true;
5201 next_line:
5202 CPP_INCREMENT_LINE (pfile, 0);
5203 line_count++;
5204 line_start = pos;
5205 break;
5207 case '\\':
5208 /* <backslash><newline> is removed, and doesn't undo any
5209 preceeding escape or whatnot. */
5210 if (*pos == '\n')
5212 pos++;
5213 goto next_line;
5215 else if (*pos == '\r')
5217 if (pos[1] == '\n')
5218 pos++;
5219 pos++;
5220 goto next_line;
5222 goto dflt;
5224 case '#':
5225 if (bol)
5227 /* Line directive. */
5228 if (pos - 1 > base && !pfile->state.skipping)
5229 cb (pfile, CPP_DO_print, data,
5230 line_count, base, pos - 1 - base);
5232 /* Prep things for directive handling. */
5233 buffer->next_line = pos;
5234 buffer->need_line = true;
5235 bool ok = _cpp_get_fresh_line (pfile);
5236 gcc_checking_assert (ok);
5238 /* Ensure proper column numbering for generated
5239 error messages. */
5240 buffer->line_base -= pos - line_start;
5242 _cpp_handle_directive (pfile, line_start + 1 != pos);
5244 /* Sanitize the line settings. Duplicate #include's can
5245 mess things up. */
5246 // FIXME: Necessary?
5247 pfile->line_table->highest_location
5248 = pfile->line_table->highest_line;
5250 if (!pfile->state.skipping
5251 && pfile->buffer->next_line < pfile->buffer->rlimit)
5252 cb (pfile, CPP_DO_location, data,
5253 pfile->line_table->highest_line);
5255 goto restart;
5257 goto dflt;
5259 case '/':
5261 const unsigned char *peek = do_peek_next (pos, limit);
5262 if (!(*peek == '/' || *peek == '*'))
5263 goto dflt;
5265 /* Line or block comment */
5266 bool is_block = *peek == '*';
5267 bool star = false;
5268 bool esc = false;
5269 location_t sloc
5270 = linemap_position_for_column (pfile->line_table,
5271 pos - line_start);
5273 while (pos < limit)
5275 char c = *pos++;
5276 switch (c)
5278 case '\\':
5279 esc = true;
5280 break;
5282 case '\r':
5283 if (*pos == '\n')
5284 pos++;
5285 /* FALLTHROUGH */
5287 case '\n':
5289 CPP_INCREMENT_LINE (pfile, 0);
5290 line_count++;
5291 line_start = pos;
5292 if (!esc && !is_block)
5294 bol = true;
5295 goto done_comment;
5298 if (!esc)
5299 star = false;
5300 esc = false;
5301 break;
5303 case '*':
5304 if (pos > peek)
5305 star = is_block;
5306 esc = false;
5307 break;
5309 case '/':
5310 if (star)
5311 goto done_comment;
5312 /* FALLTHROUGH */
5314 default:
5315 star = false;
5316 esc = false;
5317 break;
5320 if (pos < limit || is_block)
5321 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5322 "unterminated comment");
5323 done_comment:
5324 lwm = pos;
5325 break;
5328 case '\'':
5329 if (!CPP_OPTION (pfile, digit_separators))
5330 goto delimited_string;
5332 /* Possibly a number punctuator. */
5333 if (!ISIDNUM (*do_peek_next (pos, limit)))
5334 goto delimited_string;
5336 goto quote_peek;
5338 case '\"':
5339 if (!CPP_OPTION (pfile, rliterals))
5340 goto delimited_string;
5342 quote_peek:
5344 /* For ' see if it's a number punctuator
5345 \.?<digit>(<digit>|<identifier-nondigit>
5346 |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5347 /* For " see if it's a raw string
5348 {U,L,u,u8}R. This includes CPP_NUMBER detection,
5349 because that could be 0e+R. */
5350 const unsigned char *peek = pos - 1;
5351 bool quote_first = c == '"';
5352 bool quote_eight = false;
5353 bool maybe_number_start = false;
5354 bool want_number = false;
5356 while ((peek = do_peek_prev (peek, lwm)))
5358 unsigned char p = *peek;
5359 if (quote_first)
5361 if (!raw)
5363 if (p != 'R')
5364 break;
5365 raw = true;
5366 continue;
5369 quote_first = false;
5370 if (p == 'L' || p == 'U' || p == 'u')
5372 else if (p == '8')
5373 quote_eight = true;
5374 else
5375 goto second_raw;
5377 else if (quote_eight)
5379 if (p != 'u')
5381 raw = false;
5382 break;
5384 quote_eight = false;
5386 else if (c == '"')
5388 second_raw:;
5389 if (!want_number && ISIDNUM (p))
5391 raw = false;
5392 break;
5396 if (ISDIGIT (p))
5397 maybe_number_start = true;
5398 else if (p == '.')
5399 want_number = true;
5400 else if (ISIDNUM (p))
5401 maybe_number_start = false;
5402 else if (p == '+' || p == '-')
5404 if (const unsigned char *peek_prev
5405 = do_peek_prev (peek, lwm))
5407 p = *peek_prev;
5408 if (p == 'e' || p == 'E'
5409 || p == 'p' || p == 'P')
5411 want_number = true;
5412 maybe_number_start = false;
5414 else
5415 break;
5417 else
5418 break;
5420 else if (p == '\'' || p == '\"')
5422 /* If this is lwm, this must be the end of a
5423 previous string. So this is a trailing
5424 literal type, (a) if those are allowed,
5425 and (b) maybe_start is false. Otherwise
5426 this must be a CPP_NUMBER because we've
5427 met another ', and we'd have checked that
5428 in its own right. */
5429 if (peek == lwm && CPP_OPTION (pfile, uliterals))
5431 if (!maybe_number_start && !want_number)
5432 /* Must be a literal type. */
5433 raw = false;
5435 else if (p == '\''
5436 && CPP_OPTION (pfile, digit_separators))
5437 maybe_number_start = true;
5438 break;
5440 else if (c == '\'')
5441 break;
5442 else if (!quote_first && !quote_eight)
5443 break;
5446 if (maybe_number_start)
5448 if (c == '\'')
5449 /* A CPP NUMBER. */
5450 goto dflt;
5451 raw = false;
5454 goto delimited_string;
5457 delimited_string:
5459 /* (Possibly raw) string or char literal. */
5460 unsigned char end = c;
5461 int delim_len = -1;
5462 const unsigned char *delim = NULL;
5463 location_t sloc = linemap_position_for_column (pfile->line_table,
5464 pos - line_start);
5465 int esc = 0;
5467 if (raw)
5469 /* There can be no line breaks in the delimiter. */
5470 delim = pos;
5471 for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5473 if (delim_len == 16)
5475 cpp_error_with_line (pfile, CPP_DL_ERROR,
5476 sloc, 0,
5477 "raw string delimiter"
5478 " longer than %d"
5479 " characters",
5480 delim_len);
5481 raw = false;
5482 pos = delim;
5483 break;
5485 if (strchr (") \\\t\v\f\n", c))
5487 cpp_error_with_line (pfile, CPP_DL_ERROR,
5488 sloc, 0,
5489 "invalid character '%c'"
5490 " in raw string"
5491 " delimiter", c);
5492 raw = false;
5493 pos = delim;
5494 break;
5496 if (pos >= limit)
5497 goto bad_string;
5501 while (pos < limit)
5503 char c = *pos++;
5504 switch (c)
5506 case '\\':
5507 if (!raw)
5508 esc++;
5509 break;
5511 case '\r':
5512 if (*pos == '\n')
5513 pos++;
5514 /* FALLTHROUGH */
5516 case '\n':
5518 CPP_INCREMENT_LINE (pfile, 0);
5519 line_count++;
5520 line_start = pos;
5522 if (esc)
5523 esc--;
5524 break;
5526 case ')':
5527 if (raw
5528 && pos + delim_len + 1 < limit
5529 && pos[delim_len] == end
5530 && !memcmp (delim, pos, delim_len))
5532 pos += delim_len + 1;
5533 raw = false;
5534 goto done_string;
5536 break;
5538 default:
5539 if (!raw && !(esc & 1) && c == end)
5540 goto done_string;
5541 esc = 0;
5542 break;
5545 bad_string:
5546 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5547 "unterminated literal");
5549 done_string:
5550 raw = false;
5551 lwm = pos - 1;
5553 goto dflt;
5555 case '_':
5556 case 'e':
5557 case 'i':
5558 case 'm':
5559 if (bol && module_p && !pfile->state.skipping
5560 && do_peek_module (pfile, c, pos, limit))
5562 /* We've seen the start of a module control line.
5563 Start up the tokenizer. */
5564 pos--; /* Backup over the first character. */
5566 /* Backup over whitespace to start of line. */
5567 while (pos > line_start
5568 && (pos[-1] == ' ' || pos[-1] == '\t'))
5569 pos--;
5571 if (pos > base)
5572 cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5574 /* Prep things for directive handling. */
5575 buffer->next_line = pos;
5576 buffer->need_line = true;
5578 /* Now get tokens until the PRAGMA_EOL. */
5581 location_t spelling;
5582 const cpp_token *tok
5583 = cpp_get_token_with_location (pfile, &spelling);
5585 gcc_assert (pfile->state.in_deferred_pragma
5586 || tok->type == CPP_PRAGMA_EOL);
5587 cb (pfile, CPP_DO_token, data, tok, spelling);
5589 while (pfile->state.in_deferred_pragma);
5591 if (pfile->buffer->next_line < pfile->buffer->rlimit)
5592 cb (pfile, CPP_DO_location, data,
5593 pfile->line_table->highest_line);
5595 pfile->mi_valid = false;
5596 goto restart;
5598 goto dflt;
5600 default:
5601 dflt:
5602 bol = false;
5603 pfile->mi_valid = false;
5604 break;
5608 if (buffer->rlimit > base && !pfile->state.skipping)
5610 const unsigned char *limit = buffer->rlimit;
5611 /* If the file was not newline terminated, add rlimit, which is
5612 guaranteed to point to a newline, to the end of our range. */
5613 if (limit[-1] != '\n')
5615 limit++;
5616 CPP_INCREMENT_LINE (pfile, 0);
5617 line_count++;
5619 cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5622 _cpp_pop_buffer (pfile);
5624 while (pfile->buffer);