Add 'libgomp.oacc-fortran/declare-allocatable-1.f90'
[official-gcc.git] / libcpp / lex.cc
blobcc12a52d28295f847faf93f4d5a6c67ef7213af7
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000-2022 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
27 enum spell_type
29 SPELL_OPERATOR = 0,
30 SPELL_IDENT,
31 SPELL_LITERAL,
32 SPELL_NONE
35 struct token_spelling
37 enum spell_type category;
38 const unsigned char *name;
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
44 #define OP(e, s) { SPELL_OPERATOR, UC s },
45 #define TK(e, s) { SPELL_ ## s, UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
53 /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. */
54 #define UCS_LIMIT 0x10FFFF
56 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
57 static int skip_line_comment (cpp_reader *);
58 static void skip_whitespace (cpp_reader *, cppchar_t);
59 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
60 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
61 static void store_comment (cpp_reader *, cpp_token *);
62 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
63 unsigned int, enum cpp_ttype);
64 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
65 static int name_p (cpp_reader *, const cpp_string *);
66 static tokenrun *next_tokenrun (tokenrun *);
68 static _cpp_buff *new_buff (size_t);
71 /* Utility routine:
73 Compares, the token TOKEN to the NUL-terminated string STRING.
74 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
75 int
76 cpp_ideq (const cpp_token *token, const char *string)
78 if (token->type != CPP_NAME)
79 return 0;
81 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
84 /* Record a note TYPE at byte POS into the current cleaned logical
85 line. */
86 static void
87 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
89 if (buffer->notes_used == buffer->notes_cap)
91 buffer->notes_cap = buffer->notes_cap * 2 + 200;
92 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
93 buffer->notes_cap);
96 buffer->notes[buffer->notes_used].pos = pos;
97 buffer->notes[buffer->notes_used].type = type;
98 buffer->notes_used++;
102 /* Fast path to find line special characters using optimized character
103 scanning algorithms. Anything complicated falls back to the slow
104 path below. Since this loop is very hot it's worth doing these kinds
105 of optimizations.
107 One of the paths through the ifdefs should provide
109 const uchar *search_line_fast (const uchar *s, const uchar *end);
111 Between S and END, search for \n, \r, \\, ?. Return a pointer to
112 the found character.
114 Note that the last character of the buffer is *always* a newline,
115 as forced by _cpp_convert_input. This fact can be used to avoid
116 explicitly looking for the end of the buffer. */
118 /* Configure gives us an ifdef test. */
119 #ifndef WORDS_BIGENDIAN
120 #define WORDS_BIGENDIAN 0
121 #endif
123 /* We'd like the largest integer that fits into a register. There's nothing
124 in <stdint.h> that gives us that. For most hosts this is unsigned long,
125 but MS decided on an LLP64 model. Thankfully when building with GCC we
126 can get the "real" word size. */
127 #ifdef __GNUC__
128 typedef unsigned int word_type __attribute__((__mode__(__word__)));
129 #else
130 typedef unsigned long word_type;
131 #endif
133 /* The code below is only expecting sizes 4 or 8.
134 Die at compile-time if this expectation is violated. */
135 typedef char check_word_type_size
136 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
138 /* Return X with the first N bytes forced to values that won't match one
139 of the interesting characters. Note that NUL is not interesting. */
141 static inline word_type
142 acc_char_mask_misalign (word_type val, unsigned int n)
144 word_type mask = -1;
145 if (WORDS_BIGENDIAN)
146 mask >>= n * 8;
147 else
148 mask <<= n * 8;
149 return val & mask;
152 /* Return X replicated to all byte positions within WORD_TYPE. */
154 static inline word_type
155 acc_char_replicate (uchar x)
157 word_type ret;
159 ret = (x << 24) | (x << 16) | (x << 8) | x;
160 if (sizeof(word_type) == 8)
161 ret = (ret << 16 << 16) | ret;
162 return ret;
165 /* Return non-zero if some byte of VAL is (probably) C. */
167 static inline word_type
168 acc_char_cmp (word_type val, word_type c)
170 #if defined(__GNUC__) && defined(__alpha__)
171 /* We can get exact results using a compare-bytes instruction.
172 Get (val == c) via (0 >= (val ^ c)). */
173 return __builtin_alpha_cmpbge (0, val ^ c);
174 #else
175 word_type magic = 0x7efefefeU;
176 if (sizeof(word_type) == 8)
177 magic = (magic << 16 << 16) | 0xfefefefeU;
178 magic |= 1;
180 val ^= c;
181 return ((val + magic) ^ ~val) & ~magic;
182 #endif
185 /* Given the result of acc_char_cmp is non-zero, return the index of
186 the found character. If this was a false positive, return -1. */
188 static inline int
189 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
190 word_type val ATTRIBUTE_UNUSED)
192 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
193 /* The cmpbge instruction sets *bits* of the result corresponding to
194 matches in the bytes with no false positives. */
195 return __builtin_ctzl (cmp);
196 #else
197 unsigned int i;
199 /* ??? It would be nice to force unrolling here,
200 and have all of these constants folded. */
201 for (i = 0; i < sizeof(word_type); ++i)
203 uchar c;
204 if (WORDS_BIGENDIAN)
205 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
206 else
207 c = (val >> i * 8) & 0xff;
209 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
210 return i;
213 return -1;
214 #endif
217 /* A version of the fast scanner using bit fiddling techniques.
219 For 32-bit words, one would normally perform 16 comparisons and
220 16 branches. With this algorithm one performs 24 arithmetic
221 operations and one branch. Whether this is faster with a 32-bit
222 word size is going to be somewhat system dependent.
224 For 64-bit words, we eliminate twice the number of comparisons
225 and branches without increasing the number of arithmetic operations.
226 It's almost certainly going to be a win with 64-bit word size. */
228 static const uchar * search_line_acc_char (const uchar *, const uchar *)
229 ATTRIBUTE_UNUSED;
231 static const uchar *
232 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
234 const word_type repl_nl = acc_char_replicate ('\n');
235 const word_type repl_cr = acc_char_replicate ('\r');
236 const word_type repl_bs = acc_char_replicate ('\\');
237 const word_type repl_qm = acc_char_replicate ('?');
239 unsigned int misalign;
240 const word_type *p;
241 word_type val, t;
243 /* Align the buffer. Mask out any bytes from before the beginning. */
244 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
245 val = *p;
246 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
247 if (misalign)
248 val = acc_char_mask_misalign (val, misalign);
250 /* Main loop. */
251 while (1)
253 t = acc_char_cmp (val, repl_nl);
254 t |= acc_char_cmp (val, repl_cr);
255 t |= acc_char_cmp (val, repl_bs);
256 t |= acc_char_cmp (val, repl_qm);
258 if (__builtin_expect (t != 0, 0))
260 int i = acc_char_index (t, val);
261 if (i >= 0)
262 return (const uchar *)p + i;
265 val = *++p;
269 /* Disable on Solaris 2/x86 until the following problem can be properly
270 autoconfed:
272 The Solaris 10+ assembler tags objects with the instruction set
273 extensions used, so SSE4.2 executables cannot run on machines that
274 don't support that extension. */
276 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
278 /* Replicated character data to be shared between implementations.
279 Recall that outside of a context with vector support we can't
280 define compatible vector types, therefore these are all defined
281 in terms of raw characters. */
282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
283 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
284 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
285 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
286 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
287 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
288 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
289 { '?', '?', '?', '?', '?', '?', '?', '?',
290 '?', '?', '?', '?', '?', '?', '?', '?' },
293 /* A version of the fast scanner using MMX vectorized byte compare insns.
295 This uses the PMOVMSKB instruction which was introduced with "MMX2",
296 which was packaged into SSE1; it is also present in the AMD MMX
297 extension. Mark the function as using "sse" so that we emit a real
298 "emms" instruction, rather than the 3dNOW "femms" instruction. */
300 static const uchar *
301 #ifndef __SSE__
302 __attribute__((__target__("sse")))
303 #endif
304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
306 typedef char v8qi __attribute__ ((__vector_size__ (8)));
307 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
309 const v8qi repl_nl = *(const v8qi *)repl_chars[0];
310 const v8qi repl_cr = *(const v8qi *)repl_chars[1];
311 const v8qi repl_bs = *(const v8qi *)repl_chars[2];
312 const v8qi repl_qm = *(const v8qi *)repl_chars[3];
314 unsigned int misalign, found, mask;
315 const v8qi *p;
316 v8qi data, t, c;
318 /* Align the source pointer. While MMX doesn't generate unaligned data
319 faults, this allows us to safely scan to the end of the buffer without
320 reading beyond the end of the last page. */
321 misalign = (uintptr_t)s & 7;
322 p = (const v8qi *)((uintptr_t)s & -8);
323 data = *p;
325 /* Create a mask for the bytes that are valid within the first
326 16-byte block. The Idea here is that the AND with the mask
327 within the loop is "free", since we need some AND or TEST
328 insn in order to set the flags for the branch anyway. */
329 mask = -1u << misalign;
331 /* Main loop processing 8 bytes at a time. */
332 goto start;
335 data = *++p;
336 mask = -1;
338 start:
339 t = __builtin_ia32_pcmpeqb(data, repl_nl);
340 c = __builtin_ia32_pcmpeqb(data, repl_cr);
341 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
342 c = __builtin_ia32_pcmpeqb(data, repl_bs);
343 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
344 c = __builtin_ia32_pcmpeqb(data, repl_qm);
345 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
346 found = __builtin_ia32_pmovmskb (t);
347 found &= mask;
349 while (!found);
351 __builtin_ia32_emms ();
353 /* FOUND contains 1 in bits for which we matched a relevant
354 character. Conversion to the byte index is trivial. */
355 found = __builtin_ctz(found);
356 return (const uchar *)p + found;
359 /* A version of the fast scanner using SSE2 vectorized byte compare insns. */
361 static const uchar *
362 #ifndef __SSE2__
363 __attribute__((__target__("sse2")))
364 #endif
365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
367 typedef char v16qi __attribute__ ((__vector_size__ (16)));
369 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
370 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
371 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
372 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
374 unsigned int misalign, found, mask;
375 const v16qi *p;
376 v16qi data, t;
378 /* Align the source pointer. */
379 misalign = (uintptr_t)s & 15;
380 p = (const v16qi *)((uintptr_t)s & -16);
381 data = *p;
383 /* Create a mask for the bytes that are valid within the first
384 16-byte block. The Idea here is that the AND with the mask
385 within the loop is "free", since we need some AND or TEST
386 insn in order to set the flags for the branch anyway. */
387 mask = -1u << misalign;
389 /* Main loop processing 16 bytes at a time. */
390 goto start;
393 data = *++p;
394 mask = -1;
396 start:
397 t = data == repl_nl;
398 t |= data == repl_cr;
399 t |= data == repl_bs;
400 t |= data == repl_qm;
401 found = __builtin_ia32_pmovmskb128 (t);
402 found &= mask;
404 while (!found);
406 /* FOUND contains 1 in bits for which we matched a relevant
407 character. Conversion to the byte index is trivial. */
408 found = __builtin_ctz(found);
409 return (const uchar *)p + found;
412 #ifdef HAVE_SSE4
413 /* A version of the fast scanner using SSE 4.2 vectorized string insns. */
415 static const uchar *
416 #ifndef __SSE4_2__
417 __attribute__((__target__("sse4.2")))
418 #endif
419 search_line_sse42 (const uchar *s, const uchar *end)
421 typedef char v16qi __attribute__ ((__vector_size__ (16)));
422 static const v16qi search = { '\n', '\r', '?', '\\' };
424 uintptr_t si = (uintptr_t)s;
425 uintptr_t index;
427 /* Check for unaligned input. */
428 if (si & 15)
430 v16qi sv;
432 if (__builtin_expect (end - s < 16, 0)
433 && __builtin_expect ((si & 0xfff) > 0xff0, 0))
435 /* There are less than 16 bytes left in the buffer, and less
436 than 16 bytes left on the page. Reading 16 bytes at this
437 point might generate a spurious page fault. Defer to the
438 SSE2 implementation, which already handles alignment. */
439 return search_line_sse2 (s, end);
442 /* ??? The builtin doesn't understand that the PCMPESTRI read from
443 memory need not be aligned. */
444 sv = __builtin_ia32_loaddqu ((const char *) s);
445 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
447 if (__builtin_expect (index < 16, 0))
448 goto found;
450 /* Advance the pointer to an aligned address. We will re-scan a
451 few bytes, but we no longer need care for reading past the
452 end of a page, since we're guaranteed a match. */
453 s = (const uchar *)((si + 15) & -16);
456 /* Main loop, processing 16 bytes at a time. */
457 #ifdef __GCC_ASM_FLAG_OUTPUTS__
458 while (1)
460 char f;
462 /* By using inline assembly instead of the builtin,
463 we can use the result, as well as the flags set. */
464 __asm ("%vpcmpestri\t$0, %2, %3"
465 : "=c"(index), "=@ccc"(f)
466 : "m"(*s), "x"(search), "a"(4), "d"(16));
467 if (f)
468 break;
470 s += 16;
472 #else
473 s -= 16;
474 /* By doing the whole loop in inline assembly,
475 we can make proper use of the flags set. */
476 __asm ( ".balign 16\n"
477 "0: add $16, %1\n"
478 " %vpcmpestri\t$0, (%1), %2\n"
479 " jnc 0b"
480 : "=&c"(index), "+r"(s)
481 : "x"(search), "a"(4), "d"(16));
482 #endif
484 found:
485 return s + index;
488 #else
489 /* Work around out-dated assemblers without sse4 support. */
490 #define search_line_sse42 search_line_sse2
491 #endif
493 /* Check the CPU capabilities. */
495 #include "../gcc/config/i386/cpuid.h"
497 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
498 static search_line_fast_type search_line_fast;
500 #define HAVE_init_vectorized_lexer 1
501 static inline void
502 init_vectorized_lexer (void)
504 unsigned dummy, ecx = 0, edx = 0;
505 search_line_fast_type impl = search_line_acc_char;
506 int minimum = 0;
508 #if defined(__SSE4_2__)
509 minimum = 3;
510 #elif defined(__SSE2__)
511 minimum = 2;
512 #elif defined(__SSE__)
513 minimum = 1;
514 #endif
516 if (minimum == 3)
517 impl = search_line_sse42;
518 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
520 if (minimum == 3 || (ecx & bit_SSE4_2))
521 impl = search_line_sse42;
522 else if (minimum == 2 || (edx & bit_SSE2))
523 impl = search_line_sse2;
524 else if (minimum == 1 || (edx & bit_SSE))
525 impl = search_line_mmx;
527 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
529 if (minimum == 1
530 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
531 impl = search_line_mmx;
534 search_line_fast = impl;
537 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
539 /* A vection of the fast scanner using AltiVec vectorized byte compares
540 and VSX unaligned loads (when VSX is available). This is otherwise
541 the same as the AltiVec version. */
543 ATTRIBUTE_NO_SANITIZE_UNDEFINED
544 static const uchar *
545 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
547 typedef __attribute__((altivec(vector))) unsigned char vc;
549 const vc repl_nl = {
550 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
551 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
553 const vc repl_cr = {
554 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
555 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
557 const vc repl_bs = {
558 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
559 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
561 const vc repl_qm = {
562 '?', '?', '?', '?', '?', '?', '?', '?',
563 '?', '?', '?', '?', '?', '?', '?', '?',
565 const vc zero = { 0 };
567 vc data, t;
569 /* Main loop processing 16 bytes at a time. */
572 vc m_nl, m_cr, m_bs, m_qm;
574 data = __builtin_vec_vsx_ld (0, s);
575 s += 16;
577 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
578 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
579 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
580 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
581 t = (m_nl | m_cr) | (m_bs | m_qm);
583 /* T now contains 0xff in bytes for which we matched one of the relevant
584 characters. We want to exit the loop if any byte in T is non-zero.
585 Below is the expansion of vec_any_ne(t, zero). */
587 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
589 /* Restore s to to point to the 16 bytes we just processed. */
590 s -= 16;
593 #define N (sizeof(vc) / sizeof(long))
595 union {
596 vc v;
597 /* Statically assert that N is 2 or 4. */
598 unsigned long l[(N == 2 || N == 4) ? N : -1];
599 } u;
600 unsigned long l, i = 0;
602 u.v = t;
604 /* Find the first word of T that is non-zero. */
605 switch (N)
607 case 4:
608 l = u.l[i++];
609 if (l != 0)
610 break;
611 s += sizeof(unsigned long);
612 l = u.l[i++];
613 if (l != 0)
614 break;
615 s += sizeof(unsigned long);
616 /* FALLTHRU */
617 case 2:
618 l = u.l[i++];
619 if (l != 0)
620 break;
621 s += sizeof(unsigned long);
622 l = u.l[i];
625 /* L now contains 0xff in bytes for which we matched one of the
626 relevant characters. We can find the byte index by finding
627 its bit index and dividing by 8. */
628 #ifdef __BIG_ENDIAN__
629 l = __builtin_clzl(l) >> 3;
630 #else
631 l = __builtin_ctzl(l) >> 3;
632 #endif
633 return s + l;
635 #undef N
639 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
641 /* A vection of the fast scanner using AltiVec vectorized byte compares.
642 This cannot be used for little endian because vec_lvsl/lvsr are
643 deprecated for little endian and the code won't work properly. */
644 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
645 so we can't compile this function without -maltivec on the command line
646 (or implied by some other switch). */
648 static const uchar *
649 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
651 typedef __attribute__((altivec(vector))) unsigned char vc;
653 const vc repl_nl = {
654 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
655 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
657 const vc repl_cr = {
658 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
659 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
661 const vc repl_bs = {
662 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
663 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
665 const vc repl_qm = {
666 '?', '?', '?', '?', '?', '?', '?', '?',
667 '?', '?', '?', '?', '?', '?', '?', '?',
669 const vc ones = {
670 -1, -1, -1, -1, -1, -1, -1, -1,
671 -1, -1, -1, -1, -1, -1, -1, -1,
673 const vc zero = { 0 };
675 vc data, mask, t;
677 /* Altivec loads automatically mask addresses with -16. This lets us
678 issue the first load as early as possible. */
679 data = __builtin_vec_ld(0, (const vc *)s);
681 /* Discard bytes before the beginning of the buffer. Do this by
682 beginning with all ones and shifting in zeros according to the
683 mis-alignment. The LVSR instruction pulls the exact shift we
684 want from the address. */
685 mask = __builtin_vec_lvsr(0, s);
686 mask = __builtin_vec_perm(zero, ones, mask);
687 data &= mask;
689 /* While altivec loads mask addresses, we still need to align S so
690 that the offset we compute at the end is correct. */
691 s = (const uchar *)((uintptr_t)s & -16);
693 /* Main loop processing 16 bytes at a time. */
694 goto start;
697 vc m_nl, m_cr, m_bs, m_qm;
699 s += 16;
700 data = __builtin_vec_ld(0, (const vc *)s);
702 start:
703 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
704 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
705 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
706 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
707 t = (m_nl | m_cr) | (m_bs | m_qm);
709 /* T now contains 0xff in bytes for which we matched one of the relevant
710 characters. We want to exit the loop if any byte in T is non-zero.
711 Below is the expansion of vec_any_ne(t, zero). */
713 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
716 #define N (sizeof(vc) / sizeof(long))
718 union {
719 vc v;
720 /* Statically assert that N is 2 or 4. */
721 unsigned long l[(N == 2 || N == 4) ? N : -1];
722 } u;
723 unsigned long l, i = 0;
725 u.v = t;
727 /* Find the first word of T that is non-zero. */
728 switch (N)
730 case 4:
731 l = u.l[i++];
732 if (l != 0)
733 break;
734 s += sizeof(unsigned long);
735 l = u.l[i++];
736 if (l != 0)
737 break;
738 s += sizeof(unsigned long);
739 /* FALLTHROUGH */
740 case 2:
741 l = u.l[i++];
742 if (l != 0)
743 break;
744 s += sizeof(unsigned long);
745 l = u.l[i];
748 /* L now contains 0xff in bytes for which we matched one of the
749 relevant characters. We can find the byte index by finding
750 its bit index and dividing by 8. */
751 l = __builtin_clzl(l) >> 3;
752 return s + l;
754 #undef N
758 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
759 #include "arm_neon.h"
761 /* This doesn't have to be the exact page size, but no system may use
762 a size smaller than this. ARMv8 requires a minimum page size of
763 4k. The impact of being conservative here is a small number of
764 cases will take the slightly slower entry path into the main
765 loop. */
767 #define AARCH64_MIN_PAGE_SIZE 4096
769 static const uchar *
770 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
772 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
773 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
774 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
775 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
776 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
778 #ifdef __ARM_BIG_ENDIAN
779 const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
780 #else
781 const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
782 #endif
784 unsigned int found;
785 const uint8_t *p;
786 uint8x16_t data;
787 uint8x16_t t;
788 uint16x8_t m;
789 uint8x16_t u, v, w;
791 /* Align the source pointer. */
792 p = (const uint8_t *)((uintptr_t)s & -16);
794 /* Assuming random string start positions, with a 4k page size we'll take
795 the slow path about 0.37% of the time. */
796 if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
797 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
798 < 16, 0))
800 /* Slow path: the string starts near a possible page boundary. */
801 uint32_t misalign, mask;
803 misalign = (uintptr_t)s & 15;
804 mask = (-1u << misalign) & 0xffff;
805 data = vld1q_u8 (p);
806 t = vceqq_u8 (data, repl_nl);
807 u = vceqq_u8 (data, repl_cr);
808 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
809 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
810 t = vorrq_u8 (v, w);
811 t = vandq_u8 (t, xmask);
812 m = vpaddlq_u8 (t);
813 m = vshlq_u16 (m, shift);
814 found = vaddvq_u16 (m);
815 found &= mask;
816 if (found)
817 return (const uchar*)p + __builtin_ctz (found);
819 else
821 data = vld1q_u8 ((const uint8_t *) s);
822 t = vceqq_u8 (data, repl_nl);
823 u = vceqq_u8 (data, repl_cr);
824 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
825 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
826 t = vorrq_u8 (v, w);
827 if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
828 goto done;
833 p += 16;
834 data = vld1q_u8 (p);
835 t = vceqq_u8 (data, repl_nl);
836 u = vceqq_u8 (data, repl_cr);
837 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
838 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
839 t = vorrq_u8 (v, w);
840 } while (!vpaddd_u64 ((uint64x2_t)t));
842 done:
843 /* Now that we've found the terminating substring, work out precisely where
844 we need to stop. */
845 t = vandq_u8 (t, xmask);
846 m = vpaddlq_u8 (t);
847 m = vshlq_u16 (m, shift);
848 found = vaddvq_u16 (m);
849 return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
850 + __builtin_ctz (found));
853 #elif defined (__ARM_NEON)
854 #include "arm_neon.h"
856 static const uchar *
857 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
859 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
860 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
861 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
862 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
863 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
865 unsigned int misalign, found, mask;
866 const uint8_t *p;
867 uint8x16_t data;
869 /* Align the source pointer. */
870 misalign = (uintptr_t)s & 15;
871 p = (const uint8_t *)((uintptr_t)s & -16);
872 data = vld1q_u8 (p);
874 /* Create a mask for the bytes that are valid within the first
875 16-byte block. The Idea here is that the AND with the mask
876 within the loop is "free", since we need some AND or TEST
877 insn in order to set the flags for the branch anyway. */
878 mask = (-1u << misalign) & 0xffff;
880 /* Main loop, processing 16 bytes at a time. */
881 goto start;
885 uint8x8_t l;
886 uint16x4_t m;
887 uint32x2_t n;
888 uint8x16_t t, u, v, w;
890 p += 16;
891 data = vld1q_u8 (p);
892 mask = 0xffff;
894 start:
895 t = vceqq_u8 (data, repl_nl);
896 u = vceqq_u8 (data, repl_cr);
897 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
898 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
899 t = vandq_u8 (vorrq_u8 (v, w), xmask);
900 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
901 m = vpaddl_u8 (l);
902 n = vpaddl_u16 (m);
904 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
905 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
906 found &= mask;
908 while (!found);
910 /* FOUND contains 1 in bits for which we matched a relevant
911 character. Conversion to the byte index is trivial. */
912 found = __builtin_ctz (found);
913 return (const uchar *)p + found;
916 #else
918 /* We only have one accelerated alternative. Use a direct call so that
919 we encourage inlining. */
921 #define search_line_fast search_line_acc_char
923 #endif
925 /* Initialize the lexer if needed. */
927 void
928 _cpp_init_lexer (void)
930 #ifdef HAVE_init_vectorized_lexer
931 init_vectorized_lexer ();
932 #endif
935 /* Returns with a logical line that contains no escaped newlines or
936 trigraphs. This is a time-critical inner loop. */
937 void
938 _cpp_clean_line (cpp_reader *pfile)
940 cpp_buffer *buffer;
941 const uchar *s;
942 uchar c, *d, *p;
944 buffer = pfile->buffer;
945 buffer->cur_note = buffer->notes_used = 0;
946 buffer->cur = buffer->line_base = buffer->next_line;
947 buffer->need_line = false;
948 s = buffer->next_line;
950 if (!buffer->from_stage3)
952 const uchar *pbackslash = NULL;
954 /* Fast path. This is the common case of an un-escaped line with
955 no trigraphs. The primary win here is by not writing any
956 data back to memory until we have to. */
957 while (1)
959 /* Perform an optimized search for \n, \r, \\, ?. */
960 s = search_line_fast (s, buffer->rlimit);
962 c = *s;
963 if (c == '\\')
965 /* Record the location of the backslash and continue. */
966 pbackslash = s++;
968 else if (__builtin_expect (c == '?', 0))
970 if (__builtin_expect (s[1] == '?', false)
971 && _cpp_trigraph_map[s[2]])
973 /* Have a trigraph. We may or may not have to convert
974 it. Add a line note regardless, for -Wtrigraphs. */
975 add_line_note (buffer, s, s[2]);
976 if (CPP_OPTION (pfile, trigraphs))
978 /* We do, and that means we have to switch to the
979 slow path. */
980 d = (uchar *) s;
981 *d = _cpp_trigraph_map[s[2]];
982 s += 2;
983 goto slow_path;
986 /* Not a trigraph. Continue on fast-path. */
987 s++;
989 else
990 break;
993 /* This must be \r or \n. We're either done, or we'll be forced
994 to write back to the buffer and continue on the slow path. */
995 d = (uchar *) s;
997 if (__builtin_expect (s == buffer->rlimit, false))
998 goto done;
1000 /* DOS line ending? */
1001 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1003 s++;
1004 if (s == buffer->rlimit)
1005 goto done;
1008 if (__builtin_expect (pbackslash == NULL, true))
1009 goto done;
1011 /* Check for escaped newline. */
1012 p = d;
1013 while (is_nvspace (p[-1]))
1014 p--;
1015 if (p - 1 != pbackslash)
1016 goto done;
1018 /* Have an escaped newline; process it and proceed to
1019 the slow path. */
1020 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1021 d = p - 2;
1022 buffer->next_line = p - 1;
1024 slow_path:
1025 while (1)
1027 c = *++s;
1028 *++d = c;
1030 if (c == '\n' || c == '\r')
1032 /* Handle DOS line endings. */
1033 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1034 s++;
1035 if (s == buffer->rlimit)
1036 break;
1038 /* Escaped? */
1039 p = d;
1040 while (p != buffer->next_line && is_nvspace (p[-1]))
1041 p--;
1042 if (p == buffer->next_line || p[-1] != '\\')
1043 break;
1045 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1046 d = p - 2;
1047 buffer->next_line = p - 1;
1049 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1051 /* Add a note regardless, for the benefit of -Wtrigraphs. */
1052 add_line_note (buffer, d, s[2]);
1053 if (CPP_OPTION (pfile, trigraphs))
1055 *d = _cpp_trigraph_map[s[2]];
1056 s += 2;
1061 else
1063 while (*s != '\n' && *s != '\r')
1064 s++;
1065 d = (uchar *) s;
1067 /* Handle DOS line endings. */
1068 if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1069 s++;
1072 done:
1073 *d = '\n';
1074 /* A sentinel note that should never be processed. */
1075 add_line_note (buffer, d + 1, '\n');
1076 buffer->next_line = s + 1;
1079 /* Return true if the trigraph indicated by NOTE should be warned
1080 about in a comment. */
1081 static bool
1082 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1084 const uchar *p;
1086 /* Within comments we don't warn about trigraphs, unless the
1087 trigraph forms an escaped newline, as that may change
1088 behavior. */
1089 if (note->type != '/')
1090 return false;
1092 /* If -trigraphs, then this was an escaped newline iff the next note
1093 is coincident. */
1094 if (CPP_OPTION (pfile, trigraphs))
1095 return note[1].pos == note->pos;
1097 /* Otherwise, see if this forms an escaped newline. */
1098 p = note->pos + 3;
1099 while (is_nvspace (*p))
1100 p++;
1102 /* There might have been escaped newlines between the trigraph and the
1103 newline we found. Hence the position test. */
1104 return (*p == '\n' && p < note[1].pos);
1107 /* Process the notes created by add_line_note as far as the current
1108 location. */
1109 void
1110 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1112 cpp_buffer *buffer = pfile->buffer;
1114 for (;;)
1116 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1117 unsigned int col;
1119 if (note->pos > buffer->cur)
1120 break;
1122 buffer->cur_note++;
1123 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1125 if (note->type == '\\' || note->type == ' ')
1127 if (note->type == ' ' && !in_comment)
1128 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1129 "backslash and newline separated by space");
1131 if (buffer->next_line > buffer->rlimit)
1133 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1134 "backslash-newline at end of file");
1135 /* Prevent "no newline at end of file" warning. */
1136 buffer->next_line = buffer->rlimit;
1139 buffer->line_base = note->pos;
1140 CPP_INCREMENT_LINE (pfile, 0);
1142 else if (_cpp_trigraph_map[note->type])
1144 if (CPP_OPTION (pfile, warn_trigraphs)
1145 && (!in_comment || warn_in_comment (pfile, note)))
1147 if (CPP_OPTION (pfile, trigraphs))
1148 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1149 pfile->line_table->highest_line, col,
1150 "trigraph ??%c converted to %c",
1151 note->type,
1152 (int) _cpp_trigraph_map[note->type]);
1153 else
1155 cpp_warning_with_line
1156 (pfile, CPP_W_TRIGRAPHS,
1157 pfile->line_table->highest_line, col,
1158 "trigraph ??%c ignored, use -trigraphs to enable",
1159 note->type);
1163 else if (note->type == 0)
1164 /* Already processed in lex_raw_string. */;
1165 else
1166 abort ();
1170 namespace bidi {
1171 enum class kind {
1172 NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1175 /* All the UTF-8 encodings of bidi characters start with E2. */
1176 constexpr uchar utf8_start = 0xe2;
1178 struct context
1180 context () {}
1181 context (location_t loc, kind k, bool pdf, bool ucn)
1182 : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1186 kind get_pop_kind () const
1188 return m_pdf ? kind::PDF : kind::PDI;
1190 bool ucn_p () const
1192 return m_ucn;
1195 location_t m_loc;
1196 kind m_kind;
1197 unsigned m_pdf : 1;
1198 unsigned m_ucn : 1;
1201 /* A vector holding currently open bidi contexts. We use a char for
1202 each context, its LSB is 1 if it represents a PDF context, 0 if it
1203 represents a PDI context. The next bit is 1 if this context was open
1204 by a bidi character written as a UCN, and 0 when it was UTF-8. */
1205 semi_embedded_vec <context, 16> vec;
1207 /* Close the whole comment/identifier/string literal/character constant
1208 context. */
1209 void on_close ()
1211 vec.truncate (0);
1214 /* Pop the last element in the vector. */
1215 void pop ()
1217 unsigned int len = vec.count ();
1218 gcc_checking_assert (len > 0);
1219 vec.truncate (len - 1);
1222 /* Return the pop kind of the context of the Ith element. */
1223 kind pop_kind_at (unsigned int i)
1225 return vec[i].get_pop_kind ();
1228 /* Return the pop kind of the context that is currently opened. */
1229 kind current_ctx ()
1231 unsigned int len = vec.count ();
1232 if (len == 0)
1233 return kind::NONE;
1234 return vec[len - 1].get_pop_kind ();
1237 /* Return true if the current context comes from a UCN origin, that is,
1238 the bidi char which started this bidi context was written as a UCN. */
1239 bool current_ctx_ucn_p ()
1241 unsigned int len = vec.count ();
1242 gcc_checking_assert (len > 0);
1243 return vec[len - 1].m_ucn;
1246 location_t current_ctx_loc ()
1248 unsigned int len = vec.count ();
1249 gcc_checking_assert (len > 0);
1250 return vec[len - 1].m_loc;
1253 /* We've read a bidi char, update the current vector as necessary.
1254 LOC is only valid when K is not kind::NONE. */
1255 void on_char (kind k, bool ucn_p, location_t loc)
1257 switch (k)
1259 case kind::LRE:
1260 case kind::RLE:
1261 case kind::LRO:
1262 case kind::RLO:
1263 vec.push (context (loc, k, true, ucn_p));
1264 break;
1265 case kind::LRI:
1266 case kind::RLI:
1267 case kind::FSI:
1268 vec.push (context (loc, k, false, ucn_p));
1269 break;
1270 /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1271 whose scope has not yet been terminated. */
1272 case kind::PDF:
1273 if (current_ctx () == kind::PDF)
1274 pop ();
1275 break;
1276 /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1277 scope has not yet been terminated, as well as the scopes of
1278 any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1279 yet been terminated. */
1280 case kind::PDI:
1281 for (int i = vec.count () - 1; i >= 0; --i)
1282 if (pop_kind_at (i) == kind::PDI)
1284 vec.truncate (i);
1285 break;
1287 break;
1288 case kind::LTR:
1289 case kind::RTL:
1290 /* These aren't popped by a PDF/PDI. */
1291 break;
1292 ATTR_LIKELY case kind::NONE:
1293 break;
1294 default:
1295 abort ();
1299 /* Return a descriptive string for K. */
1300 const char *to_str (kind k)
1302 switch (k)
1304 case kind::LRE:
1305 return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1306 case kind::RLE:
1307 return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1308 case kind::LRO:
1309 return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1310 case kind::RLO:
1311 return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1312 case kind::LRI:
1313 return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1314 case kind::RLI:
1315 return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1316 case kind::FSI:
1317 return "U+2068 (FIRST STRONG ISOLATE)";
1318 case kind::PDF:
1319 return "U+202C (POP DIRECTIONAL FORMATTING)";
1320 case kind::PDI:
1321 return "U+2069 (POP DIRECTIONAL ISOLATE)";
1322 case kind::LTR:
1323 return "U+200E (LEFT-TO-RIGHT MARK)";
1324 case kind::RTL:
1325 return "U+200F (RIGHT-TO-LEFT MARK)";
1326 default:
1327 abort ();
1332 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1333 within the current line in FILE, with the caret at START. */
1335 static location_t
1336 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1337 const unsigned char *const start,
1338 size_t num_bytes)
1340 gcc_checking_assert (num_bytes > 0);
1342 /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1343 to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1344 whereas linemap_position_for_column is 1-based. */
1346 /* Get 0-based offsets within the line. */
1347 size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1348 size_t end_offset = start_offset + num_bytes - 1;
1350 /* Now convert to location_t, where "columns" are 1-based byte offsets. */
1351 location_t start_loc = linemap_position_for_column (pfile->line_table,
1352 start_offset + 1);
1353 location_t end_loc = linemap_position_for_column (pfile->line_table,
1354 end_offset + 1);
1356 if (start_loc == end_loc)
1357 return start_loc;
1359 source_range src_range;
1360 src_range.m_start = start_loc;
1361 src_range.m_finish = end_loc;
1362 location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
1363 start_loc,
1364 src_range,
1365 NULL,
1367 return combined_loc;
1370 /* Parse a sequence of 3 bytes starting with P and return its bidi code. */
1372 static bidi::kind
1373 get_bidi_utf8_1 (const unsigned char *const p)
1375 gcc_checking_assert (p[0] == bidi::utf8_start);
1377 if (p[1] == 0x80)
1378 switch (p[2])
1380 case 0xaa:
1381 return bidi::kind::LRE;
1382 case 0xab:
1383 return bidi::kind::RLE;
1384 case 0xac:
1385 return bidi::kind::PDF;
1386 case 0xad:
1387 return bidi::kind::LRO;
1388 case 0xae:
1389 return bidi::kind::RLO;
1390 case 0x8e:
1391 return bidi::kind::LTR;
1392 case 0x8f:
1393 return bidi::kind::RTL;
1394 default:
1395 break;
1397 else if (p[1] == 0x81)
1398 switch (p[2])
1400 case 0xa6:
1401 return bidi::kind::LRI;
1402 case 0xa7:
1403 return bidi::kind::RLI;
1404 case 0xa8:
1405 return bidi::kind::FSI;
1406 case 0xa9:
1407 return bidi::kind::PDI;
1408 default:
1409 break;
1412 return bidi::kind::NONE;
1415 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1416 If the kind is not NONE, write the location to *OUT.*/
1418 static bidi::kind
1419 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1421 bidi::kind result = get_bidi_utf8_1 (p);
1422 if (result != bidi::kind::NONE)
1424 /* We have a sequence of 3 bytes starting at P. */
1425 *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1427 return result;
1430 /* Parse a UCN where P points just past \u or \U and return its bidi code. */
1432 static bidi::kind
1433 get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
1435 /* 6.4.3 Universal Character Names
1436 \u hex-quad
1437 \U hex-quad hex-quad
1438 \u { simple-hexadecimal-digit-sequence }
1439 where \unnnn means \U0000nnnn. */
1441 *end = p + 4;
1442 if (is_U)
1444 if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1445 return bidi::kind::NONE;
1446 /* Skip 4B so we can treat \u and \U the same below. */
1447 p += 4;
1448 *end += 4;
1450 else if (p[0] == '{')
1452 p++;
1453 while (*p == '0')
1454 p++;
1455 if (p[0] != '2'
1456 || p[1] != '0'
1457 || !ISXDIGIT (p[2])
1458 || !ISXDIGIT (p[3])
1459 || p[4] != '}')
1460 return bidi::kind::NONE;
1461 *end = p + 5;
1464 /* All code points we are looking for start with 20xx. */
1465 if (p[0] != '2' || p[1] != '0')
1466 return bidi::kind::NONE;
1467 else if (p[2] == '2')
1468 switch (p[3])
1470 case 'a':
1471 case 'A':
1472 return bidi::kind::LRE;
1473 case 'b':
1474 case 'B':
1475 return bidi::kind::RLE;
1476 case 'c':
1477 case 'C':
1478 return bidi::kind::PDF;
1479 case 'd':
1480 case 'D':
1481 return bidi::kind::LRO;
1482 case 'e':
1483 case 'E':
1484 return bidi::kind::RLO;
1485 default:
1486 break;
1488 else if (p[2] == '6')
1489 switch (p[3])
1491 case '6':
1492 return bidi::kind::LRI;
1493 case '7':
1494 return bidi::kind::RLI;
1495 case '8':
1496 return bidi::kind::FSI;
1497 case '9':
1498 return bidi::kind::PDI;
1499 default:
1500 break;
1502 else if (p[2] == '0')
1503 switch (p[3])
1505 case 'e':
1506 case 'E':
1507 return bidi::kind::LTR;
1508 case 'f':
1509 case 'F':
1510 return bidi::kind::RTL;
1511 default:
1512 break;
1515 return bidi::kind::NONE;
1518 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1519 If the kind is not NONE, write the location to *OUT. */
1521 static bidi::kind
1522 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1523 location_t *out)
1525 const unsigned char *end;
1526 bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
1527 if (result != bidi::kind::NONE)
1529 const unsigned char *start = p - 2;
1530 size_t num_bytes = end - start;
1531 *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1533 return result;
1536 /* Parse a named universal character escape where P points just past \N and
1537 return its bidi code. If the kind is not NONE, write the location to
1538 *OUT. */
1540 static bidi::kind
1541 get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1543 bidi::kind result = bidi::kind::NONE;
1544 if (*p != '{')
1545 return bidi::kind::NONE;
1546 if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
1548 if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1549 result = bidi::kind::LTR;
1550 else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1551 result = bidi::kind::LRE;
1552 else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1553 result = bidi::kind::LRO;
1554 else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1555 result = bidi::kind::LRI;
1557 else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
1559 if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1560 result = bidi::kind::RTL;
1561 else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1562 result = bidi::kind::RLE;
1563 else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1564 result = bidi::kind::RLO;
1565 else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1566 result = bidi::kind::RLI;
1568 else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
1570 if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
1571 result = bidi::kind::PDF;
1572 else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
1573 result = bidi::kind::PDI;
1575 else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
1576 result = bidi::kind::FSI;
1577 if (result != bidi::kind::NONE)
1578 *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
1579 (strchr ((const char *)
1580 (p + 1), '}')
1581 - (const char *) p)
1582 + 3);
1583 return result;
1586 /* Subclass of rich_location for reporting on unpaired UTF-8
1587 bidirectional control character(s).
1588 Escape the source lines on output, and show all unclosed
1589 bidi context, labelling everything. */
1591 class unpaired_bidi_rich_location : public rich_location
1593 public:
1594 class custom_range_label : public range_label
1596 public:
1597 label_text get_text (unsigned range_idx) const final override
1599 /* range 0 is the primary location; each subsequent range i + 1
1600 is for bidi::vec[i]. */
1601 if (range_idx > 0)
1603 const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1604 return label_text::borrow (bidi::to_str (ctxt.m_kind));
1606 else
1607 return label_text::borrow (_("end of bidirectional context"));
1611 unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1612 : rich_location (pfile->line_table, loc, &m_custom_label)
1614 set_escape_on_output (true);
1615 for (unsigned i = 0; i < bidi::vec.count (); i++)
1616 add_range (bidi::vec[i].m_loc,
1617 SHOW_RANGE_WITHOUT_CARET,
1618 &m_custom_label);
1621 private:
1622 custom_range_label m_custom_label;
1625 /* We're closing a bidi context, that is, we've encountered a newline,
1626 are closing a C-style comment, or are at the end of a string literal,
1627 character constant, or identifier. Warn if this context was not
1628 properly terminated by a PDI or PDF. P points to the last character
1629 in this context. */
1631 static void
1632 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1634 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1635 if (bidi::vec.count () > 0
1636 && (warn_bidi & bidirectional_unpaired
1637 && (!bidi::current_ctx_ucn_p ()
1638 || (warn_bidi & bidirectional_ucn))))
1640 const location_t loc
1641 = linemap_position_for_column (pfile->line_table,
1642 CPP_BUF_COLUMN (pfile->buffer, p));
1643 unpaired_bidi_rich_location rich_loc (pfile, loc);
1644 /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1645 forms of a diagnostic, so fake it for now. */
1646 if (bidi::vec.count () > 1)
1647 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1648 "unpaired UTF-8 bidirectional control characters "
1649 "detected");
1650 else
1651 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1652 "unpaired UTF-8 bidirectional control character "
1653 "detected");
1655 /* We're done with this context. */
1656 bidi::on_close ();
1659 /* We're at the beginning or in the middle of an identifier/comment/string
1660 literal/character constant. Warn if we've encountered a bidi character.
1661 KIND says which bidi control character it was; UCN_P is true iff this bidi
1662 control character was written as a UCN. LOC is the location of the
1663 character, but is only valid if KIND != bidi::kind::NONE. */
1665 static void
1666 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1667 bool ucn_p, location_t loc)
1669 if (__builtin_expect (kind == bidi::kind::NONE, 1))
1670 return;
1672 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1674 if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1676 rich_location rich_loc (pfile->line_table, loc);
1677 rich_loc.set_escape_on_output (true);
1679 /* It seems excessive to warn about a PDI/PDF that is closing
1680 an opened context because we've already warned about the
1681 opening character. Except warn when we have a UCN x UTF-8
1682 mismatch, if UCN checking is enabled. */
1683 if (kind == bidi::current_ctx ())
1685 if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1686 && bidi::current_ctx_ucn_p () != ucn_p)
1688 rich_loc.add_range (bidi::current_ctx_loc ());
1689 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1690 "UTF-8 vs UCN mismatch when closing "
1691 "a context by \"%s\"", bidi::to_str (kind));
1694 else if (warn_bidi & bidirectional_any
1695 && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1697 if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1698 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1699 "\"%s\" is closing an unopened context",
1700 bidi::to_str (kind));
1701 else
1702 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1703 "found problematic Unicode character \"%s\"",
1704 bidi::to_str (kind));
1707 /* We're done with this context. */
1708 bidi::on_char (kind, ucn_p, loc);
1711 static const cppchar_t utf8_continuation = 0x80;
1712 static const cppchar_t utf8_signifier = 0xC0;
1714 /* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1715 at PFILE->buffer->cur. Return a pointer after the diagnosed
1716 invalid character. */
1718 static const uchar *
1719 _cpp_warn_invalid_utf8 (cpp_reader *pfile)
1721 cpp_buffer *buffer = pfile->buffer;
1722 const uchar *cur = buffer->cur;
1723 bool pedantic = (CPP_PEDANTIC (pfile)
1724 && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1726 if (cur[0] < utf8_signifier
1727 || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1729 if (pedantic)
1730 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1731 pfile->line_table->highest_line,
1732 CPP_BUF_COL (buffer),
1733 "invalid UTF-8 character <%x>",
1734 cur[0]);
1735 else
1736 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1737 pfile->line_table->highest_line,
1738 CPP_BUF_COL (buffer),
1739 "invalid UTF-8 character <%x>",
1740 cur[0]);
1741 return cur + 1;
1743 else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1745 if (pedantic)
1746 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1747 pfile->line_table->highest_line,
1748 CPP_BUF_COL (buffer),
1749 "invalid UTF-8 character <%x><%x>",
1750 cur[0], cur[1]);
1751 else
1752 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1753 pfile->line_table->highest_line,
1754 CPP_BUF_COL (buffer),
1755 "invalid UTF-8 character <%x><%x>",
1756 cur[0], cur[1]);
1757 return cur + 2;
1759 else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1761 if (pedantic)
1762 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1763 pfile->line_table->highest_line,
1764 CPP_BUF_COL (buffer),
1765 "invalid UTF-8 character <%x><%x><%x>",
1766 cur[0], cur[1], cur[2]);
1767 else
1768 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1769 pfile->line_table->highest_line,
1770 CPP_BUF_COL (buffer),
1771 "invalid UTF-8 character <%x><%x><%x>",
1772 cur[0], cur[1], cur[2]);
1773 return cur + 3;
1775 else
1777 if (pedantic)
1778 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1779 pfile->line_table->highest_line,
1780 CPP_BUF_COL (buffer),
1781 "invalid UTF-8 character <%x><%x><%x><%x>",
1782 cur[0], cur[1], cur[2], cur[3]);
1783 else
1784 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1785 pfile->line_table->highest_line,
1786 CPP_BUF_COL (buffer),
1787 "invalid UTF-8 character <%x><%x><%x><%x>",
1788 cur[0], cur[1], cur[2], cur[3]);
1789 return cur + 4;
1793 /* Helper function of *skip_*_comment and lex*_string. For C,
1794 character at CUR[-1] with MSB set handle -Wbidi-chars* and
1795 -Winvalid-utf8 diagnostics and return pointer to first character
1796 that should be processed next. */
1798 static inline const uchar *
1799 _cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1800 const uchar *cur, bool warn_bidi_p,
1801 bool warn_invalid_utf8_p)
1803 /* If this is a beginning of a UTF-8 encoding, it might be
1804 a bidirectional control character. */
1805 if (c == bidi::utf8_start && warn_bidi_p)
1807 location_t loc;
1808 bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1809 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1811 if (!warn_invalid_utf8_p)
1812 return cur;
1813 if (c >= utf8_signifier)
1815 cppchar_t s;
1816 const uchar *pstr = cur - 1;
1817 if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
1818 && s <= UCS_LIMIT)
1819 return pstr;
1821 pfile->buffer->cur = cur - 1;
1822 return _cpp_warn_invalid_utf8 (pfile);
1825 /* Skip a C-style block comment. We find the end of the comment by
1826 seeing if an asterisk is before every '/' we encounter. Returns
1827 nonzero if comment terminated by EOF, zero otherwise.
1829 Buffer->cur points to the initial asterisk of the comment. */
1830 bool
1831 _cpp_skip_block_comment (cpp_reader *pfile)
1833 cpp_buffer *buffer = pfile->buffer;
1834 const uchar *cur = buffer->cur;
1835 uchar c;
1836 const bool warn_bidi_p = pfile->warn_bidi_p ();
1837 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1838 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1840 cur++;
1841 if (*cur == '/')
1842 cur++;
1844 for (;;)
1846 /* People like decorating comments with '*', so check for '/'
1847 instead for efficiency. */
1848 c = *cur++;
1850 if (c == '/')
1852 if (cur[-2] == '*')
1854 if (warn_bidi_p)
1855 maybe_warn_bidi_on_close (pfile, cur);
1856 break;
1859 /* Warn about potential nested comments, but not if the '/'
1860 comes immediately before the true comment delimiter.
1861 Don't bother to get it right across escaped newlines. */
1862 if (CPP_OPTION (pfile, warn_comments)
1863 && cur[0] == '*' && cur[1] != '/')
1865 buffer->cur = cur;
1866 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1867 pfile->line_table->highest_line,
1868 CPP_BUF_COL (buffer),
1869 "\"/*\" within comment");
1872 else if (c == '\n')
1874 unsigned int cols;
1875 buffer->cur = cur - 1;
1876 if (warn_bidi_p)
1877 maybe_warn_bidi_on_close (pfile, cur);
1878 _cpp_process_line_notes (pfile, true);
1879 if (buffer->next_line >= buffer->rlimit)
1880 return true;
1881 _cpp_clean_line (pfile);
1883 cols = buffer->next_line - buffer->line_base;
1884 CPP_INCREMENT_LINE (pfile, cols);
1886 cur = buffer->cur;
1888 else if (__builtin_expect (c >= utf8_continuation, 0)
1889 && warn_bidi_or_invalid_utf8_p)
1890 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1891 warn_invalid_utf8_p);
1894 buffer->cur = cur;
1895 _cpp_process_line_notes (pfile, true);
1896 return false;
1899 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1900 terminating newline. Handles escaped newlines. Returns nonzero
1901 if a multiline comment. */
1902 static int
1903 skip_line_comment (cpp_reader *pfile)
1905 cpp_buffer *buffer = pfile->buffer;
1906 location_t orig_line = pfile->line_table->highest_line;
1907 const bool warn_bidi_p = pfile->warn_bidi_p ();
1908 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1909 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1911 if (!warn_bidi_or_invalid_utf8_p)
1912 while (*buffer->cur != '\n')
1913 buffer->cur++;
1914 else if (!warn_invalid_utf8_p)
1916 while (*buffer->cur != '\n'
1917 && *buffer->cur != bidi::utf8_start)
1918 buffer->cur++;
1919 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1921 while (*buffer->cur != '\n')
1923 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1925 location_t loc;
1926 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1927 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1929 buffer->cur++;
1931 maybe_warn_bidi_on_close (pfile, buffer->cur);
1934 else
1936 while (*buffer->cur != '\n')
1938 if (*buffer->cur < utf8_continuation)
1940 buffer->cur++;
1941 continue;
1943 buffer->cur
1944 = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
1945 warn_bidi_p, warn_invalid_utf8_p);
1947 if (warn_bidi_p)
1948 maybe_warn_bidi_on_close (pfile, buffer->cur);
1951 _cpp_process_line_notes (pfile, true);
1952 return orig_line != pfile->line_table->highest_line;
1955 /* Skips whitespace, saving the next non-whitespace character. */
1956 static void
1957 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1959 cpp_buffer *buffer = pfile->buffer;
1960 bool saw_NUL = false;
1964 /* Horizontal space always OK. */
1965 if (c == ' ' || c == '\t')
1967 /* Just \f \v or \0 left. */
1968 else if (c == '\0')
1969 saw_NUL = true;
1970 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1971 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1972 CPP_BUF_COL (buffer),
1973 "%s in preprocessing directive",
1974 c == '\f' ? "form feed" : "vertical tab");
1976 c = *buffer->cur++;
1978 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1979 while (is_nvspace (c));
1981 if (saw_NUL)
1983 encoding_rich_location rich_loc (pfile);
1984 cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1985 "null character(s) ignored");
1988 buffer->cur--;
1991 /* See if the characters of a number token are valid in a name (no
1992 '.', '+' or '-'). */
1993 static int
1994 name_p (cpp_reader *pfile, const cpp_string *string)
1996 unsigned int i;
1998 for (i = 0; i < string->len; i++)
1999 if (!is_idchar (string->text[i]))
2000 return 0;
2002 return 1;
2005 /* After parsing an identifier or other sequence, produce a warning about
2006 sequences not in NFC/NFKC. */
2007 static void
2008 warn_about_normalization (cpp_reader *pfile,
2009 const cpp_token *token,
2010 const struct normalize_state *s,
2011 bool identifier)
2013 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2014 && !pfile->state.skipping)
2016 location_t loc = token->src_loc;
2018 /* If possible, create a location range for the token. */
2019 if (loc >= RESERVED_LOCATION_COUNT
2020 && token->type != CPP_EOF
2021 /* There must be no line notes to process. */
2022 && (!(pfile->buffer->cur
2023 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2024 && !pfile->overlaid_buffer)))
2026 source_range tok_range;
2027 tok_range.m_start = loc;
2028 tok_range.m_finish
2029 = linemap_position_for_column (pfile->line_table,
2030 CPP_BUF_COLUMN (pfile->buffer,
2031 pfile->buffer->cur));
2032 loc = COMBINE_LOCATION_DATA (pfile->line_table,
2033 loc, tok_range, NULL, 0);
2036 encoding_rich_location rich_loc (pfile, loc);
2038 /* Make sure that the token is printed using UCNs, even
2039 if we'd otherwise happily print UTF-8. */
2040 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2041 size_t sz;
2043 sz = cpp_spell_token (pfile, token, buf, false) - buf;
2044 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2045 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2046 "`%.*s' is not in NFKC", (int) sz, buf);
2047 else if (identifier && CPP_OPTION (pfile, xid_identifiers))
2048 cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2049 "`%.*s' is not in NFC", (int) sz, buf);
2050 else
2051 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2052 "`%.*s' is not in NFC", (int) sz, buf);
2053 free (buf);
2057 /* Returns TRUE if the sequence starting at buffer->cur is valid in
2058 an identifier. FIRST is TRUE if this starts an identifier. */
2060 static bool
2061 forms_identifier_p (cpp_reader *pfile, int first,
2062 struct normalize_state *state)
2064 cpp_buffer *buffer = pfile->buffer;
2065 const bool warn_bidi_p = pfile->warn_bidi_p ();
2067 if (*buffer->cur == '$')
2069 if (!CPP_OPTION (pfile, dollars_in_ident))
2070 return false;
2072 buffer->cur++;
2073 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
2075 CPP_OPTION (pfile, warn_dollars) = 0;
2076 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
2079 return true;
2082 /* Is this a syntactically valid UCN or a valid UTF-8 char? */
2083 if (CPP_OPTION (pfile, extended_identifiers))
2085 cppchar_t s;
2086 if (*buffer->cur >= utf8_signifier)
2088 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2089 && warn_bidi_p)
2091 location_t loc;
2092 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
2093 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2095 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2096 state, &s))
2097 return true;
2099 else if (*buffer->cur == '\\'
2100 && (buffer->cur[1] == 'u'
2101 || buffer->cur[1] == 'U'
2102 || buffer->cur[1] == 'N'))
2104 buffer->cur += 2;
2105 if (warn_bidi_p)
2107 location_t loc;
2108 bidi::kind kind;
2109 if (buffer->cur[-1] == 'N')
2110 kind = get_bidi_named (pfile, buffer->cur, &loc);
2111 else
2112 kind = get_bidi_ucn (pfile, buffer->cur,
2113 buffer->cur[-1] == 'U', &loc);
2114 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2116 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2117 state, &s, NULL, NULL))
2118 return true;
2119 buffer->cur -= 2;
2123 return false;
2126 /* Helper function to issue error about improper __VA_OPT__ use. */
2127 static void
2128 maybe_va_opt_error (cpp_reader *pfile)
2130 if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2132 /* __VA_OPT__ should not be accepted at all, but allow it in
2133 system headers. */
2134 if (!_cpp_in_system_header (pfile))
2135 cpp_error (pfile, CPP_DL_PEDWARN,
2136 "__VA_OPT__ is not available until C++20");
2138 else if (!pfile->state.va_args_ok)
2140 /* __VA_OPT__ should only appear in the replacement list of a
2141 variadic macro. */
2142 cpp_error (pfile, CPP_DL_PEDWARN,
2143 "__VA_OPT__ can only appear in the expansion"
2144 " of a C++20 variadic macro");
2148 /* Helper function to get the cpp_hashnode of the identifier BASE. */
2149 static cpp_hashnode *
2150 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
2152 cpp_hashnode *result;
2153 const uchar *cur;
2154 unsigned int len;
2155 unsigned int hash = HT_HASHSTEP (0, *base);
2157 cur = base + 1;
2158 while (ISIDNUM (*cur))
2160 hash = HT_HASHSTEP (hash, *cur);
2161 cur++;
2163 len = cur - base;
2164 hash = HT_HASHFINISH (hash, len);
2165 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2166 base, len, hash, HT_ALLOC));
2168 /* Rarely, identifiers require diagnostics when lexed. */
2169 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2170 && !pfile->state.skipping, 0))
2172 /* It is allowed to poison the same identifier twice. */
2173 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2174 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2175 NODE_NAME (result));
2177 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2178 replacement list of a variadic macro. */
2179 if (result == pfile->spec_nodes.n__VA_ARGS__
2180 && !pfile->state.va_args_ok)
2182 if (CPP_OPTION (pfile, cplusplus))
2183 cpp_error (pfile, CPP_DL_PEDWARN,
2184 "__VA_ARGS__ can only appear in the expansion"
2185 " of a C++11 variadic macro");
2186 else
2187 cpp_error (pfile, CPP_DL_PEDWARN,
2188 "__VA_ARGS__ can only appear in the expansion"
2189 " of a C99 variadic macro");
2192 if (result == pfile->spec_nodes.n__VA_OPT__)
2193 maybe_va_opt_error (pfile);
2195 /* For -Wc++-compat, warn about use of C++ named operators. */
2196 if (result->flags & NODE_WARN_OPERATOR)
2197 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2198 "identifier \"%s\" is a special operator name in C++",
2199 NODE_NAME (result));
2202 return result;
2205 /* Get the cpp_hashnode of an identifier specified by NAME in
2206 the current cpp_reader object. If none is found, NULL is returned. */
2207 cpp_hashnode *
2208 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2210 cpp_hashnode *result;
2211 result = lex_identifier_intern (pfile, (uchar *) name);
2212 return result;
2215 /* Lex an identifier starting at BUFFER->CUR - 1. */
2216 static cpp_hashnode *
2217 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2218 struct normalize_state *nst, cpp_hashnode **spelling)
2220 cpp_hashnode *result;
2221 const uchar *cur;
2222 unsigned int len;
2223 unsigned int hash = HT_HASHSTEP (0, *base);
2224 const bool warn_bidi_p = pfile->warn_bidi_p ();
2226 cur = pfile->buffer->cur;
2227 if (! starts_ucn)
2229 while (ISIDNUM (*cur))
2231 hash = HT_HASHSTEP (hash, *cur);
2232 cur++;
2234 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2236 pfile->buffer->cur = cur;
2237 if (starts_ucn || forms_identifier_p (pfile, false, nst))
2239 /* Slower version for identifiers containing UCNs
2240 or extended chars (including $). */
2241 do {
2242 while (ISIDNUM (*pfile->buffer->cur))
2244 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2245 pfile->buffer->cur++;
2247 } while (forms_identifier_p (pfile, false, nst));
2248 if (warn_bidi_p)
2249 maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2250 result = _cpp_interpret_identifier (pfile, base,
2251 pfile->buffer->cur - base);
2252 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2254 else
2256 len = cur - base;
2257 hash = HT_HASHFINISH (hash, len);
2259 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2260 base, len, hash, HT_ALLOC));
2261 *spelling = result;
2264 /* Rarely, identifiers require diagnostics when lexed. */
2265 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2266 && !pfile->state.skipping, 0))
2268 /* It is allowed to poison the same identifier twice. */
2269 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2270 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2271 NODE_NAME (result));
2273 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2274 replacement list of a variadic macro. */
2275 if (result == pfile->spec_nodes.n__VA_ARGS__
2276 && !pfile->state.va_args_ok)
2278 if (CPP_OPTION (pfile, cplusplus))
2279 cpp_error (pfile, CPP_DL_PEDWARN,
2280 "__VA_ARGS__ can only appear in the expansion"
2281 " of a C++11 variadic macro");
2282 else
2283 cpp_error (pfile, CPP_DL_PEDWARN,
2284 "__VA_ARGS__ can only appear in the expansion"
2285 " of a C99 variadic macro");
2288 /* __VA_OPT__ should only appear in the replacement list of a
2289 variadic macro. */
2290 if (result == pfile->spec_nodes.n__VA_OPT__)
2291 maybe_va_opt_error (pfile);
2293 /* For -Wc++-compat, warn about use of C++ named operators. */
2294 if (result->flags & NODE_WARN_OPERATOR)
2295 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2296 "identifier \"%s\" is a special operator name in C++",
2297 NODE_NAME (result));
2300 return result;
2303 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
2304 static void
2305 lex_number (cpp_reader *pfile, cpp_string *number,
2306 struct normalize_state *nst)
2308 const uchar *cur;
2309 const uchar *base;
2310 uchar *dest;
2312 base = pfile->buffer->cur - 1;
2315 const uchar *adj_digit_sep = NULL;
2316 cur = pfile->buffer->cur;
2318 /* N.B. ISIDNUM does not include $. */
2319 while (ISIDNUM (*cur)
2320 || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2321 || DIGIT_SEP (*cur)
2322 || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2324 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2325 /* Adjacent digit separators do not form part of the pp-number syntax.
2326 However, they can safely be diagnosed here as an error, since '' is
2327 not a valid preprocessing token. */
2328 if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2329 adj_digit_sep = cur;
2330 cur++;
2332 /* A number can't end with a digit separator. */
2333 while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2334 --cur;
2335 if (adj_digit_sep && adj_digit_sep < cur)
2336 cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2338 pfile->buffer->cur = cur;
2340 while (forms_identifier_p (pfile, false, nst));
2342 number->len = cur - base;
2343 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2344 memcpy (dest, base, number->len);
2345 dest[number->len] = '\0';
2346 number->text = dest;
2349 /* Create a token of type TYPE with a literal spelling. */
2350 static void
2351 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2352 unsigned int len, enum cpp_ttype type)
2354 token->type = type;
2355 token->val.str.len = len;
2356 token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2359 const uchar *
2360 cpp_alloc_token_string (cpp_reader *pfile,
2361 const unsigned char *ptr, unsigned len)
2363 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2365 dest[len] = 0;
2366 memcpy (dest, ptr, len);
2367 return dest;
2370 /* A pair of raw buffer pointers. The currently open one is [1], the
2371 first one is [0]. Used for string literal lexing. */
2372 struct lit_accum {
2373 _cpp_buff *first;
2374 _cpp_buff *last;
2375 const uchar *rpos;
2376 size_t accum;
2378 lit_accum ()
2379 : first (NULL), last (NULL), rpos (0), accum (0)
2383 void append (cpp_reader *, const uchar *, size_t);
2385 void read_begin (cpp_reader *);
2386 bool reading_p () const
2388 return rpos != NULL;
2390 char read_char ()
2392 char c = *rpos++;
2393 if (rpos == BUFF_FRONT (last))
2394 rpos = NULL;
2395 return c;
2399 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2400 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
2402 void
2403 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2405 if (!last)
2406 /* Starting. */
2407 first = last = _cpp_get_buff (pfile, len);
2408 else if (len > BUFF_ROOM (last))
2410 /* There is insufficient room in the buffer. Copy what we can,
2411 and then either extend or create a new one. */
2412 size_t room = BUFF_ROOM (last);
2413 memcpy (BUFF_FRONT (last), base, room);
2414 BUFF_FRONT (last) += room;
2415 base += room;
2416 len -= room;
2417 accum += room;
2419 gcc_checking_assert (!rpos);
2421 last = _cpp_append_extend_buff (pfile, last, len);
2424 memcpy (BUFF_FRONT (last), base, len);
2425 BUFF_FRONT (last) += len;
2426 accum += len;
2429 void
2430 lit_accum::read_begin (cpp_reader *pfile)
2432 /* We never accumulate more than 4 chars to read. */
2433 if (BUFF_ROOM (last) < 4)
2435 last = _cpp_append_extend_buff (pfile, last, 4);
2436 rpos = BUFF_FRONT (last);
2439 /* Returns true if a macro has been defined.
2440 This might not work if compile with -save-temps,
2441 or preprocess separately from compilation. */
2443 static bool
2444 is_macro(cpp_reader *pfile, const uchar *base)
2446 const uchar *cur = base;
2447 if (! ISIDST (*cur))
2448 return false;
2449 unsigned int hash = HT_HASHSTEP (0, *cur);
2450 ++cur;
2451 while (ISIDNUM (*cur))
2453 hash = HT_HASHSTEP (hash, *cur);
2454 ++cur;
2456 hash = HT_HASHFINISH (hash, cur - base);
2458 cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2459 base, cur - base, hash, HT_NO_INSERT));
2461 return result && cpp_macro_p (result);
2464 /* Returns true if a literal suffix does not have the expected form
2465 and is defined as a macro. */
2467 static bool
2468 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
2470 /* User-defined literals outside of namespace std must start with a single
2471 underscore, so assume anything of that form really is a UDL suffix.
2472 We don't need to worry about UDLs defined inside namespace std because
2473 their names are reserved, so cannot be used as macro names in valid
2474 programs. */
2475 if (base[0] == '_' && base[1] != '_')
2476 return false;
2477 return is_macro (pfile, base);
2480 /* Lexes a raw string. The stored string contains the spelling,
2481 including double quotes, delimiter string, '(' and ')', any leading
2482 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
2483 the type of the literal, or CPP_OTHER if it was not properly
2484 terminated.
2486 BASE is the start of the token. Updates pfile->buffer->cur to just
2487 after the lexed string.
2489 The spelling is NUL-terminated, but it is not guaranteed that this
2490 is the first NUL since embedded NULs are preserved. */
2492 static void
2493 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2495 const uchar *pos = base;
2496 const bool warn_bidi_p = pfile->warn_bidi_p ();
2497 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2498 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2500 /* 'tis a pity this information isn't passed down from the lexer's
2501 initial categorization of the token. */
2502 enum cpp_ttype type = CPP_STRING;
2504 if (*pos == 'L')
2506 type = CPP_WSTRING;
2507 pos++;
2509 else if (*pos == 'U')
2511 type = CPP_STRING32;
2512 pos++;
2514 else if (*pos == 'u')
2516 if (pos[1] == '8')
2518 type = CPP_UTF8STRING;
2519 pos++;
2521 else
2522 type = CPP_STRING16;
2523 pos++;
2526 gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2527 pos += 2;
2529 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2531 /* Skip notes before the ". */
2532 while (note->pos < pos)
2533 ++note;
2535 lit_accum accum;
2537 uchar prefix[17];
2538 unsigned prefix_len = 0;
2539 enum Phase
2541 PHASE_PREFIX = -2,
2542 PHASE_NONE = -1,
2543 PHASE_SUFFIX = 0
2544 } phase = PHASE_PREFIX;
2546 for (;;)
2548 gcc_checking_assert (note->pos >= pos);
2550 /* Undo any escaped newlines and trigraphs. */
2551 if (!accum.reading_p () && note->pos == pos)
2552 switch (note->type)
2554 case '\\':
2555 case ' ':
2556 /* Restore backslash followed by newline. */
2557 accum.append (pfile, base, pos - base);
2558 base = pos;
2559 accum.read_begin (pfile);
2560 accum.append (pfile, UC"\\", 1);
2562 after_backslash:
2563 if (note->type == ' ')
2564 /* GNU backslash whitespace newline extension. FIXME
2565 could be any sequence of non-vertical space. When we
2566 can properly restore any such sequence, we should
2567 mark this note as handled so _cpp_process_line_notes
2568 doesn't warn. */
2569 accum.append (pfile, UC" ", 1);
2571 accum.append (pfile, UC"\n", 1);
2572 note++;
2573 break;
2575 case '\n':
2576 /* This can happen for ??/<NEWLINE> when trigraphs are not
2577 being interpretted. */
2578 gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2579 note->type = 0;
2580 note++;
2581 break;
2583 default:
2584 gcc_checking_assert (_cpp_trigraph_map[note->type]);
2586 /* Don't warn about this trigraph in
2587 _cpp_process_line_notes, since trigraphs show up as
2588 trigraphs in raw strings. */
2589 uchar type = note->type;
2590 note->type = 0;
2592 if (CPP_OPTION (pfile, trigraphs))
2594 accum.append (pfile, base, pos - base);
2595 base = pos;
2596 accum.read_begin (pfile);
2597 accum.append (pfile, UC"??", 2);
2598 accum.append (pfile, &type, 1);
2600 /* ??/ followed by newline gets two line notes, one for
2601 the trigraph and one for the backslash/newline. */
2602 if (type == '/' && note[1].pos == pos)
2604 note++;
2605 gcc_assert (note->type == '\\' || note->type == ' ');
2606 goto after_backslash;
2608 /* Skip the replacement character. */
2609 base = ++pos;
2612 note++;
2613 break;
2616 /* Now get a char to process. Either from an expanded note, or
2617 from the line buffer. */
2618 bool read_note = accum.reading_p ();
2619 char c = read_note ? accum.read_char () : *pos++;
2621 if (phase == PHASE_PREFIX)
2623 if (c == '(')
2625 /* Done. */
2626 phase = PHASE_NONE;
2627 prefix[prefix_len++] = '"';
2629 else if (prefix_len < 16
2630 /* Prefix chars are any of the basic character set,
2631 [lex.charset] except for '
2632 ()\\\t\v\f\n'. Optimized for a contiguous
2633 alphabet. */
2634 /* Unlike a switch, this collapses down to one or
2635 two shift and bitmask operations on an ASCII
2636 system, with an outlier or two. */
2637 && (('Z' - 'A' == 25
2638 ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2639 : ISIDST (c))
2640 || (c >= '0' && c <= '9')
2641 || c == '_' || c == '{' || c == '}'
2642 || c == '[' || c == ']' || c == '#'
2643 || c == '<' || c == '>' || c == '%'
2644 || c == ':' || c == ';' || c == '.' || c == '?'
2645 || c == '*' || c == '+' || c == '-' || c == '/'
2646 || c == '^' || c == '&' || c == '|' || c == '~'
2647 || c == '!' || c == '=' || c == ','
2648 || c == '"' || c == '\''))
2649 prefix[prefix_len++] = c;
2650 else
2652 /* Something is wrong. */
2653 int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2654 if (prefix_len == 16)
2655 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2656 col, "raw string delimiter longer "
2657 "than 16 characters");
2658 else if (c == '\n')
2659 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2660 col, "invalid new-line in raw "
2661 "string delimiter");
2662 else
2663 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2664 col, "invalid character '%c' in "
2665 "raw string delimiter", c);
2666 type = CPP_OTHER;
2667 phase = PHASE_NONE;
2668 /* Continue until we get a close quote, that's probably
2669 the best failure mode. */
2670 prefix_len = 0;
2672 if (c != '\n')
2673 continue;
2676 if (phase != PHASE_NONE)
2678 if (prefix[phase] != c)
2679 phase = PHASE_NONE;
2680 else if (unsigned (phase + 1) == prefix_len)
2681 break;
2682 else
2684 phase = Phase (phase + 1);
2685 continue;
2689 if (!prefix_len && c == '"')
2690 /* Failure mode lexing. */
2691 goto out;
2692 else if (prefix_len && c == ')')
2693 phase = PHASE_SUFFIX;
2694 else if (!read_note && c == '\n')
2696 pos--;
2697 pfile->buffer->cur = pos;
2698 if (pfile->state.in_directive
2699 || (pfile->state.parsing_args
2700 && pfile->buffer->next_line >= pfile->buffer->rlimit))
2702 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2703 "unterminated raw string");
2704 type = CPP_OTHER;
2705 goto out;
2708 accum.append (pfile, base, pos - base + 1);
2709 _cpp_process_line_notes (pfile, false);
2711 if (pfile->buffer->next_line < pfile->buffer->rlimit)
2712 CPP_INCREMENT_LINE (pfile, 0);
2713 pfile->buffer->need_line = true;
2715 if (!_cpp_get_fresh_line (pfile))
2717 /* We ran out of file and failed to get a line. */
2718 location_t src_loc = token->src_loc;
2719 token->type = CPP_EOF;
2720 /* Tell the compiler the line number of the EOF token. */
2721 token->src_loc = pfile->line_table->highest_line;
2722 token->flags = BOL;
2723 if (accum.first)
2724 _cpp_release_buff (pfile, accum.first);
2725 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2726 "unterminated raw string");
2727 /* Now pop the buffer that _cpp_get_fresh_line did not. */
2728 _cpp_pop_buffer (pfile);
2729 return;
2732 pos = base = pfile->buffer->cur;
2733 note = &pfile->buffer->notes[pfile->buffer->cur_note];
2735 else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2736 && warn_bidi_or_invalid_utf8_p)
2737 pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
2738 warn_invalid_utf8_p);
2741 if (warn_bidi_p)
2742 maybe_warn_bidi_on_close (pfile, pos);
2744 if (CPP_OPTION (pfile, user_literals))
2746 /* If a string format macro, say from inttypes.h, is placed touching
2747 a string literal it could be parsed as a C++11 user-defined string
2748 literal thus breaking the program. */
2749 if (is_macro_not_literal_suffix (pfile, pos))
2751 /* Raise a warning, but do not consume subsequent tokens. */
2752 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2753 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2754 token->src_loc, 0,
2755 "invalid suffix on literal; C++11 requires "
2756 "a space between literal and string macro");
2758 /* Grab user defined literal suffix. */
2759 else if (ISIDST (*pos))
2761 type = cpp_userdef_string_add_type (type);
2762 ++pos;
2764 while (ISIDNUM (*pos))
2765 ++pos;
2769 out:
2770 pfile->buffer->cur = pos;
2771 if (!accum.accum)
2772 create_literal (pfile, token, base, pos - base, type);
2773 else
2775 size_t extra_len = pos - base;
2776 uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2778 token->type = type;
2779 token->val.str.len = accum.accum + extra_len;
2780 token->val.str.text = dest;
2781 for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2783 size_t len = BUFF_FRONT (buf) - buf->base;
2784 memcpy (dest, buf->base, len);
2785 dest += len;
2787 _cpp_release_buff (pfile, accum.first);
2788 memcpy (dest, base, extra_len);
2789 dest[extra_len] = '\0';
2793 /* Lexes a string, character constant, or angle-bracketed header file
2794 name. The stored string contains the spelling, including opening
2795 quote and any leading 'L', 'u', 'U' or 'u8' and optional
2796 'R' modifier. It returns the type of the literal, or CPP_OTHER
2797 if it was not properly terminated, or CPP_LESS for an unterminated
2798 header name which must be relexed as normal tokens.
2800 The spelling is NUL-terminated, but it is not guaranteed that this
2801 is the first NUL since embedded NULs are preserved. */
2802 static void
2803 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2805 bool saw_NUL = false;
2806 const uchar *cur;
2807 cppchar_t terminator;
2808 enum cpp_ttype type;
2810 cur = base;
2811 terminator = *cur++;
2812 if (terminator == 'L' || terminator == 'U')
2813 terminator = *cur++;
2814 else if (terminator == 'u')
2816 terminator = *cur++;
2817 if (terminator == '8')
2818 terminator = *cur++;
2820 if (terminator == 'R')
2822 lex_raw_string (pfile, token, base);
2823 return;
2825 if (terminator == '"')
2826 type = (*base == 'L' ? CPP_WSTRING :
2827 *base == 'U' ? CPP_STRING32 :
2828 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2829 : CPP_STRING);
2830 else if (terminator == '\'')
2831 type = (*base == 'L' ? CPP_WCHAR :
2832 *base == 'U' ? CPP_CHAR32 :
2833 *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2834 : CPP_CHAR);
2835 else
2836 terminator = '>', type = CPP_HEADER_NAME;
2838 const bool warn_bidi_p = pfile->warn_bidi_p ();
2839 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2840 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2841 for (;;)
2843 cppchar_t c = *cur++;
2845 /* In #include-style directives, terminators are not escapable. */
2846 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2848 if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
2850 location_t loc;
2851 bidi::kind kind;
2852 if (cur[0] == 'N')
2853 kind = get_bidi_named (pfile, cur + 1, &loc);
2854 else
2855 kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
2856 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2858 cur++;
2860 else if (c == terminator)
2862 if (warn_bidi_p)
2863 maybe_warn_bidi_on_close (pfile, cur - 1);
2864 break;
2866 else if (c == '\n')
2868 cur--;
2869 /* Unmatched quotes always yield undefined behavior, but
2870 greedy lexing means that what appears to be an unterminated
2871 header name may actually be a legitimate sequence of tokens. */
2872 if (terminator == '>')
2874 token->type = CPP_LESS;
2875 return;
2877 type = CPP_OTHER;
2878 break;
2880 else if (c == '\0')
2881 saw_NUL = true;
2882 else if (__builtin_expect (c >= utf8_continuation, 0)
2883 && warn_bidi_or_invalid_utf8_p)
2884 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2885 warn_invalid_utf8_p);
2888 if (saw_NUL && !pfile->state.skipping)
2889 cpp_error (pfile, CPP_DL_WARNING,
2890 "null character(s) preserved in literal");
2892 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2893 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2894 (int) terminator);
2896 if (CPP_OPTION (pfile, user_literals))
2898 /* If a string format macro, say from inttypes.h, is placed touching
2899 a string literal it could be parsed as a C++11 user-defined string
2900 literal thus breaking the program. */
2901 if (is_macro_not_literal_suffix (pfile, cur))
2903 /* Raise a warning, but do not consume subsequent tokens. */
2904 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2905 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2906 token->src_loc, 0,
2907 "invalid suffix on literal; C++11 requires "
2908 "a space between literal and string macro");
2910 /* Grab user defined literal suffix. */
2911 else if (ISIDST (*cur))
2913 type = cpp_userdef_char_add_type (type);
2914 type = cpp_userdef_string_add_type (type);
2915 ++cur;
2917 while (ISIDNUM (*cur))
2918 ++cur;
2921 else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2922 && is_macro (pfile, cur)
2923 && !pfile->state.skipping)
2924 cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2925 token->src_loc, 0, "C++11 requires a space "
2926 "between string literal and macro");
2928 pfile->buffer->cur = cur;
2929 create_literal (pfile, token, base, cur - base, type);
2932 /* Return the comment table. The client may not make any assumption
2933 about the ordering of the table. */
2934 cpp_comment_table *
2935 cpp_get_comments (cpp_reader *pfile)
2937 return &pfile->comments;
2940 /* Append a comment to the end of the comment table. */
2941 static void
2942 store_comment (cpp_reader *pfile, cpp_token *token)
2944 int len;
2946 if (pfile->comments.allocated == 0)
2948 pfile->comments.allocated = 256;
2949 pfile->comments.entries = (cpp_comment *) xmalloc
2950 (pfile->comments.allocated * sizeof (cpp_comment));
2953 if (pfile->comments.count == pfile->comments.allocated)
2955 pfile->comments.allocated *= 2;
2956 pfile->comments.entries = (cpp_comment *) xrealloc
2957 (pfile->comments.entries,
2958 pfile->comments.allocated * sizeof (cpp_comment));
2961 len = token->val.str.len;
2963 /* Copy comment. Note, token may not be NULL terminated. */
2964 pfile->comments.entries[pfile->comments.count].comment =
2965 (char *) xmalloc (sizeof (char) * (len + 1));
2966 memcpy (pfile->comments.entries[pfile->comments.count].comment,
2967 token->val.str.text, len);
2968 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2970 /* Set source location. */
2971 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2973 /* Increment the count of entries in the comment table. */
2974 pfile->comments.count++;
2977 /* The stored comment includes the comment start and any terminator. */
2978 static void
2979 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2980 cppchar_t type)
2982 unsigned char *buffer;
2983 unsigned int len, clen, i;
2985 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
2987 /* C++ comments probably (not definitely) have moved past a new
2988 line, which we don't want to save in the comment. */
2989 if (is_vspace (pfile->buffer->cur[-1]))
2990 len--;
2992 /* If we are currently in a directive or in argument parsing, then
2993 we need to store all C++ comments as C comments internally, and
2994 so we need to allocate a little extra space in that case.
2996 Note that the only time we encounter a directive here is
2997 when we are saving comments in a "#define". */
2998 clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2999 && type == '/') ? len + 2 : len;
3001 buffer = _cpp_unaligned_alloc (pfile, clen);
3003 token->type = CPP_COMMENT;
3004 token->val.str.len = clen;
3005 token->val.str.text = buffer;
3007 buffer[0] = '/';
3008 memcpy (buffer + 1, from, len - 1);
3010 /* Finish conversion to a C comment, if necessary. */
3011 if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
3013 buffer[1] = '*';
3014 buffer[clen - 2] = '*';
3015 buffer[clen - 1] = '/';
3016 /* As there can be in a C++ comments illegal sequences for C comments
3017 we need to filter them out. */
3018 for (i = 2; i < (clen - 2); i++)
3019 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3020 buffer[i] = '|';
3023 /* Finally store this comment for use by clients of libcpp. */
3024 store_comment (pfile, token);
3027 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3028 comment. */
3030 static bool
3031 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3033 const unsigned char *from = comment_start + 1;
3035 switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3037 /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3038 don't recognize any comments. The latter only checks attributes,
3039 the former doesn't warn. */
3040 case 0:
3041 default:
3042 return false;
3043 /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3044 content it has. */
3045 case 1:
3046 return true;
3047 case 2:
3048 /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3049 .*falls?[ \t-]*thr(u|ough).* regex. */
3050 for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3051 from++)
3053 /* Is there anything like strpbrk with upper boundary, or
3054 memchr looking for 2 characters rather than just one? */
3055 if (from[0] != 'f' && from[0] != 'F')
3056 continue;
3057 if (from[1] != 'a' && from[1] != 'A')
3058 continue;
3059 if (from[2] != 'l' && from[2] != 'L')
3060 continue;
3061 if (from[3] != 'l' && from[3] != 'L')
3062 continue;
3063 from += sizeof "fall" - 1;
3064 if (from[0] == 's' || from[0] == 'S')
3065 from++;
3066 while (*from == ' ' || *from == '\t' || *from == '-')
3067 from++;
3068 if (from[0] != 't' && from[0] != 'T')
3069 continue;
3070 if (from[1] != 'h' && from[1] != 'H')
3071 continue;
3072 if (from[2] != 'r' && from[2] != 'R')
3073 continue;
3074 if (from[3] == 'u' || from[3] == 'U')
3075 return true;
3076 if (from[3] != 'o' && from[3] != 'O')
3077 continue;
3078 if (from[4] != 'u' && from[4] != 'U')
3079 continue;
3080 if (from[5] != 'g' && from[5] != 'G')
3081 continue;
3082 if (from[6] != 'h' && from[6] != 'H')
3083 continue;
3084 return true;
3086 return false;
3087 case 3:
3088 case 4:
3089 break;
3092 /* Whole comment contents:
3093 -fallthrough
3094 @fallthrough@
3096 if (*from == '-' || *from == '@')
3098 size_t len = sizeof "fallthrough" - 1;
3099 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3100 return false;
3101 if (memcmp (from + 1, "fallthrough", len))
3102 return false;
3103 if (*from == '@')
3105 if (from[len + 1] != '@')
3106 return false;
3107 len++;
3109 from += 1 + len;
3111 /* Whole comment contents (regex):
3112 lint -fallthrough[ \t]*
3114 else if (*from == 'l')
3116 size_t len = sizeof "int -fallthrough" - 1;
3117 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3118 return false;
3119 if (memcmp (from + 1, "int -fallthrough", len))
3120 return false;
3121 from += 1 + len;
3122 while (*from == ' ' || *from == '\t')
3123 from++;
3125 /* Whole comment contents (regex):
3126 [ \t]*FALLTHR(U|OUGH)[ \t]*
3128 else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3130 while (*from == ' ' || *from == '\t')
3131 from++;
3132 if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
3133 return false;
3134 if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
3135 return false;
3136 from += sizeof "FALLTHR" - 1;
3137 if (*from == 'U')
3138 from++;
3139 else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
3140 return false;
3141 else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
3142 return false;
3143 else
3144 from += sizeof "OUGH" - 1;
3145 while (*from == ' ' || *from == '\t')
3146 from++;
3148 /* Whole comment contents (regex):
3149 [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3150 [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3151 [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
3153 else
3155 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3156 from++;
3157 unsigned char f = *from;
3158 bool all_upper = false;
3159 if (f == 'E' || f == 'e')
3161 if ((size_t) (pfile->buffer->cur - from)
3162 < sizeof "else fallthru" - 1)
3163 return false;
3164 if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
3165 all_upper = true;
3166 else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
3167 return false;
3168 from += sizeof "else" - 1;
3169 if (*from == ',')
3170 from++;
3171 if (*from != ' ')
3172 return false;
3173 from++;
3174 if (all_upper && *from == 'f')
3175 return false;
3176 if (f == 'e' && *from == 'F')
3177 return false;
3178 f = *from;
3180 else if (f == 'I' || f == 'i')
3182 if ((size_t) (pfile->buffer->cur - from)
3183 < sizeof "intentional fallthru" - 1)
3184 return false;
3185 if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
3186 sizeof "NTENTIONAL" - 1) == 0)
3187 all_upper = true;
3188 else if (memcmp (from + 1, "ntentional",
3189 sizeof "ntentional" - 1))
3190 return false;
3191 from += sizeof "intentional" - 1;
3192 if (*from == ' ')
3194 from++;
3195 if (all_upper && *from == 'f')
3196 return false;
3198 else if (all_upper)
3200 if (memcmp (from, "LY F", sizeof "LY F" - 1))
3201 return false;
3202 from += sizeof "LY " - 1;
3204 else
3206 if (memcmp (from, "ly ", sizeof "ly " - 1))
3207 return false;
3208 from += sizeof "ly " - 1;
3210 if (f == 'i' && *from == 'F')
3211 return false;
3212 f = *from;
3214 if (f != 'F' && f != 'f')
3215 return false;
3216 if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3217 return false;
3218 if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3219 all_upper = true;
3220 else if (all_upper)
3221 return false;
3222 else if (memcmp (from + 1, "all", sizeof "all" - 1))
3223 return false;
3224 from += sizeof "fall" - 1;
3225 if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3226 from += 2;
3227 else if (*from == ' ' || *from == '-')
3228 from++;
3229 else if (*from != (all_upper ? 'T' : 't'))
3230 return false;
3231 if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3232 return false;
3233 if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3234 return false;
3235 if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3237 if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3238 return false;
3239 if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3240 sizeof "hrough" - 1))
3241 return false;
3242 from += sizeof "through" - 1;
3244 else
3245 from += sizeof "thru" - 1;
3246 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3247 from++;
3248 if (*from == '-')
3250 from++;
3251 if (*comment_start == '*')
3255 while (*from && *from != '*'
3256 && *from != '\n' && *from != '\r')
3257 from++;
3258 if (*from != '*' || from[1] == '/')
3259 break;
3260 from++;
3262 while (1);
3264 else
3265 while (*from && *from != '\n' && *from != '\r')
3266 from++;
3269 /* C block comment. */
3270 if (*comment_start == '*')
3272 if (*from != '*' || from[1] != '/')
3273 return false;
3275 /* C++ line comment. */
3276 else if (*from != '\n')
3277 return false;
3279 return true;
3282 /* Allocate COUNT tokens for RUN. */
3283 void
3284 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3286 run->base = XNEWVEC (cpp_token, count);
3287 run->limit = run->base + count;
3288 run->next = NULL;
3291 /* Returns the next tokenrun, or creates one if there is none. */
3292 static tokenrun *
3293 next_tokenrun (tokenrun *run)
3295 if (run->next == NULL)
3297 run->next = XNEW (tokenrun);
3298 run->next->prev = run;
3299 _cpp_init_tokenrun (run->next, 250);
3302 return run->next;
3305 /* Return the number of not yet processed token in a given
3306 context. */
3308 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3310 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3311 return (LAST (context).token - FIRST (context).token);
3312 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3313 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3314 return (LAST (context).ptoken - FIRST (context).ptoken);
3315 else
3316 abort ();
3319 /* Returns the token present at index INDEX in a given context. If
3320 INDEX is zero, the next token to be processed is returned. */
3321 static const cpp_token*
3322 _cpp_token_from_context_at (cpp_context *context, int index)
3324 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3325 return &(FIRST (context).token[index]);
3326 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3327 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3328 return FIRST (context).ptoken[index];
3329 else
3330 abort ();
3333 /* Look ahead in the input stream. */
3334 const cpp_token *
3335 cpp_peek_token (cpp_reader *pfile, int index)
3337 cpp_context *context = pfile->context;
3338 const cpp_token *peektok;
3339 int count;
3341 /* First, scan through any pending cpp_context objects. */
3342 while (context->prev)
3344 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3346 if (index < (int) sz)
3347 return _cpp_token_from_context_at (context, index);
3348 index -= (int) sz;
3349 context = context->prev;
3352 /* We will have to read some new tokens after all (and do so
3353 without invalidating preceding tokens). */
3354 count = index;
3355 pfile->keep_tokens++;
3357 /* For peeked tokens temporarily disable line_change reporting,
3358 until the tokens are parsed for real. */
3359 void (*line_change) (cpp_reader *, const cpp_token *, int)
3360 = pfile->cb.line_change;
3361 pfile->cb.line_change = NULL;
3365 peektok = _cpp_lex_token (pfile);
3366 if (peektok->type == CPP_EOF)
3368 index--;
3369 break;
3371 else if (peektok->type == CPP_PRAGMA)
3373 /* Don't peek past a pragma. */
3374 if (peektok == &pfile->directive_result)
3375 /* Save the pragma in the buffer. */
3376 *pfile->cur_token++ = *peektok;
3377 index--;
3378 break;
3381 while (index--);
3383 _cpp_backup_tokens_direct (pfile, count - index);
3384 pfile->keep_tokens--;
3385 pfile->cb.line_change = line_change;
3387 return peektok;
3390 /* Allocate a single token that is invalidated at the same time as the
3391 rest of the tokens on the line. Has its line and col set to the
3392 same as the last lexed token, so that diagnostics appear in the
3393 right place. */
3394 cpp_token *
3395 _cpp_temp_token (cpp_reader *pfile)
3397 cpp_token *old, *result;
3398 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3399 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3401 old = pfile->cur_token - 1;
3402 /* Any pre-existing lookaheads must not be clobbered. */
3403 if (la)
3405 if (sz <= la)
3407 tokenrun *next = next_tokenrun (pfile->cur_run);
3409 if (sz < la)
3410 memmove (next->base + 1, next->base,
3411 (la - sz) * sizeof (cpp_token));
3413 next->base[0] = pfile->cur_run->limit[-1];
3416 if (sz > 1)
3417 memmove (pfile->cur_token + 1, pfile->cur_token,
3418 MIN (la, sz - 1) * sizeof (cpp_token));
3421 if (!sz && pfile->cur_token == pfile->cur_run->limit)
3423 pfile->cur_run = next_tokenrun (pfile->cur_run);
3424 pfile->cur_token = pfile->cur_run->base;
3427 result = pfile->cur_token++;
3428 result->src_loc = old->src_loc;
3429 return result;
3432 /* We're at the beginning of a logical line (so not in
3433 directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
3434 if we should enter deferred_pragma mode to tokenize the rest of the
3435 line as a module control-line. */
3437 static void
3438 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3440 unsigned backup = 0; /* Tokens we peeked. */
3441 cpp_hashnode *node = result->val.node.node;
3442 cpp_token *peek = result;
3443 cpp_token *keyword = peek;
3444 cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3445 int header_count = 0;
3447 /* Make sure the incoming state is as we expect it. This way we
3448 can restore it using constants. */
3449 gcc_checking_assert (!pfile->state.in_deferred_pragma
3450 && !pfile->state.skipping
3451 && !pfile->state.parsing_args
3452 && !pfile->state.angled_headers
3453 && (pfile->state.save_comments
3454 == !CPP_OPTION (pfile, discard_comments)));
3456 /* Enter directives mode sufficiently for peeking. We don't have
3457 to actually set in_directive. */
3458 pfile->state.in_deferred_pragma = true;
3460 /* These two fields are needed to process tokenization in deferred
3461 pragma mode. They are not used outside deferred pragma mode or
3462 directives mode. */
3463 pfile->state.pragma_allow_expansion = true;
3464 pfile->directive_line = result->src_loc;
3466 /* Saving comments is incompatible with directives mode. */
3467 pfile->state.save_comments = 0;
3469 if (node == n_modules[spec_nodes::M_EXPORT][0])
3471 peek = _cpp_lex_direct (pfile);
3472 keyword = peek;
3473 backup++;
3474 if (keyword->type != CPP_NAME)
3475 goto not_module;
3476 node = keyword->val.node.node;
3477 if (!(node->flags & NODE_MODULE))
3478 goto not_module;
3481 if (node == n_modules[spec_nodes::M__IMPORT][0])
3482 /* __import */
3483 header_count = backup + 2 + 16;
3484 else if (node == n_modules[spec_nodes::M_IMPORT][0])
3485 /* import */
3486 header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3487 else if (node == n_modules[spec_nodes::M_MODULE][0])
3488 ; /* module */
3489 else
3490 goto not_module;
3492 /* We've seen [export] {module|import|__import}. Check the next token. */
3493 if (header_count)
3494 /* After '{,__}import' a header name may appear. */
3495 pfile->state.angled_headers = true;
3496 peek = _cpp_lex_direct (pfile);
3497 backup++;
3499 /* ... import followed by identifier, ':', '<' or
3500 header-name preprocessing tokens, or module
3501 followed by cpp-identifier, ':' or ';' preprocessing
3502 tokens. C++ keywords are not yet relevant. */
3503 if (peek->type == CPP_NAME
3504 || peek->type == CPP_COLON
3505 || (header_count
3506 ? (peek->type == CPP_LESS
3507 || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3508 || peek->type == CPP_HEADER_NAME)
3509 : peek->type == CPP_SEMICOLON))
3511 pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3512 if (!pfile->state.pragma_allow_expansion)
3513 pfile->state.prevent_expansion++;
3515 if (!header_count && linemap_included_from
3516 (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3517 cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3518 "module control-line cannot be in included file");
3520 /* The first one or two tokens cannot be macro names. */
3521 for (int ix = backup; ix--;)
3523 cpp_token *tok = ix ? keyword : result;
3524 cpp_hashnode *node = tok->val.node.node;
3526 /* Don't attempt to expand the token. */
3527 tok->flags |= NO_EXPAND;
3528 if (_cpp_defined_macro_p (node)
3529 && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3530 && !cpp_fun_like_macro_p (node))
3531 cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3532 "module control-line \"%s\" cannot be"
3533 " an object-like macro",
3534 NODE_NAME (node));
3537 /* Map to underbar variants. */
3538 keyword->val.node.node = n_modules[header_count
3539 ? spec_nodes::M_IMPORT
3540 : spec_nodes::M_MODULE][1];
3541 if (backup != 1)
3542 result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3544 /* Maybe tell the tokenizer we expect a header-name down the
3545 road. */
3546 pfile->state.directive_file_token = header_count;
3548 else
3550 not_module:
3551 /* Drop out of directive mode. */
3552 /* We aaserted save_comments had this value upon entry. */
3553 pfile->state.save_comments
3554 = !CPP_OPTION (pfile, discard_comments);
3555 pfile->state.in_deferred_pragma = false;
3556 /* Do not let this remain on. */
3557 pfile->state.angled_headers = false;
3560 /* In either case we want to backup the peeked tokens. */
3561 if (backup)
3563 /* If we saw EOL, we should drop it, because this isn't a module
3564 control-line after all. */
3565 bool eol = peek->type == CPP_PRAGMA_EOL;
3566 if (!eol || backup > 1)
3568 /* Put put the peeked tokens back */
3569 _cpp_backup_tokens_direct (pfile, backup);
3570 /* But if the last one was an EOL, forget it. */
3571 if (eol)
3572 pfile->lookaheads--;
3577 /* Lex a token into RESULT (external interface). Takes care of issues
3578 like directive handling, token lookahead, multiple include
3579 optimization and skipping. */
3580 const cpp_token *
3581 _cpp_lex_token (cpp_reader *pfile)
3583 cpp_token *result;
3585 for (;;)
3587 if (pfile->cur_token == pfile->cur_run->limit)
3589 pfile->cur_run = next_tokenrun (pfile->cur_run);
3590 pfile->cur_token = pfile->cur_run->base;
3592 /* We assume that the current token is somewhere in the current
3593 run. */
3594 if (pfile->cur_token < pfile->cur_run->base
3595 || pfile->cur_token >= pfile->cur_run->limit)
3596 abort ();
3598 if (pfile->lookaheads)
3600 pfile->lookaheads--;
3601 result = pfile->cur_token++;
3603 else
3604 result = _cpp_lex_direct (pfile);
3606 if (result->flags & BOL)
3608 /* Is this a directive. If _cpp_handle_directive returns
3609 false, it is an assembler #. */
3610 if (result->type == CPP_HASH
3611 /* 6.10.3 p 11: Directives in a list of macro arguments
3612 gives undefined behavior. This implementation
3613 handles the directive as normal. */
3614 && pfile->state.parsing_args != 1)
3616 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3618 if (pfile->directive_result.type == CPP_PADDING)
3619 continue;
3620 result = &pfile->directive_result;
3623 else if (pfile->state.in_deferred_pragma)
3624 result = &pfile->directive_result;
3625 else if (result->type == CPP_NAME
3626 && (result->val.node.node->flags & NODE_MODULE)
3627 && !pfile->state.skipping
3628 /* Unlike regular directives, we do not deal with
3629 tokenizing module directives as macro arguments.
3630 That's not permitted. */
3631 && !pfile->state.parsing_args)
3633 /* P1857. Before macro expansion, At start of logical
3634 line ... */
3635 /* We don't have to consider lookaheads at this point. */
3636 gcc_checking_assert (!pfile->lookaheads);
3638 cpp_maybe_module_directive (pfile, result);
3641 if (pfile->cb.line_change && !pfile->state.skipping)
3642 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3645 /* We don't skip tokens in directives. */
3646 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3647 break;
3649 /* Outside a directive, invalidate controlling macros. At file
3650 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3651 get here and MI optimization works. */
3652 pfile->mi_valid = false;
3654 if (!pfile->state.skipping || result->type == CPP_EOF)
3655 break;
3658 return result;
3661 /* Returns true if a fresh line has been loaded. */
3662 bool
3663 _cpp_get_fresh_line (cpp_reader *pfile)
3665 /* We can't get a new line until we leave the current directive. */
3666 if (pfile->state.in_directive)
3667 return false;
3669 for (;;)
3671 cpp_buffer *buffer = pfile->buffer;
3673 if (!buffer->need_line)
3674 return true;
3676 if (buffer->next_line < buffer->rlimit)
3678 _cpp_clean_line (pfile);
3679 return true;
3682 /* First, get out of parsing arguments state. */
3683 if (pfile->state.parsing_args)
3684 return false;
3686 /* End of buffer. Non-empty files should end in a newline. */
3687 if (buffer->buf != buffer->rlimit
3688 && buffer->next_line > buffer->rlimit
3689 && !buffer->from_stage3)
3691 /* Clip to buffer size. */
3692 buffer->next_line = buffer->rlimit;
3695 if (buffer->prev && !buffer->return_at_eof)
3696 _cpp_pop_buffer (pfile);
3697 else
3699 /* End of translation. Do not pop the buffer yet. Increment
3700 line number so that the EOF token is on a line of its own
3701 (_cpp_lex_direct doesn't increment in that case, because
3702 it's hard for it to distinguish this special case). */
3703 CPP_INCREMENT_LINE (pfile, 0);
3704 return false;
3709 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
3710 do \
3712 result->type = ELSE_TYPE; \
3713 if (*buffer->cur == CHAR) \
3714 buffer->cur++, result->type = THEN_TYPE; \
3716 while (0)
3718 /* Lex a token into pfile->cur_token, which is also incremented, to
3719 get diagnostics pointing to the correct location.
3721 Does not handle issues such as token lookahead, multiple-include
3722 optimization, directives, skipping etc. This function is only
3723 suitable for use by _cpp_lex_token, and in special cases like
3724 lex_expansion_token which doesn't care for any of these issues.
3726 When meeting a newline, returns CPP_EOF if parsing a directive,
3727 otherwise returns to the start of the token buffer if permissible.
3728 Returns the location of the lexed token. */
3729 cpp_token *
3730 _cpp_lex_direct (cpp_reader *pfile)
3732 cppchar_t c;
3733 cpp_buffer *buffer;
3734 const unsigned char *comment_start;
3735 bool fallthrough_comment = false;
3736 cpp_token *result = pfile->cur_token++;
3738 fresh_line:
3739 result->flags = 0;
3740 buffer = pfile->buffer;
3741 if (buffer->need_line)
3743 if (pfile->state.in_deferred_pragma)
3745 /* This can happen in cases like:
3746 #define loop(x) whatever
3747 #pragma omp loop
3748 where when trying to expand loop we need to peek
3749 next token after loop, but aren't still in_deferred_pragma
3750 mode but are in in_directive mode, so buffer->need_line
3751 is set, a CPP_EOF is peeked. */
3752 result->type = CPP_PRAGMA_EOL;
3753 pfile->state.in_deferred_pragma = false;
3754 if (!pfile->state.pragma_allow_expansion)
3755 pfile->state.prevent_expansion--;
3756 return result;
3758 if (!_cpp_get_fresh_line (pfile))
3760 result->type = CPP_EOF;
3761 /* Not a real EOF in a directive or arg parsing -- we refuse
3762 to advance to the next file now, and will once we're out
3763 of those modes. */
3764 if (!pfile->state.in_directive && !pfile->state.parsing_args)
3766 /* Tell the compiler the line number of the EOF token. */
3767 result->src_loc = pfile->line_table->highest_line;
3768 result->flags = BOL;
3769 /* Now pop the buffer that _cpp_get_fresh_line did not. */
3770 _cpp_pop_buffer (pfile);
3772 return result;
3774 if (buffer != pfile->buffer)
3775 fallthrough_comment = false;
3776 if (!pfile->keep_tokens)
3778 pfile->cur_run = &pfile->base_run;
3779 result = pfile->base_run.base;
3780 pfile->cur_token = result + 1;
3782 result->flags = BOL;
3783 if (pfile->state.parsing_args == 2)
3784 result->flags |= PREV_WHITE;
3786 buffer = pfile->buffer;
3787 update_tokens_line:
3788 result->src_loc = pfile->line_table->highest_line;
3790 skipped_white:
3791 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3792 && !pfile->overlaid_buffer)
3794 _cpp_process_line_notes (pfile, false);
3795 result->src_loc = pfile->line_table->highest_line;
3797 c = *buffer->cur++;
3799 if (pfile->forced_token_location)
3800 result->src_loc = pfile->forced_token_location;
3801 else
3802 result->src_loc = linemap_position_for_column (pfile->line_table,
3803 CPP_BUF_COLUMN (buffer, buffer->cur));
3805 switch (c)
3807 case ' ': case '\t': case '\f': case '\v': case '\0':
3808 result->flags |= PREV_WHITE;
3809 skip_whitespace (pfile, c);
3810 goto skipped_white;
3812 case '\n':
3813 /* Increment the line, unless this is the last line ... */
3814 if (buffer->cur < buffer->rlimit
3815 /* ... or this is a #include, (where _cpp_stack_file needs to
3816 unwind by one line) ... */
3817 || (pfile->state.in_directive > 1
3818 /* ... except traditional-cpp increments this elsewhere. */
3819 && !CPP_OPTION (pfile, traditional)))
3820 CPP_INCREMENT_LINE (pfile, 0);
3821 buffer->need_line = true;
3822 if (pfile->state.in_deferred_pragma)
3824 /* Produce the PRAGMA_EOL on this line. File reading
3825 ensures there is always a \n at end of the buffer, thus
3826 in a deferred pragma we always see CPP_PRAGMA_EOL before
3827 any CPP_EOF. */
3828 result->type = CPP_PRAGMA_EOL;
3829 result->flags &= ~PREV_WHITE;
3830 pfile->state.in_deferred_pragma = false;
3831 if (!pfile->state.pragma_allow_expansion)
3832 pfile->state.prevent_expansion--;
3833 return result;
3835 goto fresh_line;
3837 case '0': case '1': case '2': case '3': case '4':
3838 case '5': case '6': case '7': case '8': case '9':
3840 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3841 result->type = CPP_NUMBER;
3842 lex_number (pfile, &result->val.str, &nst);
3843 warn_about_normalization (pfile, result, &nst, false);
3844 break;
3847 case 'L':
3848 case 'u':
3849 case 'U':
3850 case 'R':
3851 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3852 wide strings or raw strings. */
3853 if (c == 'L' || CPP_OPTION (pfile, rliterals)
3854 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3856 if ((*buffer->cur == '\'' && c != 'R')
3857 || *buffer->cur == '"'
3858 || (*buffer->cur == 'R'
3859 && c != 'R'
3860 && buffer->cur[1] == '"'
3861 && CPP_OPTION (pfile, rliterals))
3862 || (*buffer->cur == '8'
3863 && c == 'u'
3864 && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3865 && CPP_OPTION (pfile, utf8_char_literals)))
3866 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3867 && CPP_OPTION (pfile, rliterals)))))
3869 lex_string (pfile, result, buffer->cur - 1);
3870 break;
3873 /* Fall through. */
3875 case '_':
3876 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3877 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3878 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3879 case 's': case 't': case 'v': case 'w': case 'x':
3880 case 'y': case 'z':
3881 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3882 case 'G': case 'H': case 'I': case 'J': case 'K':
3883 case 'M': case 'N': case 'O': case 'P': case 'Q':
3884 case 'S': case 'T': case 'V': case 'W': case 'X':
3885 case 'Y': case 'Z':
3886 result->type = CPP_NAME;
3888 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3889 result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3890 &nst,
3891 &result->val.node.spelling);
3892 warn_about_normalization (pfile, result, &nst, true);
3895 /* Convert named operators to their proper types. */
3896 if (result->val.node.node->flags & NODE_OPERATOR)
3898 result->flags |= NAMED_OP;
3899 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3902 /* Signal FALLTHROUGH comment followed by another token. */
3903 if (fallthrough_comment)
3904 result->flags |= PREV_FALLTHROUGH;
3905 break;
3907 case '\'':
3908 case '"':
3909 lex_string (pfile, result, buffer->cur - 1);
3910 break;
3912 case '/':
3913 /* A potential block or line comment. */
3914 comment_start = buffer->cur;
3915 c = *buffer->cur;
3917 if (c == '*')
3919 if (_cpp_skip_block_comment (pfile))
3920 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3922 else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3924 /* Don't warn for system headers. */
3925 if (_cpp_in_system_header (pfile))
3927 /* Warn about comments if pedantically GNUC89, and not
3928 in system headers. */
3929 else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3930 && CPP_PEDANTIC (pfile)
3931 && ! buffer->warned_cplusplus_comments)
3933 if (cpp_error (pfile, CPP_DL_PEDWARN,
3934 "C++ style comments are not allowed in ISO C90"))
3935 cpp_error (pfile, CPP_DL_NOTE,
3936 "(this will be reported only once per input file)");
3937 buffer->warned_cplusplus_comments = 1;
3939 /* Or if specifically desired via -Wc90-c99-compat. */
3940 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3941 && ! CPP_OPTION (pfile, cplusplus)
3942 && ! buffer->warned_cplusplus_comments)
3944 if (cpp_error (pfile, CPP_DL_WARNING,
3945 "C++ style comments are incompatible with C90"))
3946 cpp_error (pfile, CPP_DL_NOTE,
3947 "(this will be reported only once per input file)");
3948 buffer->warned_cplusplus_comments = 1;
3950 /* In C89/C94, C++ style comments are forbidden. */
3951 else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3952 || CPP_OPTION (pfile, lang) == CLK_STDC94))
3954 /* But don't be confused about valid code such as
3955 - // immediately followed by *,
3956 - // in a preprocessing directive,
3957 - // in an #if 0 block. */
3958 if (buffer->cur[1] == '*'
3959 || pfile->state.in_directive
3960 || pfile->state.skipping)
3962 result->type = CPP_DIV;
3963 break;
3965 else if (! buffer->warned_cplusplus_comments)
3967 if (cpp_error (pfile, CPP_DL_ERROR,
3968 "C++ style comments are not allowed in "
3969 "ISO C90"))
3970 cpp_error (pfile, CPP_DL_NOTE,
3971 "(this will be reported only once per input "
3972 "file)");
3973 buffer->warned_cplusplus_comments = 1;
3976 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3977 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3979 else if (c == '=')
3981 buffer->cur++;
3982 result->type = CPP_DIV_EQ;
3983 break;
3985 else
3987 result->type = CPP_DIV;
3988 break;
3991 if (fallthrough_comment_p (pfile, comment_start))
3992 fallthrough_comment = true;
3994 if (pfile->cb.comment)
3996 size_t len = pfile->buffer->cur - comment_start;
3997 pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3998 len + 1);
4001 if (!pfile->state.save_comments)
4003 result->flags |= PREV_WHITE;
4004 goto update_tokens_line;
4007 if (fallthrough_comment)
4008 result->flags |= PREV_FALLTHROUGH;
4010 /* Save the comment as a token in its own right. */
4011 save_comment (pfile, result, comment_start, c);
4012 break;
4014 case '<':
4015 if (pfile->state.angled_headers)
4017 lex_string (pfile, result, buffer->cur - 1);
4018 if (result->type != CPP_LESS)
4019 break;
4022 result->type = CPP_LESS;
4023 if (*buffer->cur == '=')
4025 buffer->cur++, result->type = CPP_LESS_EQ;
4026 if (*buffer->cur == '>'
4027 && CPP_OPTION (pfile, cplusplus)
4028 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
4029 buffer->cur++, result->type = CPP_SPACESHIP;
4031 else if (*buffer->cur == '<')
4033 buffer->cur++;
4034 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
4036 else if (CPP_OPTION (pfile, digraphs))
4038 if (*buffer->cur == ':')
4040 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4041 three characters are <:: and the subsequent character
4042 is neither : nor >, the < is treated as a preprocessor
4043 token by itself". */
4044 if (CPP_OPTION (pfile, cplusplus)
4045 && CPP_OPTION (pfile, lang) != CLK_CXX98
4046 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
4047 && buffer->cur[1] == ':'
4048 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4049 break;
4051 buffer->cur++;
4052 result->flags |= DIGRAPH;
4053 result->type = CPP_OPEN_SQUARE;
4055 else if (*buffer->cur == '%')
4057 buffer->cur++;
4058 result->flags |= DIGRAPH;
4059 result->type = CPP_OPEN_BRACE;
4062 break;
4064 case '>':
4065 result->type = CPP_GREATER;
4066 if (*buffer->cur == '=')
4067 buffer->cur++, result->type = CPP_GREATER_EQ;
4068 else if (*buffer->cur == '>')
4070 buffer->cur++;
4071 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4073 break;
4075 case '%':
4076 result->type = CPP_MOD;
4077 if (*buffer->cur == '=')
4078 buffer->cur++, result->type = CPP_MOD_EQ;
4079 else if (CPP_OPTION (pfile, digraphs))
4081 if (*buffer->cur == ':')
4083 buffer->cur++;
4084 result->flags |= DIGRAPH;
4085 result->type = CPP_HASH;
4086 if (*buffer->cur == '%' && buffer->cur[1] == ':')
4087 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
4089 else if (*buffer->cur == '>')
4091 buffer->cur++;
4092 result->flags |= DIGRAPH;
4093 result->type = CPP_CLOSE_BRACE;
4096 break;
4098 case '.':
4099 result->type = CPP_DOT;
4100 if (ISDIGIT (*buffer->cur))
4102 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4103 result->type = CPP_NUMBER;
4104 lex_number (pfile, &result->val.str, &nst);
4105 warn_about_normalization (pfile, result, &nst, false);
4107 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4108 buffer->cur += 2, result->type = CPP_ELLIPSIS;
4109 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4110 buffer->cur++, result->type = CPP_DOT_STAR;
4111 break;
4113 case '+':
4114 result->type = CPP_PLUS;
4115 if (*buffer->cur == '+')
4116 buffer->cur++, result->type = CPP_PLUS_PLUS;
4117 else if (*buffer->cur == '=')
4118 buffer->cur++, result->type = CPP_PLUS_EQ;
4119 break;
4121 case '-':
4122 result->type = CPP_MINUS;
4123 if (*buffer->cur == '>')
4125 buffer->cur++;
4126 result->type = CPP_DEREF;
4127 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4128 buffer->cur++, result->type = CPP_DEREF_STAR;
4130 else if (*buffer->cur == '-')
4131 buffer->cur++, result->type = CPP_MINUS_MINUS;
4132 else if (*buffer->cur == '=')
4133 buffer->cur++, result->type = CPP_MINUS_EQ;
4134 break;
4136 case '&':
4137 result->type = CPP_AND;
4138 if (*buffer->cur == '&')
4139 buffer->cur++, result->type = CPP_AND_AND;
4140 else if (*buffer->cur == '=')
4141 buffer->cur++, result->type = CPP_AND_EQ;
4142 break;
4144 case '|':
4145 result->type = CPP_OR;
4146 if (*buffer->cur == '|')
4147 buffer->cur++, result->type = CPP_OR_OR;
4148 else if (*buffer->cur == '=')
4149 buffer->cur++, result->type = CPP_OR_EQ;
4150 break;
4152 case ':':
4153 result->type = CPP_COLON;
4154 if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
4155 buffer->cur++, result->type = CPP_SCOPE;
4156 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
4158 buffer->cur++;
4159 result->flags |= DIGRAPH;
4160 result->type = CPP_CLOSE_SQUARE;
4162 break;
4164 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4165 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4166 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4167 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
4168 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
4170 case '?': result->type = CPP_QUERY; break;
4171 case '~': result->type = CPP_COMPL; break;
4172 case ',': result->type = CPP_COMMA; break;
4173 case '(': result->type = CPP_OPEN_PAREN; break;
4174 case ')': result->type = CPP_CLOSE_PAREN; break;
4175 case '[': result->type = CPP_OPEN_SQUARE; break;
4176 case ']': result->type = CPP_CLOSE_SQUARE; break;
4177 case '{': result->type = CPP_OPEN_BRACE; break;
4178 case '}': result->type = CPP_CLOSE_BRACE; break;
4179 case ';': result->type = CPP_SEMICOLON; break;
4181 /* @ is a punctuator in Objective-C. */
4182 case '@': result->type = CPP_ATSIGN; break;
4184 default:
4186 const uchar *base = --buffer->cur;
4187 static int no_warn_cnt;
4189 /* Check for an extended identifier ($ or UCN or UTF-8). */
4190 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4191 if (forms_identifier_p (pfile, true, &nst))
4193 result->type = CPP_NAME;
4194 result->val.node.node = lex_identifier (pfile, base, true, &nst,
4195 &result->val.node.spelling);
4196 warn_about_normalization (pfile, result, &nst, true);
4197 break;
4200 /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
4201 single token. */
4202 buffer->cur++;
4203 if (c >= utf8_signifier)
4205 const uchar *pstr = base;
4206 cppchar_t s;
4207 if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4209 if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4211 buffer->cur = base;
4212 _cpp_warn_invalid_utf8 (pfile);
4214 buffer->cur = pstr;
4216 else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4218 buffer->cur = base;
4219 const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4220 buffer->cur = base + 1;
4221 no_warn_cnt = end - buffer->cur;
4224 else if (c >= utf8_continuation
4225 && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4227 if (no_warn_cnt)
4228 --no_warn_cnt;
4229 else
4231 buffer->cur = base;
4232 _cpp_warn_invalid_utf8 (pfile);
4233 buffer->cur = base + 1;
4236 create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4237 break;
4242 /* Potentially convert the location of the token to a range. */
4243 if (result->src_loc >= RESERVED_LOCATION_COUNT
4244 && result->type != CPP_EOF)
4246 /* Ensure that any line notes are processed, so that we have the
4247 correct physical line/column for the end-point of the token even
4248 when a logical line is split via one or more backslashes. */
4249 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4250 && !pfile->overlaid_buffer)
4251 _cpp_process_line_notes (pfile, false);
4253 source_range tok_range;
4254 tok_range.m_start = result->src_loc;
4255 tok_range.m_finish
4256 = linemap_position_for_column (pfile->line_table,
4257 CPP_BUF_COLUMN (buffer, buffer->cur));
4259 result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
4260 result->src_loc,
4261 tok_range, NULL, 0);
4264 return result;
4267 /* An upper bound on the number of bytes needed to spell TOKEN.
4268 Does not include preceding whitespace. */
4269 unsigned int
4270 cpp_token_len (const cpp_token *token)
4272 unsigned int len;
4274 switch (TOKEN_SPELL (token))
4276 default: len = 6; break;
4277 case SPELL_LITERAL: len = token->val.str.len; break;
4278 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
4281 return len;
4284 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4285 Return the number of bytes read out of NAME. (There are always
4286 10 bytes written to BUFFER.) */
4288 static size_t
4289 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4291 int j;
4292 int ucn_len = 0;
4293 int ucn_len_c;
4294 unsigned t;
4295 unsigned long utf32;
4297 /* Compute the length of the UTF-8 sequence. */
4298 for (t = *name; t & 0x80; t <<= 1)
4299 ucn_len++;
4301 utf32 = *name & (0x7F >> ucn_len);
4302 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4304 utf32 = (utf32 << 6) | (*++name & 0x3F);
4306 /* Ill-formed UTF-8. */
4307 if ((*name & ~0x3F) != 0x80)
4308 abort ();
4311 *buffer++ = '\\';
4312 *buffer++ = 'U';
4313 for (j = 7; j >= 0; j--)
4314 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4315 return ucn_len;
4318 /* Given a token TYPE corresponding to a digraph, return a pointer to
4319 the spelling of the digraph. */
4320 static const unsigned char *
4321 cpp_digraph2name (enum cpp_ttype type)
4323 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4326 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4327 The buffer must already contain the enough space to hold the
4328 token's spelling. Returns a pointer to the character after the
4329 last character written. */
4330 unsigned char *
4331 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4333 size_t i;
4334 const unsigned char *name = NODE_NAME (ident);
4336 for (i = 0; i < NODE_LEN (ident); i++)
4337 if (name[i] & ~0x7F)
4339 i += utf8_to_ucn (buffer, name + i) - 1;
4340 buffer += 10;
4342 else
4343 *buffer++ = name[i];
4345 return buffer;
4348 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
4349 already contain the enough space to hold the token's spelling.
4350 Returns a pointer to the character after the last character written.
4351 FORSTRING is true if this is to be the spelling after translation
4352 phase 1 (with the original spelling of extended identifiers), false
4353 if extended identifiers should always be written using UCNs (there is
4354 no option for always writing them in the internal UTF-8 form).
4355 FIXME: Would be nice if we didn't need the PFILE argument. */
4356 unsigned char *
4357 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4358 unsigned char *buffer, bool forstring)
4360 switch (TOKEN_SPELL (token))
4362 case SPELL_OPERATOR:
4364 const unsigned char *spelling;
4365 unsigned char c;
4367 if (token->flags & DIGRAPH)
4368 spelling = cpp_digraph2name (token->type);
4369 else if (token->flags & NAMED_OP)
4370 goto spell_ident;
4371 else
4372 spelling = TOKEN_NAME (token);
4374 while ((c = *spelling++) != '\0')
4375 *buffer++ = c;
4377 break;
4379 spell_ident:
4380 case SPELL_IDENT:
4381 if (forstring)
4383 memcpy (buffer, NODE_NAME (token->val.node.spelling),
4384 NODE_LEN (token->val.node.spelling));
4385 buffer += NODE_LEN (token->val.node.spelling);
4387 else
4388 buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4389 break;
4391 case SPELL_LITERAL:
4392 memcpy (buffer, token->val.str.text, token->val.str.len);
4393 buffer += token->val.str.len;
4394 break;
4396 case SPELL_NONE:
4397 cpp_error (pfile, CPP_DL_ICE,
4398 "unspellable token %s", TOKEN_NAME (token));
4399 break;
4402 return buffer;
4405 /* Returns TOKEN spelt as a null-terminated string. The string is
4406 freed when the reader is destroyed. Useful for diagnostics. */
4407 unsigned char *
4408 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4410 unsigned int len = cpp_token_len (token) + 1;
4411 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4413 end = cpp_spell_token (pfile, token, start, false);
4414 end[0] = '\0';
4416 return start;
4419 /* Returns a pointer to a string which spells the token defined by
4420 TYPE and FLAGS. Used by C front ends, which really should move to
4421 using cpp_token_as_text. */
4422 const char *
4423 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4425 if (flags & DIGRAPH)
4426 return (const char *) cpp_digraph2name (type);
4427 else if (flags & NAMED_OP)
4428 return cpp_named_operator2name (type);
4430 return (const char *) token_spellings[type].name;
4433 /* Writes the spelling of token to FP, without any preceding space.
4434 Separated from cpp_spell_token for efficiency - to avoid stdio
4435 double-buffering. */
4436 void
4437 cpp_output_token (const cpp_token *token, FILE *fp)
4439 switch (TOKEN_SPELL (token))
4441 case SPELL_OPERATOR:
4443 const unsigned char *spelling;
4444 int c;
4446 if (token->flags & DIGRAPH)
4447 spelling = cpp_digraph2name (token->type);
4448 else if (token->flags & NAMED_OP)
4449 goto spell_ident;
4450 else
4451 spelling = TOKEN_NAME (token);
4453 c = *spelling;
4455 putc (c, fp);
4456 while ((c = *++spelling) != '\0');
4458 break;
4460 spell_ident:
4461 case SPELL_IDENT:
4463 size_t i;
4464 const unsigned char * name = NODE_NAME (token->val.node.node);
4466 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4467 if (name[i] & ~0x7F)
4469 unsigned char buffer[10];
4470 i += utf8_to_ucn (buffer, name + i) - 1;
4471 fwrite (buffer, 1, 10, fp);
4473 else
4474 fputc (NODE_NAME (token->val.node.node)[i], fp);
4476 break;
4478 case SPELL_LITERAL:
4479 if (token->type == CPP_HEADER_NAME)
4480 fputc ('"', fp);
4481 fwrite (token->val.str.text, 1, token->val.str.len, fp);
4482 if (token->type == CPP_HEADER_NAME)
4483 fputc ('"', fp);
4484 break;
4486 case SPELL_NONE:
4487 /* An error, most probably. */
4488 break;
4492 /* Compare two tokens. */
4494 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4496 if (a->type == b->type && a->flags == b->flags)
4497 switch (TOKEN_SPELL (a))
4499 default: /* Keep compiler happy. */
4500 case SPELL_OPERATOR:
4501 /* token_no is used to track where multiple consecutive ##
4502 tokens were originally located. */
4503 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4504 case SPELL_NONE:
4505 return (a->type != CPP_MACRO_ARG
4506 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4507 && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4508 case SPELL_IDENT:
4509 return (a->val.node.node == b->val.node.node
4510 && a->val.node.spelling == b->val.node.spelling);
4511 case SPELL_LITERAL:
4512 return (a->val.str.len == b->val.str.len
4513 && !memcmp (a->val.str.text, b->val.str.text,
4514 a->val.str.len));
4517 return 0;
4520 /* Returns nonzero if a space should be inserted to avoid an
4521 accidental token paste for output. For simplicity, it is
4522 conservative, and occasionally advises a space where one is not
4523 needed, e.g. "." and ".2". */
4525 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4526 const cpp_token *token2)
4528 enum cpp_ttype a = token1->type, b = token2->type;
4529 cppchar_t c;
4531 if (token1->flags & NAMED_OP)
4532 a = CPP_NAME;
4533 if (token2->flags & NAMED_OP)
4534 b = CPP_NAME;
4536 c = EOF;
4537 if (token2->flags & DIGRAPH)
4538 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4539 else if (token_spellings[b].category == SPELL_OPERATOR)
4540 c = token_spellings[b].name[0];
4542 /* Quickly get everything that can paste with an '='. */
4543 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4544 return 1;
4546 switch (a)
4548 case CPP_GREATER: return c == '>';
4549 case CPP_LESS: return c == '<' || c == '%' || c == ':';
4550 case CPP_PLUS: return c == '+';
4551 case CPP_MINUS: return c == '-' || c == '>';
4552 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
4553 case CPP_MOD: return c == ':' || c == '>';
4554 case CPP_AND: return c == '&';
4555 case CPP_OR: return c == '|';
4556 case CPP_COLON: return c == ':' || c == '>';
4557 case CPP_DEREF: return c == '*';
4558 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
4559 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
4560 case CPP_PRAGMA:
4561 case CPP_NAME: return ((b == CPP_NUMBER
4562 && name_p (pfile, &token2->val.str))
4563 || b == CPP_NAME
4564 || b == CPP_CHAR || b == CPP_STRING); /* L */
4565 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
4566 || b == CPP_CHAR
4567 || c == '.' || c == '+' || c == '-');
4568 /* UCNs */
4569 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
4570 && b == CPP_NAME)
4571 || (CPP_OPTION (pfile, objc)
4572 && token1->val.str.text[0] == '@'
4573 && (b == CPP_NAME || b == CPP_STRING)));
4574 case CPP_LESS_EQ: return c == '>';
4575 case CPP_STRING:
4576 case CPP_WSTRING:
4577 case CPP_UTF8STRING:
4578 case CPP_STRING16:
4579 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
4580 && (b == CPP_NAME
4581 || (TOKEN_SPELL (token2) == SPELL_LITERAL
4582 && ISIDST (token2->val.str.text[0]))));
4584 default: break;
4587 return 0;
4590 /* Output all the remaining tokens on the current line, and a newline
4591 character, to FP. Leading whitespace is removed. If there are
4592 macros, special token padding is not performed. */
4593 void
4594 cpp_output_line (cpp_reader *pfile, FILE *fp)
4596 const cpp_token *token;
4598 token = cpp_get_token (pfile);
4599 while (token->type != CPP_EOF)
4601 cpp_output_token (token, fp);
4602 token = cpp_get_token (pfile);
4603 if (token->flags & PREV_WHITE)
4604 putc (' ', fp);
4607 putc ('\n', fp);
4610 /* Return a string representation of all the remaining tokens on the
4611 current line. The result is allocated using xmalloc and must be
4612 freed by the caller. */
4613 unsigned char *
4614 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4616 const cpp_token *token;
4617 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4618 unsigned int alloced = 120 + out;
4619 unsigned char *result = (unsigned char *) xmalloc (alloced);
4621 /* If DIR_NAME is empty, there are no initial contents. */
4622 if (dir_name)
4624 sprintf ((char *) result, "#%s ", dir_name);
4625 out += 2;
4628 token = cpp_get_token (pfile);
4629 while (token->type != CPP_EOF)
4631 unsigned char *last;
4632 /* Include room for a possible space and the terminating nul. */
4633 unsigned int len = cpp_token_len (token) + 2;
4635 if (out + len > alloced)
4637 alloced *= 2;
4638 if (out + len > alloced)
4639 alloced = out + len;
4640 result = (unsigned char *) xrealloc (result, alloced);
4643 last = cpp_spell_token (pfile, token, &result[out], 0);
4644 out = last - result;
4646 token = cpp_get_token (pfile);
4647 if (token->flags & PREV_WHITE)
4648 result[out++] = ' ';
4651 result[out] = '\0';
4652 return result;
4655 /* Memory buffers. Changing these three constants can have a dramatic
4656 effect on performance. The values here are reasonable defaults,
4657 but might be tuned. If you adjust them, be sure to test across a
4658 range of uses of cpplib, including heavy nested function-like macro
4659 expansion. Also check the change in peak memory usage (NJAMD is a
4660 good tool for this). */
4661 #define MIN_BUFF_SIZE 8000
4662 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4663 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4664 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4666 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4667 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4668 #endif
4670 /* Create a new allocation buffer. Place the control block at the end
4671 of the buffer, so that buffer overflows will cause immediate chaos. */
4672 static _cpp_buff *
4673 new_buff (size_t len)
4675 _cpp_buff *result;
4676 unsigned char *base;
4678 if (len < MIN_BUFF_SIZE)
4679 len = MIN_BUFF_SIZE;
4680 len = CPP_ALIGN (len);
4682 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4683 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4684 struct first. */
4685 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4686 base = XNEWVEC (unsigned char, len + slen);
4687 result = (_cpp_buff *) base;
4688 base += slen;
4689 #else
4690 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4691 result = (_cpp_buff *) (base + len);
4692 #endif
4693 result->base = base;
4694 result->cur = base;
4695 result->limit = base + len;
4696 result->next = NULL;
4697 return result;
4700 /* Place a chain of unwanted allocation buffers on the free list. */
4701 void
4702 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4704 _cpp_buff *end = buff;
4706 while (end->next)
4707 end = end->next;
4708 end->next = pfile->free_buffs;
4709 pfile->free_buffs = buff;
4712 /* Return a free buffer of size at least MIN_SIZE. */
4713 _cpp_buff *
4714 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4716 _cpp_buff *result, **p;
4718 for (p = &pfile->free_buffs;; p = &(*p)->next)
4720 size_t size;
4722 if (*p == NULL)
4723 return new_buff (min_size);
4724 result = *p;
4725 size = result->limit - result->base;
4726 /* Return a buffer that's big enough, but don't waste one that's
4727 way too big. */
4728 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4729 break;
4732 *p = result->next;
4733 result->next = NULL;
4734 result->cur = result->base;
4735 return result;
4738 /* Creates a new buffer with enough space to hold the uncommitted
4739 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
4740 the excess bytes to the new buffer. Chains the new buffer after
4741 BUFF, and returns the new buffer. */
4742 _cpp_buff *
4743 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4745 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4746 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4748 buff->next = new_buff;
4749 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4750 return new_buff;
4753 /* Creates a new buffer with enough space to hold the uncommitted
4754 remaining bytes of the buffer pointed to by BUFF, and at least
4755 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
4756 Chains the new buffer before the buffer pointed to by BUFF, and
4757 updates the pointer to point to the new buffer. */
4758 void
4759 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4761 _cpp_buff *new_buff, *old_buff = *pbuff;
4762 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4764 new_buff = _cpp_get_buff (pfile, size);
4765 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4766 new_buff->next = old_buff;
4767 *pbuff = new_buff;
4770 /* Free a chain of buffers starting at BUFF. */
4771 void
4772 _cpp_free_buff (_cpp_buff *buff)
4774 _cpp_buff *next;
4776 for (; buff; buff = next)
4778 next = buff->next;
4779 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4780 free (buff);
4781 #else
4782 free (buff->base);
4783 #endif
4787 /* Allocate permanent, unaligned storage of length LEN. */
4788 unsigned char *
4789 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4791 _cpp_buff *buff = pfile->u_buff;
4792 unsigned char *result = buff->cur;
4794 if (len > (size_t) (buff->limit - result))
4796 buff = _cpp_get_buff (pfile, len);
4797 buff->next = pfile->u_buff;
4798 pfile->u_buff = buff;
4799 result = buff->cur;
4802 buff->cur = result + len;
4803 return result;
4806 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4807 That buffer is used for growing allocations when saving macro
4808 replacement lists in a #define, and when parsing an answer to an
4809 assertion in #assert, #unassert or #if (and therefore possibly
4810 whilst expanding macros). It therefore must not be used by any
4811 code that they might call: specifically the lexer and the guts of
4812 the macro expander.
4814 All existing other uses clearly fit this restriction: storing
4815 registered pragmas during initialization. */
4816 unsigned char *
4817 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4819 _cpp_buff *buff = pfile->a_buff;
4820 unsigned char *result = buff->cur;
4822 if (len > (size_t) (buff->limit - result))
4824 buff = _cpp_get_buff (pfile, len);
4825 buff->next = pfile->a_buff;
4826 pfile->a_buff = buff;
4827 result = buff->cur;
4830 buff->cur = result + len;
4831 return result;
4834 /* Commit or allocate storage from a buffer. */
4836 void *
4837 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4839 void *ptr = BUFF_FRONT (pfile->a_buff);
4841 if (pfile->hash_table->alloc_subobject)
4843 void *copy = pfile->hash_table->alloc_subobject (size);
4844 memcpy (copy, ptr, size);
4845 ptr = copy;
4847 else
4848 BUFF_FRONT (pfile->a_buff) += size;
4850 return ptr;
4853 /* Say which field of TOK is in use. */
4855 enum cpp_token_fld_kind
4856 cpp_token_val_index (const cpp_token *tok)
4858 switch (TOKEN_SPELL (tok))
4860 case SPELL_IDENT:
4861 return CPP_TOKEN_FLD_NODE;
4862 case SPELL_LITERAL:
4863 return CPP_TOKEN_FLD_STR;
4864 case SPELL_OPERATOR:
4865 /* Operands which were originally spelled as ident keep around
4866 the node for the exact spelling. */
4867 if (tok->flags & NAMED_OP)
4868 return CPP_TOKEN_FLD_NODE;
4869 else if (tok->type == CPP_PASTE)
4870 return CPP_TOKEN_FLD_TOKEN_NO;
4871 else
4872 return CPP_TOKEN_FLD_NONE;
4873 case SPELL_NONE:
4874 if (tok->type == CPP_MACRO_ARG)
4875 return CPP_TOKEN_FLD_ARG_NO;
4876 else if (tok->type == CPP_PADDING)
4877 return CPP_TOKEN_FLD_SOURCE;
4878 else if (tok->type == CPP_PRAGMA)
4879 return CPP_TOKEN_FLD_PRAGMA;
4880 /* fall through */
4881 default:
4882 return CPP_TOKEN_FLD_NONE;
4886 /* All tokens lexed in R after calling this function will be forced to
4887 have their location_t to be P, until
4888 cpp_stop_forcing_token_locations is called for R. */
4890 void
4891 cpp_force_token_locations (cpp_reader *r, location_t loc)
4893 r->forced_token_location = loc;
4896 /* Go back to assigning locations naturally for lexed tokens. */
4898 void
4899 cpp_stop_forcing_token_locations (cpp_reader *r)
4901 r->forced_token_location = 0;
4904 /* We're looking at \, if it's escaping EOL, look past it. If at
4905 LIMIT, don't advance. */
4907 static const unsigned char *
4908 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4910 const unsigned char *probe = peek;
4912 if (__builtin_expect (peek[1] == '\n', true))
4914 eol:
4915 probe += 2;
4916 if (__builtin_expect (probe < limit, true))
4918 peek = probe;
4919 if (*peek == '\\')
4920 /* The user might be perverse. */
4921 return do_peek_backslash (peek, limit);
4924 else if (__builtin_expect (peek[1] == '\r', false))
4926 if (probe[2] == '\n')
4927 probe++;
4928 goto eol;
4931 return peek;
4934 static const unsigned char *
4935 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4937 if (__builtin_expect (*peek == '\\', false))
4938 peek = do_peek_backslash (peek, limit);
4939 return peek;
4942 static const unsigned char *
4943 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4945 if (peek == bound)
4946 return NULL;
4948 unsigned char c = *--peek;
4949 if (__builtin_expect (c == '\n', false)
4950 || __builtin_expect (c == 'r', false))
4952 if (peek == bound)
4953 return peek;
4954 int ix = -1;
4955 if (c == '\n' && peek[ix] == '\r')
4957 if (peek + ix == bound)
4958 return peek;
4959 ix--;
4962 if (peek[ix] == '\\')
4963 return do_peek_prev (peek + ix, bound);
4965 return peek;
4967 else
4968 return peek;
4971 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4972 space. Otherwise return NULL. */
4974 static const unsigned char *
4975 do_peek_ident (const char *match, const unsigned char *peek,
4976 const unsigned char *limit)
4978 for (; *++match; peek++)
4979 if (*peek != *match)
4981 peek = do_peek_next (peek, limit);
4982 if (*peek != *match)
4983 return NULL;
4986 /* Must now not be looking at an identifier char. */
4987 peek = do_peek_next (peek, limit);
4988 if (ISIDNUM (*peek))
4989 return NULL;
4991 /* Skip control-line whitespace. */
4993 while (*peek == ' ' || *peek == '\t')
4994 peek++;
4995 if (__builtin_expect (*peek == '\\', false))
4997 peek = do_peek_backslash (peek, limit);
4998 if (*peek != '\\')
4999 goto ws;
5002 return peek;
5005 /* Are we looking at a module control line starting as PEEK - 1? */
5007 static bool
5008 do_peek_module (cpp_reader *pfile, unsigned char c,
5009 const unsigned char *peek, const unsigned char *limit)
5011 bool import = false;
5013 if (__builtin_expect (c == 'e', false))
5015 if (!((peek[0] == 'x' || peek[0] == '\\')
5016 && (peek = do_peek_ident ("export", peek, limit))))
5017 return false;
5019 /* export, peek for import or module. No need to peek __import
5020 here. */
5021 if (peek[0] == 'i')
5023 if (!((peek[1] == 'm' || peek[1] == '\\')
5024 && (peek = do_peek_ident ("import", peek + 1, limit))))
5025 return false;
5026 import = true;
5028 else if (peek[0] == 'm')
5030 if (!((peek[1] == 'o' || peek[1] == '\\')
5031 && (peek = do_peek_ident ("module", peek + 1, limit))))
5032 return false;
5034 else
5035 return false;
5037 else if (__builtin_expect (c == 'i', false))
5039 if (!((peek[0] == 'm' || peek[0] == '\\')
5040 && (peek = do_peek_ident ("import", peek, limit))))
5041 return false;
5042 import = true;
5044 else if (__builtin_expect (c == '_', false))
5046 /* Needed for translated includes. */
5047 if (!((peek[0] == '_' || peek[0] == '\\')
5048 && (peek = do_peek_ident ("__import", peek, limit))))
5049 return false;
5050 import = true;
5052 else if (__builtin_expect (c == 'm', false))
5054 if (!((peek[0] == 'o' || peek[0] == '\\')
5055 && (peek = do_peek_ident ("module", peek, limit))))
5056 return false;
5058 else
5059 return false;
5061 /* Peek the next character to see if it's good enough. We'll be at
5062 the first non-whitespace char, including skipping an escaped
5063 newline. */
5064 /* ... import followed by identifier, ':', '<' or header-name
5065 preprocessing tokens, or module followed by identifier, ':' or
5066 ';' preprocessing tokens. */
5067 unsigned char p = *peek++;
5069 /* A character literal is ... single quotes, ... optionally preceded
5070 by u8, u, U, or L */
5071 /* A string-literal is a ... double quotes, optionally prefixed by
5072 R, u8, u8R, u, uR, U, UR, L, or LR */
5073 if (p == 'u')
5075 peek = do_peek_next (peek, limit);
5076 if (*peek == '8')
5078 peek++;
5079 goto peek_u8;
5081 goto peek_u;
5083 else if (p == 'U' || p == 'L')
5085 peek_u8:
5086 peek = do_peek_next (peek, limit);
5087 peek_u:
5088 if (*peek == '\"' || *peek == '\'')
5089 return false;
5091 if (*peek == 'R')
5092 goto peek_R;
5093 /* Identifier. Ok. */
5095 else if (p == 'R')
5097 peek_R:
5098 if (CPP_OPTION (pfile, rliterals))
5100 peek = do_peek_next (peek, limit);
5101 if (*peek == '\"')
5102 return false;
5104 /* Identifier. Ok. */
5106 else if ('Z' - 'A' == 25
5107 ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5108 : ISIDST (p))
5110 /* Identifier. Ok. */
5112 else if (p == '<')
5114 /* Maybe angle header, ok for import. Reject
5115 '<=', '<<' digraph:'<:'. */
5116 if (!import)
5117 return false;
5118 peek = do_peek_next (peek, limit);
5119 if (*peek == '=' || *peek == '<'
5120 || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5121 return false;
5123 else if (p == ';')
5125 /* SEMICOLON, ok for module. */
5126 if (import)
5127 return false;
5129 else if (p == '"')
5131 /* STRING, ok for import. */
5132 if (!import)
5133 return false;
5135 else if (p == ':')
5137 /* Maybe COLON, ok. Reject '::', digraph:':>'. */
5138 peek = do_peek_next (peek, limit);
5139 if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5140 return false;
5142 else
5143 /* FIXME: Detect a unicode character, excluding those not
5144 permitted as the initial character. [lex.name]/1. I presume
5145 we need to check the \[uU] spellings, and directly using
5146 Unicode in say UTF8 form? Or perhaps we do the phase-1
5147 conversion of UTF8 to universal-character-names? */
5148 return false;
5150 return true;
5153 /* Directives-only scanning. Somewhat more relaxed than correct
5154 parsing -- some ill-formed programs will not be rejected. */
5156 void
5157 cpp_directive_only_process (cpp_reader *pfile,
5158 void *data,
5159 void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5161 bool module_p = CPP_OPTION (pfile, module_directives);
5165 restart:
5166 /* Buffer initialization, but no line cleaning. */
5167 cpp_buffer *buffer = pfile->buffer;
5168 buffer->cur_note = buffer->notes_used = 0;
5169 buffer->cur = buffer->line_base = buffer->next_line;
5170 buffer->need_line = false;
5171 /* Files always end in a newline or carriage return. We rely on this for
5172 character peeking safety. */
5173 gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
5175 const unsigned char *base = buffer->cur;
5176 unsigned line_count = 0;
5177 const unsigned char *line_start = base;
5179 bool bol = true;
5180 bool raw = false;
5182 const unsigned char *lwm = base;
5183 for (const unsigned char *pos = base, *limit = buffer->rlimit;
5184 pos < limit;)
5186 unsigned char c = *pos++;
5187 /* This matches the switch in _cpp_lex_direct. */
5188 switch (c)
5190 case ' ': case '\t': case '\f': case '\v':
5191 /* Whitespace, do nothing. */
5192 break;
5194 case '\r': /* MAC line ending, or Windows \r\n */
5195 if (*pos == '\n')
5196 pos++;
5197 /* FALLTHROUGH */
5199 case '\n':
5200 bol = true;
5202 next_line:
5203 CPP_INCREMENT_LINE (pfile, 0);
5204 line_count++;
5205 line_start = pos;
5206 break;
5208 case '\\':
5209 /* <backslash><newline> is removed, and doesn't undo any
5210 preceeding escape or whatnot. */
5211 if (*pos == '\n')
5213 pos++;
5214 goto next_line;
5216 else if (*pos == '\r')
5218 if (pos[1] == '\n')
5219 pos++;
5220 pos++;
5221 goto next_line;
5223 goto dflt;
5225 case '#':
5226 if (bol)
5228 /* Line directive. */
5229 if (pos - 1 > base && !pfile->state.skipping)
5230 cb (pfile, CPP_DO_print, data,
5231 line_count, base, pos - 1 - base);
5233 /* Prep things for directive handling. */
5234 buffer->next_line = pos;
5235 buffer->need_line = true;
5236 bool ok = _cpp_get_fresh_line (pfile);
5237 gcc_checking_assert (ok);
5239 /* Ensure proper column numbering for generated
5240 error messages. */
5241 buffer->line_base -= pos - line_start;
5243 _cpp_handle_directive (pfile, line_start + 1 != pos);
5245 /* Sanitize the line settings. Duplicate #include's can
5246 mess things up. */
5247 // FIXME: Necessary?
5248 pfile->line_table->highest_location
5249 = pfile->line_table->highest_line;
5251 if (!pfile->state.skipping
5252 && pfile->buffer->next_line < pfile->buffer->rlimit)
5253 cb (pfile, CPP_DO_location, data,
5254 pfile->line_table->highest_line);
5256 goto restart;
5258 goto dflt;
5260 case '/':
5262 const unsigned char *peek = do_peek_next (pos, limit);
5263 if (!(*peek == '/' || *peek == '*'))
5264 goto dflt;
5266 /* Line or block comment */
5267 bool is_block = *peek == '*';
5268 bool star = false;
5269 bool esc = false;
5270 location_t sloc
5271 = linemap_position_for_column (pfile->line_table,
5272 pos - line_start);
5274 while (pos < limit)
5276 char c = *pos++;
5277 switch (c)
5279 case '\\':
5280 esc = true;
5281 break;
5283 case '\r':
5284 if (*pos == '\n')
5285 pos++;
5286 /* FALLTHROUGH */
5288 case '\n':
5290 CPP_INCREMENT_LINE (pfile, 0);
5291 line_count++;
5292 line_start = pos;
5293 if (!esc && !is_block)
5295 bol = true;
5296 goto done_comment;
5299 if (!esc)
5300 star = false;
5301 esc = false;
5302 break;
5304 case '*':
5305 if (pos > peek)
5306 star = is_block;
5307 esc = false;
5308 break;
5310 case '/':
5311 if (star)
5312 goto done_comment;
5313 /* FALLTHROUGH */
5315 default:
5316 star = false;
5317 esc = false;
5318 break;
5321 if (pos < limit || is_block)
5322 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5323 "unterminated comment");
5324 done_comment:
5325 lwm = pos;
5326 break;
5329 case '\'':
5330 if (!CPP_OPTION (pfile, digit_separators))
5331 goto delimited_string;
5333 /* Possibly a number punctuator. */
5334 if (!ISIDNUM (*do_peek_next (pos, limit)))
5335 goto delimited_string;
5337 goto quote_peek;
5339 case '\"':
5340 if (!CPP_OPTION (pfile, rliterals))
5341 goto delimited_string;
5343 quote_peek:
5345 /* For ' see if it's a number punctuator
5346 \.?<digit>(<digit>|<identifier-nondigit>
5347 |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5348 /* For " see if it's a raw string
5349 {U,L,u,u8}R. This includes CPP_NUMBER detection,
5350 because that could be 0e+R. */
5351 const unsigned char *peek = pos - 1;
5352 bool quote_first = c == '"';
5353 bool quote_eight = false;
5354 bool maybe_number_start = false;
5355 bool want_number = false;
5357 while ((peek = do_peek_prev (peek, lwm)))
5359 unsigned char p = *peek;
5360 if (quote_first)
5362 if (!raw)
5364 if (p != 'R')
5365 break;
5366 raw = true;
5367 continue;
5370 quote_first = false;
5371 if (p == 'L' || p == 'U' || p == 'u')
5373 else if (p == '8')
5374 quote_eight = true;
5375 else
5376 goto second_raw;
5378 else if (quote_eight)
5380 if (p != 'u')
5382 raw = false;
5383 break;
5385 quote_eight = false;
5387 else if (c == '"')
5389 second_raw:;
5390 if (!want_number && ISIDNUM (p))
5392 raw = false;
5393 break;
5397 if (ISDIGIT (p))
5398 maybe_number_start = true;
5399 else if (p == '.')
5400 want_number = true;
5401 else if (ISIDNUM (p))
5402 maybe_number_start = false;
5403 else if (p == '+' || p == '-')
5405 if (const unsigned char *peek_prev
5406 = do_peek_prev (peek, lwm))
5408 p = *peek_prev;
5409 if (p == 'e' || p == 'E'
5410 || p == 'p' || p == 'P')
5412 want_number = true;
5413 maybe_number_start = false;
5415 else
5416 break;
5418 else
5419 break;
5421 else if (p == '\'' || p == '\"')
5423 /* If this is lwm, this must be the end of a
5424 previous string. So this is a trailing
5425 literal type, (a) if those are allowed,
5426 and (b) maybe_start is false. Otherwise
5427 this must be a CPP_NUMBER because we've
5428 met another ', and we'd have checked that
5429 in its own right. */
5430 if (peek == lwm && CPP_OPTION (pfile, uliterals))
5432 if (!maybe_number_start && !want_number)
5433 /* Must be a literal type. */
5434 raw = false;
5436 else if (p == '\''
5437 && CPP_OPTION (pfile, digit_separators))
5438 maybe_number_start = true;
5439 break;
5441 else if (c == '\'')
5442 break;
5443 else if (!quote_first && !quote_eight)
5444 break;
5447 if (maybe_number_start)
5449 if (c == '\'')
5450 /* A CPP NUMBER. */
5451 goto dflt;
5452 raw = false;
5455 goto delimited_string;
5458 delimited_string:
5460 /* (Possibly raw) string or char literal. */
5461 unsigned char end = c;
5462 int delim_len = -1;
5463 const unsigned char *delim = NULL;
5464 location_t sloc = linemap_position_for_column (pfile->line_table,
5465 pos - line_start);
5466 int esc = 0;
5468 if (raw)
5470 /* There can be no line breaks in the delimiter. */
5471 delim = pos;
5472 for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5474 if (delim_len == 16)
5476 cpp_error_with_line (pfile, CPP_DL_ERROR,
5477 sloc, 0,
5478 "raw string delimiter"
5479 " longer than %d"
5480 " characters",
5481 delim_len);
5482 raw = false;
5483 pos = delim;
5484 break;
5486 if (strchr (") \\\t\v\f\n", c))
5488 cpp_error_with_line (pfile, CPP_DL_ERROR,
5489 sloc, 0,
5490 "invalid character '%c'"
5491 " in raw string"
5492 " delimiter", c);
5493 raw = false;
5494 pos = delim;
5495 break;
5497 if (pos >= limit)
5498 goto bad_string;
5502 while (pos < limit)
5504 char c = *pos++;
5505 switch (c)
5507 case '\\':
5508 if (!raw)
5509 esc++;
5510 break;
5512 case '\r':
5513 if (*pos == '\n')
5514 pos++;
5515 /* FALLTHROUGH */
5517 case '\n':
5519 CPP_INCREMENT_LINE (pfile, 0);
5520 line_count++;
5521 line_start = pos;
5523 if (esc)
5524 esc--;
5525 break;
5527 case ')':
5528 if (raw
5529 && pos + delim_len + 1 < limit
5530 && pos[delim_len] == end
5531 && !memcmp (delim, pos, delim_len))
5533 pos += delim_len + 1;
5534 raw = false;
5535 goto done_string;
5537 break;
5539 default:
5540 if (!raw && !(esc & 1) && c == end)
5541 goto done_string;
5542 esc = 0;
5543 break;
5546 bad_string:
5547 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5548 "unterminated literal");
5550 done_string:
5551 raw = false;
5552 lwm = pos - 1;
5554 goto dflt;
5556 case '_':
5557 case 'e':
5558 case 'i':
5559 case 'm':
5560 if (bol && module_p && !pfile->state.skipping
5561 && do_peek_module (pfile, c, pos, limit))
5563 /* We've seen the start of a module control line.
5564 Start up the tokenizer. */
5565 pos--; /* Backup over the first character. */
5567 /* Backup over whitespace to start of line. */
5568 while (pos > line_start
5569 && (pos[-1] == ' ' || pos[-1] == '\t'))
5570 pos--;
5572 if (pos > base)
5573 cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5575 /* Prep things for directive handling. */
5576 buffer->next_line = pos;
5577 buffer->need_line = true;
5579 /* Now get tokens until the PRAGMA_EOL. */
5582 location_t spelling;
5583 const cpp_token *tok
5584 = cpp_get_token_with_location (pfile, &spelling);
5586 gcc_assert (pfile->state.in_deferred_pragma
5587 || tok->type == CPP_PRAGMA_EOL);
5588 cb (pfile, CPP_DO_token, data, tok, spelling);
5590 while (pfile->state.in_deferred_pragma);
5592 if (pfile->buffer->next_line < pfile->buffer->rlimit)
5593 cb (pfile, CPP_DO_location, data,
5594 pfile->line_table->highest_line);
5596 pfile->mi_valid = false;
5597 goto restart;
5599 goto dflt;
5601 default:
5602 dflt:
5603 bol = false;
5604 pfile->mi_valid = false;
5605 break;
5609 if (buffer->rlimit > base && !pfile->state.skipping)
5611 const unsigned char *limit = buffer->rlimit;
5612 /* If the file was not newline terminated, add rlimit, which is
5613 guaranteed to point to a newline, to the end of our range. */
5614 if (limit[-1] != '\n')
5616 limit++;
5617 CPP_INCREMENT_LINE (pfile, 0);
5618 line_count++;
5620 cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5623 _cpp_pop_buffer (pfile);
5625 while (pfile->buffer);