d: Merge upstream dmd d579c467c1, phobos 88aa69b14.
[official-gcc.git] / libcpp / lex.cc
blob41f905dea165da71db8e682a76627b2318154a42
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000-2022 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
27 enum spell_type
29 SPELL_OPERATOR = 0,
30 SPELL_IDENT,
31 SPELL_LITERAL,
32 SPELL_NONE
35 struct token_spelling
37 enum spell_type category;
38 const unsigned char *name;
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
44 #define OP(e, s) { SPELL_OPERATOR, UC s },
45 #define TK(e, s) { SPELL_ ## s, UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
53 /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. */
54 #define UCS_LIMIT 0x10FFFF
56 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
57 static int skip_line_comment (cpp_reader *);
58 static void skip_whitespace (cpp_reader *, cppchar_t);
59 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
60 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
61 static void store_comment (cpp_reader *, cpp_token *);
62 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
63 unsigned int, enum cpp_ttype);
64 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
65 static int name_p (cpp_reader *, const cpp_string *);
66 static tokenrun *next_tokenrun (tokenrun *);
68 static _cpp_buff *new_buff (size_t);
71 /* Utility routine:
73 Compares, the token TOKEN to the NUL-terminated string STRING.
74 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
75 int
76 cpp_ideq (const cpp_token *token, const char *string)
78 if (token->type != CPP_NAME)
79 return 0;
81 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
84 /* Record a note TYPE at byte POS into the current cleaned logical
85 line. */
86 static void
87 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
89 if (buffer->notes_used == buffer->notes_cap)
91 buffer->notes_cap = buffer->notes_cap * 2 + 200;
92 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
93 buffer->notes_cap);
96 buffer->notes[buffer->notes_used].pos = pos;
97 buffer->notes[buffer->notes_used].type = type;
98 buffer->notes_used++;
102 /* Fast path to find line special characters using optimized character
103 scanning algorithms. Anything complicated falls back to the slow
104 path below. Since this loop is very hot it's worth doing these kinds
105 of optimizations.
107 One of the paths through the ifdefs should provide
109 const uchar *search_line_fast (const uchar *s, const uchar *end);
111 Between S and END, search for \n, \r, \\, ?. Return a pointer to
112 the found character.
114 Note that the last character of the buffer is *always* a newline,
115 as forced by _cpp_convert_input. This fact can be used to avoid
116 explicitly looking for the end of the buffer. */
118 /* Configure gives us an ifdef test. */
119 #ifndef WORDS_BIGENDIAN
120 #define WORDS_BIGENDIAN 0
121 #endif
123 /* We'd like the largest integer that fits into a register. There's nothing
124 in <stdint.h> that gives us that. For most hosts this is unsigned long,
125 but MS decided on an LLP64 model. Thankfully when building with GCC we
126 can get the "real" word size. */
127 #ifdef __GNUC__
128 typedef unsigned int word_type __attribute__((__mode__(__word__)));
129 #else
130 typedef unsigned long word_type;
131 #endif
133 /* The code below is only expecting sizes 4 or 8.
134 Die at compile-time if this expectation is violated. */
135 typedef char check_word_type_size
136 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
138 /* Return X with the first N bytes forced to values that won't match one
139 of the interesting characters. Note that NUL is not interesting. */
141 static inline word_type
142 acc_char_mask_misalign (word_type val, unsigned int n)
144 word_type mask = -1;
145 if (WORDS_BIGENDIAN)
146 mask >>= n * 8;
147 else
148 mask <<= n * 8;
149 return val & mask;
152 /* Return X replicated to all byte positions within WORD_TYPE. */
154 static inline word_type
155 acc_char_replicate (uchar x)
157 word_type ret;
159 ret = (x << 24) | (x << 16) | (x << 8) | x;
160 if (sizeof(word_type) == 8)
161 ret = (ret << 16 << 16) | ret;
162 return ret;
165 /* Return non-zero if some byte of VAL is (probably) C. */
167 static inline word_type
168 acc_char_cmp (word_type val, word_type c)
170 #if defined(__GNUC__) && defined(__alpha__)
171 /* We can get exact results using a compare-bytes instruction.
172 Get (val == c) via (0 >= (val ^ c)). */
173 return __builtin_alpha_cmpbge (0, val ^ c);
174 #else
175 word_type magic = 0x7efefefeU;
176 if (sizeof(word_type) == 8)
177 magic = (magic << 16 << 16) | 0xfefefefeU;
178 magic |= 1;
180 val ^= c;
181 return ((val + magic) ^ ~val) & ~magic;
182 #endif
185 /* Given the result of acc_char_cmp is non-zero, return the index of
186 the found character. If this was a false positive, return -1. */
188 static inline int
189 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
190 word_type val ATTRIBUTE_UNUSED)
192 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
193 /* The cmpbge instruction sets *bits* of the result corresponding to
194 matches in the bytes with no false positives. */
195 return __builtin_ctzl (cmp);
196 #else
197 unsigned int i;
199 /* ??? It would be nice to force unrolling here,
200 and have all of these constants folded. */
201 for (i = 0; i < sizeof(word_type); ++i)
203 uchar c;
204 if (WORDS_BIGENDIAN)
205 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
206 else
207 c = (val >> i * 8) & 0xff;
209 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
210 return i;
213 return -1;
214 #endif
217 /* A version of the fast scanner using bit fiddling techniques.
219 For 32-bit words, one would normally perform 16 comparisons and
220 16 branches. With this algorithm one performs 24 arithmetic
221 operations and one branch. Whether this is faster with a 32-bit
222 word size is going to be somewhat system dependent.
224 For 64-bit words, we eliminate twice the number of comparisons
225 and branches without increasing the number of arithmetic operations.
226 It's almost certainly going to be a win with 64-bit word size. */
228 static const uchar * search_line_acc_char (const uchar *, const uchar *)
229 ATTRIBUTE_UNUSED;
231 static const uchar *
232 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
234 const word_type repl_nl = acc_char_replicate ('\n');
235 const word_type repl_cr = acc_char_replicate ('\r');
236 const word_type repl_bs = acc_char_replicate ('\\');
237 const word_type repl_qm = acc_char_replicate ('?');
239 unsigned int misalign;
240 const word_type *p;
241 word_type val, t;
243 /* Align the buffer. Mask out any bytes from before the beginning. */
244 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
245 val = *p;
246 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
247 if (misalign)
248 val = acc_char_mask_misalign (val, misalign);
250 /* Main loop. */
251 while (1)
253 t = acc_char_cmp (val, repl_nl);
254 t |= acc_char_cmp (val, repl_cr);
255 t |= acc_char_cmp (val, repl_bs);
256 t |= acc_char_cmp (val, repl_qm);
258 if (__builtin_expect (t != 0, 0))
260 int i = acc_char_index (t, val);
261 if (i >= 0)
262 return (const uchar *)p + i;
265 val = *++p;
269 /* Disable on Solaris 2/x86 until the following problem can be properly
270 autoconfed:
272 The Solaris 10+ assembler tags objects with the instruction set
273 extensions used, so SSE4.2 executables cannot run on machines that
274 don't support that extension. */
276 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
278 /* Replicated character data to be shared between implementations.
279 Recall that outside of a context with vector support we can't
280 define compatible vector types, therefore these are all defined
281 in terms of raw characters. */
282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
283 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
284 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
285 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
286 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
287 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
288 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
289 { '?', '?', '?', '?', '?', '?', '?', '?',
290 '?', '?', '?', '?', '?', '?', '?', '?' },
293 /* A version of the fast scanner using MMX vectorized byte compare insns.
295 This uses the PMOVMSKB instruction which was introduced with "MMX2",
296 which was packaged into SSE1; it is also present in the AMD MMX
297 extension. Mark the function as using "sse" so that we emit a real
298 "emms" instruction, rather than the 3dNOW "femms" instruction. */
300 static const uchar *
301 #ifndef __SSE__
302 __attribute__((__target__("sse")))
303 #endif
304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
306 typedef char v8qi __attribute__ ((__vector_size__ (8)));
307 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
309 const v8qi repl_nl = *(const v8qi *)repl_chars[0];
310 const v8qi repl_cr = *(const v8qi *)repl_chars[1];
311 const v8qi repl_bs = *(const v8qi *)repl_chars[2];
312 const v8qi repl_qm = *(const v8qi *)repl_chars[3];
314 unsigned int misalign, found, mask;
315 const v8qi *p;
316 v8qi data, t, c;
318 /* Align the source pointer. While MMX doesn't generate unaligned data
319 faults, this allows us to safely scan to the end of the buffer without
320 reading beyond the end of the last page. */
321 misalign = (uintptr_t)s & 7;
322 p = (const v8qi *)((uintptr_t)s & -8);
323 data = *p;
325 /* Create a mask for the bytes that are valid within the first
326 16-byte block. The Idea here is that the AND with the mask
327 within the loop is "free", since we need some AND or TEST
328 insn in order to set the flags for the branch anyway. */
329 mask = -1u << misalign;
331 /* Main loop processing 8 bytes at a time. */
332 goto start;
335 data = *++p;
336 mask = -1;
338 start:
339 t = __builtin_ia32_pcmpeqb(data, repl_nl);
340 c = __builtin_ia32_pcmpeqb(data, repl_cr);
341 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
342 c = __builtin_ia32_pcmpeqb(data, repl_bs);
343 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
344 c = __builtin_ia32_pcmpeqb(data, repl_qm);
345 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
346 found = __builtin_ia32_pmovmskb (t);
347 found &= mask;
349 while (!found);
351 __builtin_ia32_emms ();
353 /* FOUND contains 1 in bits for which we matched a relevant
354 character. Conversion to the byte index is trivial. */
355 found = __builtin_ctz(found);
356 return (const uchar *)p + found;
359 /* A version of the fast scanner using SSE2 vectorized byte compare insns. */
361 static const uchar *
362 #ifndef __SSE2__
363 __attribute__((__target__("sse2")))
364 #endif
365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
367 typedef char v16qi __attribute__ ((__vector_size__ (16)));
369 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
370 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
371 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
372 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
374 unsigned int misalign, found, mask;
375 const v16qi *p;
376 v16qi data, t;
378 /* Align the source pointer. */
379 misalign = (uintptr_t)s & 15;
380 p = (const v16qi *)((uintptr_t)s & -16);
381 data = *p;
383 /* Create a mask for the bytes that are valid within the first
384 16-byte block. The Idea here is that the AND with the mask
385 within the loop is "free", since we need some AND or TEST
386 insn in order to set the flags for the branch anyway. */
387 mask = -1u << misalign;
389 /* Main loop processing 16 bytes at a time. */
390 goto start;
393 data = *++p;
394 mask = -1;
396 start:
397 t = data == repl_nl;
398 t |= data == repl_cr;
399 t |= data == repl_bs;
400 t |= data == repl_qm;
401 found = __builtin_ia32_pmovmskb128 (t);
402 found &= mask;
404 while (!found);
406 /* FOUND contains 1 in bits for which we matched a relevant
407 character. Conversion to the byte index is trivial. */
408 found = __builtin_ctz(found);
409 return (const uchar *)p + found;
412 #ifdef HAVE_SSE4
413 /* A version of the fast scanner using SSE 4.2 vectorized string insns. */
415 static const uchar *
416 #ifndef __SSE4_2__
417 __attribute__((__target__("sse4.2")))
418 #endif
419 search_line_sse42 (const uchar *s, const uchar *end)
421 typedef char v16qi __attribute__ ((__vector_size__ (16)));
422 static const v16qi search = { '\n', '\r', '?', '\\' };
424 uintptr_t si = (uintptr_t)s;
425 uintptr_t index;
427 /* Check for unaligned input. */
428 if (si & 15)
430 v16qi sv;
432 if (__builtin_expect (end - s < 16, 0)
433 && __builtin_expect ((si & 0xfff) > 0xff0, 0))
435 /* There are less than 16 bytes left in the buffer, and less
436 than 16 bytes left on the page. Reading 16 bytes at this
437 point might generate a spurious page fault. Defer to the
438 SSE2 implementation, which already handles alignment. */
439 return search_line_sse2 (s, end);
442 /* ??? The builtin doesn't understand that the PCMPESTRI read from
443 memory need not be aligned. */
444 sv = __builtin_ia32_loaddqu ((const char *) s);
445 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
447 if (__builtin_expect (index < 16, 0))
448 goto found;
450 /* Advance the pointer to an aligned address. We will re-scan a
451 few bytes, but we no longer need care for reading past the
452 end of a page, since we're guaranteed a match. */
453 s = (const uchar *)((si + 15) & -16);
456 /* Main loop, processing 16 bytes at a time. */
457 #ifdef __GCC_ASM_FLAG_OUTPUTS__
458 while (1)
460 char f;
462 /* By using inline assembly instead of the builtin,
463 we can use the result, as well as the flags set. */
464 __asm ("%vpcmpestri\t$0, %2, %3"
465 : "=c"(index), "=@ccc"(f)
466 : "m"(*s), "x"(search), "a"(4), "d"(16));
467 if (f)
468 break;
470 s += 16;
472 #else
473 s -= 16;
474 /* By doing the whole loop in inline assembly,
475 we can make proper use of the flags set. */
476 __asm ( ".balign 16\n"
477 "0: add $16, %1\n"
478 " %vpcmpestri\t$0, (%1), %2\n"
479 " jnc 0b"
480 : "=&c"(index), "+r"(s)
481 : "x"(search), "a"(4), "d"(16));
482 #endif
484 found:
485 return s + index;
488 #else
489 /* Work around out-dated assemblers without sse4 support. */
490 #define search_line_sse42 search_line_sse2
491 #endif
493 /* Check the CPU capabilities. */
495 #include "../gcc/config/i386/cpuid.h"
497 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
498 static search_line_fast_type search_line_fast;
500 #define HAVE_init_vectorized_lexer 1
501 static inline void
502 init_vectorized_lexer (void)
504 unsigned dummy, ecx = 0, edx = 0;
505 search_line_fast_type impl = search_line_acc_char;
506 int minimum = 0;
508 #if defined(__SSE4_2__)
509 minimum = 3;
510 #elif defined(__SSE2__)
511 minimum = 2;
512 #elif defined(__SSE__)
513 minimum = 1;
514 #endif
516 if (minimum == 3)
517 impl = search_line_sse42;
518 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
520 if (minimum == 3 || (ecx & bit_SSE4_2))
521 impl = search_line_sse42;
522 else if (minimum == 2 || (edx & bit_SSE2))
523 impl = search_line_sse2;
524 else if (minimum == 1 || (edx & bit_SSE))
525 impl = search_line_mmx;
527 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
529 if (minimum == 1
530 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
531 impl = search_line_mmx;
534 search_line_fast = impl;
537 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
539 /* A vection of the fast scanner using AltiVec vectorized byte compares
540 and VSX unaligned loads (when VSX is available). This is otherwise
541 the same as the AltiVec version. */
543 ATTRIBUTE_NO_SANITIZE_UNDEFINED
544 static const uchar *
545 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
547 typedef __attribute__((altivec(vector))) unsigned char vc;
549 const vc repl_nl = {
550 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
551 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
553 const vc repl_cr = {
554 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
555 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
557 const vc repl_bs = {
558 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
559 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
561 const vc repl_qm = {
562 '?', '?', '?', '?', '?', '?', '?', '?',
563 '?', '?', '?', '?', '?', '?', '?', '?',
565 const vc zero = { 0 };
567 vc data, t;
569 /* Main loop processing 16 bytes at a time. */
572 vc m_nl, m_cr, m_bs, m_qm;
574 data = __builtin_vec_vsx_ld (0, s);
575 s += 16;
577 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
578 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
579 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
580 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
581 t = (m_nl | m_cr) | (m_bs | m_qm);
583 /* T now contains 0xff in bytes for which we matched one of the relevant
584 characters. We want to exit the loop if any byte in T is non-zero.
585 Below is the expansion of vec_any_ne(t, zero). */
587 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
589 /* Restore s to to point to the 16 bytes we just processed. */
590 s -= 16;
593 #define N (sizeof(vc) / sizeof(long))
595 union {
596 vc v;
597 /* Statically assert that N is 2 or 4. */
598 unsigned long l[(N == 2 || N == 4) ? N : -1];
599 } u;
600 unsigned long l, i = 0;
602 u.v = t;
604 /* Find the first word of T that is non-zero. */
605 switch (N)
607 case 4:
608 l = u.l[i++];
609 if (l != 0)
610 break;
611 s += sizeof(unsigned long);
612 l = u.l[i++];
613 if (l != 0)
614 break;
615 s += sizeof(unsigned long);
616 /* FALLTHRU */
617 case 2:
618 l = u.l[i++];
619 if (l != 0)
620 break;
621 s += sizeof(unsigned long);
622 l = u.l[i];
625 /* L now contains 0xff in bytes for which we matched one of the
626 relevant characters. We can find the byte index by finding
627 its bit index and dividing by 8. */
628 #ifdef __BIG_ENDIAN__
629 l = __builtin_clzl(l) >> 3;
630 #else
631 l = __builtin_ctzl(l) >> 3;
632 #endif
633 return s + l;
635 #undef N
639 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
641 /* A vection of the fast scanner using AltiVec vectorized byte compares.
642 This cannot be used for little endian because vec_lvsl/lvsr are
643 deprecated for little endian and the code won't work properly. */
644 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
645 so we can't compile this function without -maltivec on the command line
646 (or implied by some other switch). */
648 static const uchar *
649 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
651 typedef __attribute__((altivec(vector))) unsigned char vc;
653 const vc repl_nl = {
654 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
655 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
657 const vc repl_cr = {
658 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
659 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
661 const vc repl_bs = {
662 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
663 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
665 const vc repl_qm = {
666 '?', '?', '?', '?', '?', '?', '?', '?',
667 '?', '?', '?', '?', '?', '?', '?', '?',
669 const vc ones = {
670 -1, -1, -1, -1, -1, -1, -1, -1,
671 -1, -1, -1, -1, -1, -1, -1, -1,
673 const vc zero = { 0 };
675 vc data, mask, t;
677 /* Altivec loads automatically mask addresses with -16. This lets us
678 issue the first load as early as possible. */
679 data = __builtin_vec_ld(0, (const vc *)s);
681 /* Discard bytes before the beginning of the buffer. Do this by
682 beginning with all ones and shifting in zeros according to the
683 mis-alignment. The LVSR instruction pulls the exact shift we
684 want from the address. */
685 mask = __builtin_vec_lvsr(0, s);
686 mask = __builtin_vec_perm(zero, ones, mask);
687 data &= mask;
689 /* While altivec loads mask addresses, we still need to align S so
690 that the offset we compute at the end is correct. */
691 s = (const uchar *)((uintptr_t)s & -16);
693 /* Main loop processing 16 bytes at a time. */
694 goto start;
697 vc m_nl, m_cr, m_bs, m_qm;
699 s += 16;
700 data = __builtin_vec_ld(0, (const vc *)s);
702 start:
703 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
704 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
705 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
706 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
707 t = (m_nl | m_cr) | (m_bs | m_qm);
709 /* T now contains 0xff in bytes for which we matched one of the relevant
710 characters. We want to exit the loop if any byte in T is non-zero.
711 Below is the expansion of vec_any_ne(t, zero). */
713 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
716 #define N (sizeof(vc) / sizeof(long))
718 union {
719 vc v;
720 /* Statically assert that N is 2 or 4. */
721 unsigned long l[(N == 2 || N == 4) ? N : -1];
722 } u;
723 unsigned long l, i = 0;
725 u.v = t;
727 /* Find the first word of T that is non-zero. */
728 switch (N)
730 case 4:
731 l = u.l[i++];
732 if (l != 0)
733 break;
734 s += sizeof(unsigned long);
735 l = u.l[i++];
736 if (l != 0)
737 break;
738 s += sizeof(unsigned long);
739 /* FALLTHROUGH */
740 case 2:
741 l = u.l[i++];
742 if (l != 0)
743 break;
744 s += sizeof(unsigned long);
745 l = u.l[i];
748 /* L now contains 0xff in bytes for which we matched one of the
749 relevant characters. We can find the byte index by finding
750 its bit index and dividing by 8. */
751 l = __builtin_clzl(l) >> 3;
752 return s + l;
754 #undef N
758 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
759 #include "arm_neon.h"
761 /* This doesn't have to be the exact page size, but no system may use
762 a size smaller than this. ARMv8 requires a minimum page size of
763 4k. The impact of being conservative here is a small number of
764 cases will take the slightly slower entry path into the main
765 loop. */
767 #define AARCH64_MIN_PAGE_SIZE 4096
769 static const uchar *
770 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
772 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
773 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
774 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
775 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
776 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
778 #ifdef __ARM_BIG_ENDIAN
779 const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
780 #else
781 const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
782 #endif
784 unsigned int found;
785 const uint8_t *p;
786 uint8x16_t data;
787 uint8x16_t t;
788 uint16x8_t m;
789 uint8x16_t u, v, w;
791 /* Align the source pointer. */
792 p = (const uint8_t *)((uintptr_t)s & -16);
794 /* Assuming random string start positions, with a 4k page size we'll take
795 the slow path about 0.37% of the time. */
796 if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
797 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
798 < 16, 0))
800 /* Slow path: the string starts near a possible page boundary. */
801 uint32_t misalign, mask;
803 misalign = (uintptr_t)s & 15;
804 mask = (-1u << misalign) & 0xffff;
805 data = vld1q_u8 (p);
806 t = vceqq_u8 (data, repl_nl);
807 u = vceqq_u8 (data, repl_cr);
808 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
809 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
810 t = vorrq_u8 (v, w);
811 t = vandq_u8 (t, xmask);
812 m = vpaddlq_u8 (t);
813 m = vshlq_u16 (m, shift);
814 found = vaddvq_u16 (m);
815 found &= mask;
816 if (found)
817 return (const uchar*)p + __builtin_ctz (found);
819 else
821 data = vld1q_u8 ((const uint8_t *) s);
822 t = vceqq_u8 (data, repl_nl);
823 u = vceqq_u8 (data, repl_cr);
824 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
825 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
826 t = vorrq_u8 (v, w);
827 if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
828 goto done;
833 p += 16;
834 data = vld1q_u8 (p);
835 t = vceqq_u8 (data, repl_nl);
836 u = vceqq_u8 (data, repl_cr);
837 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
838 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
839 t = vorrq_u8 (v, w);
840 } while (!vpaddd_u64 ((uint64x2_t)t));
842 done:
843 /* Now that we've found the terminating substring, work out precisely where
844 we need to stop. */
845 t = vandq_u8 (t, xmask);
846 m = vpaddlq_u8 (t);
847 m = vshlq_u16 (m, shift);
848 found = vaddvq_u16 (m);
849 return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
850 + __builtin_ctz (found));
853 #elif defined (__ARM_NEON)
854 #include "arm_neon.h"
856 static const uchar *
857 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
859 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
860 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
861 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
862 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
863 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
865 unsigned int misalign, found, mask;
866 const uint8_t *p;
867 uint8x16_t data;
869 /* Align the source pointer. */
870 misalign = (uintptr_t)s & 15;
871 p = (const uint8_t *)((uintptr_t)s & -16);
872 data = vld1q_u8 (p);
874 /* Create a mask for the bytes that are valid within the first
875 16-byte block. The Idea here is that the AND with the mask
876 within the loop is "free", since we need some AND or TEST
877 insn in order to set the flags for the branch anyway. */
878 mask = (-1u << misalign) & 0xffff;
880 /* Main loop, processing 16 bytes at a time. */
881 goto start;
885 uint8x8_t l;
886 uint16x4_t m;
887 uint32x2_t n;
888 uint8x16_t t, u, v, w;
890 p += 16;
891 data = vld1q_u8 (p);
892 mask = 0xffff;
894 start:
895 t = vceqq_u8 (data, repl_nl);
896 u = vceqq_u8 (data, repl_cr);
897 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
898 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
899 t = vandq_u8 (vorrq_u8 (v, w), xmask);
900 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
901 m = vpaddl_u8 (l);
902 n = vpaddl_u16 (m);
904 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
905 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
906 found &= mask;
908 while (!found);
910 /* FOUND contains 1 in bits for which we matched a relevant
911 character. Conversion to the byte index is trivial. */
912 found = __builtin_ctz (found);
913 return (const uchar *)p + found;
916 #else
918 /* We only have one accelerated alternative. Use a direct call so that
919 we encourage inlining. */
921 #define search_line_fast search_line_acc_char
923 #endif
925 /* Initialize the lexer if needed. */
927 void
928 _cpp_init_lexer (void)
930 #ifdef HAVE_init_vectorized_lexer
931 init_vectorized_lexer ();
932 #endif
935 /* Returns with a logical line that contains no escaped newlines or
936 trigraphs. This is a time-critical inner loop. */
937 void
938 _cpp_clean_line (cpp_reader *pfile)
940 cpp_buffer *buffer;
941 const uchar *s;
942 uchar c, *d, *p;
944 buffer = pfile->buffer;
945 buffer->cur_note = buffer->notes_used = 0;
946 buffer->cur = buffer->line_base = buffer->next_line;
947 buffer->need_line = false;
948 s = buffer->next_line;
950 if (!buffer->from_stage3)
952 const uchar *pbackslash = NULL;
954 /* Fast path. This is the common case of an un-escaped line with
955 no trigraphs. The primary win here is by not writing any
956 data back to memory until we have to. */
957 while (1)
959 /* Perform an optimized search for \n, \r, \\, ?. */
960 s = search_line_fast (s, buffer->rlimit);
962 c = *s;
963 if (c == '\\')
965 /* Record the location of the backslash and continue. */
966 pbackslash = s++;
968 else if (__builtin_expect (c == '?', 0))
970 if (__builtin_expect (s[1] == '?', false)
971 && _cpp_trigraph_map[s[2]])
973 /* Have a trigraph. We may or may not have to convert
974 it. Add a line note regardless, for -Wtrigraphs. */
975 add_line_note (buffer, s, s[2]);
976 if (CPP_OPTION (pfile, trigraphs))
978 /* We do, and that means we have to switch to the
979 slow path. */
980 d = (uchar *) s;
981 *d = _cpp_trigraph_map[s[2]];
982 s += 2;
983 goto slow_path;
986 /* Not a trigraph. Continue on fast-path. */
987 s++;
989 else
990 break;
993 /* This must be \r or \n. We're either done, or we'll be forced
994 to write back to the buffer and continue on the slow path. */
995 d = (uchar *) s;
997 if (__builtin_expect (s == buffer->rlimit, false))
998 goto done;
1000 /* DOS line ending? */
1001 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1003 s++;
1004 if (s == buffer->rlimit)
1005 goto done;
1008 if (__builtin_expect (pbackslash == NULL, true))
1009 goto done;
1011 /* Check for escaped newline. */
1012 p = d;
1013 while (is_nvspace (p[-1]))
1014 p--;
1015 if (p - 1 != pbackslash)
1016 goto done;
1018 /* Have an escaped newline; process it and proceed to
1019 the slow path. */
1020 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1021 d = p - 2;
1022 buffer->next_line = p - 1;
1024 slow_path:
1025 while (1)
1027 c = *++s;
1028 *++d = c;
1030 if (c == '\n' || c == '\r')
1032 /* Handle DOS line endings. */
1033 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1034 s++;
1035 if (s == buffer->rlimit)
1036 break;
1038 /* Escaped? */
1039 p = d;
1040 while (p != buffer->next_line && is_nvspace (p[-1]))
1041 p--;
1042 if (p == buffer->next_line || p[-1] != '\\')
1043 break;
1045 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1046 d = p - 2;
1047 buffer->next_line = p - 1;
1049 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1051 /* Add a note regardless, for the benefit of -Wtrigraphs. */
1052 add_line_note (buffer, d, s[2]);
1053 if (CPP_OPTION (pfile, trigraphs))
1055 *d = _cpp_trigraph_map[s[2]];
1056 s += 2;
1061 else
1063 while (*s != '\n' && *s != '\r')
1064 s++;
1065 d = (uchar *) s;
1067 /* Handle DOS line endings. */
1068 if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1069 s++;
1072 done:
1073 *d = '\n';
1074 /* A sentinel note that should never be processed. */
1075 add_line_note (buffer, d + 1, '\n');
1076 buffer->next_line = s + 1;
1079 /* Return true if the trigraph indicated by NOTE should be warned
1080 about in a comment. */
1081 static bool
1082 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1084 const uchar *p;
1086 /* Within comments we don't warn about trigraphs, unless the
1087 trigraph forms an escaped newline, as that may change
1088 behavior. */
1089 if (note->type != '/')
1090 return false;
1092 /* If -trigraphs, then this was an escaped newline iff the next note
1093 is coincident. */
1094 if (CPP_OPTION (pfile, trigraphs))
1095 return note[1].pos == note->pos;
1097 /* Otherwise, see if this forms an escaped newline. */
1098 p = note->pos + 3;
1099 while (is_nvspace (*p))
1100 p++;
1102 /* There might have been escaped newlines between the trigraph and the
1103 newline we found. Hence the position test. */
1104 return (*p == '\n' && p < note[1].pos);
1107 /* Process the notes created by add_line_note as far as the current
1108 location. */
1109 void
1110 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1112 cpp_buffer *buffer = pfile->buffer;
1114 for (;;)
1116 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1117 unsigned int col;
1119 if (note->pos > buffer->cur)
1120 break;
1122 buffer->cur_note++;
1123 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1125 if (note->type == '\\' || note->type == ' ')
1127 if (note->type == ' ' && !in_comment)
1128 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1129 "backslash and newline separated by space");
1131 if (buffer->next_line > buffer->rlimit)
1133 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1134 "backslash-newline at end of file");
1135 /* Prevent "no newline at end of file" warning. */
1136 buffer->next_line = buffer->rlimit;
1139 buffer->line_base = note->pos;
1140 CPP_INCREMENT_LINE (pfile, 0);
1142 else if (_cpp_trigraph_map[note->type])
1144 if (CPP_OPTION (pfile, warn_trigraphs)
1145 && (!in_comment || warn_in_comment (pfile, note)))
1147 if (CPP_OPTION (pfile, trigraphs))
1148 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1149 pfile->line_table->highest_line, col,
1150 "trigraph ??%c converted to %c",
1151 note->type,
1152 (int) _cpp_trigraph_map[note->type]);
1153 else
1155 cpp_warning_with_line
1156 (pfile, CPP_W_TRIGRAPHS,
1157 pfile->line_table->highest_line, col,
1158 "trigraph ??%c ignored, use -trigraphs to enable",
1159 note->type);
1163 else if (note->type == 0)
1164 /* Already processed in lex_raw_string. */;
1165 else
1166 abort ();
1170 namespace bidi {
1171 enum class kind {
1172 NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1175 /* All the UTF-8 encodings of bidi characters start with E2. */
1176 constexpr uchar utf8_start = 0xe2;
1178 struct context
1180 context () {}
1181 context (location_t loc, kind k, bool pdf, bool ucn)
1182 : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1186 kind get_pop_kind () const
1188 return m_pdf ? kind::PDF : kind::PDI;
1190 bool ucn_p () const
1192 return m_ucn;
1195 location_t m_loc;
1196 kind m_kind;
1197 unsigned m_pdf : 1;
1198 unsigned m_ucn : 1;
1201 /* A vector holding currently open bidi contexts. We use a char for
1202 each context, its LSB is 1 if it represents a PDF context, 0 if it
1203 represents a PDI context. The next bit is 1 if this context was open
1204 by a bidi character written as a UCN, and 0 when it was UTF-8. */
1205 semi_embedded_vec <context, 16> vec;
1207 /* Close the whole comment/identifier/string literal/character constant
1208 context. */
1209 void on_close ()
1211 vec.truncate (0);
1214 /* Pop the last element in the vector. */
1215 void pop ()
1217 unsigned int len = vec.count ();
1218 gcc_checking_assert (len > 0);
1219 vec.truncate (len - 1);
1222 /* Return the pop kind of the context of the Ith element. */
1223 kind pop_kind_at (unsigned int i)
1225 return vec[i].get_pop_kind ();
1228 /* Return the pop kind of the context that is currently opened. */
1229 kind current_ctx ()
1231 unsigned int len = vec.count ();
1232 if (len == 0)
1233 return kind::NONE;
1234 return vec[len - 1].get_pop_kind ();
1237 /* Return true if the current context comes from a UCN origin, that is,
1238 the bidi char which started this bidi context was written as a UCN. */
1239 bool current_ctx_ucn_p ()
1241 unsigned int len = vec.count ();
1242 gcc_checking_assert (len > 0);
1243 return vec[len - 1].m_ucn;
1246 location_t current_ctx_loc ()
1248 unsigned int len = vec.count ();
1249 gcc_checking_assert (len > 0);
1250 return vec[len - 1].m_loc;
1253 /* We've read a bidi char, update the current vector as necessary.
1254 LOC is only valid when K is not kind::NONE. */
1255 void on_char (kind k, bool ucn_p, location_t loc)
1257 switch (k)
1259 case kind::LRE:
1260 case kind::RLE:
1261 case kind::LRO:
1262 case kind::RLO:
1263 vec.push (context (loc, k, true, ucn_p));
1264 break;
1265 case kind::LRI:
1266 case kind::RLI:
1267 case kind::FSI:
1268 vec.push (context (loc, k, false, ucn_p));
1269 break;
1270 /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1271 whose scope has not yet been terminated. */
1272 case kind::PDF:
1273 if (current_ctx () == kind::PDF)
1274 pop ();
1275 break;
1276 /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1277 scope has not yet been terminated, as well as the scopes of
1278 any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1279 yet been terminated. */
1280 case kind::PDI:
1281 for (int i = vec.count () - 1; i >= 0; --i)
1282 if (pop_kind_at (i) == kind::PDI)
1284 vec.truncate (i);
1285 break;
1287 break;
1288 case kind::LTR:
1289 case kind::RTL:
1290 /* These aren't popped by a PDF/PDI. */
1291 break;
1292 ATTR_LIKELY case kind::NONE:
1293 break;
1294 default:
1295 abort ();
1299 /* Return a descriptive string for K. */
1300 const char *to_str (kind k)
1302 switch (k)
1304 case kind::LRE:
1305 return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1306 case kind::RLE:
1307 return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1308 case kind::LRO:
1309 return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1310 case kind::RLO:
1311 return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1312 case kind::LRI:
1313 return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1314 case kind::RLI:
1315 return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1316 case kind::FSI:
1317 return "U+2068 (FIRST STRONG ISOLATE)";
1318 case kind::PDF:
1319 return "U+202C (POP DIRECTIONAL FORMATTING)";
1320 case kind::PDI:
1321 return "U+2069 (POP DIRECTIONAL ISOLATE)";
1322 case kind::LTR:
1323 return "U+200E (LEFT-TO-RIGHT MARK)";
1324 case kind::RTL:
1325 return "U+200F (RIGHT-TO-LEFT MARK)";
1326 default:
1327 abort ();
1332 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1333 within the current line in FILE, with the caret at START. */
1335 static location_t
1336 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1337 const unsigned char *const start,
1338 size_t num_bytes)
1340 gcc_checking_assert (num_bytes > 0);
1342 /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1343 to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1344 whereas linemap_position_for_column is 1-based. */
1346 /* Get 0-based offsets within the line. */
1347 size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1348 size_t end_offset = start_offset + num_bytes - 1;
1350 /* Now convert to location_t, where "columns" are 1-based byte offsets. */
1351 location_t start_loc = linemap_position_for_column (pfile->line_table,
1352 start_offset + 1);
1353 location_t end_loc = linemap_position_for_column (pfile->line_table,
1354 end_offset + 1);
1356 if (start_loc == end_loc)
1357 return start_loc;
1359 source_range src_range;
1360 src_range.m_start = start_loc;
1361 src_range.m_finish = end_loc;
1362 location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
1363 start_loc,
1364 src_range,
1365 NULL);
1366 return combined_loc;
1369 /* Parse a sequence of 3 bytes starting with P and return its bidi code. */
1371 static bidi::kind
1372 get_bidi_utf8_1 (const unsigned char *const p)
1374 gcc_checking_assert (p[0] == bidi::utf8_start);
1376 if (p[1] == 0x80)
1377 switch (p[2])
1379 case 0xaa:
1380 return bidi::kind::LRE;
1381 case 0xab:
1382 return bidi::kind::RLE;
1383 case 0xac:
1384 return bidi::kind::PDF;
1385 case 0xad:
1386 return bidi::kind::LRO;
1387 case 0xae:
1388 return bidi::kind::RLO;
1389 case 0x8e:
1390 return bidi::kind::LTR;
1391 case 0x8f:
1392 return bidi::kind::RTL;
1393 default:
1394 break;
1396 else if (p[1] == 0x81)
1397 switch (p[2])
1399 case 0xa6:
1400 return bidi::kind::LRI;
1401 case 0xa7:
1402 return bidi::kind::RLI;
1403 case 0xa8:
1404 return bidi::kind::FSI;
1405 case 0xa9:
1406 return bidi::kind::PDI;
1407 default:
1408 break;
1411 return bidi::kind::NONE;
1414 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1415 If the kind is not NONE, write the location to *OUT.*/
1417 static bidi::kind
1418 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1420 bidi::kind result = get_bidi_utf8_1 (p);
1421 if (result != bidi::kind::NONE)
1423 /* We have a sequence of 3 bytes starting at P. */
1424 *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1426 return result;
1429 /* Parse a UCN where P points just past \u or \U and return its bidi code. */
1431 static bidi::kind
1432 get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
1434 /* 6.4.3 Universal Character Names
1435 \u hex-quad
1436 \U hex-quad hex-quad
1437 \u { simple-hexadecimal-digit-sequence }
1438 where \unnnn means \U0000nnnn. */
1440 *end = p + 4;
1441 if (is_U)
1443 if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1444 return bidi::kind::NONE;
1445 /* Skip 4B so we can treat \u and \U the same below. */
1446 p += 4;
1447 *end += 4;
1449 else if (p[0] == '{')
1451 p++;
1452 while (*p == '0')
1453 p++;
1454 if (p[0] != '2'
1455 || p[1] != '0'
1456 || !ISXDIGIT (p[2])
1457 || !ISXDIGIT (p[3])
1458 || p[4] != '}')
1459 return bidi::kind::NONE;
1460 *end = p + 5;
1463 /* All code points we are looking for start with 20xx. */
1464 if (p[0] != '2' || p[1] != '0')
1465 return bidi::kind::NONE;
1466 else if (p[2] == '2')
1467 switch (p[3])
1469 case 'a':
1470 case 'A':
1471 return bidi::kind::LRE;
1472 case 'b':
1473 case 'B':
1474 return bidi::kind::RLE;
1475 case 'c':
1476 case 'C':
1477 return bidi::kind::PDF;
1478 case 'd':
1479 case 'D':
1480 return bidi::kind::LRO;
1481 case 'e':
1482 case 'E':
1483 return bidi::kind::RLO;
1484 default:
1485 break;
1487 else if (p[2] == '6')
1488 switch (p[3])
1490 case '6':
1491 return bidi::kind::LRI;
1492 case '7':
1493 return bidi::kind::RLI;
1494 case '8':
1495 return bidi::kind::FSI;
1496 case '9':
1497 return bidi::kind::PDI;
1498 default:
1499 break;
1501 else if (p[2] == '0')
1502 switch (p[3])
1504 case 'e':
1505 case 'E':
1506 return bidi::kind::LTR;
1507 case 'f':
1508 case 'F':
1509 return bidi::kind::RTL;
1510 default:
1511 break;
1514 return bidi::kind::NONE;
1517 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1518 If the kind is not NONE, write the location to *OUT. */
1520 static bidi::kind
1521 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1522 location_t *out)
1524 const unsigned char *end;
1525 bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
1526 if (result != bidi::kind::NONE)
1528 const unsigned char *start = p - 2;
1529 size_t num_bytes = end - start;
1530 *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1532 return result;
1535 /* Parse a named universal character escape where P points just past \N and
1536 return its bidi code. If the kind is not NONE, write the location to
1537 *OUT. */
1539 static bidi::kind
1540 get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1542 bidi::kind result = bidi::kind::NONE;
1543 if (*p != '{')
1544 return bidi::kind::NONE;
1545 if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
1547 if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1548 result = bidi::kind::LTR;
1549 else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1550 result = bidi::kind::LRE;
1551 else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1552 result = bidi::kind::LRO;
1553 else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1554 result = bidi::kind::LRI;
1556 else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
1558 if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1559 result = bidi::kind::RTL;
1560 else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1561 result = bidi::kind::RLE;
1562 else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1563 result = bidi::kind::RLO;
1564 else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1565 result = bidi::kind::RLI;
1567 else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
1569 if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
1570 result = bidi::kind::PDF;
1571 else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
1572 result = bidi::kind::PDI;
1574 else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
1575 result = bidi::kind::FSI;
1576 if (result != bidi::kind::NONE)
1577 *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
1578 (strchr ((const char *)
1579 (p + 1), '}')
1580 - (const char *) p)
1581 + 3);
1582 return result;
1585 /* Subclass of rich_location for reporting on unpaired UTF-8
1586 bidirectional control character(s).
1587 Escape the source lines on output, and show all unclosed
1588 bidi context, labelling everything. */
1590 class unpaired_bidi_rich_location : public rich_location
1592 public:
1593 class custom_range_label : public range_label
1595 public:
1596 label_text get_text (unsigned range_idx) const final override
1598 /* range 0 is the primary location; each subsequent range i + 1
1599 is for bidi::vec[i]. */
1600 if (range_idx > 0)
1602 const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1603 return label_text::borrow (bidi::to_str (ctxt.m_kind));
1605 else
1606 return label_text::borrow (_("end of bidirectional context"));
1610 unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1611 : rich_location (pfile->line_table, loc, &m_custom_label)
1613 set_escape_on_output (true);
1614 for (unsigned i = 0; i < bidi::vec.count (); i++)
1615 add_range (bidi::vec[i].m_loc,
1616 SHOW_RANGE_WITHOUT_CARET,
1617 &m_custom_label);
1620 private:
1621 custom_range_label m_custom_label;
1624 /* We're closing a bidi context, that is, we've encountered a newline,
1625 are closing a C-style comment, or are at the end of a string literal,
1626 character constant, or identifier. Warn if this context was not
1627 properly terminated by a PDI or PDF. P points to the last character
1628 in this context. */
1630 static void
1631 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1633 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1634 if (bidi::vec.count () > 0
1635 && (warn_bidi & bidirectional_unpaired
1636 && (!bidi::current_ctx_ucn_p ()
1637 || (warn_bidi & bidirectional_ucn))))
1639 const location_t loc
1640 = linemap_position_for_column (pfile->line_table,
1641 CPP_BUF_COLUMN (pfile->buffer, p));
1642 unpaired_bidi_rich_location rich_loc (pfile, loc);
1643 /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1644 forms of a diagnostic, so fake it for now. */
1645 if (bidi::vec.count () > 1)
1646 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1647 "unpaired UTF-8 bidirectional control characters "
1648 "detected");
1649 else
1650 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1651 "unpaired UTF-8 bidirectional control character "
1652 "detected");
1654 /* We're done with this context. */
1655 bidi::on_close ();
1658 /* We're at the beginning or in the middle of an identifier/comment/string
1659 literal/character constant. Warn if we've encountered a bidi character.
1660 KIND says which bidi control character it was; UCN_P is true iff this bidi
1661 control character was written as a UCN. LOC is the location of the
1662 character, but is only valid if KIND != bidi::kind::NONE. */
1664 static void
1665 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1666 bool ucn_p, location_t loc)
1668 if (__builtin_expect (kind == bidi::kind::NONE, 1))
1669 return;
1671 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1673 if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1675 rich_location rich_loc (pfile->line_table, loc);
1676 rich_loc.set_escape_on_output (true);
1678 /* It seems excessive to warn about a PDI/PDF that is closing
1679 an opened context because we've already warned about the
1680 opening character. Except warn when we have a UCN x UTF-8
1681 mismatch, if UCN checking is enabled. */
1682 if (kind == bidi::current_ctx ())
1684 if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1685 && bidi::current_ctx_ucn_p () != ucn_p)
1687 rich_loc.add_range (bidi::current_ctx_loc ());
1688 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1689 "UTF-8 vs UCN mismatch when closing "
1690 "a context by \"%s\"", bidi::to_str (kind));
1693 else if (warn_bidi & bidirectional_any
1694 && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1696 if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1697 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1698 "\"%s\" is closing an unopened context",
1699 bidi::to_str (kind));
1700 else
1701 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1702 "found problematic Unicode character \"%s\"",
1703 bidi::to_str (kind));
1706 /* We're done with this context. */
1707 bidi::on_char (kind, ucn_p, loc);
1710 static const cppchar_t utf8_continuation = 0x80;
1711 static const cppchar_t utf8_signifier = 0xC0;
1713 /* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1714 at PFILE->buffer->cur. Return a pointer after the diagnosed
1715 invalid character. */
1717 static const uchar *
1718 _cpp_warn_invalid_utf8 (cpp_reader *pfile)
1720 cpp_buffer *buffer = pfile->buffer;
1721 const uchar *cur = buffer->cur;
1722 bool pedantic = (CPP_PEDANTIC (pfile)
1723 && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1725 if (cur[0] < utf8_signifier
1726 || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1728 if (pedantic)
1729 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1730 pfile->line_table->highest_line,
1731 CPP_BUF_COL (buffer),
1732 "invalid UTF-8 character <%x>",
1733 cur[0]);
1734 else
1735 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1736 pfile->line_table->highest_line,
1737 CPP_BUF_COL (buffer),
1738 "invalid UTF-8 character <%x>",
1739 cur[0]);
1740 return cur + 1;
1742 else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1744 if (pedantic)
1745 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1746 pfile->line_table->highest_line,
1747 CPP_BUF_COL (buffer),
1748 "invalid UTF-8 character <%x><%x>",
1749 cur[0], cur[1]);
1750 else
1751 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1752 pfile->line_table->highest_line,
1753 CPP_BUF_COL (buffer),
1754 "invalid UTF-8 character <%x><%x>",
1755 cur[0], cur[1]);
1756 return cur + 2;
1758 else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1760 if (pedantic)
1761 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1762 pfile->line_table->highest_line,
1763 CPP_BUF_COL (buffer),
1764 "invalid UTF-8 character <%x><%x><%x>",
1765 cur[0], cur[1], cur[2]);
1766 else
1767 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1768 pfile->line_table->highest_line,
1769 CPP_BUF_COL (buffer),
1770 "invalid UTF-8 character <%x><%x><%x>",
1771 cur[0], cur[1], cur[2]);
1772 return cur + 3;
1774 else
1776 if (pedantic)
1777 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1778 pfile->line_table->highest_line,
1779 CPP_BUF_COL (buffer),
1780 "invalid UTF-8 character <%x><%x><%x><%x>",
1781 cur[0], cur[1], cur[2], cur[3]);
1782 else
1783 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1784 pfile->line_table->highest_line,
1785 CPP_BUF_COL (buffer),
1786 "invalid UTF-8 character <%x><%x><%x><%x>",
1787 cur[0], cur[1], cur[2], cur[3]);
1788 return cur + 4;
1792 /* Helper function of *skip_*_comment and lex*_string. For C,
1793 character at CUR[-1] with MSB set handle -Wbidi-chars* and
1794 -Winvalid-utf8 diagnostics and return pointer to first character
1795 that should be processed next. */
1797 static inline const uchar *
1798 _cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1799 const uchar *cur, bool warn_bidi_p,
1800 bool warn_invalid_utf8_p)
1802 /* If this is a beginning of a UTF-8 encoding, it might be
1803 a bidirectional control character. */
1804 if (c == bidi::utf8_start && warn_bidi_p)
1806 location_t loc;
1807 bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1808 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1810 if (!warn_invalid_utf8_p)
1811 return cur;
1812 if (c >= utf8_signifier)
1814 cppchar_t s;
1815 const uchar *pstr = cur - 1;
1816 if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
1817 && s <= UCS_LIMIT)
1818 return pstr;
1820 pfile->buffer->cur = cur - 1;
1821 return _cpp_warn_invalid_utf8 (pfile);
1824 /* Skip a C-style block comment. We find the end of the comment by
1825 seeing if an asterisk is before every '/' we encounter. Returns
1826 nonzero if comment terminated by EOF, zero otherwise.
1828 Buffer->cur points to the initial asterisk of the comment. */
1829 bool
1830 _cpp_skip_block_comment (cpp_reader *pfile)
1832 cpp_buffer *buffer = pfile->buffer;
1833 const uchar *cur = buffer->cur;
1834 uchar c;
1835 const bool warn_bidi_p = pfile->warn_bidi_p ();
1836 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1837 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1839 cur++;
1840 if (*cur == '/')
1841 cur++;
1843 for (;;)
1845 /* People like decorating comments with '*', so check for '/'
1846 instead for efficiency. */
1847 c = *cur++;
1849 if (c == '/')
1851 if (cur[-2] == '*')
1853 if (warn_bidi_p)
1854 maybe_warn_bidi_on_close (pfile, cur);
1855 break;
1858 /* Warn about potential nested comments, but not if the '/'
1859 comes immediately before the true comment delimiter.
1860 Don't bother to get it right across escaped newlines. */
1861 if (CPP_OPTION (pfile, warn_comments)
1862 && cur[0] == '*' && cur[1] != '/')
1864 buffer->cur = cur;
1865 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1866 pfile->line_table->highest_line,
1867 CPP_BUF_COL (buffer),
1868 "\"/*\" within comment");
1871 else if (c == '\n')
1873 unsigned int cols;
1874 buffer->cur = cur - 1;
1875 if (warn_bidi_p)
1876 maybe_warn_bidi_on_close (pfile, cur);
1877 _cpp_process_line_notes (pfile, true);
1878 if (buffer->next_line >= buffer->rlimit)
1879 return true;
1880 _cpp_clean_line (pfile);
1882 cols = buffer->next_line - buffer->line_base;
1883 CPP_INCREMENT_LINE (pfile, cols);
1885 cur = buffer->cur;
1887 else if (__builtin_expect (c >= utf8_continuation, 0)
1888 && warn_bidi_or_invalid_utf8_p)
1889 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1890 warn_invalid_utf8_p);
1893 buffer->cur = cur;
1894 _cpp_process_line_notes (pfile, true);
1895 return false;
1898 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1899 terminating newline. Handles escaped newlines. Returns nonzero
1900 if a multiline comment. */
1901 static int
1902 skip_line_comment (cpp_reader *pfile)
1904 cpp_buffer *buffer = pfile->buffer;
1905 location_t orig_line = pfile->line_table->highest_line;
1906 const bool warn_bidi_p = pfile->warn_bidi_p ();
1907 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1908 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1910 if (!warn_bidi_or_invalid_utf8_p)
1911 while (*buffer->cur != '\n')
1912 buffer->cur++;
1913 else if (!warn_invalid_utf8_p)
1915 while (*buffer->cur != '\n'
1916 && *buffer->cur != bidi::utf8_start)
1917 buffer->cur++;
1918 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1920 while (*buffer->cur != '\n')
1922 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1924 location_t loc;
1925 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1926 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1928 buffer->cur++;
1930 maybe_warn_bidi_on_close (pfile, buffer->cur);
1933 else
1935 while (*buffer->cur != '\n')
1937 if (*buffer->cur < utf8_continuation)
1939 buffer->cur++;
1940 continue;
1942 buffer->cur
1943 = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
1944 warn_bidi_p, warn_invalid_utf8_p);
1946 if (warn_bidi_p)
1947 maybe_warn_bidi_on_close (pfile, buffer->cur);
1950 _cpp_process_line_notes (pfile, true);
1951 return orig_line != pfile->line_table->highest_line;
1954 /* Skips whitespace, saving the next non-whitespace character. */
1955 static void
1956 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1958 cpp_buffer *buffer = pfile->buffer;
1959 bool saw_NUL = false;
1963 /* Horizontal space always OK. */
1964 if (c == ' ' || c == '\t')
1966 /* Just \f \v or \0 left. */
1967 else if (c == '\0')
1968 saw_NUL = true;
1969 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1970 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1971 CPP_BUF_COL (buffer),
1972 "%s in preprocessing directive",
1973 c == '\f' ? "form feed" : "vertical tab");
1975 c = *buffer->cur++;
1977 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1978 while (is_nvspace (c));
1980 if (saw_NUL)
1982 encoding_rich_location rich_loc (pfile);
1983 cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1984 "null character(s) ignored");
1987 buffer->cur--;
1990 /* See if the characters of a number token are valid in a name (no
1991 '.', '+' or '-'). */
1992 static int
1993 name_p (cpp_reader *pfile, const cpp_string *string)
1995 unsigned int i;
1997 for (i = 0; i < string->len; i++)
1998 if (!is_idchar (string->text[i]))
1999 return 0;
2001 return 1;
2004 /* After parsing an identifier or other sequence, produce a warning about
2005 sequences not in NFC/NFKC. */
2006 static void
2007 warn_about_normalization (cpp_reader *pfile,
2008 const cpp_token *token,
2009 const struct normalize_state *s)
2011 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2012 && !pfile->state.skipping)
2014 location_t loc = token->src_loc;
2016 /* If possible, create a location range for the token. */
2017 if (loc >= RESERVED_LOCATION_COUNT
2018 && token->type != CPP_EOF
2019 /* There must be no line notes to process. */
2020 && (!(pfile->buffer->cur
2021 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2022 && !pfile->overlaid_buffer)))
2024 source_range tok_range;
2025 tok_range.m_start = loc;
2026 tok_range.m_finish
2027 = linemap_position_for_column (pfile->line_table,
2028 CPP_BUF_COLUMN (pfile->buffer,
2029 pfile->buffer->cur));
2030 loc = COMBINE_LOCATION_DATA (pfile->line_table,
2031 loc, tok_range, NULL);
2034 encoding_rich_location rich_loc (pfile, loc);
2036 /* Make sure that the token is printed using UCNs, even
2037 if we'd otherwise happily print UTF-8. */
2038 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2039 size_t sz;
2041 sz = cpp_spell_token (pfile, token, buf, false) - buf;
2042 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2043 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2044 "`%.*s' is not in NFKC", (int) sz, buf);
2045 else if (CPP_OPTION (pfile, cplusplus))
2046 cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2047 "`%.*s' is not in NFC", (int) sz, buf);
2048 else
2049 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2050 "`%.*s' is not in NFC", (int) sz, buf);
2051 free (buf);
2055 /* Returns TRUE if the sequence starting at buffer->cur is valid in
2056 an identifier. FIRST is TRUE if this starts an identifier. */
2058 static bool
2059 forms_identifier_p (cpp_reader *pfile, int first,
2060 struct normalize_state *state)
2062 cpp_buffer *buffer = pfile->buffer;
2063 const bool warn_bidi_p = pfile->warn_bidi_p ();
2065 if (*buffer->cur == '$')
2067 if (!CPP_OPTION (pfile, dollars_in_ident))
2068 return false;
2070 buffer->cur++;
2071 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
2073 CPP_OPTION (pfile, warn_dollars) = 0;
2074 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
2077 return true;
2080 /* Is this a syntactically valid UCN or a valid UTF-8 char? */
2081 if (CPP_OPTION (pfile, extended_identifiers))
2083 cppchar_t s;
2084 if (*buffer->cur >= utf8_signifier)
2086 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2087 && warn_bidi_p)
2089 location_t loc;
2090 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
2091 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2093 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2094 state, &s))
2095 return true;
2097 else if (*buffer->cur == '\\'
2098 && (buffer->cur[1] == 'u'
2099 || buffer->cur[1] == 'U'
2100 || buffer->cur[1] == 'N'))
2102 buffer->cur += 2;
2103 if (warn_bidi_p)
2105 location_t loc;
2106 bidi::kind kind;
2107 if (buffer->cur[-1] == 'N')
2108 kind = get_bidi_named (pfile, buffer->cur, &loc);
2109 else
2110 kind = get_bidi_ucn (pfile, buffer->cur,
2111 buffer->cur[-1] == 'U', &loc);
2112 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2114 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2115 state, &s, NULL, NULL))
2116 return true;
2117 buffer->cur -= 2;
2121 return false;
2124 /* Helper function to issue error about improper __VA_OPT__ use. */
2125 static void
2126 maybe_va_opt_error (cpp_reader *pfile)
2128 if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2130 /* __VA_OPT__ should not be accepted at all, but allow it in
2131 system headers. */
2132 if (!_cpp_in_system_header (pfile))
2133 cpp_error (pfile, CPP_DL_PEDWARN,
2134 "__VA_OPT__ is not available until C++20");
2136 else if (!pfile->state.va_args_ok)
2138 /* __VA_OPT__ should only appear in the replacement list of a
2139 variadic macro. */
2140 cpp_error (pfile, CPP_DL_PEDWARN,
2141 "__VA_OPT__ can only appear in the expansion"
2142 " of a C++20 variadic macro");
2146 /* Helper function to get the cpp_hashnode of the identifier BASE. */
2147 static cpp_hashnode *
2148 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
2150 cpp_hashnode *result;
2151 const uchar *cur;
2152 unsigned int len;
2153 unsigned int hash = HT_HASHSTEP (0, *base);
2155 cur = base + 1;
2156 while (ISIDNUM (*cur))
2158 hash = HT_HASHSTEP (hash, *cur);
2159 cur++;
2161 len = cur - base;
2162 hash = HT_HASHFINISH (hash, len);
2163 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2164 base, len, hash, HT_ALLOC));
2166 /* Rarely, identifiers require diagnostics when lexed. */
2167 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2168 && !pfile->state.skipping, 0))
2170 /* It is allowed to poison the same identifier twice. */
2171 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2172 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2173 NODE_NAME (result));
2175 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2176 replacement list of a variadic macro. */
2177 if (result == pfile->spec_nodes.n__VA_ARGS__
2178 && !pfile->state.va_args_ok)
2180 if (CPP_OPTION (pfile, cplusplus))
2181 cpp_error (pfile, CPP_DL_PEDWARN,
2182 "__VA_ARGS__ can only appear in the expansion"
2183 " of a C++11 variadic macro");
2184 else
2185 cpp_error (pfile, CPP_DL_PEDWARN,
2186 "__VA_ARGS__ can only appear in the expansion"
2187 " of a C99 variadic macro");
2190 if (result == pfile->spec_nodes.n__VA_OPT__)
2191 maybe_va_opt_error (pfile);
2193 /* For -Wc++-compat, warn about use of C++ named operators. */
2194 if (result->flags & NODE_WARN_OPERATOR)
2195 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2196 "identifier \"%s\" is a special operator name in C++",
2197 NODE_NAME (result));
2200 return result;
2203 /* Get the cpp_hashnode of an identifier specified by NAME in
2204 the current cpp_reader object. If none is found, NULL is returned. */
2205 cpp_hashnode *
2206 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2208 cpp_hashnode *result;
2209 result = lex_identifier_intern (pfile, (uchar *) name);
2210 return result;
2213 /* Lex an identifier starting at BUFFER->CUR - 1. */
2214 static cpp_hashnode *
2215 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2216 struct normalize_state *nst, cpp_hashnode **spelling)
2218 cpp_hashnode *result;
2219 const uchar *cur;
2220 unsigned int len;
2221 unsigned int hash = HT_HASHSTEP (0, *base);
2222 const bool warn_bidi_p = pfile->warn_bidi_p ();
2224 cur = pfile->buffer->cur;
2225 if (! starts_ucn)
2227 while (ISIDNUM (*cur))
2229 hash = HT_HASHSTEP (hash, *cur);
2230 cur++;
2232 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2234 pfile->buffer->cur = cur;
2235 if (starts_ucn || forms_identifier_p (pfile, false, nst))
2237 /* Slower version for identifiers containing UCNs
2238 or extended chars (including $). */
2239 do {
2240 while (ISIDNUM (*pfile->buffer->cur))
2242 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2243 pfile->buffer->cur++;
2245 } while (forms_identifier_p (pfile, false, nst));
2246 if (warn_bidi_p)
2247 maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2248 result = _cpp_interpret_identifier (pfile, base,
2249 pfile->buffer->cur - base);
2250 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2252 else
2254 len = cur - base;
2255 hash = HT_HASHFINISH (hash, len);
2257 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2258 base, len, hash, HT_ALLOC));
2259 *spelling = result;
2262 /* Rarely, identifiers require diagnostics when lexed. */
2263 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2264 && !pfile->state.skipping, 0))
2266 /* It is allowed to poison the same identifier twice. */
2267 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2268 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2269 NODE_NAME (result));
2271 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2272 replacement list of a variadic macro. */
2273 if (result == pfile->spec_nodes.n__VA_ARGS__
2274 && !pfile->state.va_args_ok)
2276 if (CPP_OPTION (pfile, cplusplus))
2277 cpp_error (pfile, CPP_DL_PEDWARN,
2278 "__VA_ARGS__ can only appear in the expansion"
2279 " of a C++11 variadic macro");
2280 else
2281 cpp_error (pfile, CPP_DL_PEDWARN,
2282 "__VA_ARGS__ can only appear in the expansion"
2283 " of a C99 variadic macro");
2286 /* __VA_OPT__ should only appear in the replacement list of a
2287 variadic macro. */
2288 if (result == pfile->spec_nodes.n__VA_OPT__)
2289 maybe_va_opt_error (pfile);
2291 /* For -Wc++-compat, warn about use of C++ named operators. */
2292 if (result->flags & NODE_WARN_OPERATOR)
2293 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2294 "identifier \"%s\" is a special operator name in C++",
2295 NODE_NAME (result));
2298 return result;
2301 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
2302 static void
2303 lex_number (cpp_reader *pfile, cpp_string *number,
2304 struct normalize_state *nst)
2306 const uchar *cur;
2307 const uchar *base;
2308 uchar *dest;
2310 base = pfile->buffer->cur - 1;
2313 const uchar *adj_digit_sep = NULL;
2314 cur = pfile->buffer->cur;
2316 /* N.B. ISIDNUM does not include $. */
2317 while (ISIDNUM (*cur)
2318 || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2319 || DIGIT_SEP (*cur)
2320 || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2322 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2323 /* Adjacent digit separators do not form part of the pp-number syntax.
2324 However, they can safely be diagnosed here as an error, since '' is
2325 not a valid preprocessing token. */
2326 if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2327 adj_digit_sep = cur;
2328 cur++;
2330 /* A number can't end with a digit separator. */
2331 while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2332 --cur;
2333 if (adj_digit_sep && adj_digit_sep < cur)
2334 cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2336 pfile->buffer->cur = cur;
2338 while (forms_identifier_p (pfile, false, nst));
2340 number->len = cur - base;
2341 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2342 memcpy (dest, base, number->len);
2343 dest[number->len] = '\0';
2344 number->text = dest;
2347 /* Create a token of type TYPE with a literal spelling. */
2348 static void
2349 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2350 unsigned int len, enum cpp_ttype type)
2352 token->type = type;
2353 token->val.str.len = len;
2354 token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2357 const uchar *
2358 cpp_alloc_token_string (cpp_reader *pfile,
2359 const unsigned char *ptr, unsigned len)
2361 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2363 dest[len] = 0;
2364 memcpy (dest, ptr, len);
2365 return dest;
2368 /* A pair of raw buffer pointers. The currently open one is [1], the
2369 first one is [0]. Used for string literal lexing. */
2370 struct lit_accum {
2371 _cpp_buff *first;
2372 _cpp_buff *last;
2373 const uchar *rpos;
2374 size_t accum;
2376 lit_accum ()
2377 : first (NULL), last (NULL), rpos (0), accum (0)
2381 void append (cpp_reader *, const uchar *, size_t);
2383 void read_begin (cpp_reader *);
2384 bool reading_p () const
2386 return rpos != NULL;
2388 char read_char ()
2390 char c = *rpos++;
2391 if (rpos == BUFF_FRONT (last))
2392 rpos = NULL;
2393 return c;
2397 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2398 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
2400 void
2401 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2403 if (!last)
2404 /* Starting. */
2405 first = last = _cpp_get_buff (pfile, len);
2406 else if (len > BUFF_ROOM (last))
2408 /* There is insufficient room in the buffer. Copy what we can,
2409 and then either extend or create a new one. */
2410 size_t room = BUFF_ROOM (last);
2411 memcpy (BUFF_FRONT (last), base, room);
2412 BUFF_FRONT (last) += room;
2413 base += room;
2414 len -= room;
2415 accum += room;
2417 gcc_checking_assert (!rpos);
2419 last = _cpp_append_extend_buff (pfile, last, len);
2422 memcpy (BUFF_FRONT (last), base, len);
2423 BUFF_FRONT (last) += len;
2424 accum += len;
2427 void
2428 lit_accum::read_begin (cpp_reader *pfile)
2430 /* We never accumulate more than 4 chars to read. */
2431 if (BUFF_ROOM (last) < 4)
2433 last = _cpp_append_extend_buff (pfile, last, 4);
2434 rpos = BUFF_FRONT (last);
2437 /* Returns true if a macro has been defined.
2438 This might not work if compile with -save-temps,
2439 or preprocess separately from compilation. */
2441 static bool
2442 is_macro(cpp_reader *pfile, const uchar *base)
2444 const uchar *cur = base;
2445 if (! ISIDST (*cur))
2446 return false;
2447 unsigned int hash = HT_HASHSTEP (0, *cur);
2448 ++cur;
2449 while (ISIDNUM (*cur))
2451 hash = HT_HASHSTEP (hash, *cur);
2452 ++cur;
2454 hash = HT_HASHFINISH (hash, cur - base);
2456 cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2457 base, cur - base, hash, HT_NO_INSERT));
2459 return result && cpp_macro_p (result);
2462 /* Returns true if a literal suffix does not have the expected form
2463 and is defined as a macro. */
2465 static bool
2466 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
2468 /* User-defined literals outside of namespace std must start with a single
2469 underscore, so assume anything of that form really is a UDL suffix.
2470 We don't need to worry about UDLs defined inside namespace std because
2471 their names are reserved, so cannot be used as macro names in valid
2472 programs. */
2473 if (base[0] == '_' && base[1] != '_')
2474 return false;
2475 return is_macro (pfile, base);
2478 /* Lexes a raw string. The stored string contains the spelling,
2479 including double quotes, delimiter string, '(' and ')', any leading
2480 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
2481 the type of the literal, or CPP_OTHER if it was not properly
2482 terminated.
2484 BASE is the start of the token. Updates pfile->buffer->cur to just
2485 after the lexed string.
2487 The spelling is NUL-terminated, but it is not guaranteed that this
2488 is the first NUL since embedded NULs are preserved. */
2490 static void
2491 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2493 const uchar *pos = base;
2494 const bool warn_bidi_p = pfile->warn_bidi_p ();
2495 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2496 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2498 /* 'tis a pity this information isn't passed down from the lexer's
2499 initial categorization of the token. */
2500 enum cpp_ttype type = CPP_STRING;
2502 if (*pos == 'L')
2504 type = CPP_WSTRING;
2505 pos++;
2507 else if (*pos == 'U')
2509 type = CPP_STRING32;
2510 pos++;
2512 else if (*pos == 'u')
2514 if (pos[1] == '8')
2516 type = CPP_UTF8STRING;
2517 pos++;
2519 else
2520 type = CPP_STRING16;
2521 pos++;
2524 gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2525 pos += 2;
2527 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2529 /* Skip notes before the ". */
2530 while (note->pos < pos)
2531 ++note;
2533 lit_accum accum;
2535 uchar prefix[17];
2536 unsigned prefix_len = 0;
2537 enum Phase
2539 PHASE_PREFIX = -2,
2540 PHASE_NONE = -1,
2541 PHASE_SUFFIX = 0
2542 } phase = PHASE_PREFIX;
2544 for (;;)
2546 gcc_checking_assert (note->pos >= pos);
2548 /* Undo any escaped newlines and trigraphs. */
2549 if (!accum.reading_p () && note->pos == pos)
2550 switch (note->type)
2552 case '\\':
2553 case ' ':
2554 /* Restore backslash followed by newline. */
2555 accum.append (pfile, base, pos - base);
2556 base = pos;
2557 accum.read_begin (pfile);
2558 accum.append (pfile, UC"\\", 1);
2560 after_backslash:
2561 if (note->type == ' ')
2562 /* GNU backslash whitespace newline extension. FIXME
2563 could be any sequence of non-vertical space. When we
2564 can properly restore any such sequence, we should
2565 mark this note as handled so _cpp_process_line_notes
2566 doesn't warn. */
2567 accum.append (pfile, UC" ", 1);
2569 accum.append (pfile, UC"\n", 1);
2570 note++;
2571 break;
2573 case '\n':
2574 /* This can happen for ??/<NEWLINE> when trigraphs are not
2575 being interpretted. */
2576 gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2577 note->type = 0;
2578 note++;
2579 break;
2581 default:
2582 gcc_checking_assert (_cpp_trigraph_map[note->type]);
2584 /* Don't warn about this trigraph in
2585 _cpp_process_line_notes, since trigraphs show up as
2586 trigraphs in raw strings. */
2587 uchar type = note->type;
2588 note->type = 0;
2590 if (CPP_OPTION (pfile, trigraphs))
2592 accum.append (pfile, base, pos - base);
2593 base = pos;
2594 accum.read_begin (pfile);
2595 accum.append (pfile, UC"??", 2);
2596 accum.append (pfile, &type, 1);
2598 /* ??/ followed by newline gets two line notes, one for
2599 the trigraph and one for the backslash/newline. */
2600 if (type == '/' && note[1].pos == pos)
2602 note++;
2603 gcc_assert (note->type == '\\' || note->type == ' ');
2604 goto after_backslash;
2606 /* Skip the replacement character. */
2607 base = ++pos;
2610 note++;
2611 break;
2614 /* Now get a char to process. Either from an expanded note, or
2615 from the line buffer. */
2616 bool read_note = accum.reading_p ();
2617 char c = read_note ? accum.read_char () : *pos++;
2619 if (phase == PHASE_PREFIX)
2621 if (c == '(')
2623 /* Done. */
2624 phase = PHASE_NONE;
2625 prefix[prefix_len++] = '"';
2627 else if (prefix_len < 16
2628 /* Prefix chars are any of the basic character set,
2629 [lex.charset] except for '
2630 ()\\\t\v\f\n'. Optimized for a contiguous
2631 alphabet. */
2632 /* Unlike a switch, this collapses down to one or
2633 two shift and bitmask operations on an ASCII
2634 system, with an outlier or two. */
2635 && (('Z' - 'A' == 25
2636 ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2637 : ISIDST (c))
2638 || (c >= '0' && c <= '9')
2639 || c == '_' || c == '{' || c == '}'
2640 || c == '[' || c == ']' || c == '#'
2641 || c == '<' || c == '>' || c == '%'
2642 || c == ':' || c == ';' || c == '.' || c == '?'
2643 || c == '*' || c == '+' || c == '-' || c == '/'
2644 || c == '^' || c == '&' || c == '|' || c == '~'
2645 || c == '!' || c == '=' || c == ','
2646 || c == '"' || c == '\''))
2647 prefix[prefix_len++] = c;
2648 else
2650 /* Something is wrong. */
2651 int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2652 if (prefix_len == 16)
2653 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2654 col, "raw string delimiter longer "
2655 "than 16 characters");
2656 else if (c == '\n')
2657 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2658 col, "invalid new-line in raw "
2659 "string delimiter");
2660 else
2661 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2662 col, "invalid character '%c' in "
2663 "raw string delimiter", c);
2664 type = CPP_OTHER;
2665 phase = PHASE_NONE;
2666 /* Continue until we get a close quote, that's probably
2667 the best failure mode. */
2668 prefix_len = 0;
2670 if (c != '\n')
2671 continue;
2674 if (phase != PHASE_NONE)
2676 if (prefix[phase] != c)
2677 phase = PHASE_NONE;
2678 else if (unsigned (phase + 1) == prefix_len)
2679 break;
2680 else
2682 phase = Phase (phase + 1);
2683 continue;
2687 if (!prefix_len && c == '"')
2688 /* Failure mode lexing. */
2689 goto out;
2690 else if (prefix_len && c == ')')
2691 phase = PHASE_SUFFIX;
2692 else if (!read_note && c == '\n')
2694 pos--;
2695 pfile->buffer->cur = pos;
2696 if (pfile->state.in_directive
2697 || (pfile->state.parsing_args
2698 && pfile->buffer->next_line >= pfile->buffer->rlimit))
2700 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2701 "unterminated raw string");
2702 type = CPP_OTHER;
2703 goto out;
2706 accum.append (pfile, base, pos - base + 1);
2707 _cpp_process_line_notes (pfile, false);
2709 if (pfile->buffer->next_line < pfile->buffer->rlimit)
2710 CPP_INCREMENT_LINE (pfile, 0);
2711 pfile->buffer->need_line = true;
2713 if (!_cpp_get_fresh_line (pfile))
2715 /* We ran out of file and failed to get a line. */
2716 location_t src_loc = token->src_loc;
2717 token->type = CPP_EOF;
2718 /* Tell the compiler the line number of the EOF token. */
2719 token->src_loc = pfile->line_table->highest_line;
2720 token->flags = BOL;
2721 if (accum.first)
2722 _cpp_release_buff (pfile, accum.first);
2723 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2724 "unterminated raw string");
2725 /* Now pop the buffer that _cpp_get_fresh_line did not. */
2726 _cpp_pop_buffer (pfile);
2727 return;
2730 pos = base = pfile->buffer->cur;
2731 note = &pfile->buffer->notes[pfile->buffer->cur_note];
2733 else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2734 && warn_bidi_or_invalid_utf8_p)
2735 pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
2736 warn_invalid_utf8_p);
2739 if (warn_bidi_p)
2740 maybe_warn_bidi_on_close (pfile, pos);
2742 if (CPP_OPTION (pfile, user_literals))
2744 /* If a string format macro, say from inttypes.h, is placed touching
2745 a string literal it could be parsed as a C++11 user-defined string
2746 literal thus breaking the program. */
2747 if (is_macro_not_literal_suffix (pfile, pos))
2749 /* Raise a warning, but do not consume subsequent tokens. */
2750 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2751 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2752 token->src_loc, 0,
2753 "invalid suffix on literal; C++11 requires "
2754 "a space between literal and string macro");
2756 /* Grab user defined literal suffix. */
2757 else if (ISIDST (*pos))
2759 type = cpp_userdef_string_add_type (type);
2760 ++pos;
2762 while (ISIDNUM (*pos))
2763 ++pos;
2767 out:
2768 pfile->buffer->cur = pos;
2769 if (!accum.accum)
2770 create_literal (pfile, token, base, pos - base, type);
2771 else
2773 size_t extra_len = pos - base;
2774 uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2776 token->type = type;
2777 token->val.str.len = accum.accum + extra_len;
2778 token->val.str.text = dest;
2779 for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2781 size_t len = BUFF_FRONT (buf) - buf->base;
2782 memcpy (dest, buf->base, len);
2783 dest += len;
2785 _cpp_release_buff (pfile, accum.first);
2786 memcpy (dest, base, extra_len);
2787 dest[extra_len] = '\0';
2791 /* Lexes a string, character constant, or angle-bracketed header file
2792 name. The stored string contains the spelling, including opening
2793 quote and any leading 'L', 'u', 'U' or 'u8' and optional
2794 'R' modifier. It returns the type of the literal, or CPP_OTHER
2795 if it was not properly terminated, or CPP_LESS for an unterminated
2796 header name which must be relexed as normal tokens.
2798 The spelling is NUL-terminated, but it is not guaranteed that this
2799 is the first NUL since embedded NULs are preserved. */
2800 static void
2801 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2803 bool saw_NUL = false;
2804 const uchar *cur;
2805 cppchar_t terminator;
2806 enum cpp_ttype type;
2808 cur = base;
2809 terminator = *cur++;
2810 if (terminator == 'L' || terminator == 'U')
2811 terminator = *cur++;
2812 else if (terminator == 'u')
2814 terminator = *cur++;
2815 if (terminator == '8')
2816 terminator = *cur++;
2818 if (terminator == 'R')
2820 lex_raw_string (pfile, token, base);
2821 return;
2823 if (terminator == '"')
2824 type = (*base == 'L' ? CPP_WSTRING :
2825 *base == 'U' ? CPP_STRING32 :
2826 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2827 : CPP_STRING);
2828 else if (terminator == '\'')
2829 type = (*base == 'L' ? CPP_WCHAR :
2830 *base == 'U' ? CPP_CHAR32 :
2831 *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2832 : CPP_CHAR);
2833 else
2834 terminator = '>', type = CPP_HEADER_NAME;
2836 const bool warn_bidi_p = pfile->warn_bidi_p ();
2837 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2838 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2839 for (;;)
2841 cppchar_t c = *cur++;
2843 /* In #include-style directives, terminators are not escapable. */
2844 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2846 if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
2848 location_t loc;
2849 bidi::kind kind;
2850 if (cur[0] == 'N')
2851 kind = get_bidi_named (pfile, cur + 1, &loc);
2852 else
2853 kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
2854 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2856 cur++;
2858 else if (c == terminator)
2860 if (warn_bidi_p)
2861 maybe_warn_bidi_on_close (pfile, cur - 1);
2862 break;
2864 else if (c == '\n')
2866 cur--;
2867 /* Unmatched quotes always yield undefined behavior, but
2868 greedy lexing means that what appears to be an unterminated
2869 header name may actually be a legitimate sequence of tokens. */
2870 if (terminator == '>')
2872 token->type = CPP_LESS;
2873 return;
2875 type = CPP_OTHER;
2876 break;
2878 else if (c == '\0')
2879 saw_NUL = true;
2880 else if (__builtin_expect (c >= utf8_continuation, 0)
2881 && warn_bidi_or_invalid_utf8_p)
2882 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2883 warn_invalid_utf8_p);
2886 if (saw_NUL && !pfile->state.skipping)
2887 cpp_error (pfile, CPP_DL_WARNING,
2888 "null character(s) preserved in literal");
2890 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2891 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2892 (int) terminator);
2894 if (CPP_OPTION (pfile, user_literals))
2896 /* If a string format macro, say from inttypes.h, is placed touching
2897 a string literal it could be parsed as a C++11 user-defined string
2898 literal thus breaking the program. */
2899 if (is_macro_not_literal_suffix (pfile, cur))
2901 /* Raise a warning, but do not consume subsequent tokens. */
2902 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2903 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2904 token->src_loc, 0,
2905 "invalid suffix on literal; C++11 requires "
2906 "a space between literal and string macro");
2908 /* Grab user defined literal suffix. */
2909 else if (ISIDST (*cur))
2911 type = cpp_userdef_char_add_type (type);
2912 type = cpp_userdef_string_add_type (type);
2913 ++cur;
2915 while (ISIDNUM (*cur))
2916 ++cur;
2919 else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2920 && is_macro (pfile, cur)
2921 && !pfile->state.skipping)
2922 cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2923 token->src_loc, 0, "C++11 requires a space "
2924 "between string literal and macro");
2926 pfile->buffer->cur = cur;
2927 create_literal (pfile, token, base, cur - base, type);
2930 /* Return the comment table. The client may not make any assumption
2931 about the ordering of the table. */
2932 cpp_comment_table *
2933 cpp_get_comments (cpp_reader *pfile)
2935 return &pfile->comments;
2938 /* Append a comment to the end of the comment table. */
2939 static void
2940 store_comment (cpp_reader *pfile, cpp_token *token)
2942 int len;
2944 if (pfile->comments.allocated == 0)
2946 pfile->comments.allocated = 256;
2947 pfile->comments.entries = (cpp_comment *) xmalloc
2948 (pfile->comments.allocated * sizeof (cpp_comment));
2951 if (pfile->comments.count == pfile->comments.allocated)
2953 pfile->comments.allocated *= 2;
2954 pfile->comments.entries = (cpp_comment *) xrealloc
2955 (pfile->comments.entries,
2956 pfile->comments.allocated * sizeof (cpp_comment));
2959 len = token->val.str.len;
2961 /* Copy comment. Note, token may not be NULL terminated. */
2962 pfile->comments.entries[pfile->comments.count].comment =
2963 (char *) xmalloc (sizeof (char) * (len + 1));
2964 memcpy (pfile->comments.entries[pfile->comments.count].comment,
2965 token->val.str.text, len);
2966 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2968 /* Set source location. */
2969 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2971 /* Increment the count of entries in the comment table. */
2972 pfile->comments.count++;
2975 /* The stored comment includes the comment start and any terminator. */
2976 static void
2977 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2978 cppchar_t type)
2980 unsigned char *buffer;
2981 unsigned int len, clen, i;
2983 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
2985 /* C++ comments probably (not definitely) have moved past a new
2986 line, which we don't want to save in the comment. */
2987 if (is_vspace (pfile->buffer->cur[-1]))
2988 len--;
2990 /* If we are currently in a directive or in argument parsing, then
2991 we need to store all C++ comments as C comments internally, and
2992 so we need to allocate a little extra space in that case.
2994 Note that the only time we encounter a directive here is
2995 when we are saving comments in a "#define". */
2996 clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2997 && type == '/') ? len + 2 : len;
2999 buffer = _cpp_unaligned_alloc (pfile, clen);
3001 token->type = CPP_COMMENT;
3002 token->val.str.len = clen;
3003 token->val.str.text = buffer;
3005 buffer[0] = '/';
3006 memcpy (buffer + 1, from, len - 1);
3008 /* Finish conversion to a C comment, if necessary. */
3009 if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
3011 buffer[1] = '*';
3012 buffer[clen - 2] = '*';
3013 buffer[clen - 1] = '/';
3014 /* As there can be in a C++ comments illegal sequences for C comments
3015 we need to filter them out. */
3016 for (i = 2; i < (clen - 2); i++)
3017 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3018 buffer[i] = '|';
3021 /* Finally store this comment for use by clients of libcpp. */
3022 store_comment (pfile, token);
3025 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3026 comment. */
3028 static bool
3029 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3031 const unsigned char *from = comment_start + 1;
3033 switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3035 /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3036 don't recognize any comments. The latter only checks attributes,
3037 the former doesn't warn. */
3038 case 0:
3039 default:
3040 return false;
3041 /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3042 content it has. */
3043 case 1:
3044 return true;
3045 case 2:
3046 /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3047 .*falls?[ \t-]*thr(u|ough).* regex. */
3048 for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3049 from++)
3051 /* Is there anything like strpbrk with upper boundary, or
3052 memchr looking for 2 characters rather than just one? */
3053 if (from[0] != 'f' && from[0] != 'F')
3054 continue;
3055 if (from[1] != 'a' && from[1] != 'A')
3056 continue;
3057 if (from[2] != 'l' && from[2] != 'L')
3058 continue;
3059 if (from[3] != 'l' && from[3] != 'L')
3060 continue;
3061 from += sizeof "fall" - 1;
3062 if (from[0] == 's' || from[0] == 'S')
3063 from++;
3064 while (*from == ' ' || *from == '\t' || *from == '-')
3065 from++;
3066 if (from[0] != 't' && from[0] != 'T')
3067 continue;
3068 if (from[1] != 'h' && from[1] != 'H')
3069 continue;
3070 if (from[2] != 'r' && from[2] != 'R')
3071 continue;
3072 if (from[3] == 'u' || from[3] == 'U')
3073 return true;
3074 if (from[3] != 'o' && from[3] != 'O')
3075 continue;
3076 if (from[4] != 'u' && from[4] != 'U')
3077 continue;
3078 if (from[5] != 'g' && from[5] != 'G')
3079 continue;
3080 if (from[6] != 'h' && from[6] != 'H')
3081 continue;
3082 return true;
3084 return false;
3085 case 3:
3086 case 4:
3087 break;
3090 /* Whole comment contents:
3091 -fallthrough
3092 @fallthrough@
3094 if (*from == '-' || *from == '@')
3096 size_t len = sizeof "fallthrough" - 1;
3097 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3098 return false;
3099 if (memcmp (from + 1, "fallthrough", len))
3100 return false;
3101 if (*from == '@')
3103 if (from[len + 1] != '@')
3104 return false;
3105 len++;
3107 from += 1 + len;
3109 /* Whole comment contents (regex):
3110 lint -fallthrough[ \t]*
3112 else if (*from == 'l')
3114 size_t len = sizeof "int -fallthrough" - 1;
3115 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3116 return false;
3117 if (memcmp (from + 1, "int -fallthrough", len))
3118 return false;
3119 from += 1 + len;
3120 while (*from == ' ' || *from == '\t')
3121 from++;
3123 /* Whole comment contents (regex):
3124 [ \t]*FALLTHR(U|OUGH)[ \t]*
3126 else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3128 while (*from == ' ' || *from == '\t')
3129 from++;
3130 if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
3131 return false;
3132 if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
3133 return false;
3134 from += sizeof "FALLTHR" - 1;
3135 if (*from == 'U')
3136 from++;
3137 else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
3138 return false;
3139 else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
3140 return false;
3141 else
3142 from += sizeof "OUGH" - 1;
3143 while (*from == ' ' || *from == '\t')
3144 from++;
3146 /* Whole comment contents (regex):
3147 [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3148 [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3149 [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
3151 else
3153 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3154 from++;
3155 unsigned char f = *from;
3156 bool all_upper = false;
3157 if (f == 'E' || f == 'e')
3159 if ((size_t) (pfile->buffer->cur - from)
3160 < sizeof "else fallthru" - 1)
3161 return false;
3162 if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
3163 all_upper = true;
3164 else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
3165 return false;
3166 from += sizeof "else" - 1;
3167 if (*from == ',')
3168 from++;
3169 if (*from != ' ')
3170 return false;
3171 from++;
3172 if (all_upper && *from == 'f')
3173 return false;
3174 if (f == 'e' && *from == 'F')
3175 return false;
3176 f = *from;
3178 else if (f == 'I' || f == 'i')
3180 if ((size_t) (pfile->buffer->cur - from)
3181 < sizeof "intentional fallthru" - 1)
3182 return false;
3183 if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
3184 sizeof "NTENTIONAL" - 1) == 0)
3185 all_upper = true;
3186 else if (memcmp (from + 1, "ntentional",
3187 sizeof "ntentional" - 1))
3188 return false;
3189 from += sizeof "intentional" - 1;
3190 if (*from == ' ')
3192 from++;
3193 if (all_upper && *from == 'f')
3194 return false;
3196 else if (all_upper)
3198 if (memcmp (from, "LY F", sizeof "LY F" - 1))
3199 return false;
3200 from += sizeof "LY " - 1;
3202 else
3204 if (memcmp (from, "ly ", sizeof "ly " - 1))
3205 return false;
3206 from += sizeof "ly " - 1;
3208 if (f == 'i' && *from == 'F')
3209 return false;
3210 f = *from;
3212 if (f != 'F' && f != 'f')
3213 return false;
3214 if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3215 return false;
3216 if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3217 all_upper = true;
3218 else if (all_upper)
3219 return false;
3220 else if (memcmp (from + 1, "all", sizeof "all" - 1))
3221 return false;
3222 from += sizeof "fall" - 1;
3223 if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3224 from += 2;
3225 else if (*from == ' ' || *from == '-')
3226 from++;
3227 else if (*from != (all_upper ? 'T' : 't'))
3228 return false;
3229 if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3230 return false;
3231 if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3232 return false;
3233 if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3235 if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3236 return false;
3237 if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3238 sizeof "hrough" - 1))
3239 return false;
3240 from += sizeof "through" - 1;
3242 else
3243 from += sizeof "thru" - 1;
3244 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3245 from++;
3246 if (*from == '-')
3248 from++;
3249 if (*comment_start == '*')
3253 while (*from && *from != '*'
3254 && *from != '\n' && *from != '\r')
3255 from++;
3256 if (*from != '*' || from[1] == '/')
3257 break;
3258 from++;
3260 while (1);
3262 else
3263 while (*from && *from != '\n' && *from != '\r')
3264 from++;
3267 /* C block comment. */
3268 if (*comment_start == '*')
3270 if (*from != '*' || from[1] != '/')
3271 return false;
3273 /* C++ line comment. */
3274 else if (*from != '\n')
3275 return false;
3277 return true;
3280 /* Allocate COUNT tokens for RUN. */
3281 void
3282 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3284 run->base = XNEWVEC (cpp_token, count);
3285 run->limit = run->base + count;
3286 run->next = NULL;
3289 /* Returns the next tokenrun, or creates one if there is none. */
3290 static tokenrun *
3291 next_tokenrun (tokenrun *run)
3293 if (run->next == NULL)
3295 run->next = XNEW (tokenrun);
3296 run->next->prev = run;
3297 _cpp_init_tokenrun (run->next, 250);
3300 return run->next;
3303 /* Return the number of not yet processed token in a given
3304 context. */
3306 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3308 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3309 return (LAST (context).token - FIRST (context).token);
3310 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3311 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3312 return (LAST (context).ptoken - FIRST (context).ptoken);
3313 else
3314 abort ();
3317 /* Returns the token present at index INDEX in a given context. If
3318 INDEX is zero, the next token to be processed is returned. */
3319 static const cpp_token*
3320 _cpp_token_from_context_at (cpp_context *context, int index)
3322 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3323 return &(FIRST (context).token[index]);
3324 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3325 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3326 return FIRST (context).ptoken[index];
3327 else
3328 abort ();
3331 /* Look ahead in the input stream. */
3332 const cpp_token *
3333 cpp_peek_token (cpp_reader *pfile, int index)
3335 cpp_context *context = pfile->context;
3336 const cpp_token *peektok;
3337 int count;
3339 /* First, scan through any pending cpp_context objects. */
3340 while (context->prev)
3342 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3344 if (index < (int) sz)
3345 return _cpp_token_from_context_at (context, index);
3346 index -= (int) sz;
3347 context = context->prev;
3350 /* We will have to read some new tokens after all (and do so
3351 without invalidating preceding tokens). */
3352 count = index;
3353 pfile->keep_tokens++;
3355 /* For peeked tokens temporarily disable line_change reporting,
3356 until the tokens are parsed for real. */
3357 void (*line_change) (cpp_reader *, const cpp_token *, int)
3358 = pfile->cb.line_change;
3359 pfile->cb.line_change = NULL;
3363 peektok = _cpp_lex_token (pfile);
3364 if (peektok->type == CPP_EOF)
3366 index--;
3367 break;
3369 else if (peektok->type == CPP_PRAGMA)
3371 /* Don't peek past a pragma. */
3372 if (peektok == &pfile->directive_result)
3373 /* Save the pragma in the buffer. */
3374 *pfile->cur_token++ = *peektok;
3375 index--;
3376 break;
3379 while (index--);
3381 _cpp_backup_tokens_direct (pfile, count - index);
3382 pfile->keep_tokens--;
3383 pfile->cb.line_change = line_change;
3385 return peektok;
3388 /* Allocate a single token that is invalidated at the same time as the
3389 rest of the tokens on the line. Has its line and col set to the
3390 same as the last lexed token, so that diagnostics appear in the
3391 right place. */
3392 cpp_token *
3393 _cpp_temp_token (cpp_reader *pfile)
3395 cpp_token *old, *result;
3396 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3397 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3399 old = pfile->cur_token - 1;
3400 /* Any pre-existing lookaheads must not be clobbered. */
3401 if (la)
3403 if (sz <= la)
3405 tokenrun *next = next_tokenrun (pfile->cur_run);
3407 if (sz < la)
3408 memmove (next->base + 1, next->base,
3409 (la - sz) * sizeof (cpp_token));
3411 next->base[0] = pfile->cur_run->limit[-1];
3414 if (sz > 1)
3415 memmove (pfile->cur_token + 1, pfile->cur_token,
3416 MIN (la, sz - 1) * sizeof (cpp_token));
3419 if (!sz && pfile->cur_token == pfile->cur_run->limit)
3421 pfile->cur_run = next_tokenrun (pfile->cur_run);
3422 pfile->cur_token = pfile->cur_run->base;
3425 result = pfile->cur_token++;
3426 result->src_loc = old->src_loc;
3427 return result;
3430 /* We're at the beginning of a logical line (so not in
3431 directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
3432 if we should enter deferred_pragma mode to tokenize the rest of the
3433 line as a module control-line. */
3435 static void
3436 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3438 unsigned backup = 0; /* Tokens we peeked. */
3439 cpp_hashnode *node = result->val.node.node;
3440 cpp_token *peek = result;
3441 cpp_token *keyword = peek;
3442 cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3443 int header_count = 0;
3445 /* Make sure the incoming state is as we expect it. This way we
3446 can restore it using constants. */
3447 gcc_checking_assert (!pfile->state.in_deferred_pragma
3448 && !pfile->state.skipping
3449 && !pfile->state.parsing_args
3450 && !pfile->state.angled_headers
3451 && (pfile->state.save_comments
3452 == !CPP_OPTION (pfile, discard_comments)));
3454 /* Enter directives mode sufficiently for peeking. We don't have
3455 to actually set in_directive. */
3456 pfile->state.in_deferred_pragma = true;
3458 /* These two fields are needed to process tokenization in deferred
3459 pragma mode. They are not used outside deferred pragma mode or
3460 directives mode. */
3461 pfile->state.pragma_allow_expansion = true;
3462 pfile->directive_line = result->src_loc;
3464 /* Saving comments is incompatible with directives mode. */
3465 pfile->state.save_comments = 0;
3467 if (node == n_modules[spec_nodes::M_EXPORT][0])
3469 peek = _cpp_lex_direct (pfile);
3470 keyword = peek;
3471 backup++;
3472 if (keyword->type != CPP_NAME)
3473 goto not_module;
3474 node = keyword->val.node.node;
3475 if (!(node->flags & NODE_MODULE))
3476 goto not_module;
3479 if (node == n_modules[spec_nodes::M__IMPORT][0])
3480 /* __import */
3481 header_count = backup + 2 + 16;
3482 else if (node == n_modules[spec_nodes::M_IMPORT][0])
3483 /* import */
3484 header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3485 else if (node == n_modules[spec_nodes::M_MODULE][0])
3486 ; /* module */
3487 else
3488 goto not_module;
3490 /* We've seen [export] {module|import|__import}. Check the next token. */
3491 if (header_count)
3492 /* After '{,__}import' a header name may appear. */
3493 pfile->state.angled_headers = true;
3494 peek = _cpp_lex_direct (pfile);
3495 backup++;
3497 /* ... import followed by identifier, ':', '<' or
3498 header-name preprocessing tokens, or module
3499 followed by cpp-identifier, ':' or ';' preprocessing
3500 tokens. C++ keywords are not yet relevant. */
3501 if (peek->type == CPP_NAME
3502 || peek->type == CPP_COLON
3503 || (header_count
3504 ? (peek->type == CPP_LESS
3505 || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3506 || peek->type == CPP_HEADER_NAME)
3507 : peek->type == CPP_SEMICOLON))
3509 pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3510 if (!pfile->state.pragma_allow_expansion)
3511 pfile->state.prevent_expansion++;
3513 if (!header_count && linemap_included_from
3514 (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3515 cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3516 "module control-line cannot be in included file");
3518 /* The first one or two tokens cannot be macro names. */
3519 for (int ix = backup; ix--;)
3521 cpp_token *tok = ix ? keyword : result;
3522 cpp_hashnode *node = tok->val.node.node;
3524 /* Don't attempt to expand the token. */
3525 tok->flags |= NO_EXPAND;
3526 if (_cpp_defined_macro_p (node)
3527 && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3528 && !cpp_fun_like_macro_p (node))
3529 cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3530 "module control-line \"%s\" cannot be"
3531 " an object-like macro",
3532 NODE_NAME (node));
3535 /* Map to underbar variants. */
3536 keyword->val.node.node = n_modules[header_count
3537 ? spec_nodes::M_IMPORT
3538 : spec_nodes::M_MODULE][1];
3539 if (backup != 1)
3540 result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3542 /* Maybe tell the tokenizer we expect a header-name down the
3543 road. */
3544 pfile->state.directive_file_token = header_count;
3546 else
3548 not_module:
3549 /* Drop out of directive mode. */
3550 /* We aaserted save_comments had this value upon entry. */
3551 pfile->state.save_comments
3552 = !CPP_OPTION (pfile, discard_comments);
3553 pfile->state.in_deferred_pragma = false;
3554 /* Do not let this remain on. */
3555 pfile->state.angled_headers = false;
3558 /* In either case we want to backup the peeked tokens. */
3559 if (backup)
3561 /* If we saw EOL, we should drop it, because this isn't a module
3562 control-line after all. */
3563 bool eol = peek->type == CPP_PRAGMA_EOL;
3564 if (!eol || backup > 1)
3566 /* Put put the peeked tokens back */
3567 _cpp_backup_tokens_direct (pfile, backup);
3568 /* But if the last one was an EOL, forget it. */
3569 if (eol)
3570 pfile->lookaheads--;
3575 /* Lex a token into RESULT (external interface). Takes care of issues
3576 like directive handling, token lookahead, multiple include
3577 optimization and skipping. */
3578 const cpp_token *
3579 _cpp_lex_token (cpp_reader *pfile)
3581 cpp_token *result;
3583 for (;;)
3585 if (pfile->cur_token == pfile->cur_run->limit)
3587 pfile->cur_run = next_tokenrun (pfile->cur_run);
3588 pfile->cur_token = pfile->cur_run->base;
3590 /* We assume that the current token is somewhere in the current
3591 run. */
3592 if (pfile->cur_token < pfile->cur_run->base
3593 || pfile->cur_token >= pfile->cur_run->limit)
3594 abort ();
3596 if (pfile->lookaheads)
3598 pfile->lookaheads--;
3599 result = pfile->cur_token++;
3601 else
3602 result = _cpp_lex_direct (pfile);
3604 if (result->flags & BOL)
3606 /* Is this a directive. If _cpp_handle_directive returns
3607 false, it is an assembler #. */
3608 if (result->type == CPP_HASH
3609 /* 6.10.3 p 11: Directives in a list of macro arguments
3610 gives undefined behavior. This implementation
3611 handles the directive as normal. */
3612 && pfile->state.parsing_args != 1)
3614 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3616 if (pfile->directive_result.type == CPP_PADDING)
3617 continue;
3618 result = &pfile->directive_result;
3621 else if (pfile->state.in_deferred_pragma)
3622 result = &pfile->directive_result;
3623 else if (result->type == CPP_NAME
3624 && (result->val.node.node->flags & NODE_MODULE)
3625 && !pfile->state.skipping
3626 /* Unlike regular directives, we do not deal with
3627 tokenizing module directives as macro arguments.
3628 That's not permitted. */
3629 && !pfile->state.parsing_args)
3631 /* P1857. Before macro expansion, At start of logical
3632 line ... */
3633 /* We don't have to consider lookaheads at this point. */
3634 gcc_checking_assert (!pfile->lookaheads);
3636 cpp_maybe_module_directive (pfile, result);
3639 if (pfile->cb.line_change && !pfile->state.skipping)
3640 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3643 /* We don't skip tokens in directives. */
3644 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3645 break;
3647 /* Outside a directive, invalidate controlling macros. At file
3648 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3649 get here and MI optimization works. */
3650 pfile->mi_valid = false;
3652 if (!pfile->state.skipping || result->type == CPP_EOF)
3653 break;
3656 return result;
3659 /* Returns true if a fresh line has been loaded. */
3660 bool
3661 _cpp_get_fresh_line (cpp_reader *pfile)
3663 /* We can't get a new line until we leave the current directive. */
3664 if (pfile->state.in_directive)
3665 return false;
3667 for (;;)
3669 cpp_buffer *buffer = pfile->buffer;
3671 if (!buffer->need_line)
3672 return true;
3674 if (buffer->next_line < buffer->rlimit)
3676 _cpp_clean_line (pfile);
3677 return true;
3680 /* First, get out of parsing arguments state. */
3681 if (pfile->state.parsing_args)
3682 return false;
3684 /* End of buffer. Non-empty files should end in a newline. */
3685 if (buffer->buf != buffer->rlimit
3686 && buffer->next_line > buffer->rlimit
3687 && !buffer->from_stage3)
3689 /* Clip to buffer size. */
3690 buffer->next_line = buffer->rlimit;
3693 if (buffer->prev && !buffer->return_at_eof)
3694 _cpp_pop_buffer (pfile);
3695 else
3697 /* End of translation. Do not pop the buffer yet. Increment
3698 line number so that the EOF token is on a line of its own
3699 (_cpp_lex_direct doesn't increment in that case, because
3700 it's hard for it to distinguish this special case). */
3701 CPP_INCREMENT_LINE (pfile, 0);
3702 return false;
3707 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
3708 do \
3710 result->type = ELSE_TYPE; \
3711 if (*buffer->cur == CHAR) \
3712 buffer->cur++, result->type = THEN_TYPE; \
3714 while (0)
3716 /* Lex a token into pfile->cur_token, which is also incremented, to
3717 get diagnostics pointing to the correct location.
3719 Does not handle issues such as token lookahead, multiple-include
3720 optimization, directives, skipping etc. This function is only
3721 suitable for use by _cpp_lex_token, and in special cases like
3722 lex_expansion_token which doesn't care for any of these issues.
3724 When meeting a newline, returns CPP_EOF if parsing a directive,
3725 otherwise returns to the start of the token buffer if permissible.
3726 Returns the location of the lexed token. */
3727 cpp_token *
3728 _cpp_lex_direct (cpp_reader *pfile)
3730 cppchar_t c;
3731 cpp_buffer *buffer;
3732 const unsigned char *comment_start;
3733 bool fallthrough_comment = false;
3734 cpp_token *result = pfile->cur_token++;
3736 fresh_line:
3737 result->flags = 0;
3738 buffer = pfile->buffer;
3739 if (buffer->need_line)
3741 if (pfile->state.in_deferred_pragma)
3743 /* This can happen in cases like:
3744 #define loop(x) whatever
3745 #pragma omp loop
3746 where when trying to expand loop we need to peek
3747 next token after loop, but aren't still in_deferred_pragma
3748 mode but are in in_directive mode, so buffer->need_line
3749 is set, a CPP_EOF is peeked. */
3750 result->type = CPP_PRAGMA_EOL;
3751 pfile->state.in_deferred_pragma = false;
3752 if (!pfile->state.pragma_allow_expansion)
3753 pfile->state.prevent_expansion--;
3754 return result;
3756 if (!_cpp_get_fresh_line (pfile))
3758 result->type = CPP_EOF;
3759 /* Not a real EOF in a directive or arg parsing -- we refuse
3760 to advance to the next file now, and will once we're out
3761 of those modes. */
3762 if (!pfile->state.in_directive && !pfile->state.parsing_args)
3764 /* Tell the compiler the line number of the EOF token. */
3765 result->src_loc = pfile->line_table->highest_line;
3766 result->flags = BOL;
3767 /* Now pop the buffer that _cpp_get_fresh_line did not. */
3768 _cpp_pop_buffer (pfile);
3770 return result;
3772 if (buffer != pfile->buffer)
3773 fallthrough_comment = false;
3774 if (!pfile->keep_tokens)
3776 pfile->cur_run = &pfile->base_run;
3777 result = pfile->base_run.base;
3778 pfile->cur_token = result + 1;
3780 result->flags = BOL;
3781 if (pfile->state.parsing_args == 2)
3782 result->flags |= PREV_WHITE;
3784 buffer = pfile->buffer;
3785 update_tokens_line:
3786 result->src_loc = pfile->line_table->highest_line;
3788 skipped_white:
3789 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3790 && !pfile->overlaid_buffer)
3792 _cpp_process_line_notes (pfile, false);
3793 result->src_loc = pfile->line_table->highest_line;
3795 c = *buffer->cur++;
3797 if (pfile->forced_token_location)
3798 result->src_loc = pfile->forced_token_location;
3799 else
3800 result->src_loc = linemap_position_for_column (pfile->line_table,
3801 CPP_BUF_COLUMN (buffer, buffer->cur));
3803 switch (c)
3805 case ' ': case '\t': case '\f': case '\v': case '\0':
3806 result->flags |= PREV_WHITE;
3807 skip_whitespace (pfile, c);
3808 goto skipped_white;
3810 case '\n':
3811 /* Increment the line, unless this is the last line ... */
3812 if (buffer->cur < buffer->rlimit
3813 /* ... or this is a #include, (where _cpp_stack_file needs to
3814 unwind by one line) ... */
3815 || (pfile->state.in_directive > 1
3816 /* ... except traditional-cpp increments this elsewhere. */
3817 && !CPP_OPTION (pfile, traditional)))
3818 CPP_INCREMENT_LINE (pfile, 0);
3819 buffer->need_line = true;
3820 if (pfile->state.in_deferred_pragma)
3822 /* Produce the PRAGMA_EOL on this line. File reading
3823 ensures there is always a \n at end of the buffer, thus
3824 in a deferred pragma we always see CPP_PRAGMA_EOL before
3825 any CPP_EOF. */
3826 result->type = CPP_PRAGMA_EOL;
3827 result->flags &= ~PREV_WHITE;
3828 pfile->state.in_deferred_pragma = false;
3829 if (!pfile->state.pragma_allow_expansion)
3830 pfile->state.prevent_expansion--;
3831 return result;
3833 goto fresh_line;
3835 case '0': case '1': case '2': case '3': case '4':
3836 case '5': case '6': case '7': case '8': case '9':
3838 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3839 result->type = CPP_NUMBER;
3840 lex_number (pfile, &result->val.str, &nst);
3841 warn_about_normalization (pfile, result, &nst);
3842 break;
3845 case 'L':
3846 case 'u':
3847 case 'U':
3848 case 'R':
3849 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3850 wide strings or raw strings. */
3851 if (c == 'L' || CPP_OPTION (pfile, rliterals)
3852 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3854 if ((*buffer->cur == '\'' && c != 'R')
3855 || *buffer->cur == '"'
3856 || (*buffer->cur == 'R'
3857 && c != 'R'
3858 && buffer->cur[1] == '"'
3859 && CPP_OPTION (pfile, rliterals))
3860 || (*buffer->cur == '8'
3861 && c == 'u'
3862 && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3863 && CPP_OPTION (pfile, utf8_char_literals)))
3864 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3865 && CPP_OPTION (pfile, rliterals)))))
3867 lex_string (pfile, result, buffer->cur - 1);
3868 break;
3871 /* Fall through. */
3873 case '_':
3874 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3875 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3876 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3877 case 's': case 't': case 'v': case 'w': case 'x':
3878 case 'y': case 'z':
3879 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3880 case 'G': case 'H': case 'I': case 'J': case 'K':
3881 case 'M': case 'N': case 'O': case 'P': case 'Q':
3882 case 'S': case 'T': case 'V': case 'W': case 'X':
3883 case 'Y': case 'Z':
3884 result->type = CPP_NAME;
3886 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3887 result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3888 &nst,
3889 &result->val.node.spelling);
3890 warn_about_normalization (pfile, result, &nst);
3893 /* Convert named operators to their proper types. */
3894 if (result->val.node.node->flags & NODE_OPERATOR)
3896 result->flags |= NAMED_OP;
3897 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3900 /* Signal FALLTHROUGH comment followed by another token. */
3901 if (fallthrough_comment)
3902 result->flags |= PREV_FALLTHROUGH;
3903 break;
3905 case '\'':
3906 case '"':
3907 lex_string (pfile, result, buffer->cur - 1);
3908 break;
3910 case '/':
3911 /* A potential block or line comment. */
3912 comment_start = buffer->cur;
3913 c = *buffer->cur;
3915 if (c == '*')
3917 if (_cpp_skip_block_comment (pfile))
3918 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3920 else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3922 /* Don't warn for system headers. */
3923 if (_cpp_in_system_header (pfile))
3925 /* Warn about comments if pedantically GNUC89, and not
3926 in system headers. */
3927 else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3928 && CPP_PEDANTIC (pfile)
3929 && ! buffer->warned_cplusplus_comments)
3931 if (cpp_error (pfile, CPP_DL_PEDWARN,
3932 "C++ style comments are not allowed in ISO C90"))
3933 cpp_error (pfile, CPP_DL_NOTE,
3934 "(this will be reported only once per input file)");
3935 buffer->warned_cplusplus_comments = 1;
3937 /* Or if specifically desired via -Wc90-c99-compat. */
3938 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3939 && ! CPP_OPTION (pfile, cplusplus)
3940 && ! buffer->warned_cplusplus_comments)
3942 if (cpp_error (pfile, CPP_DL_WARNING,
3943 "C++ style comments are incompatible with C90"))
3944 cpp_error (pfile, CPP_DL_NOTE,
3945 "(this will be reported only once per input file)");
3946 buffer->warned_cplusplus_comments = 1;
3948 /* In C89/C94, C++ style comments are forbidden. */
3949 else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3950 || CPP_OPTION (pfile, lang) == CLK_STDC94))
3952 /* But don't be confused about valid code such as
3953 - // immediately followed by *,
3954 - // in a preprocessing directive,
3955 - // in an #if 0 block. */
3956 if (buffer->cur[1] == '*'
3957 || pfile->state.in_directive
3958 || pfile->state.skipping)
3960 result->type = CPP_DIV;
3961 break;
3963 else if (! buffer->warned_cplusplus_comments)
3965 if (cpp_error (pfile, CPP_DL_ERROR,
3966 "C++ style comments are not allowed in "
3967 "ISO C90"))
3968 cpp_error (pfile, CPP_DL_NOTE,
3969 "(this will be reported only once per input "
3970 "file)");
3971 buffer->warned_cplusplus_comments = 1;
3974 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3975 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3977 else if (c == '=')
3979 buffer->cur++;
3980 result->type = CPP_DIV_EQ;
3981 break;
3983 else
3985 result->type = CPP_DIV;
3986 break;
3989 if (fallthrough_comment_p (pfile, comment_start))
3990 fallthrough_comment = true;
3992 if (pfile->cb.comment)
3994 size_t len = pfile->buffer->cur - comment_start;
3995 pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3996 len + 1);
3999 if (!pfile->state.save_comments)
4001 result->flags |= PREV_WHITE;
4002 goto update_tokens_line;
4005 if (fallthrough_comment)
4006 result->flags |= PREV_FALLTHROUGH;
4008 /* Save the comment as a token in its own right. */
4009 save_comment (pfile, result, comment_start, c);
4010 break;
4012 case '<':
4013 if (pfile->state.angled_headers)
4015 lex_string (pfile, result, buffer->cur - 1);
4016 if (result->type != CPP_LESS)
4017 break;
4020 result->type = CPP_LESS;
4021 if (*buffer->cur == '=')
4023 buffer->cur++, result->type = CPP_LESS_EQ;
4024 if (*buffer->cur == '>'
4025 && CPP_OPTION (pfile, cplusplus)
4026 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
4027 buffer->cur++, result->type = CPP_SPACESHIP;
4029 else if (*buffer->cur == '<')
4031 buffer->cur++;
4032 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
4034 else if (CPP_OPTION (pfile, digraphs))
4036 if (*buffer->cur == ':')
4038 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4039 three characters are <:: and the subsequent character
4040 is neither : nor >, the < is treated as a preprocessor
4041 token by itself". */
4042 if (CPP_OPTION (pfile, cplusplus)
4043 && CPP_OPTION (pfile, lang) != CLK_CXX98
4044 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
4045 && buffer->cur[1] == ':'
4046 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4047 break;
4049 buffer->cur++;
4050 result->flags |= DIGRAPH;
4051 result->type = CPP_OPEN_SQUARE;
4053 else if (*buffer->cur == '%')
4055 buffer->cur++;
4056 result->flags |= DIGRAPH;
4057 result->type = CPP_OPEN_BRACE;
4060 break;
4062 case '>':
4063 result->type = CPP_GREATER;
4064 if (*buffer->cur == '=')
4065 buffer->cur++, result->type = CPP_GREATER_EQ;
4066 else if (*buffer->cur == '>')
4068 buffer->cur++;
4069 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4071 break;
4073 case '%':
4074 result->type = CPP_MOD;
4075 if (*buffer->cur == '=')
4076 buffer->cur++, result->type = CPP_MOD_EQ;
4077 else if (CPP_OPTION (pfile, digraphs))
4079 if (*buffer->cur == ':')
4081 buffer->cur++;
4082 result->flags |= DIGRAPH;
4083 result->type = CPP_HASH;
4084 if (*buffer->cur == '%' && buffer->cur[1] == ':')
4085 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
4087 else if (*buffer->cur == '>')
4089 buffer->cur++;
4090 result->flags |= DIGRAPH;
4091 result->type = CPP_CLOSE_BRACE;
4094 break;
4096 case '.':
4097 result->type = CPP_DOT;
4098 if (ISDIGIT (*buffer->cur))
4100 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4101 result->type = CPP_NUMBER;
4102 lex_number (pfile, &result->val.str, &nst);
4103 warn_about_normalization (pfile, result, &nst);
4105 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4106 buffer->cur += 2, result->type = CPP_ELLIPSIS;
4107 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4108 buffer->cur++, result->type = CPP_DOT_STAR;
4109 break;
4111 case '+':
4112 result->type = CPP_PLUS;
4113 if (*buffer->cur == '+')
4114 buffer->cur++, result->type = CPP_PLUS_PLUS;
4115 else if (*buffer->cur == '=')
4116 buffer->cur++, result->type = CPP_PLUS_EQ;
4117 break;
4119 case '-':
4120 result->type = CPP_MINUS;
4121 if (*buffer->cur == '>')
4123 buffer->cur++;
4124 result->type = CPP_DEREF;
4125 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4126 buffer->cur++, result->type = CPP_DEREF_STAR;
4128 else if (*buffer->cur == '-')
4129 buffer->cur++, result->type = CPP_MINUS_MINUS;
4130 else if (*buffer->cur == '=')
4131 buffer->cur++, result->type = CPP_MINUS_EQ;
4132 break;
4134 case '&':
4135 result->type = CPP_AND;
4136 if (*buffer->cur == '&')
4137 buffer->cur++, result->type = CPP_AND_AND;
4138 else if (*buffer->cur == '=')
4139 buffer->cur++, result->type = CPP_AND_EQ;
4140 break;
4142 case '|':
4143 result->type = CPP_OR;
4144 if (*buffer->cur == '|')
4145 buffer->cur++, result->type = CPP_OR_OR;
4146 else if (*buffer->cur == '=')
4147 buffer->cur++, result->type = CPP_OR_EQ;
4148 break;
4150 case ':':
4151 result->type = CPP_COLON;
4152 if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
4153 buffer->cur++, result->type = CPP_SCOPE;
4154 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
4156 buffer->cur++;
4157 result->flags |= DIGRAPH;
4158 result->type = CPP_CLOSE_SQUARE;
4160 break;
4162 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4163 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4164 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4165 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
4166 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
4168 case '?': result->type = CPP_QUERY; break;
4169 case '~': result->type = CPP_COMPL; break;
4170 case ',': result->type = CPP_COMMA; break;
4171 case '(': result->type = CPP_OPEN_PAREN; break;
4172 case ')': result->type = CPP_CLOSE_PAREN; break;
4173 case '[': result->type = CPP_OPEN_SQUARE; break;
4174 case ']': result->type = CPP_CLOSE_SQUARE; break;
4175 case '{': result->type = CPP_OPEN_BRACE; break;
4176 case '}': result->type = CPP_CLOSE_BRACE; break;
4177 case ';': result->type = CPP_SEMICOLON; break;
4179 /* @ is a punctuator in Objective-C. */
4180 case '@': result->type = CPP_ATSIGN; break;
4182 default:
4184 const uchar *base = --buffer->cur;
4185 static int no_warn_cnt;
4187 /* Check for an extended identifier ($ or UCN or UTF-8). */
4188 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4189 if (forms_identifier_p (pfile, true, &nst))
4191 result->type = CPP_NAME;
4192 result->val.node.node = lex_identifier (pfile, base, true, &nst,
4193 &result->val.node.spelling);
4194 warn_about_normalization (pfile, result, &nst);
4195 break;
4198 /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
4199 single token. */
4200 buffer->cur++;
4201 if (c >= utf8_signifier)
4203 const uchar *pstr = base;
4204 cppchar_t s;
4205 if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4207 if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4209 buffer->cur = base;
4210 _cpp_warn_invalid_utf8 (pfile);
4212 buffer->cur = pstr;
4214 else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4216 buffer->cur = base;
4217 const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4218 buffer->cur = base + 1;
4219 no_warn_cnt = end - buffer->cur;
4222 else if (c >= utf8_continuation
4223 && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4225 if (no_warn_cnt)
4226 --no_warn_cnt;
4227 else
4229 buffer->cur = base;
4230 _cpp_warn_invalid_utf8 (pfile);
4231 buffer->cur = base + 1;
4234 create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4235 break;
4240 /* Potentially convert the location of the token to a range. */
4241 if (result->src_loc >= RESERVED_LOCATION_COUNT
4242 && result->type != CPP_EOF)
4244 /* Ensure that any line notes are processed, so that we have the
4245 correct physical line/column for the end-point of the token even
4246 when a logical line is split via one or more backslashes. */
4247 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4248 && !pfile->overlaid_buffer)
4249 _cpp_process_line_notes (pfile, false);
4251 source_range tok_range;
4252 tok_range.m_start = result->src_loc;
4253 tok_range.m_finish
4254 = linemap_position_for_column (pfile->line_table,
4255 CPP_BUF_COLUMN (buffer, buffer->cur));
4257 result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
4258 result->src_loc,
4259 tok_range, NULL);
4262 return result;
4265 /* An upper bound on the number of bytes needed to spell TOKEN.
4266 Does not include preceding whitespace. */
4267 unsigned int
4268 cpp_token_len (const cpp_token *token)
4270 unsigned int len;
4272 switch (TOKEN_SPELL (token))
4274 default: len = 6; break;
4275 case SPELL_LITERAL: len = token->val.str.len; break;
4276 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
4279 return len;
4282 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4283 Return the number of bytes read out of NAME. (There are always
4284 10 bytes written to BUFFER.) */
4286 static size_t
4287 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4289 int j;
4290 int ucn_len = 0;
4291 int ucn_len_c;
4292 unsigned t;
4293 unsigned long utf32;
4295 /* Compute the length of the UTF-8 sequence. */
4296 for (t = *name; t & 0x80; t <<= 1)
4297 ucn_len++;
4299 utf32 = *name & (0x7F >> ucn_len);
4300 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4302 utf32 = (utf32 << 6) | (*++name & 0x3F);
4304 /* Ill-formed UTF-8. */
4305 if ((*name & ~0x3F) != 0x80)
4306 abort ();
4309 *buffer++ = '\\';
4310 *buffer++ = 'U';
4311 for (j = 7; j >= 0; j--)
4312 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4313 return ucn_len;
4316 /* Given a token TYPE corresponding to a digraph, return a pointer to
4317 the spelling of the digraph. */
4318 static const unsigned char *
4319 cpp_digraph2name (enum cpp_ttype type)
4321 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4324 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4325 The buffer must already contain the enough space to hold the
4326 token's spelling. Returns a pointer to the character after the
4327 last character written. */
4328 unsigned char *
4329 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4331 size_t i;
4332 const unsigned char *name = NODE_NAME (ident);
4334 for (i = 0; i < NODE_LEN (ident); i++)
4335 if (name[i] & ~0x7F)
4337 i += utf8_to_ucn (buffer, name + i) - 1;
4338 buffer += 10;
4340 else
4341 *buffer++ = name[i];
4343 return buffer;
4346 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
4347 already contain the enough space to hold the token's spelling.
4348 Returns a pointer to the character after the last character written.
4349 FORSTRING is true if this is to be the spelling after translation
4350 phase 1 (with the original spelling of extended identifiers), false
4351 if extended identifiers should always be written using UCNs (there is
4352 no option for always writing them in the internal UTF-8 form).
4353 FIXME: Would be nice if we didn't need the PFILE argument. */
4354 unsigned char *
4355 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4356 unsigned char *buffer, bool forstring)
4358 switch (TOKEN_SPELL (token))
4360 case SPELL_OPERATOR:
4362 const unsigned char *spelling;
4363 unsigned char c;
4365 if (token->flags & DIGRAPH)
4366 spelling = cpp_digraph2name (token->type);
4367 else if (token->flags & NAMED_OP)
4368 goto spell_ident;
4369 else
4370 spelling = TOKEN_NAME (token);
4372 while ((c = *spelling++) != '\0')
4373 *buffer++ = c;
4375 break;
4377 spell_ident:
4378 case SPELL_IDENT:
4379 if (forstring)
4381 memcpy (buffer, NODE_NAME (token->val.node.spelling),
4382 NODE_LEN (token->val.node.spelling));
4383 buffer += NODE_LEN (token->val.node.spelling);
4385 else
4386 buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4387 break;
4389 case SPELL_LITERAL:
4390 memcpy (buffer, token->val.str.text, token->val.str.len);
4391 buffer += token->val.str.len;
4392 break;
4394 case SPELL_NONE:
4395 cpp_error (pfile, CPP_DL_ICE,
4396 "unspellable token %s", TOKEN_NAME (token));
4397 break;
4400 return buffer;
4403 /* Returns TOKEN spelt as a null-terminated string. The string is
4404 freed when the reader is destroyed. Useful for diagnostics. */
4405 unsigned char *
4406 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4408 unsigned int len = cpp_token_len (token) + 1;
4409 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4411 end = cpp_spell_token (pfile, token, start, false);
4412 end[0] = '\0';
4414 return start;
4417 /* Returns a pointer to a string which spells the token defined by
4418 TYPE and FLAGS. Used by C front ends, which really should move to
4419 using cpp_token_as_text. */
4420 const char *
4421 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4423 if (flags & DIGRAPH)
4424 return (const char *) cpp_digraph2name (type);
4425 else if (flags & NAMED_OP)
4426 return cpp_named_operator2name (type);
4428 return (const char *) token_spellings[type].name;
4431 /* Writes the spelling of token to FP, without any preceding space.
4432 Separated from cpp_spell_token for efficiency - to avoid stdio
4433 double-buffering. */
4434 void
4435 cpp_output_token (const cpp_token *token, FILE *fp)
4437 switch (TOKEN_SPELL (token))
4439 case SPELL_OPERATOR:
4441 const unsigned char *spelling;
4442 int c;
4444 if (token->flags & DIGRAPH)
4445 spelling = cpp_digraph2name (token->type);
4446 else if (token->flags & NAMED_OP)
4447 goto spell_ident;
4448 else
4449 spelling = TOKEN_NAME (token);
4451 c = *spelling;
4453 putc (c, fp);
4454 while ((c = *++spelling) != '\0');
4456 break;
4458 spell_ident:
4459 case SPELL_IDENT:
4461 size_t i;
4462 const unsigned char * name = NODE_NAME (token->val.node.node);
4464 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4465 if (name[i] & ~0x7F)
4467 unsigned char buffer[10];
4468 i += utf8_to_ucn (buffer, name + i) - 1;
4469 fwrite (buffer, 1, 10, fp);
4471 else
4472 fputc (NODE_NAME (token->val.node.node)[i], fp);
4474 break;
4476 case SPELL_LITERAL:
4477 if (token->type == CPP_HEADER_NAME)
4478 fputc ('"', fp);
4479 fwrite (token->val.str.text, 1, token->val.str.len, fp);
4480 if (token->type == CPP_HEADER_NAME)
4481 fputc ('"', fp);
4482 break;
4484 case SPELL_NONE:
4485 /* An error, most probably. */
4486 break;
4490 /* Compare two tokens. */
4492 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4494 if (a->type == b->type && a->flags == b->flags)
4495 switch (TOKEN_SPELL (a))
4497 default: /* Keep compiler happy. */
4498 case SPELL_OPERATOR:
4499 /* token_no is used to track where multiple consecutive ##
4500 tokens were originally located. */
4501 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4502 case SPELL_NONE:
4503 return (a->type != CPP_MACRO_ARG
4504 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4505 && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4506 case SPELL_IDENT:
4507 return (a->val.node.node == b->val.node.node
4508 && a->val.node.spelling == b->val.node.spelling);
4509 case SPELL_LITERAL:
4510 return (a->val.str.len == b->val.str.len
4511 && !memcmp (a->val.str.text, b->val.str.text,
4512 a->val.str.len));
4515 return 0;
4518 /* Returns nonzero if a space should be inserted to avoid an
4519 accidental token paste for output. For simplicity, it is
4520 conservative, and occasionally advises a space where one is not
4521 needed, e.g. "." and ".2". */
4523 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4524 const cpp_token *token2)
4526 enum cpp_ttype a = token1->type, b = token2->type;
4527 cppchar_t c;
4529 if (token1->flags & NAMED_OP)
4530 a = CPP_NAME;
4531 if (token2->flags & NAMED_OP)
4532 b = CPP_NAME;
4534 c = EOF;
4535 if (token2->flags & DIGRAPH)
4536 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4537 else if (token_spellings[b].category == SPELL_OPERATOR)
4538 c = token_spellings[b].name[0];
4540 /* Quickly get everything that can paste with an '='. */
4541 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4542 return 1;
4544 switch (a)
4546 case CPP_GREATER: return c == '>';
4547 case CPP_LESS: return c == '<' || c == '%' || c == ':';
4548 case CPP_PLUS: return c == '+';
4549 case CPP_MINUS: return c == '-' || c == '>';
4550 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
4551 case CPP_MOD: return c == ':' || c == '>';
4552 case CPP_AND: return c == '&';
4553 case CPP_OR: return c == '|';
4554 case CPP_COLON: return c == ':' || c == '>';
4555 case CPP_DEREF: return c == '*';
4556 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
4557 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
4558 case CPP_PRAGMA:
4559 case CPP_NAME: return ((b == CPP_NUMBER
4560 && name_p (pfile, &token2->val.str))
4561 || b == CPP_NAME
4562 || b == CPP_CHAR || b == CPP_STRING); /* L */
4563 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
4564 || b == CPP_CHAR
4565 || c == '.' || c == '+' || c == '-');
4566 /* UCNs */
4567 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
4568 && b == CPP_NAME)
4569 || (CPP_OPTION (pfile, objc)
4570 && token1->val.str.text[0] == '@'
4571 && (b == CPP_NAME || b == CPP_STRING)));
4572 case CPP_LESS_EQ: return c == '>';
4573 case CPP_STRING:
4574 case CPP_WSTRING:
4575 case CPP_UTF8STRING:
4576 case CPP_STRING16:
4577 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
4578 && (b == CPP_NAME
4579 || (TOKEN_SPELL (token2) == SPELL_LITERAL
4580 && ISIDST (token2->val.str.text[0]))));
4582 default: break;
4585 return 0;
4588 /* Output all the remaining tokens on the current line, and a newline
4589 character, to FP. Leading whitespace is removed. If there are
4590 macros, special token padding is not performed. */
4591 void
4592 cpp_output_line (cpp_reader *pfile, FILE *fp)
4594 const cpp_token *token;
4596 token = cpp_get_token (pfile);
4597 while (token->type != CPP_EOF)
4599 cpp_output_token (token, fp);
4600 token = cpp_get_token (pfile);
4601 if (token->flags & PREV_WHITE)
4602 putc (' ', fp);
4605 putc ('\n', fp);
4608 /* Return a string representation of all the remaining tokens on the
4609 current line. The result is allocated using xmalloc and must be
4610 freed by the caller. */
4611 unsigned char *
4612 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4614 const cpp_token *token;
4615 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4616 unsigned int alloced = 120 + out;
4617 unsigned char *result = (unsigned char *) xmalloc (alloced);
4619 /* If DIR_NAME is empty, there are no initial contents. */
4620 if (dir_name)
4622 sprintf ((char *) result, "#%s ", dir_name);
4623 out += 2;
4626 token = cpp_get_token (pfile);
4627 while (token->type != CPP_EOF)
4629 unsigned char *last;
4630 /* Include room for a possible space and the terminating nul. */
4631 unsigned int len = cpp_token_len (token) + 2;
4633 if (out + len > alloced)
4635 alloced *= 2;
4636 if (out + len > alloced)
4637 alloced = out + len;
4638 result = (unsigned char *) xrealloc (result, alloced);
4641 last = cpp_spell_token (pfile, token, &result[out], 0);
4642 out = last - result;
4644 token = cpp_get_token (pfile);
4645 if (token->flags & PREV_WHITE)
4646 result[out++] = ' ';
4649 result[out] = '\0';
4650 return result;
4653 /* Memory buffers. Changing these three constants can have a dramatic
4654 effect on performance. The values here are reasonable defaults,
4655 but might be tuned. If you adjust them, be sure to test across a
4656 range of uses of cpplib, including heavy nested function-like macro
4657 expansion. Also check the change in peak memory usage (NJAMD is a
4658 good tool for this). */
4659 #define MIN_BUFF_SIZE 8000
4660 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4661 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4662 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4664 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4665 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4666 #endif
4668 /* Create a new allocation buffer. Place the control block at the end
4669 of the buffer, so that buffer overflows will cause immediate chaos. */
4670 static _cpp_buff *
4671 new_buff (size_t len)
4673 _cpp_buff *result;
4674 unsigned char *base;
4676 if (len < MIN_BUFF_SIZE)
4677 len = MIN_BUFF_SIZE;
4678 len = CPP_ALIGN (len);
4680 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4681 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4682 struct first. */
4683 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4684 base = XNEWVEC (unsigned char, len + slen);
4685 result = (_cpp_buff *) base;
4686 base += slen;
4687 #else
4688 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4689 result = (_cpp_buff *) (base + len);
4690 #endif
4691 result->base = base;
4692 result->cur = base;
4693 result->limit = base + len;
4694 result->next = NULL;
4695 return result;
4698 /* Place a chain of unwanted allocation buffers on the free list. */
4699 void
4700 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4702 _cpp_buff *end = buff;
4704 while (end->next)
4705 end = end->next;
4706 end->next = pfile->free_buffs;
4707 pfile->free_buffs = buff;
4710 /* Return a free buffer of size at least MIN_SIZE. */
4711 _cpp_buff *
4712 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4714 _cpp_buff *result, **p;
4716 for (p = &pfile->free_buffs;; p = &(*p)->next)
4718 size_t size;
4720 if (*p == NULL)
4721 return new_buff (min_size);
4722 result = *p;
4723 size = result->limit - result->base;
4724 /* Return a buffer that's big enough, but don't waste one that's
4725 way too big. */
4726 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4727 break;
4730 *p = result->next;
4731 result->next = NULL;
4732 result->cur = result->base;
4733 return result;
4736 /* Creates a new buffer with enough space to hold the uncommitted
4737 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
4738 the excess bytes to the new buffer. Chains the new buffer after
4739 BUFF, and returns the new buffer. */
4740 _cpp_buff *
4741 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4743 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4744 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4746 buff->next = new_buff;
4747 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4748 return new_buff;
4751 /* Creates a new buffer with enough space to hold the uncommitted
4752 remaining bytes of the buffer pointed to by BUFF, and at least
4753 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
4754 Chains the new buffer before the buffer pointed to by BUFF, and
4755 updates the pointer to point to the new buffer. */
4756 void
4757 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4759 _cpp_buff *new_buff, *old_buff = *pbuff;
4760 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4762 new_buff = _cpp_get_buff (pfile, size);
4763 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4764 new_buff->next = old_buff;
4765 *pbuff = new_buff;
4768 /* Free a chain of buffers starting at BUFF. */
4769 void
4770 _cpp_free_buff (_cpp_buff *buff)
4772 _cpp_buff *next;
4774 for (; buff; buff = next)
4776 next = buff->next;
4777 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4778 free (buff);
4779 #else
4780 free (buff->base);
4781 #endif
4785 /* Allocate permanent, unaligned storage of length LEN. */
4786 unsigned char *
4787 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4789 _cpp_buff *buff = pfile->u_buff;
4790 unsigned char *result = buff->cur;
4792 if (len > (size_t) (buff->limit - result))
4794 buff = _cpp_get_buff (pfile, len);
4795 buff->next = pfile->u_buff;
4796 pfile->u_buff = buff;
4797 result = buff->cur;
4800 buff->cur = result + len;
4801 return result;
4804 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4805 That buffer is used for growing allocations when saving macro
4806 replacement lists in a #define, and when parsing an answer to an
4807 assertion in #assert, #unassert or #if (and therefore possibly
4808 whilst expanding macros). It therefore must not be used by any
4809 code that they might call: specifically the lexer and the guts of
4810 the macro expander.
4812 All existing other uses clearly fit this restriction: storing
4813 registered pragmas during initialization. */
4814 unsigned char *
4815 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4817 _cpp_buff *buff = pfile->a_buff;
4818 unsigned char *result = buff->cur;
4820 if (len > (size_t) (buff->limit - result))
4822 buff = _cpp_get_buff (pfile, len);
4823 buff->next = pfile->a_buff;
4824 pfile->a_buff = buff;
4825 result = buff->cur;
4828 buff->cur = result + len;
4829 return result;
4832 /* Commit or allocate storage from a buffer. */
4834 void *
4835 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4837 void *ptr = BUFF_FRONT (pfile->a_buff);
4839 if (pfile->hash_table->alloc_subobject)
4841 void *copy = pfile->hash_table->alloc_subobject (size);
4842 memcpy (copy, ptr, size);
4843 ptr = copy;
4845 else
4846 BUFF_FRONT (pfile->a_buff) += size;
4848 return ptr;
4851 /* Say which field of TOK is in use. */
4853 enum cpp_token_fld_kind
4854 cpp_token_val_index (const cpp_token *tok)
4856 switch (TOKEN_SPELL (tok))
4858 case SPELL_IDENT:
4859 return CPP_TOKEN_FLD_NODE;
4860 case SPELL_LITERAL:
4861 return CPP_TOKEN_FLD_STR;
4862 case SPELL_OPERATOR:
4863 /* Operands which were originally spelled as ident keep around
4864 the node for the exact spelling. */
4865 if (tok->flags & NAMED_OP)
4866 return CPP_TOKEN_FLD_NODE;
4867 else if (tok->type == CPP_PASTE)
4868 return CPP_TOKEN_FLD_TOKEN_NO;
4869 else
4870 return CPP_TOKEN_FLD_NONE;
4871 case SPELL_NONE:
4872 if (tok->type == CPP_MACRO_ARG)
4873 return CPP_TOKEN_FLD_ARG_NO;
4874 else if (tok->type == CPP_PADDING)
4875 return CPP_TOKEN_FLD_SOURCE;
4876 else if (tok->type == CPP_PRAGMA)
4877 return CPP_TOKEN_FLD_PRAGMA;
4878 /* fall through */
4879 default:
4880 return CPP_TOKEN_FLD_NONE;
4884 /* All tokens lexed in R after calling this function will be forced to
4885 have their location_t to be P, until
4886 cpp_stop_forcing_token_locations is called for R. */
4888 void
4889 cpp_force_token_locations (cpp_reader *r, location_t loc)
4891 r->forced_token_location = loc;
4894 /* Go back to assigning locations naturally for lexed tokens. */
4896 void
4897 cpp_stop_forcing_token_locations (cpp_reader *r)
4899 r->forced_token_location = 0;
4902 /* We're looking at \, if it's escaping EOL, look past it. If at
4903 LIMIT, don't advance. */
4905 static const unsigned char *
4906 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4908 const unsigned char *probe = peek;
4910 if (__builtin_expect (peek[1] == '\n', true))
4912 eol:
4913 probe += 2;
4914 if (__builtin_expect (probe < limit, true))
4916 peek = probe;
4917 if (*peek == '\\')
4918 /* The user might be perverse. */
4919 return do_peek_backslash (peek, limit);
4922 else if (__builtin_expect (peek[1] == '\r', false))
4924 if (probe[2] == '\n')
4925 probe++;
4926 goto eol;
4929 return peek;
4932 static const unsigned char *
4933 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4935 if (__builtin_expect (*peek == '\\', false))
4936 peek = do_peek_backslash (peek, limit);
4937 return peek;
4940 static const unsigned char *
4941 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4943 if (peek == bound)
4944 return NULL;
4946 unsigned char c = *--peek;
4947 if (__builtin_expect (c == '\n', false)
4948 || __builtin_expect (c == 'r', false))
4950 if (peek == bound)
4951 return peek;
4952 int ix = -1;
4953 if (c == '\n' && peek[ix] == '\r')
4955 if (peek + ix == bound)
4956 return peek;
4957 ix--;
4960 if (peek[ix] == '\\')
4961 return do_peek_prev (peek + ix, bound);
4963 return peek;
4965 else
4966 return peek;
4969 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4970 space. Otherwise return NULL. */
4972 static const unsigned char *
4973 do_peek_ident (const char *match, const unsigned char *peek,
4974 const unsigned char *limit)
4976 for (; *++match; peek++)
4977 if (*peek != *match)
4979 peek = do_peek_next (peek, limit);
4980 if (*peek != *match)
4981 return NULL;
4984 /* Must now not be looking at an identifier char. */
4985 peek = do_peek_next (peek, limit);
4986 if (ISIDNUM (*peek))
4987 return NULL;
4989 /* Skip control-line whitespace. */
4991 while (*peek == ' ' || *peek == '\t')
4992 peek++;
4993 if (__builtin_expect (*peek == '\\', false))
4995 peek = do_peek_backslash (peek, limit);
4996 if (*peek != '\\')
4997 goto ws;
5000 return peek;
5003 /* Are we looking at a module control line starting as PEEK - 1? */
5005 static bool
5006 do_peek_module (cpp_reader *pfile, unsigned char c,
5007 const unsigned char *peek, const unsigned char *limit)
5009 bool import = false;
5011 if (__builtin_expect (c == 'e', false))
5013 if (!((peek[0] == 'x' || peek[0] == '\\')
5014 && (peek = do_peek_ident ("export", peek, limit))))
5015 return false;
5017 /* export, peek for import or module. No need to peek __import
5018 here. */
5019 if (peek[0] == 'i')
5021 if (!((peek[1] == 'm' || peek[1] == '\\')
5022 && (peek = do_peek_ident ("import", peek + 1, limit))))
5023 return false;
5024 import = true;
5026 else if (peek[0] == 'm')
5028 if (!((peek[1] == 'o' || peek[1] == '\\')
5029 && (peek = do_peek_ident ("module", peek + 1, limit))))
5030 return false;
5032 else
5033 return false;
5035 else if (__builtin_expect (c == 'i', false))
5037 if (!((peek[0] == 'm' || peek[0] == '\\')
5038 && (peek = do_peek_ident ("import", peek, limit))))
5039 return false;
5040 import = true;
5042 else if (__builtin_expect (c == '_', false))
5044 /* Needed for translated includes. */
5045 if (!((peek[0] == '_' || peek[0] == '\\')
5046 && (peek = do_peek_ident ("__import", peek, limit))))
5047 return false;
5048 import = true;
5050 else if (__builtin_expect (c == 'm', false))
5052 if (!((peek[0] == 'o' || peek[0] == '\\')
5053 && (peek = do_peek_ident ("module", peek, limit))))
5054 return false;
5056 else
5057 return false;
5059 /* Peek the next character to see if it's good enough. We'll be at
5060 the first non-whitespace char, including skipping an escaped
5061 newline. */
5062 /* ... import followed by identifier, ':', '<' or header-name
5063 preprocessing tokens, or module followed by identifier, ':' or
5064 ';' preprocessing tokens. */
5065 unsigned char p = *peek++;
5067 /* A character literal is ... single quotes, ... optionally preceded
5068 by u8, u, U, or L */
5069 /* A string-literal is a ... double quotes, optionally prefixed by
5070 R, u8, u8R, u, uR, U, UR, L, or LR */
5071 if (p == 'u')
5073 peek = do_peek_next (peek, limit);
5074 if (*peek == '8')
5076 peek++;
5077 goto peek_u8;
5079 goto peek_u;
5081 else if (p == 'U' || p == 'L')
5083 peek_u8:
5084 peek = do_peek_next (peek, limit);
5085 peek_u:
5086 if (*peek == '\"' || *peek == '\'')
5087 return false;
5089 if (*peek == 'R')
5090 goto peek_R;
5091 /* Identifier. Ok. */
5093 else if (p == 'R')
5095 peek_R:
5096 if (CPP_OPTION (pfile, rliterals))
5098 peek = do_peek_next (peek, limit);
5099 if (*peek == '\"')
5100 return false;
5102 /* Identifier. Ok. */
5104 else if ('Z' - 'A' == 25
5105 ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5106 : ISIDST (p))
5108 /* Identifier. Ok. */
5110 else if (p == '<')
5112 /* Maybe angle header, ok for import. Reject
5113 '<=', '<<' digraph:'<:'. */
5114 if (!import)
5115 return false;
5116 peek = do_peek_next (peek, limit);
5117 if (*peek == '=' || *peek == '<'
5118 || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5119 return false;
5121 else if (p == ';')
5123 /* SEMICOLON, ok for module. */
5124 if (import)
5125 return false;
5127 else if (p == '"')
5129 /* STRING, ok for import. */
5130 if (!import)
5131 return false;
5133 else if (p == ':')
5135 /* Maybe COLON, ok. Reject '::', digraph:':>'. */
5136 peek = do_peek_next (peek, limit);
5137 if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5138 return false;
5140 else
5141 /* FIXME: Detect a unicode character, excluding those not
5142 permitted as the initial character. [lex.name]/1. I presume
5143 we need to check the \[uU] spellings, and directly using
5144 Unicode in say UTF8 form? Or perhaps we do the phase-1
5145 conversion of UTF8 to universal-character-names? */
5146 return false;
5148 return true;
5151 /* Directives-only scanning. Somewhat more relaxed than correct
5152 parsing -- some ill-formed programs will not be rejected. */
5154 void
5155 cpp_directive_only_process (cpp_reader *pfile,
5156 void *data,
5157 void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5159 bool module_p = CPP_OPTION (pfile, module_directives);
5163 restart:
5164 /* Buffer initialization, but no line cleaning. */
5165 cpp_buffer *buffer = pfile->buffer;
5166 buffer->cur_note = buffer->notes_used = 0;
5167 buffer->cur = buffer->line_base = buffer->next_line;
5168 buffer->need_line = false;
5169 /* Files always end in a newline or carriage return. We rely on this for
5170 character peeking safety. */
5171 gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
5173 const unsigned char *base = buffer->cur;
5174 unsigned line_count = 0;
5175 const unsigned char *line_start = base;
5177 bool bol = true;
5178 bool raw = false;
5180 const unsigned char *lwm = base;
5181 for (const unsigned char *pos = base, *limit = buffer->rlimit;
5182 pos < limit;)
5184 unsigned char c = *pos++;
5185 /* This matches the switch in _cpp_lex_direct. */
5186 switch (c)
5188 case ' ': case '\t': case '\f': case '\v':
5189 /* Whitespace, do nothing. */
5190 break;
5192 case '\r': /* MAC line ending, or Windows \r\n */
5193 if (*pos == '\n')
5194 pos++;
5195 /* FALLTHROUGH */
5197 case '\n':
5198 bol = true;
5200 next_line:
5201 CPP_INCREMENT_LINE (pfile, 0);
5202 line_count++;
5203 line_start = pos;
5204 break;
5206 case '\\':
5207 /* <backslash><newline> is removed, and doesn't undo any
5208 preceeding escape or whatnot. */
5209 if (*pos == '\n')
5211 pos++;
5212 goto next_line;
5214 else if (*pos == '\r')
5216 if (pos[1] == '\n')
5217 pos++;
5218 pos++;
5219 goto next_line;
5221 goto dflt;
5223 case '#':
5224 if (bol)
5226 /* Line directive. */
5227 if (pos - 1 > base && !pfile->state.skipping)
5228 cb (pfile, CPP_DO_print, data,
5229 line_count, base, pos - 1 - base);
5231 /* Prep things for directive handling. */
5232 buffer->next_line = pos;
5233 buffer->need_line = true;
5234 bool ok = _cpp_get_fresh_line (pfile);
5235 gcc_checking_assert (ok);
5237 /* Ensure proper column numbering for generated
5238 error messages. */
5239 buffer->line_base -= pos - line_start;
5241 _cpp_handle_directive (pfile, line_start + 1 != pos);
5243 /* Sanitize the line settings. Duplicate #include's can
5244 mess things up. */
5245 // FIXME: Necessary?
5246 pfile->line_table->highest_location
5247 = pfile->line_table->highest_line;
5249 if (!pfile->state.skipping
5250 && pfile->buffer->next_line < pfile->buffer->rlimit)
5251 cb (pfile, CPP_DO_location, data,
5252 pfile->line_table->highest_line);
5254 goto restart;
5256 goto dflt;
5258 case '/':
5260 const unsigned char *peek = do_peek_next (pos, limit);
5261 if (!(*peek == '/' || *peek == '*'))
5262 goto dflt;
5264 /* Line or block comment */
5265 bool is_block = *peek == '*';
5266 bool star = false;
5267 bool esc = false;
5268 location_t sloc
5269 = linemap_position_for_column (pfile->line_table,
5270 pos - line_start);
5272 while (pos < limit)
5274 char c = *pos++;
5275 switch (c)
5277 case '\\':
5278 esc = true;
5279 break;
5281 case '\r':
5282 if (*pos == '\n')
5283 pos++;
5284 /* FALLTHROUGH */
5286 case '\n':
5288 CPP_INCREMENT_LINE (pfile, 0);
5289 line_count++;
5290 line_start = pos;
5291 if (!esc && !is_block)
5293 bol = true;
5294 goto done_comment;
5297 if (!esc)
5298 star = false;
5299 esc = false;
5300 break;
5302 case '*':
5303 if (pos > peek)
5304 star = is_block;
5305 esc = false;
5306 break;
5308 case '/':
5309 if (star)
5310 goto done_comment;
5311 /* FALLTHROUGH */
5313 default:
5314 star = false;
5315 esc = false;
5316 break;
5319 if (pos < limit || is_block)
5320 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5321 "unterminated comment");
5322 done_comment:
5323 lwm = pos;
5324 break;
5327 case '\'':
5328 if (!CPP_OPTION (pfile, digit_separators))
5329 goto delimited_string;
5331 /* Possibly a number punctuator. */
5332 if (!ISIDNUM (*do_peek_next (pos, limit)))
5333 goto delimited_string;
5335 goto quote_peek;
5337 case '\"':
5338 if (!CPP_OPTION (pfile, rliterals))
5339 goto delimited_string;
5341 quote_peek:
5343 /* For ' see if it's a number punctuator
5344 \.?<digit>(<digit>|<identifier-nondigit>
5345 |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5346 /* For " see if it's a raw string
5347 {U,L,u,u8}R. This includes CPP_NUMBER detection,
5348 because that could be 0e+R. */
5349 const unsigned char *peek = pos - 1;
5350 bool quote_first = c == '"';
5351 bool quote_eight = false;
5352 bool maybe_number_start = false;
5353 bool want_number = false;
5355 while ((peek = do_peek_prev (peek, lwm)))
5357 unsigned char p = *peek;
5358 if (quote_first)
5360 if (!raw)
5362 if (p != 'R')
5363 break;
5364 raw = true;
5365 continue;
5368 quote_first = false;
5369 if (p == 'L' || p == 'U' || p == 'u')
5371 else if (p == '8')
5372 quote_eight = true;
5373 else
5374 goto second_raw;
5376 else if (quote_eight)
5378 if (p != 'u')
5380 raw = false;
5381 break;
5383 quote_eight = false;
5385 else if (c == '"')
5387 second_raw:;
5388 if (!want_number && ISIDNUM (p))
5390 raw = false;
5391 break;
5395 if (ISDIGIT (p))
5396 maybe_number_start = true;
5397 else if (p == '.')
5398 want_number = true;
5399 else if (ISIDNUM (p))
5400 maybe_number_start = false;
5401 else if (p == '+' || p == '-')
5403 if (const unsigned char *peek_prev
5404 = do_peek_prev (peek, lwm))
5406 p = *peek_prev;
5407 if (p == 'e' || p == 'E'
5408 || p == 'p' || p == 'P')
5410 want_number = true;
5411 maybe_number_start = false;
5413 else
5414 break;
5416 else
5417 break;
5419 else if (p == '\'' || p == '\"')
5421 /* If this is lwm, this must be the end of a
5422 previous string. So this is a trailing
5423 literal type, (a) if those are allowed,
5424 and (b) maybe_start is false. Otherwise
5425 this must be a CPP_NUMBER because we've
5426 met another ', and we'd have checked that
5427 in its own right. */
5428 if (peek == lwm && CPP_OPTION (pfile, uliterals))
5430 if (!maybe_number_start && !want_number)
5431 /* Must be a literal type. */
5432 raw = false;
5434 else if (p == '\''
5435 && CPP_OPTION (pfile, digit_separators))
5436 maybe_number_start = true;
5437 break;
5439 else if (c == '\'')
5440 break;
5441 else if (!quote_first && !quote_eight)
5442 break;
5445 if (maybe_number_start)
5447 if (c == '\'')
5448 /* A CPP NUMBER. */
5449 goto dflt;
5450 raw = false;
5453 goto delimited_string;
5456 delimited_string:
5458 /* (Possibly raw) string or char literal. */
5459 unsigned char end = c;
5460 int delim_len = -1;
5461 const unsigned char *delim = NULL;
5462 location_t sloc = linemap_position_for_column (pfile->line_table,
5463 pos - line_start);
5464 int esc = 0;
5466 if (raw)
5468 /* There can be no line breaks in the delimiter. */
5469 delim = pos;
5470 for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5472 if (delim_len == 16)
5474 cpp_error_with_line (pfile, CPP_DL_ERROR,
5475 sloc, 0,
5476 "raw string delimiter"
5477 " longer than %d"
5478 " characters",
5479 delim_len);
5480 raw = false;
5481 pos = delim;
5482 break;
5484 if (strchr (") \\\t\v\f\n", c))
5486 cpp_error_with_line (pfile, CPP_DL_ERROR,
5487 sloc, 0,
5488 "invalid character '%c'"
5489 " in raw string"
5490 " delimiter", c);
5491 raw = false;
5492 pos = delim;
5493 break;
5495 if (pos >= limit)
5496 goto bad_string;
5500 while (pos < limit)
5502 char c = *pos++;
5503 switch (c)
5505 case '\\':
5506 if (!raw)
5507 esc++;
5508 break;
5510 case '\r':
5511 if (*pos == '\n')
5512 pos++;
5513 /* FALLTHROUGH */
5515 case '\n':
5517 CPP_INCREMENT_LINE (pfile, 0);
5518 line_count++;
5519 line_start = pos;
5521 if (esc)
5522 esc--;
5523 break;
5525 case ')':
5526 if (raw
5527 && pos + delim_len + 1 < limit
5528 && pos[delim_len] == end
5529 && !memcmp (delim, pos, delim_len))
5531 pos += delim_len + 1;
5532 raw = false;
5533 goto done_string;
5535 break;
5537 default:
5538 if (!raw && !(esc & 1) && c == end)
5539 goto done_string;
5540 esc = 0;
5541 break;
5544 bad_string:
5545 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5546 "unterminated literal");
5548 done_string:
5549 raw = false;
5550 lwm = pos - 1;
5552 goto dflt;
5554 case '_':
5555 case 'e':
5556 case 'i':
5557 case 'm':
5558 if (bol && module_p && !pfile->state.skipping
5559 && do_peek_module (pfile, c, pos, limit))
5561 /* We've seen the start of a module control line.
5562 Start up the tokenizer. */
5563 pos--; /* Backup over the first character. */
5565 /* Backup over whitespace to start of line. */
5566 while (pos > line_start
5567 && (pos[-1] == ' ' || pos[-1] == '\t'))
5568 pos--;
5570 if (pos > base)
5571 cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5573 /* Prep things for directive handling. */
5574 buffer->next_line = pos;
5575 buffer->need_line = true;
5577 /* Now get tokens until the PRAGMA_EOL. */
5580 location_t spelling;
5581 const cpp_token *tok
5582 = cpp_get_token_with_location (pfile, &spelling);
5584 gcc_assert (pfile->state.in_deferred_pragma
5585 || tok->type == CPP_PRAGMA_EOL);
5586 cb (pfile, CPP_DO_token, data, tok, spelling);
5588 while (pfile->state.in_deferred_pragma);
5590 if (pfile->buffer->next_line < pfile->buffer->rlimit)
5591 cb (pfile, CPP_DO_location, data,
5592 pfile->line_table->highest_line);
5594 pfile->mi_valid = false;
5595 goto restart;
5597 goto dflt;
5599 default:
5600 dflt:
5601 bol = false;
5602 pfile->mi_valid = false;
5603 break;
5607 if (buffer->rlimit > base && !pfile->state.skipping)
5609 const unsigned char *limit = buffer->rlimit;
5610 /* If the file was not newline terminated, add rlimit, which is
5611 guaranteed to point to a newline, to the end of our range. */
5612 if (limit[-1] != '\n')
5614 limit++;
5615 CPP_INCREMENT_LINE (pfile, 0);
5616 line_count++;
5618 cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5621 _cpp_pop_buffer (pfile);
5623 while (pfile->buffer);