libstdc++: Fix std::runtime_format deviations from the spec [PR113320]
[official-gcc.git] / libcpp / lex.cc
blob5aa379980cf2c5d897d842deebc4913bd5c263b1
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000-2024 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
27 enum spell_type
29 SPELL_OPERATOR = 0,
30 SPELL_IDENT,
31 SPELL_LITERAL,
32 SPELL_NONE
35 struct token_spelling
37 enum spell_type category;
38 const unsigned char *name;
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
44 #define OP(e, s) { SPELL_OPERATOR, UC s },
45 #define TK(e, s) { SPELL_ ## s, UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
53 /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. */
54 #define UCS_LIMIT 0x10FFFF
56 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
57 static int skip_line_comment (cpp_reader *);
58 static void skip_whitespace (cpp_reader *, cppchar_t);
59 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
60 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
61 static void store_comment (cpp_reader *, cpp_token *);
62 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
63 unsigned int, enum cpp_ttype);
64 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
65 static int name_p (cpp_reader *, const cpp_string *);
66 static tokenrun *next_tokenrun (tokenrun *);
68 static _cpp_buff *new_buff (size_t);
71 /* Utility routine:
73 Compares, the token TOKEN to the NUL-terminated string STRING.
74 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
75 int
76 cpp_ideq (const cpp_token *token, const char *string)
78 if (token->type != CPP_NAME)
79 return 0;
81 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
84 /* Record a note TYPE at byte POS into the current cleaned logical
85 line. */
86 static void
87 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
89 if (buffer->notes_used == buffer->notes_cap)
91 buffer->notes_cap = buffer->notes_cap * 2 + 200;
92 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
93 buffer->notes_cap);
96 buffer->notes[buffer->notes_used].pos = pos;
97 buffer->notes[buffer->notes_used].type = type;
98 buffer->notes_used++;
102 /* Fast path to find line special characters using optimized character
103 scanning algorithms. Anything complicated falls back to the slow
104 path below. Since this loop is very hot it's worth doing these kinds
105 of optimizations.
107 One of the paths through the ifdefs should provide
109 const uchar *search_line_fast (const uchar *s, const uchar *end);
111 Between S and END, search for \n, \r, \\, ?. Return a pointer to
112 the found character.
114 Note that the last character of the buffer is *always* a newline,
115 as forced by _cpp_convert_input. This fact can be used to avoid
116 explicitly looking for the end of the buffer. */
118 /* Configure gives us an ifdef test. */
119 #ifndef WORDS_BIGENDIAN
120 #define WORDS_BIGENDIAN 0
121 #endif
123 /* We'd like the largest integer that fits into a register. There's nothing
124 in <stdint.h> that gives us that. For most hosts this is unsigned long,
125 but MS decided on an LLP64 model. Thankfully when building with GCC we
126 can get the "real" word size. */
127 #ifdef __GNUC__
128 typedef unsigned int word_type __attribute__((__mode__(__word__)));
129 #else
130 typedef unsigned long word_type;
131 #endif
133 /* The code below is only expecting sizes 4 or 8.
134 Die at compile-time if this expectation is violated. */
135 typedef char check_word_type_size
136 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
138 /* Return X with the first N bytes forced to values that won't match one
139 of the interesting characters. Note that NUL is not interesting. */
141 static inline word_type
142 acc_char_mask_misalign (word_type val, unsigned int n)
144 word_type mask = -1;
145 if (WORDS_BIGENDIAN)
146 mask >>= n * 8;
147 else
148 mask <<= n * 8;
149 return val & mask;
152 /* Return X replicated to all byte positions within WORD_TYPE. */
154 static inline word_type
155 acc_char_replicate (uchar x)
157 word_type ret;
159 ret = (x << 24) | (x << 16) | (x << 8) | x;
160 if (sizeof(word_type) == 8)
161 ret = (ret << 16 << 16) | ret;
162 return ret;
165 /* Return non-zero if some byte of VAL is (probably) C. */
167 static inline word_type
168 acc_char_cmp (word_type val, word_type c)
170 #if defined(__GNUC__) && defined(__alpha__)
171 /* We can get exact results using a compare-bytes instruction.
172 Get (val == c) via (0 >= (val ^ c)). */
173 return __builtin_alpha_cmpbge (0, val ^ c);
174 #else
175 word_type magic = 0x7efefefeU;
176 if (sizeof(word_type) == 8)
177 magic = (magic << 16 << 16) | 0xfefefefeU;
178 magic |= 1;
180 val ^= c;
181 return ((val + magic) ^ ~val) & ~magic;
182 #endif
185 /* Given the result of acc_char_cmp is non-zero, return the index of
186 the found character. If this was a false positive, return -1. */
188 static inline int
189 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
190 word_type val ATTRIBUTE_UNUSED)
192 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
193 /* The cmpbge instruction sets *bits* of the result corresponding to
194 matches in the bytes with no false positives. */
195 return __builtin_ctzl (cmp);
196 #else
197 unsigned int i;
199 /* ??? It would be nice to force unrolling here,
200 and have all of these constants folded. */
201 for (i = 0; i < sizeof(word_type); ++i)
203 uchar c;
204 if (WORDS_BIGENDIAN)
205 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
206 else
207 c = (val >> i * 8) & 0xff;
209 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
210 return i;
213 return -1;
214 #endif
217 /* A version of the fast scanner using bit fiddling techniques.
219 For 32-bit words, one would normally perform 16 comparisons and
220 16 branches. With this algorithm one performs 24 arithmetic
221 operations and one branch. Whether this is faster with a 32-bit
222 word size is going to be somewhat system dependent.
224 For 64-bit words, we eliminate twice the number of comparisons
225 and branches without increasing the number of arithmetic operations.
226 It's almost certainly going to be a win with 64-bit word size. */
228 static const uchar * search_line_acc_char (const uchar *, const uchar *)
229 ATTRIBUTE_UNUSED;
231 static const uchar *
232 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
234 const word_type repl_nl = acc_char_replicate ('\n');
235 const word_type repl_cr = acc_char_replicate ('\r');
236 const word_type repl_bs = acc_char_replicate ('\\');
237 const word_type repl_qm = acc_char_replicate ('?');
239 unsigned int misalign;
240 const word_type *p;
241 word_type val, t;
243 /* Align the buffer. Mask out any bytes from before the beginning. */
244 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
245 val = *p;
246 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
247 if (misalign)
248 val = acc_char_mask_misalign (val, misalign);
250 /* Main loop. */
251 while (1)
253 t = acc_char_cmp (val, repl_nl);
254 t |= acc_char_cmp (val, repl_cr);
255 t |= acc_char_cmp (val, repl_bs);
256 t |= acc_char_cmp (val, repl_qm);
258 if (__builtin_expect (t != 0, 0))
260 int i = acc_char_index (t, val);
261 if (i >= 0)
262 return (const uchar *)p + i;
265 val = *++p;
269 /* Disable on Solaris 2/x86 until the following problem can be properly
270 autoconfed:
272 The Solaris 10+ assembler tags objects with the instruction set
273 extensions used, so SSE4.2 executables cannot run on machines that
274 don't support that extension. */
276 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
278 /* Replicated character data to be shared between implementations.
279 Recall that outside of a context with vector support we can't
280 define compatible vector types, therefore these are all defined
281 in terms of raw characters. */
282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
283 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
284 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
285 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
286 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
287 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
288 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
289 { '?', '?', '?', '?', '?', '?', '?', '?',
290 '?', '?', '?', '?', '?', '?', '?', '?' },
293 /* A version of the fast scanner using MMX vectorized byte compare insns.
295 This uses the PMOVMSKB instruction which was introduced with "MMX2",
296 which was packaged into SSE1; it is also present in the AMD MMX
297 extension. Mark the function as using "sse" so that we emit a real
298 "emms" instruction, rather than the 3dNOW "femms" instruction. */
300 static const uchar *
301 #ifndef __SSE__
302 __attribute__((__target__("sse")))
303 #endif
304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
306 typedef char v8qi __attribute__ ((__vector_size__ (8)));
307 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
309 const v8qi repl_nl = *(const v8qi *)repl_chars[0];
310 const v8qi repl_cr = *(const v8qi *)repl_chars[1];
311 const v8qi repl_bs = *(const v8qi *)repl_chars[2];
312 const v8qi repl_qm = *(const v8qi *)repl_chars[3];
314 unsigned int misalign, found, mask;
315 const v8qi *p;
316 v8qi data, t, c;
318 /* Align the source pointer. While MMX doesn't generate unaligned data
319 faults, this allows us to safely scan to the end of the buffer without
320 reading beyond the end of the last page. */
321 misalign = (uintptr_t)s & 7;
322 p = (const v8qi *)((uintptr_t)s & -8);
323 data = *p;
325 /* Create a mask for the bytes that are valid within the first
326 16-byte block. The Idea here is that the AND with the mask
327 within the loop is "free", since we need some AND or TEST
328 insn in order to set the flags for the branch anyway. */
329 mask = -1u << misalign;
331 /* Main loop processing 8 bytes at a time. */
332 goto start;
335 data = *++p;
336 mask = -1;
338 start:
339 t = __builtin_ia32_pcmpeqb(data, repl_nl);
340 c = __builtin_ia32_pcmpeqb(data, repl_cr);
341 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
342 c = __builtin_ia32_pcmpeqb(data, repl_bs);
343 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
344 c = __builtin_ia32_pcmpeqb(data, repl_qm);
345 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
346 found = __builtin_ia32_pmovmskb (t);
347 found &= mask;
349 while (!found);
351 __builtin_ia32_emms ();
353 /* FOUND contains 1 in bits for which we matched a relevant
354 character. Conversion to the byte index is trivial. */
355 found = __builtin_ctz(found);
356 return (const uchar *)p + found;
359 /* A version of the fast scanner using SSE2 vectorized byte compare insns. */
361 static const uchar *
362 #ifndef __SSE2__
363 __attribute__((__target__("sse2")))
364 #endif
365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
367 typedef char v16qi __attribute__ ((__vector_size__ (16)));
369 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
370 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
371 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
372 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
374 unsigned int misalign, found, mask;
375 const v16qi *p;
376 v16qi data, t;
378 /* Align the source pointer. */
379 misalign = (uintptr_t)s & 15;
380 p = (const v16qi *)((uintptr_t)s & -16);
381 data = *p;
383 /* Create a mask for the bytes that are valid within the first
384 16-byte block. The Idea here is that the AND with the mask
385 within the loop is "free", since we need some AND or TEST
386 insn in order to set the flags for the branch anyway. */
387 mask = -1u << misalign;
389 /* Main loop processing 16 bytes at a time. */
390 goto start;
393 data = *++p;
394 mask = -1;
396 start:
397 t = data == repl_nl;
398 t |= data == repl_cr;
399 t |= data == repl_bs;
400 t |= data == repl_qm;
401 found = __builtin_ia32_pmovmskb128 (t);
402 found &= mask;
404 while (!found);
406 /* FOUND contains 1 in bits for which we matched a relevant
407 character. Conversion to the byte index is trivial. */
408 found = __builtin_ctz(found);
409 return (const uchar *)p + found;
412 #ifdef HAVE_SSE4
413 /* A version of the fast scanner using SSE 4.2 vectorized string insns. */
415 static const uchar *
416 #ifndef __SSE4_2__
417 __attribute__((__target__("sse4.2")))
418 #endif
419 search_line_sse42 (const uchar *s, const uchar *end)
421 typedef char v16qi __attribute__ ((__vector_size__ (16)));
422 static const v16qi search = { '\n', '\r', '?', '\\' };
424 uintptr_t si = (uintptr_t)s;
425 uintptr_t index;
427 /* Check for unaligned input. */
428 if (si & 15)
430 v16qi sv;
432 if (__builtin_expect (end - s < 16, 0)
433 && __builtin_expect ((si & 0xfff) > 0xff0, 0))
435 /* There are less than 16 bytes left in the buffer, and less
436 than 16 bytes left on the page. Reading 16 bytes at this
437 point might generate a spurious page fault. Defer to the
438 SSE2 implementation, which already handles alignment. */
439 return search_line_sse2 (s, end);
442 /* ??? The builtin doesn't understand that the PCMPESTRI read from
443 memory need not be aligned. */
444 sv = __builtin_ia32_loaddqu ((const char *) s);
445 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
447 if (__builtin_expect (index < 16, 0))
448 goto found;
450 /* Advance the pointer to an aligned address. We will re-scan a
451 few bytes, but we no longer need care for reading past the
452 end of a page, since we're guaranteed a match. */
453 s = (const uchar *)((si + 15) & -16);
456 /* Main loop, processing 16 bytes at a time. */
457 #ifdef __GCC_ASM_FLAG_OUTPUTS__
458 while (1)
460 char f;
462 /* By using inline assembly instead of the builtin,
463 we can use the result, as well as the flags set. */
464 __asm ("%vpcmpestri\t$0, %2, %3"
465 : "=c"(index), "=@ccc"(f)
466 : "m"(*s), "x"(search), "a"(4), "d"(16));
467 if (f)
468 break;
470 s += 16;
472 #else
473 s -= 16;
474 /* By doing the whole loop in inline assembly,
475 we can make proper use of the flags set. */
476 __asm ( ".balign 16\n"
477 "0: add $16, %1\n"
478 " %vpcmpestri\t$0, (%1), %2\n"
479 " jnc 0b"
480 : "=&c"(index), "+r"(s)
481 : "x"(search), "a"(4), "d"(16));
482 #endif
484 found:
485 return s + index;
488 #else
489 /* Work around out-dated assemblers without sse4 support. */
490 #define search_line_sse42 search_line_sse2
491 #endif
493 /* Check the CPU capabilities. */
495 #include "../gcc/config/i386/cpuid.h"
497 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
498 static search_line_fast_type search_line_fast;
500 #define HAVE_init_vectorized_lexer 1
501 static inline void
502 init_vectorized_lexer (void)
504 unsigned dummy, ecx = 0, edx = 0;
505 search_line_fast_type impl = search_line_acc_char;
506 int minimum = 0;
508 #if defined(__SSE4_2__)
509 minimum = 3;
510 #elif defined(__SSE2__)
511 minimum = 2;
512 #elif defined(__SSE__)
513 minimum = 1;
514 #endif
516 if (minimum == 3)
517 impl = search_line_sse42;
518 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
520 if (minimum == 3 || (ecx & bit_SSE4_2))
521 impl = search_line_sse42;
522 else if (minimum == 2 || (edx & bit_SSE2))
523 impl = search_line_sse2;
524 else if (minimum == 1 || (edx & bit_SSE))
525 impl = search_line_mmx;
527 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
529 if (minimum == 1
530 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
531 impl = search_line_mmx;
534 search_line_fast = impl;
537 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
539 /* A vection of the fast scanner using AltiVec vectorized byte compares
540 and VSX unaligned loads (when VSX is available). This is otherwise
541 the same as the AltiVec version. */
543 ATTRIBUTE_NO_SANITIZE_UNDEFINED
544 static const uchar *
545 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
547 typedef __attribute__((altivec(vector))) unsigned char vc;
549 const vc repl_nl = {
550 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
551 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
553 const vc repl_cr = {
554 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
555 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
557 const vc repl_bs = {
558 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
559 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
561 const vc repl_qm = {
562 '?', '?', '?', '?', '?', '?', '?', '?',
563 '?', '?', '?', '?', '?', '?', '?', '?',
565 const vc zero = { 0 };
567 vc data, t;
569 /* Main loop processing 16 bytes at a time. */
572 vc m_nl, m_cr, m_bs, m_qm;
574 data = __builtin_vec_vsx_ld (0, s);
575 s += 16;
577 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
578 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
579 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
580 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
581 t = (m_nl | m_cr) | (m_bs | m_qm);
583 /* T now contains 0xff in bytes for which we matched one of the relevant
584 characters. We want to exit the loop if any byte in T is non-zero.
585 Below is the expansion of vec_any_ne(t, zero). */
587 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
589 /* Restore s to to point to the 16 bytes we just processed. */
590 s -= 16;
593 #define N (sizeof(vc) / sizeof(long))
595 union {
596 vc v;
597 /* Statically assert that N is 2 or 4. */
598 unsigned long l[(N == 2 || N == 4) ? N : -1];
599 } u;
600 unsigned long l, i = 0;
602 u.v = t;
604 /* Find the first word of T that is non-zero. */
605 switch (N)
607 case 4:
608 l = u.l[i++];
609 if (l != 0)
610 break;
611 s += sizeof(unsigned long);
612 l = u.l[i++];
613 if (l != 0)
614 break;
615 s += sizeof(unsigned long);
616 /* FALLTHRU */
617 case 2:
618 l = u.l[i++];
619 if (l != 0)
620 break;
621 s += sizeof(unsigned long);
622 l = u.l[i];
625 /* L now contains 0xff in bytes for which we matched one of the
626 relevant characters. We can find the byte index by finding
627 its bit index and dividing by 8. */
628 #ifdef __BIG_ENDIAN__
629 l = __builtin_clzl(l) >> 3;
630 #else
631 l = __builtin_ctzl(l) >> 3;
632 #endif
633 return s + l;
635 #undef N
639 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
641 /* A vection of the fast scanner using AltiVec vectorized byte compares.
642 This cannot be used for little endian because vec_lvsl/lvsr are
643 deprecated for little endian and the code won't work properly. */
644 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
645 so we can't compile this function without -maltivec on the command line
646 (or implied by some other switch). */
648 static const uchar *
649 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
651 typedef __attribute__((altivec(vector))) unsigned char vc;
653 const vc repl_nl = {
654 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
655 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
657 const vc repl_cr = {
658 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
659 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
661 const vc repl_bs = {
662 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
663 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
665 const vc repl_qm = {
666 '?', '?', '?', '?', '?', '?', '?', '?',
667 '?', '?', '?', '?', '?', '?', '?', '?',
669 const vc ones = {
670 -1, -1, -1, -1, -1, -1, -1, -1,
671 -1, -1, -1, -1, -1, -1, -1, -1,
673 const vc zero = { 0 };
675 vc data, mask, t;
677 /* Altivec loads automatically mask addresses with -16. This lets us
678 issue the first load as early as possible. */
679 data = __builtin_vec_ld(0, (const vc *)s);
681 /* Discard bytes before the beginning of the buffer. Do this by
682 beginning with all ones and shifting in zeros according to the
683 mis-alignment. The LVSR instruction pulls the exact shift we
684 want from the address. */
685 mask = __builtin_vec_lvsr(0, s);
686 mask = __builtin_vec_perm(zero, ones, mask);
687 data &= mask;
689 /* While altivec loads mask addresses, we still need to align S so
690 that the offset we compute at the end is correct. */
691 s = (const uchar *)((uintptr_t)s & -16);
693 /* Main loop processing 16 bytes at a time. */
694 goto start;
697 vc m_nl, m_cr, m_bs, m_qm;
699 s += 16;
700 data = __builtin_vec_ld(0, (const vc *)s);
702 start:
703 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
704 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
705 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
706 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
707 t = (m_nl | m_cr) | (m_bs | m_qm);
709 /* T now contains 0xff in bytes for which we matched one of the relevant
710 characters. We want to exit the loop if any byte in T is non-zero.
711 Below is the expansion of vec_any_ne(t, zero). */
713 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
716 #define N (sizeof(vc) / sizeof(long))
718 union {
719 vc v;
720 /* Statically assert that N is 2 or 4. */
721 unsigned long l[(N == 2 || N == 4) ? N : -1];
722 } u;
723 unsigned long l, i = 0;
725 u.v = t;
727 /* Find the first word of T that is non-zero. */
728 switch (N)
730 case 4:
731 l = u.l[i++];
732 if (l != 0)
733 break;
734 s += sizeof(unsigned long);
735 l = u.l[i++];
736 if (l != 0)
737 break;
738 s += sizeof(unsigned long);
739 /* FALLTHROUGH */
740 case 2:
741 l = u.l[i++];
742 if (l != 0)
743 break;
744 s += sizeof(unsigned long);
745 l = u.l[i];
748 /* L now contains 0xff in bytes for which we matched one of the
749 relevant characters. We can find the byte index by finding
750 its bit index and dividing by 8. */
751 l = __builtin_clzl(l) >> 3;
752 return s + l;
754 #undef N
758 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
759 #include "arm_neon.h"
761 /* This doesn't have to be the exact page size, but no system may use
762 a size smaller than this. ARMv8 requires a minimum page size of
763 4k. The impact of being conservative here is a small number of
764 cases will take the slightly slower entry path into the main
765 loop. */
767 #define AARCH64_MIN_PAGE_SIZE 4096
769 static const uchar *
770 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
772 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
773 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
774 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
775 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
776 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
778 #ifdef __ARM_BIG_ENDIAN
779 const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
780 #else
781 const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
782 #endif
784 unsigned int found;
785 const uint8_t *p;
786 uint8x16_t data;
787 uint8x16_t t;
788 uint16x8_t m;
789 uint8x16_t u, v, w;
791 /* Align the source pointer. */
792 p = (const uint8_t *)((uintptr_t)s & -16);
794 /* Assuming random string start positions, with a 4k page size we'll take
795 the slow path about 0.37% of the time. */
796 if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
797 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
798 < 16, 0))
800 /* Slow path: the string starts near a possible page boundary. */
801 uint32_t misalign, mask;
803 misalign = (uintptr_t)s & 15;
804 mask = (-1u << misalign) & 0xffff;
805 data = vld1q_u8 (p);
806 t = vceqq_u8 (data, repl_nl);
807 u = vceqq_u8 (data, repl_cr);
808 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
809 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
810 t = vorrq_u8 (v, w);
811 t = vandq_u8 (t, xmask);
812 m = vpaddlq_u8 (t);
813 m = vshlq_u16 (m, shift);
814 found = vaddvq_u16 (m);
815 found &= mask;
816 if (found)
817 return (const uchar*)p + __builtin_ctz (found);
819 else
821 data = vld1q_u8 ((const uint8_t *) s);
822 t = vceqq_u8 (data, repl_nl);
823 u = vceqq_u8 (data, repl_cr);
824 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
825 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
826 t = vorrq_u8 (v, w);
827 if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
828 goto done;
833 p += 16;
834 data = vld1q_u8 (p);
835 t = vceqq_u8 (data, repl_nl);
836 u = vceqq_u8 (data, repl_cr);
837 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
838 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
839 t = vorrq_u8 (v, w);
840 } while (!vpaddd_u64 ((uint64x2_t)t));
842 done:
843 /* Now that we've found the terminating substring, work out precisely where
844 we need to stop. */
845 t = vandq_u8 (t, xmask);
846 m = vpaddlq_u8 (t);
847 m = vshlq_u16 (m, shift);
848 found = vaddvq_u16 (m);
849 return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
850 + __builtin_ctz (found));
853 #elif defined (__ARM_NEON)
854 #include "arm_neon.h"
856 static const uchar *
857 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
859 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
860 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
861 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
862 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
863 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
865 unsigned int misalign, found, mask;
866 const uint8_t *p;
867 uint8x16_t data;
869 /* Align the source pointer. */
870 misalign = (uintptr_t)s & 15;
871 p = (const uint8_t *)((uintptr_t)s & -16);
872 data = vld1q_u8 (p);
874 /* Create a mask for the bytes that are valid within the first
875 16-byte block. The Idea here is that the AND with the mask
876 within the loop is "free", since we need some AND or TEST
877 insn in order to set the flags for the branch anyway. */
878 mask = (-1u << misalign) & 0xffff;
880 /* Main loop, processing 16 bytes at a time. */
881 goto start;
885 uint8x8_t l;
886 uint16x4_t m;
887 uint32x2_t n;
888 uint8x16_t t, u, v, w;
890 p += 16;
891 data = vld1q_u8 (p);
892 mask = 0xffff;
894 start:
895 t = vceqq_u8 (data, repl_nl);
896 u = vceqq_u8 (data, repl_cr);
897 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
898 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
899 t = vandq_u8 (vorrq_u8 (v, w), xmask);
900 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
901 m = vpaddl_u8 (l);
902 n = vpaddl_u16 (m);
904 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
905 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
906 found &= mask;
908 while (!found);
910 /* FOUND contains 1 in bits for which we matched a relevant
911 character. Conversion to the byte index is trivial. */
912 found = __builtin_ctz (found);
913 return (const uchar *)p + found;
916 #else
918 /* We only have one accelerated alternative. Use a direct call so that
919 we encourage inlining. */
921 #define search_line_fast search_line_acc_char
923 #endif
925 /* Initialize the lexer if needed. */
927 void
928 _cpp_init_lexer (void)
930 #ifdef HAVE_init_vectorized_lexer
931 init_vectorized_lexer ();
932 #endif
935 /* Returns with a logical line that contains no escaped newlines or
936 trigraphs. This is a time-critical inner loop. */
937 void
938 _cpp_clean_line (cpp_reader *pfile)
940 cpp_buffer *buffer;
941 const uchar *s;
942 uchar c, *d, *p;
944 buffer = pfile->buffer;
945 buffer->cur_note = buffer->notes_used = 0;
946 buffer->cur = buffer->line_base = buffer->next_line;
947 buffer->need_line = false;
948 s = buffer->next_line;
950 if (!buffer->from_stage3)
952 const uchar *pbackslash = NULL;
954 /* Fast path. This is the common case of an un-escaped line with
955 no trigraphs. The primary win here is by not writing any
956 data back to memory until we have to. */
957 while (1)
959 /* Perform an optimized search for \n, \r, \\, ?. */
960 s = search_line_fast (s, buffer->rlimit);
962 c = *s;
963 if (c == '\\')
965 /* Record the location of the backslash and continue. */
966 pbackslash = s++;
968 else if (__builtin_expect (c == '?', 0))
970 if (__builtin_expect (s[1] == '?', false)
971 && _cpp_trigraph_map[s[2]])
973 /* Have a trigraph. We may or may not have to convert
974 it. Add a line note regardless, for -Wtrigraphs. */
975 add_line_note (buffer, s, s[2]);
976 if (CPP_OPTION (pfile, trigraphs))
978 /* We do, and that means we have to switch to the
979 slow path. */
980 d = (uchar *) s;
981 *d = _cpp_trigraph_map[s[2]];
982 s += 2;
983 goto slow_path;
986 /* Not a trigraph. Continue on fast-path. */
987 s++;
989 else
990 break;
993 /* This must be \r or \n. We're either done, or we'll be forced
994 to write back to the buffer and continue on the slow path. */
995 d = (uchar *) s;
997 if (__builtin_expect (s == buffer->rlimit, false))
998 goto done;
1000 /* DOS line ending? */
1001 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1003 s++;
1004 if (s == buffer->rlimit)
1005 goto done;
1008 if (__builtin_expect (pbackslash == NULL, true))
1009 goto done;
1011 /* Check for escaped newline. */
1012 p = d;
1013 while (is_nvspace (p[-1]))
1014 p--;
1015 if (p - 1 != pbackslash)
1016 goto done;
1018 /* Have an escaped newline; process it and proceed to
1019 the slow path. */
1020 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1021 d = p - 2;
1022 buffer->next_line = p - 1;
1024 slow_path:
1025 while (1)
1027 c = *++s;
1028 *++d = c;
1030 if (c == '\n' || c == '\r')
1032 /* Handle DOS line endings. */
1033 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1034 s++;
1035 if (s == buffer->rlimit)
1036 break;
1038 /* Escaped? */
1039 p = d;
1040 while (p != buffer->next_line && is_nvspace (p[-1]))
1041 p--;
1042 if (p == buffer->next_line || p[-1] != '\\')
1043 break;
1045 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1046 d = p - 2;
1047 buffer->next_line = p - 1;
1049 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1051 /* Add a note regardless, for the benefit of -Wtrigraphs. */
1052 add_line_note (buffer, d, s[2]);
1053 if (CPP_OPTION (pfile, trigraphs))
1055 *d = _cpp_trigraph_map[s[2]];
1056 s += 2;
1061 else
1063 while (*s != '\n' && *s != '\r')
1064 s++;
1065 d = (uchar *) s;
1067 /* Handle DOS line endings. */
1068 if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1069 s++;
1072 done:
1073 *d = '\n';
1074 /* A sentinel note that should never be processed. */
1075 add_line_note (buffer, d + 1, '\n');
1076 buffer->next_line = s + 1;
1079 template <bool lexing_raw_string>
1080 static bool get_fresh_line_impl (cpp_reader *pfile);
1082 /* Return true if the trigraph indicated by NOTE should be warned
1083 about in a comment. */
1084 static bool
1085 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1087 const uchar *p;
1089 /* Within comments we don't warn about trigraphs, unless the
1090 trigraph forms an escaped newline, as that may change
1091 behavior. */
1092 if (note->type != '/')
1093 return false;
1095 /* If -trigraphs, then this was an escaped newline iff the next note
1096 is coincident. */
1097 if (CPP_OPTION (pfile, trigraphs))
1098 return note[1].pos == note->pos;
1100 /* Otherwise, see if this forms an escaped newline. */
1101 p = note->pos + 3;
1102 while (is_nvspace (*p))
1103 p++;
1105 /* There might have been escaped newlines between the trigraph and the
1106 newline we found. Hence the position test. */
1107 return (*p == '\n' && p < note[1].pos);
1110 /* Process the notes created by add_line_note as far as the current
1111 location. */
1112 void
1113 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1115 cpp_buffer *buffer = pfile->buffer;
1117 for (;;)
1119 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1120 unsigned int col;
1122 if (note->pos > buffer->cur)
1123 break;
1125 buffer->cur_note++;
1126 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1128 if (note->type == '\\' || note->type == ' ')
1130 if (note->type == ' ' && !in_comment)
1131 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1132 "backslash and newline separated by space");
1134 if (buffer->next_line > buffer->rlimit)
1136 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1137 "backslash-newline at end of file");
1138 /* Prevent "no newline at end of file" warning. */
1139 buffer->next_line = buffer->rlimit;
1142 buffer->line_base = note->pos;
1143 CPP_INCREMENT_LINE (pfile, 0);
1145 else if (_cpp_trigraph_map[note->type])
1147 if (CPP_OPTION (pfile, warn_trigraphs)
1148 && (!in_comment || warn_in_comment (pfile, note)))
1150 if (CPP_OPTION (pfile, trigraphs))
1151 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1152 pfile->line_table->highest_line, col,
1153 "trigraph ??%c converted to %c",
1154 note->type,
1155 (int) _cpp_trigraph_map[note->type]);
1156 else
1158 cpp_warning_with_line
1159 (pfile, CPP_W_TRIGRAPHS,
1160 pfile->line_table->highest_line, col,
1161 "trigraph ??%c ignored, use -trigraphs to enable",
1162 note->type);
1166 else if (note->type == 0)
1167 /* Already processed in lex_raw_string. */;
1168 else
1169 abort ();
1173 namespace bidi {
1174 enum class kind {
1175 NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1178 /* All the UTF-8 encodings of bidi characters start with E2. */
1179 constexpr uchar utf8_start = 0xe2;
1181 struct context
1183 context () {}
1184 context (location_t loc, kind k, bool pdf, bool ucn)
1185 : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1189 kind get_pop_kind () const
1191 return m_pdf ? kind::PDF : kind::PDI;
1193 bool ucn_p () const
1195 return m_ucn;
1198 location_t m_loc;
1199 kind m_kind;
1200 unsigned m_pdf : 1;
1201 unsigned m_ucn : 1;
1204 /* A vector holding currently open bidi contexts. We use a char for
1205 each context, its LSB is 1 if it represents a PDF context, 0 if it
1206 represents a PDI context. The next bit is 1 if this context was open
1207 by a bidi character written as a UCN, and 0 when it was UTF-8. */
1208 semi_embedded_vec <context, 16> vec;
1210 /* Close the whole comment/identifier/string literal/character constant
1211 context. */
1212 void on_close ()
1214 vec.truncate (0);
1217 /* Pop the last element in the vector. */
1218 void pop ()
1220 unsigned int len = vec.count ();
1221 gcc_checking_assert (len > 0);
1222 vec.truncate (len - 1);
1225 /* Return the pop kind of the context of the Ith element. */
1226 kind pop_kind_at (unsigned int i)
1228 return vec[i].get_pop_kind ();
1231 /* Return the pop kind of the context that is currently opened. */
1232 kind current_ctx ()
1234 unsigned int len = vec.count ();
1235 if (len == 0)
1236 return kind::NONE;
1237 return vec[len - 1].get_pop_kind ();
1240 /* Return true if the current context comes from a UCN origin, that is,
1241 the bidi char which started this bidi context was written as a UCN. */
1242 bool current_ctx_ucn_p ()
1244 unsigned int len = vec.count ();
1245 gcc_checking_assert (len > 0);
1246 return vec[len - 1].m_ucn;
1249 location_t current_ctx_loc ()
1251 unsigned int len = vec.count ();
1252 gcc_checking_assert (len > 0);
1253 return vec[len - 1].m_loc;
1256 /* We've read a bidi char, update the current vector as necessary.
1257 LOC is only valid when K is not kind::NONE. */
1258 void on_char (kind k, bool ucn_p, location_t loc)
1260 switch (k)
1262 case kind::LRE:
1263 case kind::RLE:
1264 case kind::LRO:
1265 case kind::RLO:
1266 vec.push (context (loc, k, true, ucn_p));
1267 break;
1268 case kind::LRI:
1269 case kind::RLI:
1270 case kind::FSI:
1271 vec.push (context (loc, k, false, ucn_p));
1272 break;
1273 /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1274 whose scope has not yet been terminated. */
1275 case kind::PDF:
1276 if (current_ctx () == kind::PDF)
1277 pop ();
1278 break;
1279 /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1280 scope has not yet been terminated, as well as the scopes of
1281 any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1282 yet been terminated. */
1283 case kind::PDI:
1284 for (int i = vec.count () - 1; i >= 0; --i)
1285 if (pop_kind_at (i) == kind::PDI)
1287 vec.truncate (i);
1288 break;
1290 break;
1291 case kind::LTR:
1292 case kind::RTL:
1293 /* These aren't popped by a PDF/PDI. */
1294 break;
1295 ATTR_LIKELY case kind::NONE:
1296 break;
1297 default:
1298 abort ();
1302 /* Return a descriptive string for K. */
1303 const char *to_str (kind k)
1305 switch (k)
1307 case kind::LRE:
1308 return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1309 case kind::RLE:
1310 return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1311 case kind::LRO:
1312 return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1313 case kind::RLO:
1314 return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1315 case kind::LRI:
1316 return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1317 case kind::RLI:
1318 return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1319 case kind::FSI:
1320 return "U+2068 (FIRST STRONG ISOLATE)";
1321 case kind::PDF:
1322 return "U+202C (POP DIRECTIONAL FORMATTING)";
1323 case kind::PDI:
1324 return "U+2069 (POP DIRECTIONAL ISOLATE)";
1325 case kind::LTR:
1326 return "U+200E (LEFT-TO-RIGHT MARK)";
1327 case kind::RTL:
1328 return "U+200F (RIGHT-TO-LEFT MARK)";
1329 default:
1330 abort ();
1335 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1336 within the current line in FILE, with the caret at START. */
1338 static location_t
1339 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1340 const unsigned char *const start,
1341 size_t num_bytes)
1343 gcc_checking_assert (num_bytes > 0);
1345 /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1346 to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1347 whereas linemap_position_for_column is 1-based. */
1349 /* Get 0-based offsets within the line. */
1350 size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1351 size_t end_offset = start_offset + num_bytes - 1;
1353 /* Now convert to location_t, where "columns" are 1-based byte offsets. */
1354 location_t start_loc = linemap_position_for_column (pfile->line_table,
1355 start_offset + 1);
1356 location_t end_loc = linemap_position_for_column (pfile->line_table,
1357 end_offset + 1);
1359 if (start_loc == end_loc)
1360 return start_loc;
1362 source_range src_range;
1363 src_range.m_start = start_loc;
1364 src_range.m_finish = end_loc;
1365 location_t combined_loc
1366 = pfile->line_table->get_or_create_combined_loc (start_loc,
1367 src_range,
1368 nullptr,
1370 return combined_loc;
1373 /* Parse a sequence of 3 bytes starting with P and return its bidi code. */
1375 static bidi::kind
1376 get_bidi_utf8_1 (const unsigned char *const p)
1378 gcc_checking_assert (p[0] == bidi::utf8_start);
1380 if (p[1] == 0x80)
1381 switch (p[2])
1383 case 0xaa:
1384 return bidi::kind::LRE;
1385 case 0xab:
1386 return bidi::kind::RLE;
1387 case 0xac:
1388 return bidi::kind::PDF;
1389 case 0xad:
1390 return bidi::kind::LRO;
1391 case 0xae:
1392 return bidi::kind::RLO;
1393 case 0x8e:
1394 return bidi::kind::LTR;
1395 case 0x8f:
1396 return bidi::kind::RTL;
1397 default:
1398 break;
1400 else if (p[1] == 0x81)
1401 switch (p[2])
1403 case 0xa6:
1404 return bidi::kind::LRI;
1405 case 0xa7:
1406 return bidi::kind::RLI;
1407 case 0xa8:
1408 return bidi::kind::FSI;
1409 case 0xa9:
1410 return bidi::kind::PDI;
1411 default:
1412 break;
1415 return bidi::kind::NONE;
1418 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1419 If the kind is not NONE, write the location to *OUT.*/
1421 static bidi::kind
1422 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1424 bidi::kind result = get_bidi_utf8_1 (p);
1425 if (result != bidi::kind::NONE)
1427 /* We have a sequence of 3 bytes starting at P. */
1428 *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1430 return result;
1433 /* Parse a UCN where P points just past \u or \U and return its bidi code. */
1435 static bidi::kind
1436 get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
1438 /* 6.4.3 Universal Character Names
1439 \u hex-quad
1440 \U hex-quad hex-quad
1441 \u { simple-hexadecimal-digit-sequence }
1442 where \unnnn means \U0000nnnn. */
1444 *end = p + 4;
1445 if (is_U)
1447 if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1448 return bidi::kind::NONE;
1449 /* Skip 4B so we can treat \u and \U the same below. */
1450 p += 4;
1451 *end += 4;
1453 else if (p[0] == '{')
1455 p++;
1456 while (*p == '0')
1457 p++;
1458 if (p[0] != '2'
1459 || p[1] != '0'
1460 || !ISXDIGIT (p[2])
1461 || !ISXDIGIT (p[3])
1462 || p[4] != '}')
1463 return bidi::kind::NONE;
1464 *end = p + 5;
1467 /* All code points we are looking for start with 20xx. */
1468 if (p[0] != '2' || p[1] != '0')
1469 return bidi::kind::NONE;
1470 else if (p[2] == '2')
1471 switch (p[3])
1473 case 'a':
1474 case 'A':
1475 return bidi::kind::LRE;
1476 case 'b':
1477 case 'B':
1478 return bidi::kind::RLE;
1479 case 'c':
1480 case 'C':
1481 return bidi::kind::PDF;
1482 case 'd':
1483 case 'D':
1484 return bidi::kind::LRO;
1485 case 'e':
1486 case 'E':
1487 return bidi::kind::RLO;
1488 default:
1489 break;
1491 else if (p[2] == '6')
1492 switch (p[3])
1494 case '6':
1495 return bidi::kind::LRI;
1496 case '7':
1497 return bidi::kind::RLI;
1498 case '8':
1499 return bidi::kind::FSI;
1500 case '9':
1501 return bidi::kind::PDI;
1502 default:
1503 break;
1505 else if (p[2] == '0')
1506 switch (p[3])
1508 case 'e':
1509 case 'E':
1510 return bidi::kind::LTR;
1511 case 'f':
1512 case 'F':
1513 return bidi::kind::RTL;
1514 default:
1515 break;
1518 return bidi::kind::NONE;
1521 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1522 If the kind is not NONE, write the location to *OUT. */
1524 static bidi::kind
1525 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1526 location_t *out)
1528 const unsigned char *end;
1529 bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
1530 if (result != bidi::kind::NONE)
1532 const unsigned char *start = p - 2;
1533 size_t num_bytes = end - start;
1534 *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1536 return result;
1539 /* Parse a named universal character escape where P points just past \N and
1540 return its bidi code. If the kind is not NONE, write the location to
1541 *OUT. */
1543 static bidi::kind
1544 get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1546 bidi::kind result = bidi::kind::NONE;
1547 if (*p != '{')
1548 return bidi::kind::NONE;
1549 if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
1551 if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1552 result = bidi::kind::LTR;
1553 else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1554 result = bidi::kind::LRE;
1555 else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1556 result = bidi::kind::LRO;
1557 else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1558 result = bidi::kind::LRI;
1560 else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
1562 if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1563 result = bidi::kind::RTL;
1564 else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1565 result = bidi::kind::RLE;
1566 else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1567 result = bidi::kind::RLO;
1568 else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1569 result = bidi::kind::RLI;
1571 else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
1573 if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
1574 result = bidi::kind::PDF;
1575 else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
1576 result = bidi::kind::PDI;
1578 else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
1579 result = bidi::kind::FSI;
1580 if (result != bidi::kind::NONE)
1581 *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
1582 (strchr ((const char *)
1583 (p + 1), '}')
1584 - (const char *) p)
1585 + 3);
1586 return result;
1589 /* Subclass of rich_location for reporting on unpaired UTF-8
1590 bidirectional control character(s).
1591 Escape the source lines on output, and show all unclosed
1592 bidi context, labelling everything. */
1594 class unpaired_bidi_rich_location : public rich_location
1596 public:
1597 class custom_range_label : public range_label
1599 public:
1600 label_text get_text (unsigned range_idx) const final override
1602 /* range 0 is the primary location; each subsequent range i + 1
1603 is for bidi::vec[i]. */
1604 if (range_idx > 0)
1606 const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1607 return label_text::borrow (bidi::to_str (ctxt.m_kind));
1609 else
1610 return label_text::borrow (_("end of bidirectional context"));
1614 unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1615 : rich_location (pfile->line_table, loc, &m_custom_label)
1617 set_escape_on_output (true);
1618 for (unsigned i = 0; i < bidi::vec.count (); i++)
1619 add_range (bidi::vec[i].m_loc,
1620 SHOW_RANGE_WITHOUT_CARET,
1621 &m_custom_label);
1624 private:
1625 custom_range_label m_custom_label;
1628 /* We're closing a bidi context, that is, we've encountered a newline,
1629 are closing a C-style comment, or are at the end of a string literal,
1630 character constant, or identifier. Warn if this context was not
1631 properly terminated by a PDI or PDF. P points to the last character
1632 in this context. */
1634 static void
1635 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1637 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1638 if (bidi::vec.count () > 0
1639 && (warn_bidi & bidirectional_unpaired
1640 && (!bidi::current_ctx_ucn_p ()
1641 || (warn_bidi & bidirectional_ucn))))
1643 const location_t loc
1644 = linemap_position_for_column (pfile->line_table,
1645 CPP_BUF_COLUMN (pfile->buffer, p));
1646 unpaired_bidi_rich_location rich_loc (pfile, loc);
1647 /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1648 forms of a diagnostic, so fake it for now. */
1649 if (bidi::vec.count () > 1)
1650 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1651 "unpaired UTF-8 bidirectional control characters "
1652 "detected");
1653 else
1654 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1655 "unpaired UTF-8 bidirectional control character "
1656 "detected");
1658 /* We're done with this context. */
1659 bidi::on_close ();
1662 /* We're at the beginning or in the middle of an identifier/comment/string
1663 literal/character constant. Warn if we've encountered a bidi character.
1664 KIND says which bidi control character it was; UCN_P is true iff this bidi
1665 control character was written as a UCN. LOC is the location of the
1666 character, but is only valid if KIND != bidi::kind::NONE. */
1668 static void
1669 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1670 bool ucn_p, location_t loc)
1672 if (__builtin_expect (kind == bidi::kind::NONE, 1))
1673 return;
1675 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1677 if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1679 rich_location rich_loc (pfile->line_table, loc);
1680 rich_loc.set_escape_on_output (true);
1682 /* It seems excessive to warn about a PDI/PDF that is closing
1683 an opened context because we've already warned about the
1684 opening character. Except warn when we have a UCN x UTF-8
1685 mismatch, if UCN checking is enabled. */
1686 if (kind == bidi::current_ctx ())
1688 if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1689 && bidi::current_ctx_ucn_p () != ucn_p)
1691 rich_loc.add_range (bidi::current_ctx_loc ());
1692 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1693 "UTF-8 vs UCN mismatch when closing "
1694 "a context by \"%s\"", bidi::to_str (kind));
1697 else if (warn_bidi & bidirectional_any
1698 && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1700 if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1701 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1702 "\"%s\" is closing an unopened context",
1703 bidi::to_str (kind));
1704 else
1705 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1706 "found problematic Unicode character \"%s\"",
1707 bidi::to_str (kind));
1710 /* We're done with this context. */
1711 bidi::on_char (kind, ucn_p, loc);
1714 static const cppchar_t utf8_continuation = 0x80;
1715 static const cppchar_t utf8_signifier = 0xC0;
1717 /* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1718 at PFILE->buffer->cur. Return a pointer after the diagnosed
1719 invalid character. */
1721 static const uchar *
1722 _cpp_warn_invalid_utf8 (cpp_reader *pfile)
1724 cpp_buffer *buffer = pfile->buffer;
1725 const uchar *cur = buffer->cur;
1726 bool pedantic = (CPP_PEDANTIC (pfile)
1727 && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1729 if (cur[0] < utf8_signifier
1730 || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1732 if (pedantic)
1733 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1734 pfile->line_table->highest_line,
1735 CPP_BUF_COL (buffer),
1736 "invalid UTF-8 character <%x>",
1737 cur[0]);
1738 else
1739 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1740 pfile->line_table->highest_line,
1741 CPP_BUF_COL (buffer),
1742 "invalid UTF-8 character <%x>",
1743 cur[0]);
1744 return cur + 1;
1746 else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1748 if (pedantic)
1749 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1750 pfile->line_table->highest_line,
1751 CPP_BUF_COL (buffer),
1752 "invalid UTF-8 character <%x><%x>",
1753 cur[0], cur[1]);
1754 else
1755 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1756 pfile->line_table->highest_line,
1757 CPP_BUF_COL (buffer),
1758 "invalid UTF-8 character <%x><%x>",
1759 cur[0], cur[1]);
1760 return cur + 2;
1762 else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1764 if (pedantic)
1765 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1766 pfile->line_table->highest_line,
1767 CPP_BUF_COL (buffer),
1768 "invalid UTF-8 character <%x><%x><%x>",
1769 cur[0], cur[1], cur[2]);
1770 else
1771 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1772 pfile->line_table->highest_line,
1773 CPP_BUF_COL (buffer),
1774 "invalid UTF-8 character <%x><%x><%x>",
1775 cur[0], cur[1], cur[2]);
1776 return cur + 3;
1778 else
1780 if (pedantic)
1781 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1782 pfile->line_table->highest_line,
1783 CPP_BUF_COL (buffer),
1784 "invalid UTF-8 character <%x><%x><%x><%x>",
1785 cur[0], cur[1], cur[2], cur[3]);
1786 else
1787 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1788 pfile->line_table->highest_line,
1789 CPP_BUF_COL (buffer),
1790 "invalid UTF-8 character <%x><%x><%x><%x>",
1791 cur[0], cur[1], cur[2], cur[3]);
1792 return cur + 4;
1796 /* Helper function of *skip_*_comment and lex*_string. For C,
1797 character at CUR[-1] with MSB set handle -Wbidi-chars* and
1798 -Winvalid-utf8 diagnostics and return pointer to first character
1799 that should be processed next. */
1801 static inline const uchar *
1802 _cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1803 const uchar *cur, bool warn_bidi_p,
1804 bool warn_invalid_utf8_p)
1806 /* If this is a beginning of a UTF-8 encoding, it might be
1807 a bidirectional control character. */
1808 if (c == bidi::utf8_start && warn_bidi_p)
1810 location_t loc;
1811 bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1812 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1814 if (!warn_invalid_utf8_p)
1815 return cur;
1816 if (c >= utf8_signifier)
1818 cppchar_t s;
1819 const uchar *pstr = cur - 1;
1820 if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
1821 && s <= UCS_LIMIT)
1822 return pstr;
1824 pfile->buffer->cur = cur - 1;
1825 return _cpp_warn_invalid_utf8 (pfile);
1828 /* Skip a C-style block comment. We find the end of the comment by
1829 seeing if an asterisk is before every '/' we encounter. Returns
1830 nonzero if comment terminated by EOF, zero otherwise.
1832 Buffer->cur points to the initial asterisk of the comment. */
1833 bool
1834 _cpp_skip_block_comment (cpp_reader *pfile)
1836 cpp_buffer *buffer = pfile->buffer;
1837 const uchar *cur = buffer->cur;
1838 uchar c;
1839 const bool warn_bidi_p = pfile->warn_bidi_p ();
1840 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1841 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1843 cur++;
1844 if (*cur == '/')
1845 cur++;
1847 for (;;)
1849 /* People like decorating comments with '*', so check for '/'
1850 instead for efficiency. */
1851 c = *cur++;
1853 if (c == '/')
1855 if (cur[-2] == '*')
1857 if (warn_bidi_p)
1858 maybe_warn_bidi_on_close (pfile, cur);
1859 break;
1862 /* Warn about potential nested comments, but not if the '/'
1863 comes immediately before the true comment delimiter.
1864 Don't bother to get it right across escaped newlines. */
1865 if (CPP_OPTION (pfile, warn_comments)
1866 && cur[0] == '*' && cur[1] != '/')
1868 buffer->cur = cur;
1869 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1870 pfile->line_table->highest_line,
1871 CPP_BUF_COL (buffer),
1872 "\"/*\" within comment");
1875 else if (c == '\n')
1877 unsigned int cols;
1878 buffer->cur = cur - 1;
1879 if (warn_bidi_p)
1880 maybe_warn_bidi_on_close (pfile, cur);
1881 _cpp_process_line_notes (pfile, true);
1882 if (buffer->next_line >= buffer->rlimit)
1883 return true;
1884 _cpp_clean_line (pfile);
1886 cols = buffer->next_line - buffer->line_base;
1887 CPP_INCREMENT_LINE (pfile, cols);
1889 cur = buffer->cur;
1891 else if (__builtin_expect (c >= utf8_continuation, 0)
1892 && warn_bidi_or_invalid_utf8_p)
1893 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1894 warn_invalid_utf8_p);
1897 buffer->cur = cur;
1898 _cpp_process_line_notes (pfile, true);
1899 return false;
1902 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1903 terminating newline. Handles escaped newlines. Returns nonzero
1904 if a multiline comment. */
1905 static int
1906 skip_line_comment (cpp_reader *pfile)
1908 cpp_buffer *buffer = pfile->buffer;
1909 location_t orig_line = pfile->line_table->highest_line;
1910 const bool warn_bidi_p = pfile->warn_bidi_p ();
1911 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1912 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1914 if (!warn_bidi_or_invalid_utf8_p)
1915 while (*buffer->cur != '\n')
1916 buffer->cur++;
1917 else if (!warn_invalid_utf8_p)
1919 while (*buffer->cur != '\n'
1920 && *buffer->cur != bidi::utf8_start)
1921 buffer->cur++;
1922 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1924 while (*buffer->cur != '\n')
1926 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1928 location_t loc;
1929 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1930 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1932 buffer->cur++;
1934 maybe_warn_bidi_on_close (pfile, buffer->cur);
1937 else
1939 while (*buffer->cur != '\n')
1941 if (*buffer->cur < utf8_continuation)
1943 buffer->cur++;
1944 continue;
1946 buffer->cur
1947 = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
1948 warn_bidi_p, warn_invalid_utf8_p);
1950 if (warn_bidi_p)
1951 maybe_warn_bidi_on_close (pfile, buffer->cur);
1954 _cpp_process_line_notes (pfile, true);
1955 return orig_line != pfile->line_table->highest_line;
1958 /* Skips whitespace, saving the next non-whitespace character. */
1959 static void
1960 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1962 cpp_buffer *buffer = pfile->buffer;
1963 bool saw_NUL = false;
1967 /* Horizontal space always OK. */
1968 if (c == ' ' || c == '\t')
1970 /* Just \f \v or \0 left. */
1971 else if (c == '\0')
1972 saw_NUL = true;
1973 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1974 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1975 CPP_BUF_COL (buffer),
1976 "%s in preprocessing directive",
1977 c == '\f' ? "form feed" : "vertical tab");
1979 c = *buffer->cur++;
1981 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1982 while (is_nvspace (c));
1984 if (saw_NUL)
1986 encoding_rich_location rich_loc (pfile);
1987 cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1988 "null character(s) ignored");
1991 buffer->cur--;
1994 /* See if the characters of a number token are valid in a name (no
1995 '.', '+' or '-'). */
1996 static int
1997 name_p (cpp_reader *pfile, const cpp_string *string)
1999 unsigned int i;
2001 for (i = 0; i < string->len; i++)
2002 if (!is_idchar (string->text[i]))
2003 return 0;
2005 return 1;
2008 /* After parsing an identifier or other sequence, produce a warning about
2009 sequences not in NFC/NFKC. */
2010 static void
2011 warn_about_normalization (cpp_reader *pfile,
2012 const cpp_token *token,
2013 const struct normalize_state *s,
2014 bool identifier)
2016 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2017 && !pfile->state.skipping)
2019 location_t loc = token->src_loc;
2021 /* If possible, create a location range for the token. */
2022 if (loc >= RESERVED_LOCATION_COUNT
2023 && token->type != CPP_EOF
2024 /* There must be no line notes to process. */
2025 && (!(pfile->buffer->cur
2026 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2027 && !pfile->overlaid_buffer)))
2029 source_range tok_range;
2030 tok_range.m_start = loc;
2031 tok_range.m_finish
2032 = linemap_position_for_column (pfile->line_table,
2033 CPP_BUF_COLUMN (pfile->buffer,
2034 pfile->buffer->cur));
2035 loc = pfile->line_table->get_or_create_combined_loc (loc, tok_range,
2036 nullptr, 0);
2039 encoding_rich_location rich_loc (pfile, loc);
2041 /* Make sure that the token is printed using UCNs, even
2042 if we'd otherwise happily print UTF-8. */
2043 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2044 size_t sz;
2046 sz = cpp_spell_token (pfile, token, buf, false) - buf;
2047 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2048 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2049 "`%.*s' is not in NFKC", (int) sz, buf);
2050 else if (identifier && CPP_OPTION (pfile, xid_identifiers))
2051 cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2052 "`%.*s' is not in NFC", (int) sz, buf);
2053 else
2054 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2055 "`%.*s' is not in NFC", (int) sz, buf);
2056 free (buf);
2060 /* Returns TRUE if the byte sequence starting at buffer->cur is a valid
2061 extended character in an identifier. If FIRST is TRUE, then the character
2062 must be valid at the beginning of an identifier as well. If the return
2063 value is TRUE, then pfile->buffer->cur has been moved to point to the next
2064 byte after the extended character. */
2066 static bool
2067 forms_identifier_p (cpp_reader *pfile, int first,
2068 struct normalize_state *state)
2070 cpp_buffer *buffer = pfile->buffer;
2071 const bool warn_bidi_p = pfile->warn_bidi_p ();
2073 if (*buffer->cur == '$')
2075 if (!CPP_OPTION (pfile, dollars_in_ident))
2076 return false;
2078 buffer->cur++;
2079 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
2081 CPP_OPTION (pfile, warn_dollars) = 0;
2082 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
2085 return true;
2088 /* Is this a syntactically valid UCN or a valid UTF-8 char? */
2089 if (CPP_OPTION (pfile, extended_identifiers))
2091 cppchar_t s;
2092 if (*buffer->cur >= utf8_signifier)
2094 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2095 && warn_bidi_p)
2097 location_t loc;
2098 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
2099 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2101 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2102 state, &s))
2103 return true;
2105 else if (*buffer->cur == '\\'
2106 && (buffer->cur[1] == 'u'
2107 || buffer->cur[1] == 'U'
2108 || buffer->cur[1] == 'N'))
2110 buffer->cur += 2;
2111 if (warn_bidi_p)
2113 location_t loc;
2114 bidi::kind kind;
2115 if (buffer->cur[-1] == 'N')
2116 kind = get_bidi_named (pfile, buffer->cur, &loc);
2117 else
2118 kind = get_bidi_ucn (pfile, buffer->cur,
2119 buffer->cur[-1] == 'U', &loc);
2120 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2122 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2123 state, &s, NULL, NULL))
2124 return true;
2125 buffer->cur -= 2;
2129 return false;
2132 /* Helper function to issue error about improper __VA_OPT__ use. */
2133 static void
2134 maybe_va_opt_error (cpp_reader *pfile)
2136 if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2138 /* __VA_OPT__ should not be accepted at all, but allow it in
2139 system headers. */
2140 if (!_cpp_in_system_header (pfile))
2142 if (CPP_OPTION (pfile, cplusplus))
2143 cpp_error (pfile, CPP_DL_PEDWARN,
2144 "__VA_OPT__ is not available until C++20");
2145 else
2146 cpp_error (pfile, CPP_DL_PEDWARN,
2147 "__VA_OPT__ is not available until C23");
2150 else if (!pfile->state.va_args_ok)
2152 /* __VA_OPT__ should only appear in the replacement list of a
2153 variadic macro. */
2154 cpp_error (pfile, CPP_DL_PEDWARN,
2155 "__VA_OPT__ can only appear in the expansion"
2156 " of a C++20 variadic macro");
2160 /* Helper function to perform diagnostics that are needed (rarely)
2161 when an identifier is lexed. */
2162 static void
2163 identifier_diagnostics_on_lex (cpp_reader *pfile, cpp_hashnode *node)
2165 if (__builtin_expect (!(node->flags & NODE_DIAGNOSTIC)
2166 || pfile->state.skipping, 1))
2167 return;
2169 /* It is allowed to poison the same identifier twice. */
2170 if ((node->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2172 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2173 NODE_NAME (node));
2174 const auto data = (cpp_hashnode_extra *)
2175 ht_lookup (pfile->extra_hash_table, node->ident, HT_NO_INSERT);
2176 if (data && data->poisoned_loc)
2177 cpp_error_at (pfile, CPP_DL_NOTE, data->poisoned_loc, "poisoned here");
2180 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2181 replacement list of a variadic macro. */
2182 if (node == pfile->spec_nodes.n__VA_ARGS__
2183 && !pfile->state.va_args_ok)
2185 if (CPP_OPTION (pfile, cplusplus))
2186 cpp_error (pfile, CPP_DL_PEDWARN,
2187 "__VA_ARGS__ can only appear in the expansion"
2188 " of a C++11 variadic macro");
2189 else
2190 cpp_error (pfile, CPP_DL_PEDWARN,
2191 "__VA_ARGS__ can only appear in the expansion"
2192 " of a C99 variadic macro");
2195 /* __VA_OPT__ should only appear in the replacement list of a
2196 variadic macro. */
2197 if (node == pfile->spec_nodes.n__VA_OPT__)
2198 maybe_va_opt_error (pfile);
2200 /* For -Wc++-compat, warn about use of C++ named operators. */
2201 if (node->flags & NODE_WARN_OPERATOR)
2202 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2203 "identifier \"%s\" is a special operator name in C++",
2204 NODE_NAME (node));
2207 /* Helper function to get the cpp_hashnode of the identifier BASE. */
2208 static cpp_hashnode *
2209 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
2211 cpp_hashnode *result;
2212 const uchar *cur;
2213 unsigned int len;
2214 unsigned int hash = HT_HASHSTEP (0, *base);
2216 cur = base + 1;
2217 while (ISIDNUM (*cur))
2219 hash = HT_HASHSTEP (hash, *cur);
2220 cur++;
2222 len = cur - base;
2223 hash = HT_HASHFINISH (hash, len);
2224 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2225 base, len, hash, HT_ALLOC));
2226 identifier_diagnostics_on_lex (pfile, result);
2227 return result;
2230 /* Get the cpp_hashnode of an identifier specified by NAME in
2231 the current cpp_reader object. If none is found, NULL is returned. */
2232 cpp_hashnode *
2233 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2235 cpp_hashnode *result;
2236 result = lex_identifier_intern (pfile, (uchar *) name);
2237 return result;
2240 /* Lex an identifier starting at BASE. BUFFER->CUR is expected to point
2241 one past the first character at BASE, which may be a (possibly multi-byte)
2242 character if STARTS_UCN is true. */
2243 static cpp_hashnode *
2244 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2245 struct normalize_state *nst, cpp_hashnode **spelling)
2247 cpp_hashnode *result;
2248 const uchar *cur;
2249 unsigned int len;
2250 unsigned int hash = HT_HASHSTEP (0, *base);
2251 const bool warn_bidi_p = pfile->warn_bidi_p ();
2253 cur = pfile->buffer->cur;
2254 if (! starts_ucn)
2256 while (ISIDNUM (*cur))
2258 hash = HT_HASHSTEP (hash, *cur);
2259 cur++;
2261 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2263 pfile->buffer->cur = cur;
2264 if (starts_ucn || forms_identifier_p (pfile, false, nst))
2266 /* Slower version for identifiers containing UCNs
2267 or extended chars (including $). */
2268 do {
2269 while (ISIDNUM (*pfile->buffer->cur))
2271 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2272 pfile->buffer->cur++;
2274 } while (forms_identifier_p (pfile, false, nst));
2275 if (warn_bidi_p)
2276 maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2277 result = _cpp_interpret_identifier (pfile, base,
2278 pfile->buffer->cur - base);
2279 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2281 else
2283 len = cur - base;
2284 hash = HT_HASHFINISH (hash, len);
2286 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2287 base, len, hash, HT_ALLOC));
2288 *spelling = result;
2291 return result;
2294 /* Struct to hold the return value of the scan_cur_identifier () helper
2295 function below. */
2297 struct scan_id_result
2299 cpp_hashnode *node;
2300 normalize_state nst;
2302 scan_id_result ()
2303 : node (nullptr)
2305 nst = INITIAL_NORMALIZE_STATE;
2308 explicit operator bool () const { return node; }
2311 /* Helper function to scan an entire identifier beginning at
2312 pfile->buffer->cur, and possibly containing extended characters (UCNs
2313 and/or UTF-8). Returns the cpp_hashnode for the identifier on success, or
2314 else nullptr, as well as a normalize_state so that normalization warnings
2315 may be issued once the token lexing is complete. */
2317 static scan_id_result
2318 scan_cur_identifier (cpp_reader *pfile)
2320 const auto buffer = pfile->buffer;
2321 const auto begin = buffer->cur;
2322 scan_id_result result;
2323 if (ISIDST (*buffer->cur))
2325 ++buffer->cur;
2326 cpp_hashnode *ignore;
2327 result.node = lex_identifier (pfile, begin, false, &result.nst, &ignore);
2329 else if (forms_identifier_p (pfile, true, &result.nst))
2331 /* buffer->cur has been moved already by the call
2332 to forms_identifier_p. */
2333 cpp_hashnode *ignore;
2334 result.node = lex_identifier (pfile, begin, true, &result.nst, &ignore);
2336 return result;
2339 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
2340 static void
2341 lex_number (cpp_reader *pfile, cpp_string *number,
2342 struct normalize_state *nst)
2344 const uchar *cur;
2345 const uchar *base;
2346 uchar *dest;
2348 base = pfile->buffer->cur - 1;
2351 const uchar *adj_digit_sep = NULL;
2352 cur = pfile->buffer->cur;
2354 /* N.B. ISIDNUM does not include $. */
2355 while (ISIDNUM (*cur)
2356 || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2357 || DIGIT_SEP (*cur)
2358 || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2360 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2361 /* Adjacent digit separators do not form part of the pp-number syntax.
2362 However, they can safely be diagnosed here as an error, since '' is
2363 not a valid preprocessing token. */
2364 if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2365 adj_digit_sep = cur;
2366 cur++;
2368 /* A number can't end with a digit separator. */
2369 while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2370 --cur;
2371 if (adj_digit_sep && adj_digit_sep < cur)
2372 cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2374 pfile->buffer->cur = cur;
2376 while (forms_identifier_p (pfile, false, nst));
2378 number->len = cur - base;
2379 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2380 memcpy (dest, base, number->len);
2381 dest[number->len] = '\0';
2382 number->text = dest;
2385 /* Create a token of type TYPE with a literal spelling. */
2386 static void
2387 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2388 unsigned int len, enum cpp_ttype type)
2390 token->type = type;
2391 token->val.str.len = len;
2392 token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2395 /* Like create_literal(), but construct it from two separate strings
2396 which are concatenated. LEN2 may be 0 if no second string is
2397 required. */
2398 static void
2399 create_literal2 (cpp_reader *pfile, cpp_token *token, const uchar *base1,
2400 unsigned int len1, const uchar *base2, unsigned int len2,
2401 enum cpp_ttype type)
2403 token->type = type;
2404 token->val.str.len = len1 + len2;
2405 uchar *const dest = _cpp_unaligned_alloc (pfile, len1 + len2 + 1);
2406 memcpy (dest, base1, len1);
2407 if (len2)
2408 memcpy (dest+len1, base2, len2);
2409 dest[len1 + len2] = 0;
2410 token->val.str.text = dest;
2413 const uchar *
2414 cpp_alloc_token_string (cpp_reader *pfile,
2415 const unsigned char *ptr, unsigned len)
2417 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2419 dest[len] = 0;
2420 memcpy (dest, ptr, len);
2421 return dest;
2424 /* A pair of raw buffer pointers. The currently open one is [1], the
2425 first one is [0]. Used for string literal lexing. */
2426 struct lit_accum {
2427 _cpp_buff *first;
2428 _cpp_buff *last;
2429 const uchar *rpos;
2430 size_t accum;
2432 lit_accum ()
2433 : first (NULL), last (NULL), rpos (0), accum (0)
2437 void append (cpp_reader *, const uchar *, size_t);
2439 void read_begin (cpp_reader *);
2440 bool reading_p () const
2442 return rpos != NULL;
2444 char read_char ()
2446 char c = *rpos++;
2447 if (rpos == BUFF_FRONT (last))
2448 rpos = NULL;
2449 return c;
2452 void create_literal2 (cpp_reader *pfile, cpp_token *token,
2453 const uchar *base1, unsigned int len1,
2454 const uchar *base2, unsigned int len2,
2455 enum cpp_ttype type);
2458 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2459 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
2461 void
2462 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2464 if (!last)
2465 /* Starting. */
2466 first = last = _cpp_get_buff (pfile, len);
2467 else if (len > BUFF_ROOM (last))
2469 /* There is insufficient room in the buffer. Copy what we can,
2470 and then either extend or create a new one. */
2471 size_t room = BUFF_ROOM (last);
2472 memcpy (BUFF_FRONT (last), base, room);
2473 BUFF_FRONT (last) += room;
2474 base += room;
2475 len -= room;
2476 accum += room;
2478 gcc_checking_assert (!rpos);
2480 last = _cpp_append_extend_buff (pfile, last, len);
2483 memcpy (BUFF_FRONT (last), base, len);
2484 BUFF_FRONT (last) += len;
2485 accum += len;
2488 void
2489 lit_accum::read_begin (cpp_reader *pfile)
2491 /* We never accumulate more than 4 chars to read. */
2492 if (BUFF_ROOM (last) < 4)
2494 last = _cpp_append_extend_buff (pfile, last, 4);
2495 rpos = BUFF_FRONT (last);
2498 /* Helper function to check if a string format macro, say from inttypes.h, is
2499 placed touching a string literal, in which case it could be parsed as a C++11
2500 user-defined string literal thus breaking the program. Return TRUE if the
2501 UDL should be ignored for now and preserved for potential macro
2502 expansion. */
2504 static bool
2505 maybe_ignore_udl_macro_suffix (cpp_reader *pfile, location_t src_loc,
2506 const uchar *suffix_begin, cpp_hashnode *node)
2508 /* User-defined literals outside of namespace std must start with a single
2509 underscore, so assume anything of that form really is a UDL suffix.
2510 We don't need to worry about UDLs defined inside namespace std because
2511 their names are reserved, so cannot be used as macro names in valid
2512 programs. */
2513 if ((suffix_begin[0] == '_' && suffix_begin[1] != '_')
2514 || !cpp_macro_p (node))
2515 return false;
2517 /* Maybe raise a warning here; caller should arrange not to consume
2518 the tokens. */
2519 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2520 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX, src_loc, 0,
2521 "invalid suffix on literal; C++11 requires a space "
2522 "between literal and string macro");
2523 return true;
2526 /* Like create_literal2(), but also prepend all the accumulated data from
2527 the lit_accum struct. */
2528 void
2529 lit_accum::create_literal2 (cpp_reader *pfile, cpp_token *token,
2530 const uchar *base1, unsigned int len1,
2531 const uchar *base2, unsigned int len2,
2532 enum cpp_ttype type)
2534 const unsigned int tot_len = accum + len1 + len2;
2535 uchar *dest = _cpp_unaligned_alloc (pfile, tot_len + 1);
2536 token->type = type;
2537 token->val.str.len = tot_len;
2538 token->val.str.text = dest;
2539 for (_cpp_buff *buf = first; buf; buf = buf->next)
2541 size_t len = BUFF_FRONT (buf) - buf->base;
2542 memcpy (dest, buf->base, len);
2543 dest += len;
2545 memcpy (dest, base1, len1);
2546 dest += len1;
2547 if (len2)
2548 memcpy (dest, base2, len2);
2549 dest += len2;
2550 *dest = '\0';
2553 /* Lexes a raw string. The stored string contains the spelling,
2554 including double quotes, delimiter string, '(' and ')', any leading
2555 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
2556 the type of the literal, or CPP_OTHER if it was not properly
2557 terminated.
2559 BASE is the start of the token. Updates pfile->buffer->cur to just
2560 after the lexed string.
2562 The spelling is NUL-terminated, but it is not guaranteed that this
2563 is the first NUL since embedded NULs are preserved. */
2565 static void
2566 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2568 const uchar *pos = base;
2569 const bool warn_bidi_p = pfile->warn_bidi_p ();
2570 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2571 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2573 /* 'tis a pity this information isn't passed down from the lexer's
2574 initial categorization of the token. */
2575 enum cpp_ttype type = CPP_STRING;
2577 if (*pos == 'L')
2579 type = CPP_WSTRING;
2580 pos++;
2582 else if (*pos == 'U')
2584 type = CPP_STRING32;
2585 pos++;
2587 else if (*pos == 'u')
2589 if (pos[1] == '8')
2591 type = CPP_UTF8STRING;
2592 pos++;
2594 else
2595 type = CPP_STRING16;
2596 pos++;
2599 gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2600 pos += 2;
2602 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2604 /* Skip notes before the ". */
2605 while (note->pos < pos)
2606 ++note;
2608 lit_accum accum;
2610 uchar prefix[17];
2611 unsigned prefix_len = 0;
2612 enum Phase
2614 PHASE_PREFIX = -2,
2615 PHASE_NONE = -1,
2616 PHASE_SUFFIX = 0
2617 } phase = PHASE_PREFIX;
2619 for (;;)
2621 gcc_checking_assert (note->pos >= pos);
2623 /* Undo any escaped newlines and trigraphs. */
2624 if (!accum.reading_p () && note->pos == pos)
2625 switch (note->type)
2627 case '\\':
2628 case ' ':
2629 /* Restore backslash followed by newline. */
2630 accum.append (pfile, base, pos - base);
2631 base = pos;
2632 accum.read_begin (pfile);
2633 accum.append (pfile, UC"\\", 1);
2635 after_backslash:
2636 if (note->type == ' ')
2637 /* GNU backslash whitespace newline extension. FIXME
2638 could be any sequence of non-vertical space. When we
2639 can properly restore any such sequence, we should
2640 mark this note as handled so _cpp_process_line_notes
2641 doesn't warn. */
2642 accum.append (pfile, UC" ", 1);
2644 accum.append (pfile, UC"\n", 1);
2645 note++;
2646 break;
2648 case '\n':
2649 /* This can happen for ??/<NEWLINE> when trigraphs are not
2650 being interpretted. */
2651 gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2652 note->type = 0;
2653 note++;
2654 break;
2656 default:
2657 gcc_checking_assert (_cpp_trigraph_map[note->type]);
2659 /* Don't warn about this trigraph in
2660 _cpp_process_line_notes, since trigraphs show up as
2661 trigraphs in raw strings. */
2662 uchar type = note->type;
2663 note->type = 0;
2665 if (CPP_OPTION (pfile, trigraphs))
2667 accum.append (pfile, base, pos - base);
2668 base = pos;
2669 accum.read_begin (pfile);
2670 accum.append (pfile, UC"??", 2);
2671 accum.append (pfile, &type, 1);
2673 /* ??/ followed by newline gets two line notes, one for
2674 the trigraph and one for the backslash/newline. */
2675 if (type == '/' && note[1].pos == pos)
2677 note++;
2678 gcc_assert (note->type == '\\' || note->type == ' ');
2679 goto after_backslash;
2681 /* Skip the replacement character. */
2682 base = ++pos;
2685 note++;
2686 break;
2689 /* Now get a char to process. Either from an expanded note, or
2690 from the line buffer. */
2691 bool read_note = accum.reading_p ();
2692 char c = read_note ? accum.read_char () : *pos++;
2694 if (phase == PHASE_PREFIX)
2696 if (c == '(')
2698 /* Done. */
2699 phase = PHASE_NONE;
2700 prefix[prefix_len++] = '"';
2702 else if (prefix_len < 16
2703 /* Prefix chars are any of the basic character set,
2704 [lex.charset] except for '
2705 ()\\\t\v\f\n'. Optimized for a contiguous
2706 alphabet. */
2707 /* Unlike a switch, this collapses down to one or
2708 two shift and bitmask operations on an ASCII
2709 system, with an outlier or two. */
2710 && (('Z' - 'A' == 25
2711 ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2712 : ISIDST (c))
2713 || (c >= '0' && c <= '9')
2714 || c == '_' || c == '{' || c == '}'
2715 || c == '[' || c == ']' || c == '#'
2716 || c == '<' || c == '>' || c == '%'
2717 || c == ':' || c == ';' || c == '.' || c == '?'
2718 || c == '*' || c == '+' || c == '-' || c == '/'
2719 || c == '^' || c == '&' || c == '|' || c == '~'
2720 || c == '!' || c == '=' || c == ','
2721 || c == '"' || c == '\''))
2722 prefix[prefix_len++] = c;
2723 else
2725 /* Something is wrong. */
2726 int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2727 if (prefix_len == 16)
2728 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2729 col, "raw string delimiter longer "
2730 "than 16 characters");
2731 else if (c == '\n')
2732 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2733 col, "invalid new-line in raw "
2734 "string delimiter");
2735 else
2736 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2737 col, "invalid character '%c' in "
2738 "raw string delimiter", c);
2739 type = CPP_OTHER;
2740 phase = PHASE_NONE;
2741 /* Continue until we get a close quote, that's probably
2742 the best failure mode. */
2743 prefix_len = 0;
2745 if (c != '\n')
2746 continue;
2749 if (phase != PHASE_NONE)
2751 if (prefix[phase] != c)
2752 phase = PHASE_NONE;
2753 else if (unsigned (phase + 1) == prefix_len)
2754 break;
2755 else
2757 phase = Phase (phase + 1);
2758 continue;
2762 if (!prefix_len && c == '"')
2763 /* Failure mode lexing. */
2764 goto out;
2765 else if (prefix_len && c == ')')
2766 phase = PHASE_SUFFIX;
2767 else if (!read_note && c == '\n')
2769 pos--;
2770 pfile->buffer->cur = pos;
2771 if ((pfile->state.in_directive || pfile->state.parsing_args)
2772 && pfile->buffer->next_line >= pfile->buffer->rlimit)
2774 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2775 "unterminated raw string");
2776 type = CPP_OTHER;
2777 goto out;
2780 accum.append (pfile, base, pos - base + 1);
2781 _cpp_process_line_notes (pfile, false);
2783 if (pfile->buffer->next_line < pfile->buffer->rlimit)
2784 CPP_INCREMENT_LINE (pfile, 0);
2785 pfile->buffer->need_line = true;
2787 if (!get_fresh_line_impl<true> (pfile))
2789 /* We ran out of file and failed to get a line. */
2790 location_t src_loc = token->src_loc;
2791 token->type = CPP_EOF;
2792 /* Tell the compiler the line number of the EOF token. */
2793 token->src_loc = pfile->line_table->highest_line;
2794 token->flags = BOL;
2795 if (accum.first)
2796 _cpp_release_buff (pfile, accum.first);
2797 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2798 "unterminated raw string");
2800 /* Now pop the buffer that get_fresh_line_impl() did not. Popping
2801 is not safe if processing a directive, however this cannot
2802 happen as we already checked above that a line would be
2803 available, and get_fresh_line_impl() can't fail in this
2804 case. */
2805 gcc_assert (!pfile->state.in_directive);
2806 _cpp_pop_buffer (pfile);
2808 return;
2811 pos = base = pfile->buffer->cur;
2812 note = &pfile->buffer->notes[pfile->buffer->cur_note];
2814 else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2815 && warn_bidi_or_invalid_utf8_p)
2816 pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
2817 warn_invalid_utf8_p);
2820 if (warn_bidi_p)
2821 maybe_warn_bidi_on_close (pfile, pos);
2823 if (CPP_OPTION (pfile, user_literals))
2825 const uchar *const suffix_begin = pos;
2826 pfile->buffer->cur = pos;
2828 if (const auto sr = scan_cur_identifier (pfile))
2830 if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
2831 suffix_begin, sr.node))
2832 pfile->buffer->cur = suffix_begin;
2833 else
2835 type = cpp_userdef_string_add_type (type);
2836 accum.create_literal2 (pfile, token, base, suffix_begin - base,
2837 NODE_NAME (sr.node), NODE_LEN (sr.node),
2838 type);
2839 if (accum.first)
2840 _cpp_release_buff (pfile, accum.first);
2841 warn_about_normalization (pfile, token, &sr.nst, true);
2842 return;
2847 out:
2848 pfile->buffer->cur = pos;
2849 if (!accum.accum)
2850 create_literal (pfile, token, base, pos - base, type);
2851 else
2853 accum.create_literal2 (pfile, token, base, pos - base, nullptr, 0, type);
2854 _cpp_release_buff (pfile, accum.first);
2858 /* Lexes a string, character constant, or angle-bracketed header file
2859 name. The stored string contains the spelling, including opening
2860 quote and any leading 'L', 'u', 'U' or 'u8' and optional
2861 'R' modifier. It returns the type of the literal, or CPP_OTHER
2862 if it was not properly terminated, or CPP_LESS for an unterminated
2863 header name which must be relexed as normal tokens.
2865 The spelling is NUL-terminated, but it is not guaranteed that this
2866 is the first NUL since embedded NULs are preserved. */
2867 static void
2868 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2870 bool saw_NUL = false;
2871 const uchar *cur;
2872 cppchar_t terminator;
2873 enum cpp_ttype type;
2875 cur = base;
2876 terminator = *cur++;
2877 if (terminator == 'L' || terminator == 'U')
2878 terminator = *cur++;
2879 else if (terminator == 'u')
2881 terminator = *cur++;
2882 if (terminator == '8')
2883 terminator = *cur++;
2885 if (terminator == 'R')
2887 lex_raw_string (pfile, token, base);
2888 return;
2890 if (terminator == '"')
2891 type = (*base == 'L' ? CPP_WSTRING :
2892 *base == 'U' ? CPP_STRING32 :
2893 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2894 : CPP_STRING);
2895 else if (terminator == '\'')
2896 type = (*base == 'L' ? CPP_WCHAR :
2897 *base == 'U' ? CPP_CHAR32 :
2898 *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2899 : CPP_CHAR);
2900 else
2901 terminator = '>', type = CPP_HEADER_NAME;
2903 const bool warn_bidi_p = pfile->warn_bidi_p ();
2904 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2905 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2906 for (;;)
2908 cppchar_t c = *cur++;
2910 /* In #include-style directives, terminators are not escapable. */
2911 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2913 if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
2915 location_t loc;
2916 bidi::kind kind;
2917 if (cur[0] == 'N')
2918 kind = get_bidi_named (pfile, cur + 1, &loc);
2919 else
2920 kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
2921 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2923 cur++;
2925 else if (c == terminator)
2927 if (warn_bidi_p)
2928 maybe_warn_bidi_on_close (pfile, cur - 1);
2929 break;
2931 else if (c == '\n')
2933 cur--;
2934 /* Unmatched quotes always yield undefined behavior, but
2935 greedy lexing means that what appears to be an unterminated
2936 header name may actually be a legitimate sequence of tokens. */
2937 if (terminator == '>')
2939 token->type = CPP_LESS;
2940 return;
2942 type = CPP_OTHER;
2943 break;
2945 else if (c == '\0')
2946 saw_NUL = true;
2947 else if (__builtin_expect (c >= utf8_continuation, 0)
2948 && warn_bidi_or_invalid_utf8_p)
2949 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2950 warn_invalid_utf8_p);
2953 if (saw_NUL && !pfile->state.skipping)
2954 cpp_error (pfile, CPP_DL_WARNING,
2955 "null character(s) preserved in literal");
2957 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2958 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2959 (int) terminator);
2961 pfile->buffer->cur = cur;
2962 const uchar *const suffix_begin = cur;
2964 if (CPP_OPTION (pfile, user_literals))
2966 if (const auto sr = scan_cur_identifier (pfile))
2968 if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
2969 suffix_begin, sr.node))
2970 pfile->buffer->cur = suffix_begin;
2971 else
2973 /* Grab user defined literal suffix. */
2974 type = cpp_userdef_char_add_type (type);
2975 type = cpp_userdef_string_add_type (type);
2976 create_literal2 (pfile, token, base, suffix_begin - base,
2977 NODE_NAME (sr.node), NODE_LEN (sr.node), type);
2978 warn_about_normalization (pfile, token, &sr.nst, true);
2979 return;
2983 else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2984 && !pfile->state.skipping)
2986 const auto sr = scan_cur_identifier (pfile);
2987 /* Maybe raise a warning, but do not consume the tokens. */
2988 pfile->buffer->cur = suffix_begin;
2989 if (sr && cpp_macro_p (sr.node))
2990 cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2991 token->src_loc, 0, "C++11 requires a space "
2992 "between string literal and macro");
2995 create_literal (pfile, token, base, cur - base, type);
2998 /* Return the comment table. The client may not make any assumption
2999 about the ordering of the table. */
3000 cpp_comment_table *
3001 cpp_get_comments (cpp_reader *pfile)
3003 return &pfile->comments;
3006 /* Append a comment to the end of the comment table. */
3007 static void
3008 store_comment (cpp_reader *pfile, cpp_token *token)
3010 int len;
3012 if (pfile->comments.allocated == 0)
3014 pfile->comments.allocated = 256;
3015 pfile->comments.entries = (cpp_comment *) xmalloc
3016 (pfile->comments.allocated * sizeof (cpp_comment));
3019 if (pfile->comments.count == pfile->comments.allocated)
3021 pfile->comments.allocated *= 2;
3022 pfile->comments.entries = (cpp_comment *) xrealloc
3023 (pfile->comments.entries,
3024 pfile->comments.allocated * sizeof (cpp_comment));
3027 len = token->val.str.len;
3029 /* Copy comment. Note, token may not be NULL terminated. */
3030 pfile->comments.entries[pfile->comments.count].comment =
3031 (char *) xmalloc (sizeof (char) * (len + 1));
3032 memcpy (pfile->comments.entries[pfile->comments.count].comment,
3033 token->val.str.text, len);
3034 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
3036 /* Set source location. */
3037 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
3039 /* Increment the count of entries in the comment table. */
3040 pfile->comments.count++;
3043 /* The stored comment includes the comment start and any terminator. */
3044 static void
3045 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
3046 cppchar_t type)
3048 unsigned char *buffer;
3049 unsigned int len, clen, i;
3051 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
3053 /* C++ comments probably (not definitely) have moved past a new
3054 line, which we don't want to save in the comment. */
3055 if (is_vspace (pfile->buffer->cur[-1]))
3056 len--;
3058 /* If we are currently in a directive or in argument parsing, then
3059 we need to store all C++ comments as C comments internally, and
3060 so we need to allocate a little extra space in that case.
3062 Note that the only time we encounter a directive here is
3063 when we are saving comments in a "#define". */
3064 clen = ((pfile->state.in_directive || pfile->state.parsing_args)
3065 && type == '/') ? len + 2 : len;
3067 buffer = _cpp_unaligned_alloc (pfile, clen);
3069 token->type = CPP_COMMENT;
3070 token->val.str.len = clen;
3071 token->val.str.text = buffer;
3073 buffer[0] = '/';
3074 memcpy (buffer + 1, from, len - 1);
3076 /* Finish conversion to a C comment, if necessary. */
3077 if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
3079 buffer[1] = '*';
3080 buffer[clen - 2] = '*';
3081 buffer[clen - 1] = '/';
3082 /* As there can be in a C++ comments illegal sequences for C comments
3083 we need to filter them out. */
3084 for (i = 2; i < (clen - 2); i++)
3085 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3086 buffer[i] = '|';
3089 /* Finally store this comment for use by clients of libcpp. */
3090 store_comment (pfile, token);
3093 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3094 comment. */
3096 static bool
3097 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3099 const unsigned char *from = comment_start + 1;
3101 switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3103 /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3104 don't recognize any comments. The latter only checks attributes,
3105 the former doesn't warn. */
3106 case 0:
3107 default:
3108 return false;
3109 /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3110 content it has. */
3111 case 1:
3112 return true;
3113 case 2:
3114 /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3115 .*falls?[ \t-]*thr(u|ough).* regex. */
3116 for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3117 from++)
3119 /* Is there anything like strpbrk with upper boundary, or
3120 memchr looking for 2 characters rather than just one? */
3121 if (from[0] != 'f' && from[0] != 'F')
3122 continue;
3123 if (from[1] != 'a' && from[1] != 'A')
3124 continue;
3125 if (from[2] != 'l' && from[2] != 'L')
3126 continue;
3127 if (from[3] != 'l' && from[3] != 'L')
3128 continue;
3129 from += sizeof "fall" - 1;
3130 if (from[0] == 's' || from[0] == 'S')
3131 from++;
3132 while (*from == ' ' || *from == '\t' || *from == '-')
3133 from++;
3134 if (from[0] != 't' && from[0] != 'T')
3135 continue;
3136 if (from[1] != 'h' && from[1] != 'H')
3137 continue;
3138 if (from[2] != 'r' && from[2] != 'R')
3139 continue;
3140 if (from[3] == 'u' || from[3] == 'U')
3141 return true;
3142 if (from[3] != 'o' && from[3] != 'O')
3143 continue;
3144 if (from[4] != 'u' && from[4] != 'U')
3145 continue;
3146 if (from[5] != 'g' && from[5] != 'G')
3147 continue;
3148 if (from[6] != 'h' && from[6] != 'H')
3149 continue;
3150 return true;
3152 return false;
3153 case 3:
3154 case 4:
3155 break;
3158 /* Whole comment contents:
3159 -fallthrough
3160 @fallthrough@
3162 if (*from == '-' || *from == '@')
3164 size_t len = sizeof "fallthrough" - 1;
3165 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3166 return false;
3167 if (memcmp (from + 1, "fallthrough", len))
3168 return false;
3169 if (*from == '@')
3171 if (from[len + 1] != '@')
3172 return false;
3173 len++;
3175 from += 1 + len;
3177 /* Whole comment contents (regex):
3178 lint -fallthrough[ \t]*
3180 else if (*from == 'l')
3182 size_t len = sizeof "int -fallthrough" - 1;
3183 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3184 return false;
3185 if (memcmp (from + 1, "int -fallthrough", len))
3186 return false;
3187 from += 1 + len;
3188 while (*from == ' ' || *from == '\t')
3189 from++;
3191 /* Whole comment contents (regex):
3192 [ \t]*FALLTHR(U|OUGH)[ \t]*
3194 else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3196 while (*from == ' ' || *from == '\t')
3197 from++;
3198 if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
3199 return false;
3200 if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
3201 return false;
3202 from += sizeof "FALLTHR" - 1;
3203 if (*from == 'U')
3204 from++;
3205 else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
3206 return false;
3207 else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
3208 return false;
3209 else
3210 from += sizeof "OUGH" - 1;
3211 while (*from == ' ' || *from == '\t')
3212 from++;
3214 /* Whole comment contents (regex):
3215 [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3216 [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3217 [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
3219 else
3221 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3222 from++;
3223 unsigned char f = *from;
3224 bool all_upper = false;
3225 if (f == 'E' || f == 'e')
3227 if ((size_t) (pfile->buffer->cur - from)
3228 < sizeof "else fallthru" - 1)
3229 return false;
3230 if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
3231 all_upper = true;
3232 else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
3233 return false;
3234 from += sizeof "else" - 1;
3235 if (*from == ',')
3236 from++;
3237 if (*from != ' ')
3238 return false;
3239 from++;
3240 if (all_upper && *from == 'f')
3241 return false;
3242 if (f == 'e' && *from == 'F')
3243 return false;
3244 f = *from;
3246 else if (f == 'I' || f == 'i')
3248 if ((size_t) (pfile->buffer->cur - from)
3249 < sizeof "intentional fallthru" - 1)
3250 return false;
3251 if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
3252 sizeof "NTENTIONAL" - 1) == 0)
3253 all_upper = true;
3254 else if (memcmp (from + 1, "ntentional",
3255 sizeof "ntentional" - 1))
3256 return false;
3257 from += sizeof "intentional" - 1;
3258 if (*from == ' ')
3260 from++;
3261 if (all_upper && *from == 'f')
3262 return false;
3264 else if (all_upper)
3266 if (memcmp (from, "LY F", sizeof "LY F" - 1))
3267 return false;
3268 from += sizeof "LY " - 1;
3270 else
3272 if (memcmp (from, "ly ", sizeof "ly " - 1))
3273 return false;
3274 from += sizeof "ly " - 1;
3276 if (f == 'i' && *from == 'F')
3277 return false;
3278 f = *from;
3280 if (f != 'F' && f != 'f')
3281 return false;
3282 if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3283 return false;
3284 if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3285 all_upper = true;
3286 else if (all_upper)
3287 return false;
3288 else if (memcmp (from + 1, "all", sizeof "all" - 1))
3289 return false;
3290 from += sizeof "fall" - 1;
3291 if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3292 from += 2;
3293 else if (*from == ' ' || *from == '-')
3294 from++;
3295 else if (*from != (all_upper ? 'T' : 't'))
3296 return false;
3297 if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3298 return false;
3299 if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3300 return false;
3301 if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3303 if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3304 return false;
3305 if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3306 sizeof "hrough" - 1))
3307 return false;
3308 from += sizeof "through" - 1;
3310 else
3311 from += sizeof "thru" - 1;
3312 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3313 from++;
3314 if (*from == '-')
3316 from++;
3317 if (*comment_start == '*')
3321 while (*from && *from != '*'
3322 && *from != '\n' && *from != '\r')
3323 from++;
3324 if (*from != '*' || from[1] == '/')
3325 break;
3326 from++;
3328 while (1);
3330 else
3331 while (*from && *from != '\n' && *from != '\r')
3332 from++;
3335 /* C block comment. */
3336 if (*comment_start == '*')
3338 if (*from != '*' || from[1] != '/')
3339 return false;
3341 /* C++ line comment. */
3342 else if (*from != '\n')
3343 return false;
3345 return true;
3348 /* Allocate COUNT tokens for RUN. */
3349 void
3350 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3352 run->base = XNEWVEC (cpp_token, count);
3353 run->limit = run->base + count;
3354 run->next = NULL;
3357 /* Returns the next tokenrun, or creates one if there is none. */
3358 static tokenrun *
3359 next_tokenrun (tokenrun *run)
3361 if (run->next == NULL)
3363 run->next = XNEW (tokenrun);
3364 run->next->prev = run;
3365 _cpp_init_tokenrun (run->next, 250);
3368 return run->next;
3371 /* Return the number of not yet processed token in a given
3372 context. */
3374 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3376 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3377 return (LAST (context).token - FIRST (context).token);
3378 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3379 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3380 return (LAST (context).ptoken - FIRST (context).ptoken);
3381 else
3382 abort ();
3385 /* Returns the token present at index INDEX in a given context. If
3386 INDEX is zero, the next token to be processed is returned. */
3387 static const cpp_token*
3388 _cpp_token_from_context_at (cpp_context *context, int index)
3390 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3391 return &(FIRST (context).token[index]);
3392 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3393 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3394 return FIRST (context).ptoken[index];
3395 else
3396 abort ();
3399 /* Look ahead in the input stream. */
3400 const cpp_token *
3401 cpp_peek_token (cpp_reader *pfile, int index)
3403 cpp_context *context = pfile->context;
3404 const cpp_token *peektok;
3405 int count;
3407 /* First, scan through any pending cpp_context objects. */
3408 while (context->prev)
3410 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3412 if (index < (int) sz)
3413 return _cpp_token_from_context_at (context, index);
3414 index -= (int) sz;
3415 context = context->prev;
3418 /* We will have to read some new tokens after all (and do so
3419 without invalidating preceding tokens). */
3420 count = index;
3421 pfile->keep_tokens++;
3423 /* For peeked tokens temporarily disable line_change reporting,
3424 until the tokens are parsed for real. */
3425 void (*line_change) (cpp_reader *, const cpp_token *, int)
3426 = pfile->cb.line_change;
3427 pfile->cb.line_change = NULL;
3431 peektok = _cpp_lex_token (pfile);
3432 if (peektok->type == CPP_EOF)
3434 index--;
3435 break;
3437 else if (peektok->type == CPP_PRAGMA)
3439 /* Don't peek past a pragma. */
3440 if (peektok == &pfile->directive_result)
3441 /* Save the pragma in the buffer. */
3442 *pfile->cur_token++ = *peektok;
3443 index--;
3444 break;
3447 while (index--);
3449 _cpp_backup_tokens_direct (pfile, count - index);
3450 pfile->keep_tokens--;
3451 pfile->cb.line_change = line_change;
3453 return peektok;
3456 /* Allocate a single token that is invalidated at the same time as the
3457 rest of the tokens on the line. Has its line and col set to the
3458 same as the last lexed token, so that diagnostics appear in the
3459 right place. */
3460 cpp_token *
3461 _cpp_temp_token (cpp_reader *pfile)
3463 cpp_token *old, *result;
3464 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3465 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3467 old = pfile->cur_token - 1;
3468 /* Any pre-existing lookaheads must not be clobbered. */
3469 if (la)
3471 if (sz <= la)
3473 tokenrun *next = next_tokenrun (pfile->cur_run);
3475 if (sz < la)
3476 memmove (next->base + 1, next->base,
3477 (la - sz) * sizeof (cpp_token));
3479 next->base[0] = pfile->cur_run->limit[-1];
3482 if (sz > 1)
3483 memmove (pfile->cur_token + 1, pfile->cur_token,
3484 MIN (la, sz - 1) * sizeof (cpp_token));
3487 if (!sz && pfile->cur_token == pfile->cur_run->limit)
3489 pfile->cur_run = next_tokenrun (pfile->cur_run);
3490 pfile->cur_token = pfile->cur_run->base;
3493 result = pfile->cur_token++;
3494 result->src_loc = old->src_loc;
3495 return result;
3498 /* We're at the beginning of a logical line (so not in
3499 directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
3500 if we should enter deferred_pragma mode to tokenize the rest of the
3501 line as a module control-line. */
3503 static void
3504 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3506 unsigned backup = 0; /* Tokens we peeked. */
3507 cpp_hashnode *node = result->val.node.node;
3508 cpp_token *peek = result;
3509 cpp_token *keyword = peek;
3510 cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3511 int header_count = 0;
3513 /* Make sure the incoming state is as we expect it. This way we
3514 can restore it using constants. */
3515 gcc_checking_assert (!pfile->state.in_deferred_pragma
3516 && !pfile->state.skipping
3517 && !pfile->state.parsing_args
3518 && !pfile->state.angled_headers
3519 && (pfile->state.save_comments
3520 == !CPP_OPTION (pfile, discard_comments)));
3522 /* Enter directives mode sufficiently for peeking. We don't have
3523 to actually set in_directive. */
3524 pfile->state.in_deferred_pragma = true;
3526 /* These two fields are needed to process tokenization in deferred
3527 pragma mode. They are not used outside deferred pragma mode or
3528 directives mode. */
3529 pfile->state.pragma_allow_expansion = true;
3530 pfile->directive_line = result->src_loc;
3532 /* Saving comments is incompatible with directives mode. */
3533 pfile->state.save_comments = 0;
3535 if (node == n_modules[spec_nodes::M_EXPORT][0])
3537 peek = _cpp_lex_direct (pfile);
3538 keyword = peek;
3539 backup++;
3540 if (keyword->type != CPP_NAME)
3541 goto not_module;
3542 node = keyword->val.node.node;
3543 if (!(node->flags & NODE_MODULE))
3544 goto not_module;
3547 if (node == n_modules[spec_nodes::M__IMPORT][0])
3548 /* __import */
3549 header_count = backup + 2 + 16;
3550 else if (node == n_modules[spec_nodes::M_IMPORT][0])
3551 /* import */
3552 header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3553 else if (node == n_modules[spec_nodes::M_MODULE][0])
3554 ; /* module */
3555 else
3556 goto not_module;
3558 /* We've seen [export] {module|import|__import}. Check the next token. */
3559 if (header_count)
3560 /* After '{,__}import' a header name may appear. */
3561 pfile->state.angled_headers = true;
3562 peek = _cpp_lex_direct (pfile);
3563 backup++;
3565 /* ... import followed by identifier, ':', '<' or
3566 header-name preprocessing tokens, or module
3567 followed by cpp-identifier, ':' or ';' preprocessing
3568 tokens. C++ keywords are not yet relevant. */
3569 if (peek->type == CPP_NAME
3570 || peek->type == CPP_COLON
3571 || (header_count
3572 ? (peek->type == CPP_LESS
3573 || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3574 || peek->type == CPP_HEADER_NAME)
3575 : peek->type == CPP_SEMICOLON))
3577 pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3578 if (!pfile->state.pragma_allow_expansion)
3579 pfile->state.prevent_expansion++;
3581 if (!header_count && linemap_included_from
3582 (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3583 cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3584 "module control-line cannot be in included file");
3586 /* The first one or two tokens cannot be macro names. */
3587 for (int ix = backup; ix--;)
3589 cpp_token *tok = ix ? keyword : result;
3590 cpp_hashnode *node = tok->val.node.node;
3592 /* Don't attempt to expand the token. */
3593 tok->flags |= NO_EXPAND;
3594 if (_cpp_defined_macro_p (node)
3595 && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3596 && !cpp_fun_like_macro_p (node))
3597 cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3598 "module control-line \"%s\" cannot be"
3599 " an object-like macro",
3600 NODE_NAME (node));
3603 /* Map to underbar variants. */
3604 keyword->val.node.node = n_modules[header_count
3605 ? spec_nodes::M_IMPORT
3606 : spec_nodes::M_MODULE][1];
3607 if (backup != 1)
3608 result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3610 /* Maybe tell the tokenizer we expect a header-name down the
3611 road. */
3612 pfile->state.directive_file_token = header_count;
3614 else
3616 not_module:
3617 /* Drop out of directive mode. */
3618 /* We aaserted save_comments had this value upon entry. */
3619 pfile->state.save_comments
3620 = !CPP_OPTION (pfile, discard_comments);
3621 pfile->state.in_deferred_pragma = false;
3622 /* Do not let this remain on. */
3623 pfile->state.angled_headers = false;
3626 /* In either case we want to backup the peeked tokens. */
3627 if (backup)
3629 /* If we saw EOL, we should drop it, because this isn't a module
3630 control-line after all. */
3631 bool eol = peek->type == CPP_PRAGMA_EOL;
3632 if (!eol || backup > 1)
3634 /* Put put the peeked tokens back */
3635 _cpp_backup_tokens_direct (pfile, backup);
3636 /* But if the last one was an EOL, forget it. */
3637 if (eol)
3638 pfile->lookaheads--;
3643 /* Lex a token into RESULT (external interface). Takes care of issues
3644 like directive handling, token lookahead, multiple include
3645 optimization and skipping. */
3646 const cpp_token *
3647 _cpp_lex_token (cpp_reader *pfile)
3649 cpp_token *result;
3651 for (;;)
3653 if (pfile->cur_token == pfile->cur_run->limit)
3655 pfile->cur_run = next_tokenrun (pfile->cur_run);
3656 pfile->cur_token = pfile->cur_run->base;
3658 /* We assume that the current token is somewhere in the current
3659 run. */
3660 if (pfile->cur_token < pfile->cur_run->base
3661 || pfile->cur_token >= pfile->cur_run->limit)
3662 abort ();
3664 if (pfile->lookaheads)
3666 pfile->lookaheads--;
3667 result = pfile->cur_token++;
3669 else
3670 result = _cpp_lex_direct (pfile);
3672 if (result->flags & BOL)
3674 /* Is this a directive. If _cpp_handle_directive returns
3675 false, it is an assembler #. */
3676 if (result->type == CPP_HASH
3677 /* 6.10.3 p 11: Directives in a list of macro arguments
3678 gives undefined behavior. This implementation
3679 handles the directive as normal. */
3680 && pfile->state.parsing_args != 1)
3682 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3684 if (pfile->directive_result.type == CPP_PADDING)
3685 continue;
3686 result = &pfile->directive_result;
3689 else if (pfile->state.in_deferred_pragma)
3690 result = &pfile->directive_result;
3691 else if (result->type == CPP_NAME
3692 && (result->val.node.node->flags & NODE_MODULE)
3693 && !pfile->state.skipping
3694 /* Unlike regular directives, we do not deal with
3695 tokenizing module directives as macro arguments.
3696 That's not permitted. */
3697 && !pfile->state.parsing_args)
3699 /* P1857. Before macro expansion, At start of logical
3700 line ... */
3701 /* We don't have to consider lookaheads at this point. */
3702 gcc_checking_assert (!pfile->lookaheads);
3704 cpp_maybe_module_directive (pfile, result);
3707 if (pfile->cb.line_change && !pfile->state.skipping)
3708 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3711 /* We don't skip tokens in directives. */
3712 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3713 break;
3715 /* Outside a directive, invalidate controlling macros. At file
3716 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3717 get here and MI optimization works. */
3718 pfile->mi_valid = false;
3720 if (!pfile->state.skipping || result->type == CPP_EOF)
3721 break;
3724 return result;
3727 /* Returns true if a fresh line has been loaded. */
3728 template <bool lexing_raw_string>
3729 static bool
3730 get_fresh_line_impl (cpp_reader *pfile)
3732 /* We can't get a new line until we leave the current directive, unless we
3733 are lexing a raw string, in which case it will be OK as long as we don't
3734 pop the current buffer. */
3735 if (!lexing_raw_string && pfile->state.in_directive)
3736 return false;
3738 for (;;)
3740 cpp_buffer *buffer = pfile->buffer;
3742 if (!buffer->need_line)
3743 return true;
3745 if (buffer->next_line < buffer->rlimit)
3747 _cpp_clean_line (pfile);
3748 return true;
3751 /* We can't change buffers until we leave the current directive. */
3752 if (lexing_raw_string && pfile->state.in_directive)
3753 return false;
3755 /* First, get out of parsing arguments state. */
3756 if (pfile->state.parsing_args)
3757 return false;
3759 /* End of buffer. Non-empty files should end in a newline. */
3760 if (buffer->buf != buffer->rlimit
3761 && buffer->next_line > buffer->rlimit
3762 && !buffer->from_stage3)
3764 /* Clip to buffer size. */
3765 buffer->next_line = buffer->rlimit;
3768 if (buffer->prev && !buffer->return_at_eof)
3769 _cpp_pop_buffer (pfile);
3770 else
3772 /* End of translation. Do not pop the buffer yet. Increment
3773 line number so that the EOF token is on a line of its own
3774 (_cpp_lex_direct doesn't increment in that case, because
3775 it's hard for it to distinguish this special case). */
3776 CPP_INCREMENT_LINE (pfile, 0);
3777 return false;
3782 bool
3783 _cpp_get_fresh_line (cpp_reader *pfile)
3785 return get_fresh_line_impl<false> (pfile);
3789 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
3790 do \
3792 result->type = ELSE_TYPE; \
3793 if (*buffer->cur == CHAR) \
3794 buffer->cur++, result->type = THEN_TYPE; \
3796 while (0)
3798 /* Lex a token into pfile->cur_token, which is also incremented, to
3799 get diagnostics pointing to the correct location.
3801 Does not handle issues such as token lookahead, multiple-include
3802 optimization, directives, skipping etc. This function is only
3803 suitable for use by _cpp_lex_token, and in special cases like
3804 lex_expansion_token which doesn't care for any of these issues.
3806 When meeting a newline, returns CPP_EOF if parsing a directive,
3807 otherwise returns to the start of the token buffer if permissible.
3808 Returns the location of the lexed token. */
3809 cpp_token *
3810 _cpp_lex_direct (cpp_reader *pfile)
3812 cppchar_t c = 0;
3813 cpp_buffer *buffer;
3814 const unsigned char *comment_start;
3815 bool fallthrough_comment = false;
3816 cpp_token *result = pfile->cur_token++;
3818 fresh_line:
3819 result->flags = 0;
3820 buffer = pfile->buffer;
3821 if (buffer->need_line)
3823 if (pfile->state.in_deferred_pragma)
3825 /* This can happen in cases like:
3826 #define loop(x) whatever
3827 #pragma omp loop
3828 where when trying to expand loop we need to peek
3829 next token after loop, but aren't still in_deferred_pragma
3830 mode but are in in_directive mode, so buffer->need_line
3831 is set, a CPP_EOF is peeked. */
3832 result->type = CPP_PRAGMA_EOL;
3833 pfile->state.in_deferred_pragma = false;
3834 if (!pfile->state.pragma_allow_expansion)
3835 pfile->state.prevent_expansion--;
3836 result->src_loc = pfile->line_table->highest_line;
3837 return result;
3839 if (!_cpp_get_fresh_line (pfile))
3841 result->type = CPP_EOF;
3842 /* Not a real EOF in a directive or arg parsing -- we refuse
3843 to advance to the next file now, and will once we're out
3844 of those modes. */
3845 if (!pfile->state.in_directive && !pfile->state.parsing_args)
3847 /* Tell the compiler the line number of the EOF token. */
3848 result->src_loc = pfile->line_table->highest_line;
3849 result->flags = BOL;
3850 /* Now pop the buffer that _cpp_get_fresh_line did not. */
3851 _cpp_pop_buffer (pfile);
3853 else if (c == 0)
3854 result->src_loc = pfile->line_table->highest_line;
3855 return result;
3857 if (buffer != pfile->buffer)
3858 fallthrough_comment = false;
3859 if (!pfile->keep_tokens)
3861 pfile->cur_run = &pfile->base_run;
3862 result = pfile->base_run.base;
3863 pfile->cur_token = result + 1;
3865 result->flags = BOL;
3866 if (pfile->state.parsing_args == 2)
3867 result->flags |= PREV_WHITE;
3869 buffer = pfile->buffer;
3870 update_tokens_line:
3871 result->src_loc = pfile->line_table->highest_line;
3873 skipped_white:
3874 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3875 && !pfile->overlaid_buffer)
3877 _cpp_process_line_notes (pfile, false);
3878 result->src_loc = pfile->line_table->highest_line;
3880 c = *buffer->cur++;
3882 if (pfile->forced_token_location)
3883 result->src_loc = pfile->forced_token_location;
3884 else
3885 result->src_loc = linemap_position_for_column (pfile->line_table,
3886 CPP_BUF_COLUMN (buffer, buffer->cur));
3888 switch (c)
3890 case ' ': case '\t': case '\f': case '\v': case '\0':
3891 result->flags |= PREV_WHITE;
3892 skip_whitespace (pfile, c);
3893 goto skipped_white;
3895 case '\n':
3896 /* Increment the line, unless this is the last line ... */
3897 if (buffer->cur < buffer->rlimit
3898 /* ... or this is a #include, (where _cpp_stack_file needs to
3899 unwind by one line) ... */
3900 || (pfile->state.in_directive > 1
3901 /* ... except traditional-cpp increments this elsewhere. */
3902 && !CPP_OPTION (pfile, traditional)))
3903 CPP_INCREMENT_LINE (pfile, 0);
3904 buffer->need_line = true;
3905 if (pfile->state.in_deferred_pragma)
3907 /* Produce the PRAGMA_EOL on this line. File reading
3908 ensures there is always a \n at end of the buffer, thus
3909 in a deferred pragma we always see CPP_PRAGMA_EOL before
3910 any CPP_EOF. */
3911 result->type = CPP_PRAGMA_EOL;
3912 result->flags &= ~PREV_WHITE;
3913 pfile->state.in_deferred_pragma = false;
3914 if (!pfile->state.pragma_allow_expansion)
3915 pfile->state.prevent_expansion--;
3916 return result;
3918 goto fresh_line;
3920 case '0': case '1': case '2': case '3': case '4':
3921 case '5': case '6': case '7': case '8': case '9':
3923 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3924 result->type = CPP_NUMBER;
3925 lex_number (pfile, &result->val.str, &nst);
3926 warn_about_normalization (pfile, result, &nst, false);
3927 break;
3930 case 'L':
3931 case 'u':
3932 case 'U':
3933 case 'R':
3934 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3935 wide strings or raw strings. */
3936 if (c == 'L' || CPP_OPTION (pfile, rliterals)
3937 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3939 if ((*buffer->cur == '\'' && c != 'R')
3940 || *buffer->cur == '"'
3941 || (*buffer->cur == 'R'
3942 && c != 'R'
3943 && buffer->cur[1] == '"'
3944 && CPP_OPTION (pfile, rliterals))
3945 || (*buffer->cur == '8'
3946 && c == 'u'
3947 && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3948 && CPP_OPTION (pfile, utf8_char_literals)))
3949 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3950 && CPP_OPTION (pfile, rliterals)))))
3952 lex_string (pfile, result, buffer->cur - 1);
3953 break;
3956 /* Fall through. */
3958 case '_':
3959 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3960 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3961 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3962 case 's': case 't': case 'v': case 'w': case 'x':
3963 case 'y': case 'z':
3964 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3965 case 'G': case 'H': case 'I': case 'J': case 'K':
3966 case 'M': case 'N': case 'O': case 'P': case 'Q':
3967 case 'S': case 'T': case 'V': case 'W': case 'X':
3968 case 'Y': case 'Z':
3969 result->type = CPP_NAME;
3971 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3972 const auto node = lex_identifier (pfile, buffer->cur - 1, false, &nst,
3973 &result->val.node.spelling);
3974 result->val.node.node = node;
3975 identifier_diagnostics_on_lex (pfile, node);
3976 warn_about_normalization (pfile, result, &nst, true);
3979 /* Convert named operators to their proper types. */
3980 if (result->val.node.node->flags & NODE_OPERATOR)
3982 result->flags |= NAMED_OP;
3983 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3986 /* Signal FALLTHROUGH comment followed by another token. */
3987 if (fallthrough_comment)
3988 result->flags |= PREV_FALLTHROUGH;
3989 break;
3991 case '\'':
3992 case '"':
3993 lex_string (pfile, result, buffer->cur - 1);
3994 break;
3996 case '/':
3997 /* A potential block or line comment. */
3998 comment_start = buffer->cur;
3999 c = *buffer->cur;
4001 if (c == '*')
4003 if (_cpp_skip_block_comment (pfile))
4004 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
4006 else if (c == '/' && ! CPP_OPTION (pfile, traditional))
4008 /* Don't warn for system headers. */
4009 if (_cpp_in_system_header (pfile))
4011 /* Warn about comments if pedantically GNUC89, and not
4012 in system headers. */
4013 else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
4014 && CPP_PEDANTIC (pfile)
4015 && ! buffer->warned_cplusplus_comments)
4017 if (cpp_error (pfile, CPP_DL_PEDWARN,
4018 "C++ style comments are not allowed in ISO C90"))
4019 cpp_error (pfile, CPP_DL_NOTE,
4020 "(this will be reported only once per input file)");
4021 buffer->warned_cplusplus_comments = 1;
4023 /* Or if specifically desired via -Wc90-c99-compat. */
4024 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
4025 && ! CPP_OPTION (pfile, cplusplus)
4026 && ! buffer->warned_cplusplus_comments)
4028 if (cpp_error (pfile, CPP_DL_WARNING,
4029 "C++ style comments are incompatible with C90"))
4030 cpp_error (pfile, CPP_DL_NOTE,
4031 "(this will be reported only once per input file)");
4032 buffer->warned_cplusplus_comments = 1;
4034 /* In C89/C94, C++ style comments are forbidden. */
4035 else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
4036 || CPP_OPTION (pfile, lang) == CLK_STDC94))
4038 /* But don't be confused about valid code such as
4039 - // immediately followed by *,
4040 - // in a preprocessing directive,
4041 - // in an #if 0 block. */
4042 if (buffer->cur[1] == '*'
4043 || pfile->state.in_directive
4044 || pfile->state.skipping)
4046 result->type = CPP_DIV;
4047 break;
4049 else if (! buffer->warned_cplusplus_comments)
4051 if (cpp_error (pfile, CPP_DL_ERROR,
4052 "C++ style comments are not allowed in "
4053 "ISO C90"))
4054 cpp_error (pfile, CPP_DL_NOTE,
4055 "(this will be reported only once per input "
4056 "file)");
4057 buffer->warned_cplusplus_comments = 1;
4060 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
4061 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
4063 else if (c == '=')
4065 buffer->cur++;
4066 result->type = CPP_DIV_EQ;
4067 break;
4069 else
4071 result->type = CPP_DIV;
4072 break;
4075 if (fallthrough_comment_p (pfile, comment_start))
4076 fallthrough_comment = true;
4078 if (pfile->cb.comment)
4080 size_t len = pfile->buffer->cur - comment_start;
4081 pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
4082 len + 1);
4085 if (!pfile->state.save_comments)
4087 result->flags |= PREV_WHITE;
4088 goto update_tokens_line;
4091 if (fallthrough_comment)
4092 result->flags |= PREV_FALLTHROUGH;
4094 /* Save the comment as a token in its own right. */
4095 save_comment (pfile, result, comment_start, c);
4096 break;
4098 case '<':
4099 if (pfile->state.angled_headers)
4101 lex_string (pfile, result, buffer->cur - 1);
4102 if (result->type != CPP_LESS)
4103 break;
4106 result->type = CPP_LESS;
4107 if (*buffer->cur == '=')
4109 buffer->cur++, result->type = CPP_LESS_EQ;
4110 if (*buffer->cur == '>'
4111 && CPP_OPTION (pfile, cplusplus)
4112 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
4113 buffer->cur++, result->type = CPP_SPACESHIP;
4115 else if (*buffer->cur == '<')
4117 buffer->cur++;
4118 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
4120 else if (CPP_OPTION (pfile, digraphs))
4122 if (*buffer->cur == ':')
4124 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4125 three characters are <:: and the subsequent character
4126 is neither : nor >, the < is treated as a preprocessor
4127 token by itself". */
4128 if (CPP_OPTION (pfile, cplusplus)
4129 && CPP_OPTION (pfile, lang) != CLK_CXX98
4130 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
4131 && buffer->cur[1] == ':'
4132 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4133 break;
4135 buffer->cur++;
4136 result->flags |= DIGRAPH;
4137 result->type = CPP_OPEN_SQUARE;
4139 else if (*buffer->cur == '%')
4141 buffer->cur++;
4142 result->flags |= DIGRAPH;
4143 result->type = CPP_OPEN_BRACE;
4146 break;
4148 case '>':
4149 result->type = CPP_GREATER;
4150 if (*buffer->cur == '=')
4151 buffer->cur++, result->type = CPP_GREATER_EQ;
4152 else if (*buffer->cur == '>')
4154 buffer->cur++;
4155 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4157 break;
4159 case '%':
4160 result->type = CPP_MOD;
4161 if (*buffer->cur == '=')
4162 buffer->cur++, result->type = CPP_MOD_EQ;
4163 else if (CPP_OPTION (pfile, digraphs))
4165 if (*buffer->cur == ':')
4167 buffer->cur++;
4168 result->flags |= DIGRAPH;
4169 result->type = CPP_HASH;
4170 if (*buffer->cur == '%' && buffer->cur[1] == ':')
4171 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
4173 else if (*buffer->cur == '>')
4175 buffer->cur++;
4176 result->flags |= DIGRAPH;
4177 result->type = CPP_CLOSE_BRACE;
4180 break;
4182 case '.':
4183 result->type = CPP_DOT;
4184 if (ISDIGIT (*buffer->cur))
4186 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4187 result->type = CPP_NUMBER;
4188 lex_number (pfile, &result->val.str, &nst);
4189 warn_about_normalization (pfile, result, &nst, false);
4191 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4192 buffer->cur += 2, result->type = CPP_ELLIPSIS;
4193 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4194 buffer->cur++, result->type = CPP_DOT_STAR;
4195 break;
4197 case '+':
4198 result->type = CPP_PLUS;
4199 if (*buffer->cur == '+')
4200 buffer->cur++, result->type = CPP_PLUS_PLUS;
4201 else if (*buffer->cur == '=')
4202 buffer->cur++, result->type = CPP_PLUS_EQ;
4203 break;
4205 case '-':
4206 result->type = CPP_MINUS;
4207 if (*buffer->cur == '>')
4209 buffer->cur++;
4210 result->type = CPP_DEREF;
4211 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4212 buffer->cur++, result->type = CPP_DEREF_STAR;
4214 else if (*buffer->cur == '-')
4215 buffer->cur++, result->type = CPP_MINUS_MINUS;
4216 else if (*buffer->cur == '=')
4217 buffer->cur++, result->type = CPP_MINUS_EQ;
4218 break;
4220 case '&':
4221 result->type = CPP_AND;
4222 if (*buffer->cur == '&')
4223 buffer->cur++, result->type = CPP_AND_AND;
4224 else if (*buffer->cur == '=')
4225 buffer->cur++, result->type = CPP_AND_EQ;
4226 break;
4228 case '|':
4229 result->type = CPP_OR;
4230 if (*buffer->cur == '|')
4231 buffer->cur++, result->type = CPP_OR_OR;
4232 else if (*buffer->cur == '=')
4233 buffer->cur++, result->type = CPP_OR_EQ;
4234 break;
4236 case ':':
4237 result->type = CPP_COLON;
4238 if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
4239 buffer->cur++, result->type = CPP_SCOPE;
4240 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
4242 buffer->cur++;
4243 result->flags |= DIGRAPH;
4244 result->type = CPP_CLOSE_SQUARE;
4246 break;
4248 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4249 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4250 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4251 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
4252 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
4254 case '?': result->type = CPP_QUERY; break;
4255 case '~': result->type = CPP_COMPL; break;
4256 case ',': result->type = CPP_COMMA; break;
4257 case '(': result->type = CPP_OPEN_PAREN; break;
4258 case ')': result->type = CPP_CLOSE_PAREN; break;
4259 case '[': result->type = CPP_OPEN_SQUARE; break;
4260 case ']': result->type = CPP_CLOSE_SQUARE; break;
4261 case '{': result->type = CPP_OPEN_BRACE; break;
4262 case '}': result->type = CPP_CLOSE_BRACE; break;
4263 case ';': result->type = CPP_SEMICOLON; break;
4265 /* @ is a punctuator in Objective-C. */
4266 case '@': result->type = CPP_ATSIGN; break;
4268 default:
4270 const uchar *base = --buffer->cur;
4271 static int no_warn_cnt;
4273 /* Check for an extended identifier ($ or UCN or UTF-8). */
4274 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4275 if (forms_identifier_p (pfile, true, &nst))
4277 result->type = CPP_NAME;
4278 const auto node = lex_identifier (pfile, base, true, &nst,
4279 &result->val.node.spelling);
4280 result->val.node.node = node;
4281 identifier_diagnostics_on_lex (pfile, node);
4282 warn_about_normalization (pfile, result, &nst, true);
4283 break;
4286 /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
4287 single token. */
4288 buffer->cur++;
4289 if (c >= utf8_signifier)
4291 const uchar *pstr = base;
4292 cppchar_t s;
4293 if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4295 if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4297 buffer->cur = base;
4298 _cpp_warn_invalid_utf8 (pfile);
4300 buffer->cur = pstr;
4302 else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4304 buffer->cur = base;
4305 const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4306 buffer->cur = base + 1;
4307 no_warn_cnt = end - buffer->cur;
4310 else if (c >= utf8_continuation
4311 && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4313 if (no_warn_cnt)
4314 --no_warn_cnt;
4315 else
4317 buffer->cur = base;
4318 _cpp_warn_invalid_utf8 (pfile);
4319 buffer->cur = base + 1;
4322 create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4323 break;
4328 /* Potentially convert the location of the token to a range. */
4329 if (result->src_loc >= RESERVED_LOCATION_COUNT
4330 && result->type != CPP_EOF)
4332 /* Ensure that any line notes are processed, so that we have the
4333 correct physical line/column for the end-point of the token even
4334 when a logical line is split via one or more backslashes. */
4335 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4336 && !pfile->overlaid_buffer)
4337 _cpp_process_line_notes (pfile, false);
4339 source_range tok_range;
4340 tok_range.m_start = result->src_loc;
4341 tok_range.m_finish
4342 = linemap_position_for_column (pfile->line_table,
4343 CPP_BUF_COLUMN (buffer, buffer->cur));
4345 result->src_loc
4346 = pfile->line_table->get_or_create_combined_loc (result->src_loc,
4347 tok_range, nullptr, 0);
4350 return result;
4353 /* An upper bound on the number of bytes needed to spell TOKEN.
4354 Does not include preceding whitespace. */
4355 unsigned int
4356 cpp_token_len (const cpp_token *token)
4358 unsigned int len;
4360 switch (TOKEN_SPELL (token))
4362 default: len = 6; break;
4363 case SPELL_LITERAL: len = token->val.str.len; break;
4364 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
4367 return len;
4370 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4371 Return the number of bytes read out of NAME. (There are always
4372 10 bytes written to BUFFER.) */
4374 static size_t
4375 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4377 int j;
4378 int ucn_len = 0;
4379 int ucn_len_c;
4380 unsigned t;
4381 unsigned long utf32;
4383 /* Compute the length of the UTF-8 sequence. */
4384 for (t = *name; t & 0x80; t <<= 1)
4385 ucn_len++;
4387 utf32 = *name & (0x7F >> ucn_len);
4388 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4390 utf32 = (utf32 << 6) | (*++name & 0x3F);
4392 /* Ill-formed UTF-8. */
4393 if ((*name & ~0x3F) != 0x80)
4394 abort ();
4397 *buffer++ = '\\';
4398 *buffer++ = 'U';
4399 for (j = 7; j >= 0; j--)
4400 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4401 return ucn_len;
4404 /* Given a token TYPE corresponding to a digraph, return a pointer to
4405 the spelling of the digraph. */
4406 static const unsigned char *
4407 cpp_digraph2name (enum cpp_ttype type)
4409 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4412 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4413 The buffer must already contain enough space to hold the
4414 token's spelling. Returns a pointer to the character after the
4415 last character written. */
4416 unsigned char *
4417 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4419 size_t i;
4420 const unsigned char *name = NODE_NAME (ident);
4422 for (i = 0; i < NODE_LEN (ident); i++)
4423 if (name[i] & ~0x7F)
4425 i += utf8_to_ucn (buffer, name + i) - 1;
4426 buffer += 10;
4428 else
4429 *buffer++ = name[i];
4431 return buffer;
4434 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
4435 already contain enough space to hold the token's spelling.
4436 Returns a pointer to the character after the last character written.
4437 FORSTRING is true if this is to be the spelling after translation
4438 phase 1 (with the original spelling of extended identifiers), false
4439 if extended identifiers should always be written using UCNs (there is
4440 no option for always writing them in the internal UTF-8 form).
4441 FIXME: Would be nice if we didn't need the PFILE argument. */
4442 unsigned char *
4443 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4444 unsigned char *buffer, bool forstring)
4446 switch (TOKEN_SPELL (token))
4448 case SPELL_OPERATOR:
4450 const unsigned char *spelling;
4451 unsigned char c;
4453 if (token->flags & DIGRAPH)
4454 spelling = cpp_digraph2name (token->type);
4455 else if (token->flags & NAMED_OP)
4456 goto spell_ident;
4457 else
4458 spelling = TOKEN_NAME (token);
4460 while ((c = *spelling++) != '\0')
4461 *buffer++ = c;
4463 break;
4465 spell_ident:
4466 case SPELL_IDENT:
4467 if (forstring)
4469 memcpy (buffer, NODE_NAME (token->val.node.spelling),
4470 NODE_LEN (token->val.node.spelling));
4471 buffer += NODE_LEN (token->val.node.spelling);
4473 else
4474 buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4475 break;
4477 case SPELL_LITERAL:
4478 memcpy (buffer, token->val.str.text, token->val.str.len);
4479 buffer += token->val.str.len;
4480 break;
4482 case SPELL_NONE:
4483 cpp_error (pfile, CPP_DL_ICE,
4484 "unspellable token %s", TOKEN_NAME (token));
4485 break;
4488 return buffer;
4491 /* Returns TOKEN spelt as a null-terminated string. The string is
4492 freed when the reader is destroyed. Useful for diagnostics. */
4493 unsigned char *
4494 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4496 unsigned int len = cpp_token_len (token) + 1;
4497 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4499 end = cpp_spell_token (pfile, token, start, false);
4500 end[0] = '\0';
4502 return start;
4505 /* Returns a pointer to a string which spells the token defined by
4506 TYPE and FLAGS. Used by C front ends, which really should move to
4507 using cpp_token_as_text. */
4508 const char *
4509 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4511 if (flags & DIGRAPH)
4512 return (const char *) cpp_digraph2name (type);
4513 else if (flags & NAMED_OP)
4514 return cpp_named_operator2name (type);
4516 return (const char *) token_spellings[type].name;
4519 /* Writes the spelling of token to FP, without any preceding space.
4520 Separated from cpp_spell_token for efficiency - to avoid stdio
4521 double-buffering. */
4522 void
4523 cpp_output_token (const cpp_token *token, FILE *fp)
4525 switch (TOKEN_SPELL (token))
4527 case SPELL_OPERATOR:
4529 const unsigned char *spelling;
4530 int c;
4532 if (token->flags & DIGRAPH)
4533 spelling = cpp_digraph2name (token->type);
4534 else if (token->flags & NAMED_OP)
4535 goto spell_ident;
4536 else
4537 spelling = TOKEN_NAME (token);
4539 c = *spelling;
4541 putc (c, fp);
4542 while ((c = *++spelling) != '\0');
4544 break;
4546 spell_ident:
4547 case SPELL_IDENT:
4549 size_t i;
4550 const unsigned char * name = NODE_NAME (token->val.node.node);
4552 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4553 if (name[i] & ~0x7F)
4555 unsigned char buffer[10];
4556 i += utf8_to_ucn (buffer, name + i) - 1;
4557 fwrite (buffer, 1, 10, fp);
4559 else
4560 fputc (NODE_NAME (token->val.node.node)[i], fp);
4562 break;
4564 case SPELL_LITERAL:
4565 if (token->type == CPP_HEADER_NAME)
4566 fputc ('"', fp);
4567 fwrite (token->val.str.text, 1, token->val.str.len, fp);
4568 if (token->type == CPP_HEADER_NAME)
4569 fputc ('"', fp);
4570 break;
4572 case SPELL_NONE:
4573 /* An error, most probably. */
4574 break;
4578 /* Compare two tokens. */
4580 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4582 if (a->type == b->type && a->flags == b->flags)
4583 switch (TOKEN_SPELL (a))
4585 default: /* Keep compiler happy. */
4586 case SPELL_OPERATOR:
4587 /* token_no is used to track where multiple consecutive ##
4588 tokens were originally located. */
4589 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4590 case SPELL_NONE:
4591 return (a->type != CPP_MACRO_ARG
4592 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4593 && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4594 case SPELL_IDENT:
4595 return (a->val.node.node == b->val.node.node
4596 && a->val.node.spelling == b->val.node.spelling);
4597 case SPELL_LITERAL:
4598 return (a->val.str.len == b->val.str.len
4599 && !memcmp (a->val.str.text, b->val.str.text,
4600 a->val.str.len));
4603 return 0;
4606 /* Returns nonzero if a space should be inserted to avoid an
4607 accidental token paste for output. For simplicity, it is
4608 conservative, and occasionally advises a space where one is not
4609 needed, e.g. "." and ".2". */
4611 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4612 const cpp_token *token2)
4614 enum cpp_ttype a = token1->type, b = token2->type;
4615 cppchar_t c;
4617 if (token1->flags & NAMED_OP)
4618 a = CPP_NAME;
4619 if (token2->flags & NAMED_OP)
4620 b = CPP_NAME;
4622 c = EOF;
4623 if (token2->flags & DIGRAPH)
4624 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4625 else if (token_spellings[b].category == SPELL_OPERATOR)
4626 c = token_spellings[b].name[0];
4628 /* Quickly get everything that can paste with an '='. */
4629 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4630 return 1;
4632 switch (a)
4634 case CPP_GREATER: return c == '>';
4635 case CPP_LESS: return c == '<' || c == '%' || c == ':';
4636 case CPP_PLUS: return c == '+';
4637 case CPP_MINUS: return c == '-' || c == '>';
4638 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
4639 case CPP_MOD: return c == ':' || c == '>';
4640 case CPP_AND: return c == '&';
4641 case CPP_OR: return c == '|';
4642 case CPP_COLON: return c == ':' || c == '>';
4643 case CPP_DEREF: return c == '*';
4644 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
4645 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
4646 case CPP_PRAGMA:
4647 case CPP_NAME: return ((b == CPP_NUMBER
4648 && name_p (pfile, &token2->val.str))
4649 || b == CPP_NAME
4650 || b == CPP_CHAR || b == CPP_STRING); /* L */
4651 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
4652 || b == CPP_CHAR
4653 || c == '.' || c == '+' || c == '-');
4654 /* UCNs */
4655 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
4656 && b == CPP_NAME)
4657 || (CPP_OPTION (pfile, objc)
4658 && token1->val.str.text[0] == '@'
4659 && (b == CPP_NAME || b == CPP_STRING)));
4660 case CPP_LESS_EQ: return c == '>';
4661 case CPP_STRING:
4662 case CPP_WSTRING:
4663 case CPP_UTF8STRING:
4664 case CPP_STRING16:
4665 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
4666 && (b == CPP_NAME
4667 || (TOKEN_SPELL (token2) == SPELL_LITERAL
4668 && ISIDST (token2->val.str.text[0]))));
4670 default: break;
4673 return 0;
4676 /* Output all the remaining tokens on the current line, and a newline
4677 character, to FP. Leading whitespace is removed. If there are
4678 macros, special token padding is not performed. */
4679 void
4680 cpp_output_line (cpp_reader *pfile, FILE *fp)
4682 const cpp_token *token;
4684 token = cpp_get_token (pfile);
4685 while (token->type != CPP_EOF)
4687 cpp_output_token (token, fp);
4688 token = cpp_get_token (pfile);
4689 if (token->flags & PREV_WHITE)
4690 putc (' ', fp);
4693 putc ('\n', fp);
4696 /* Return a string representation of all the remaining tokens on the
4697 current line. The result is allocated using xmalloc and must be
4698 freed by the caller. */
4699 unsigned char *
4700 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4702 const cpp_token *token;
4703 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4704 unsigned int alloced = 120 + out;
4705 unsigned char *result = (unsigned char *) xmalloc (alloced);
4707 /* If DIR_NAME is empty, there are no initial contents. */
4708 if (dir_name)
4710 sprintf ((char *) result, "#%s ", dir_name);
4711 out += 2;
4714 token = cpp_get_token (pfile);
4715 while (token->type != CPP_EOF)
4717 unsigned char *last;
4718 /* Include room for a possible space and the terminating nul. */
4719 unsigned int len = cpp_token_len (token) + 2;
4721 if (out + len > alloced)
4723 alloced *= 2;
4724 if (out + len > alloced)
4725 alloced = out + len;
4726 result = (unsigned char *) xrealloc (result, alloced);
4729 last = cpp_spell_token (pfile, token, &result[out], 0);
4730 out = last - result;
4732 token = cpp_get_token (pfile);
4733 if (token->flags & PREV_WHITE)
4734 result[out++] = ' ';
4737 result[out] = '\0';
4738 return result;
4741 /* Memory buffers. Changing these three constants can have a dramatic
4742 effect on performance. The values here are reasonable defaults,
4743 but might be tuned. If you adjust them, be sure to test across a
4744 range of uses of cpplib, including heavy nested function-like macro
4745 expansion. Also check the change in peak memory usage (NJAMD is a
4746 good tool for this). */
4747 #define MIN_BUFF_SIZE 8000
4748 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4749 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4750 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4752 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4753 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4754 #endif
4756 /* Create a new allocation buffer. Place the control block at the end
4757 of the buffer, so that buffer overflows will cause immediate chaos. */
4758 static _cpp_buff *
4759 new_buff (size_t len)
4761 _cpp_buff *result;
4762 unsigned char *base;
4764 if (len < MIN_BUFF_SIZE)
4765 len = MIN_BUFF_SIZE;
4766 len = CPP_ALIGN (len);
4768 #ifdef ENABLE_VALGRIND_WORKAROUNDS
4769 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4770 struct first. */
4771 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4772 base = XNEWVEC (unsigned char, len + slen);
4773 result = (_cpp_buff *) base;
4774 base += slen;
4775 #else
4776 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4777 result = (_cpp_buff *) (base + len);
4778 #endif
4779 result->base = base;
4780 result->cur = base;
4781 result->limit = base + len;
4782 result->next = NULL;
4783 return result;
4786 /* Place a chain of unwanted allocation buffers on the free list. */
4787 void
4788 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4790 _cpp_buff *end = buff;
4792 while (end->next)
4793 end = end->next;
4794 end->next = pfile->free_buffs;
4795 pfile->free_buffs = buff;
4798 /* Return a free buffer of size at least MIN_SIZE. */
4799 _cpp_buff *
4800 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4802 _cpp_buff *result, **p;
4804 for (p = &pfile->free_buffs;; p = &(*p)->next)
4806 size_t size;
4808 if (*p == NULL)
4809 return new_buff (min_size);
4810 result = *p;
4811 size = result->limit - result->base;
4812 /* Return a buffer that's big enough, but don't waste one that's
4813 way too big. */
4814 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4815 break;
4818 *p = result->next;
4819 result->next = NULL;
4820 result->cur = result->base;
4821 return result;
4824 /* Creates a new buffer with enough space to hold the uncommitted
4825 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
4826 the excess bytes to the new buffer. Chains the new buffer after
4827 BUFF, and returns the new buffer. */
4828 _cpp_buff *
4829 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4831 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4832 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4834 buff->next = new_buff;
4835 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4836 return new_buff;
4839 /* Creates a new buffer with enough space to hold the uncommitted
4840 remaining bytes of the buffer pointed to by BUFF, and at least
4841 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
4842 Chains the new buffer before the buffer pointed to by BUFF, and
4843 updates the pointer to point to the new buffer. */
4844 void
4845 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4847 _cpp_buff *new_buff, *old_buff = *pbuff;
4848 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4850 new_buff = _cpp_get_buff (pfile, size);
4851 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4852 new_buff->next = old_buff;
4853 *pbuff = new_buff;
4856 /* Free a chain of buffers starting at BUFF. */
4857 void
4858 _cpp_free_buff (_cpp_buff *buff)
4860 _cpp_buff *next;
4862 for (; buff; buff = next)
4864 next = buff->next;
4865 #ifdef ENABLE_VALGRIND_WORKAROUNDS
4866 free (buff);
4867 #else
4868 free (buff->base);
4869 #endif
4873 /* Allocate permanent, unaligned storage of length LEN. */
4874 unsigned char *
4875 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4877 _cpp_buff *buff = pfile->u_buff;
4878 unsigned char *result = buff->cur;
4880 if (len > (size_t) (buff->limit - result))
4882 buff = _cpp_get_buff (pfile, len);
4883 buff->next = pfile->u_buff;
4884 pfile->u_buff = buff;
4885 result = buff->cur;
4888 buff->cur = result + len;
4889 return result;
4892 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4893 That buffer is used for growing allocations when saving macro
4894 replacement lists in a #define, and when parsing an answer to an
4895 assertion in #assert, #unassert or #if (and therefore possibly
4896 whilst expanding macros). It therefore must not be used by any
4897 code that they might call: specifically the lexer and the guts of
4898 the macro expander.
4900 All existing other uses clearly fit this restriction: storing
4901 registered pragmas during initialization. */
4902 unsigned char *
4903 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4905 _cpp_buff *buff = pfile->a_buff;
4906 unsigned char *result = buff->cur;
4908 if (len > (size_t) (buff->limit - result))
4910 buff = _cpp_get_buff (pfile, len);
4911 buff->next = pfile->a_buff;
4912 pfile->a_buff = buff;
4913 result = buff->cur;
4916 buff->cur = result + len;
4917 return result;
4920 /* Commit or allocate storage from a buffer. */
4922 void *
4923 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4925 void *ptr = BUFF_FRONT (pfile->a_buff);
4927 if (pfile->hash_table->alloc_subobject)
4929 void *copy = pfile->hash_table->alloc_subobject (size);
4930 memcpy (copy, ptr, size);
4931 ptr = copy;
4933 else
4934 BUFF_FRONT (pfile->a_buff) += size;
4936 return ptr;
4939 /* Say which field of TOK is in use. */
4941 enum cpp_token_fld_kind
4942 cpp_token_val_index (const cpp_token *tok)
4944 switch (TOKEN_SPELL (tok))
4946 case SPELL_IDENT:
4947 return CPP_TOKEN_FLD_NODE;
4948 case SPELL_LITERAL:
4949 return CPP_TOKEN_FLD_STR;
4950 case SPELL_OPERATOR:
4951 /* Operands which were originally spelled as ident keep around
4952 the node for the exact spelling. */
4953 if (tok->flags & NAMED_OP)
4954 return CPP_TOKEN_FLD_NODE;
4955 else if (tok->type == CPP_PASTE)
4956 return CPP_TOKEN_FLD_TOKEN_NO;
4957 else
4958 return CPP_TOKEN_FLD_NONE;
4959 case SPELL_NONE:
4960 if (tok->type == CPP_MACRO_ARG)
4961 return CPP_TOKEN_FLD_ARG_NO;
4962 else if (tok->type == CPP_PADDING)
4963 return CPP_TOKEN_FLD_SOURCE;
4964 else if (tok->type == CPP_PRAGMA)
4965 return CPP_TOKEN_FLD_PRAGMA;
4966 /* fall through */
4967 default:
4968 return CPP_TOKEN_FLD_NONE;
4972 /* All tokens lexed in R after calling this function will be forced to
4973 have their location_t to be P, until
4974 cpp_stop_forcing_token_locations is called for R. */
4976 void
4977 cpp_force_token_locations (cpp_reader *r, location_t loc)
4979 r->forced_token_location = loc;
4982 /* Go back to assigning locations naturally for lexed tokens. */
4984 void
4985 cpp_stop_forcing_token_locations (cpp_reader *r)
4987 r->forced_token_location = 0;
4990 /* We're looking at \, if it's escaping EOL, look past it. If at
4991 LIMIT, don't advance. */
4993 static const unsigned char *
4994 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4996 const unsigned char *probe = peek;
4998 if (__builtin_expect (peek[1] == '\n', true))
5000 eol:
5001 probe += 2;
5002 if (__builtin_expect (probe < limit, true))
5004 peek = probe;
5005 if (*peek == '\\')
5006 /* The user might be perverse. */
5007 return do_peek_backslash (peek, limit);
5010 else if (__builtin_expect (peek[1] == '\r', false))
5012 if (probe[2] == '\n')
5013 probe++;
5014 goto eol;
5017 return peek;
5020 static const unsigned char *
5021 do_peek_next (const unsigned char *peek, const unsigned char *limit)
5023 if (__builtin_expect (*peek == '\\', false))
5024 peek = do_peek_backslash (peek, limit);
5025 return peek;
5028 static const unsigned char *
5029 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
5031 if (peek == bound)
5032 return NULL;
5034 unsigned char c = *--peek;
5035 if (__builtin_expect (c == '\n', false)
5036 || __builtin_expect (c == 'r', false))
5038 if (peek == bound)
5039 return peek;
5040 int ix = -1;
5041 if (c == '\n' && peek[ix] == '\r')
5043 if (peek + ix == bound)
5044 return peek;
5045 ix--;
5048 if (peek[ix] == '\\')
5049 return do_peek_prev (peek + ix, bound);
5051 return peek;
5053 else
5054 return peek;
5057 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
5058 space. Otherwise return NULL. */
5060 static const unsigned char *
5061 do_peek_ident (const char *match, const unsigned char *peek,
5062 const unsigned char *limit)
5064 for (; *++match; peek++)
5065 if (*peek != *match)
5067 peek = do_peek_next (peek, limit);
5068 if (*peek != *match)
5069 return NULL;
5072 /* Must now not be looking at an identifier char. */
5073 peek = do_peek_next (peek, limit);
5074 if (ISIDNUM (*peek))
5075 return NULL;
5077 /* Skip control-line whitespace. */
5079 while (*peek == ' ' || *peek == '\t')
5080 peek++;
5081 if (__builtin_expect (*peek == '\\', false))
5083 peek = do_peek_backslash (peek, limit);
5084 if (*peek != '\\')
5085 goto ws;
5088 return peek;
5091 /* Are we looking at a module control line starting as PEEK - 1? */
5093 static bool
5094 do_peek_module (cpp_reader *pfile, unsigned char c,
5095 const unsigned char *peek, const unsigned char *limit)
5097 bool import = false;
5099 if (__builtin_expect (c == 'e', false))
5101 if (!((peek[0] == 'x' || peek[0] == '\\')
5102 && (peek = do_peek_ident ("export", peek, limit))))
5103 return false;
5105 /* export, peek for import or module. No need to peek __import
5106 here. */
5107 if (peek[0] == 'i')
5109 if (!((peek[1] == 'm' || peek[1] == '\\')
5110 && (peek = do_peek_ident ("import", peek + 1, limit))))
5111 return false;
5112 import = true;
5114 else if (peek[0] == 'm')
5116 if (!((peek[1] == 'o' || peek[1] == '\\')
5117 && (peek = do_peek_ident ("module", peek + 1, limit))))
5118 return false;
5120 else
5121 return false;
5123 else if (__builtin_expect (c == 'i', false))
5125 if (!((peek[0] == 'm' || peek[0] == '\\')
5126 && (peek = do_peek_ident ("import", peek, limit))))
5127 return false;
5128 import = true;
5130 else if (__builtin_expect (c == '_', false))
5132 /* Needed for translated includes. */
5133 if (!((peek[0] == '_' || peek[0] == '\\')
5134 && (peek = do_peek_ident ("__import", peek, limit))))
5135 return false;
5136 import = true;
5138 else if (__builtin_expect (c == 'm', false))
5140 if (!((peek[0] == 'o' || peek[0] == '\\')
5141 && (peek = do_peek_ident ("module", peek, limit))))
5142 return false;
5144 else
5145 return false;
5147 /* Peek the next character to see if it's good enough. We'll be at
5148 the first non-whitespace char, including skipping an escaped
5149 newline. */
5150 /* ... import followed by identifier, ':', '<' or header-name
5151 preprocessing tokens, or module followed by identifier, ':' or
5152 ';' preprocessing tokens. */
5153 unsigned char p = *peek++;
5155 /* A character literal is ... single quotes, ... optionally preceded
5156 by u8, u, U, or L */
5157 /* A string-literal is a ... double quotes, optionally prefixed by
5158 R, u8, u8R, u, uR, U, UR, L, or LR */
5159 if (p == 'u')
5161 peek = do_peek_next (peek, limit);
5162 if (*peek == '8')
5164 peek++;
5165 goto peek_u8;
5167 goto peek_u;
5169 else if (p == 'U' || p == 'L')
5171 peek_u8:
5172 peek = do_peek_next (peek, limit);
5173 peek_u:
5174 if (*peek == '\"' || *peek == '\'')
5175 return false;
5177 if (*peek == 'R')
5178 goto peek_R;
5179 /* Identifier. Ok. */
5181 else if (p == 'R')
5183 peek_R:
5184 if (CPP_OPTION (pfile, rliterals))
5186 peek = do_peek_next (peek, limit);
5187 if (*peek == '\"')
5188 return false;
5190 /* Identifier. Ok. */
5192 else if ('Z' - 'A' == 25
5193 ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5194 : ISIDST (p))
5196 /* Identifier. Ok. */
5198 else if (p == '<')
5200 /* Maybe angle header, ok for import. Reject
5201 '<=', '<<' digraph:'<:'. */
5202 if (!import)
5203 return false;
5204 peek = do_peek_next (peek, limit);
5205 if (*peek == '=' || *peek == '<'
5206 || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5207 return false;
5209 else if (p == ';')
5211 /* SEMICOLON, ok for module. */
5212 if (import)
5213 return false;
5215 else if (p == '"')
5217 /* STRING, ok for import. */
5218 if (!import)
5219 return false;
5221 else if (p == ':')
5223 /* Maybe COLON, ok. Reject '::', digraph:':>'. */
5224 peek = do_peek_next (peek, limit);
5225 if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5226 return false;
5228 else
5229 /* FIXME: Detect a unicode character, excluding those not
5230 permitted as the initial character. [lex.name]/1. I presume
5231 we need to check the \[uU] spellings, and directly using
5232 Unicode in say UTF8 form? Or perhaps we do the phase-1
5233 conversion of UTF8 to universal-character-names? */
5234 return false;
5236 return true;
5239 /* Directives-only scanning. Somewhat more relaxed than correct
5240 parsing -- some ill-formed programs will not be rejected. */
5242 void
5243 cpp_directive_only_process (cpp_reader *pfile,
5244 void *data,
5245 void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5247 bool module_p = CPP_OPTION (pfile, module_directives);
5251 restart:
5252 /* Buffer initialization, but no line cleaning. */
5253 cpp_buffer *buffer = pfile->buffer;
5254 buffer->cur_note = buffer->notes_used = 0;
5255 buffer->cur = buffer->line_base = buffer->next_line;
5256 buffer->need_line = false;
5257 /* Files always end in a newline or carriage return. We rely on this for
5258 character peeking safety. */
5259 gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
5261 const unsigned char *base = buffer->cur;
5262 unsigned line_count = 0;
5263 const unsigned char *line_start = base;
5265 bool bol = true;
5266 bool raw = false;
5268 const unsigned char *lwm = base;
5269 for (const unsigned char *pos = base, *limit = buffer->rlimit;
5270 pos < limit;)
5272 unsigned char c = *pos++;
5273 /* This matches the switch in _cpp_lex_direct. */
5274 switch (c)
5276 case ' ': case '\t': case '\f': case '\v':
5277 /* Whitespace, do nothing. */
5278 break;
5280 case '\r': /* MAC line ending, or Windows \r\n */
5281 if (*pos == '\n')
5282 pos++;
5283 /* FALLTHROUGH */
5285 case '\n':
5286 bol = true;
5288 next_line:
5289 CPP_INCREMENT_LINE (pfile, 0);
5290 line_count++;
5291 line_start = pos;
5292 break;
5294 case '\\':
5295 /* <backslash><newline> is removed, and doesn't undo any
5296 preceeding escape or whatnot. */
5297 if (*pos == '\n')
5299 pos++;
5300 goto next_line;
5302 else if (*pos == '\r')
5304 if (pos[1] == '\n')
5305 pos++;
5306 pos++;
5307 goto next_line;
5309 goto dflt;
5311 case '#':
5312 if (bol)
5314 /* Line directive. */
5315 if (pos - 1 > base && !pfile->state.skipping)
5316 cb (pfile, CPP_DO_print, data,
5317 line_count, base, pos - 1 - base);
5319 /* Prep things for directive handling. */
5320 buffer->next_line = pos;
5321 buffer->need_line = true;
5322 bool ok = _cpp_get_fresh_line (pfile);
5323 gcc_checking_assert (ok);
5325 /* Ensure proper column numbering for generated
5326 error messages. */
5327 buffer->line_base -= pos - line_start;
5329 _cpp_handle_directive (pfile, line_start + 1 != pos);
5331 /* Sanitize the line settings. Duplicate #include's can
5332 mess things up. */
5333 // FIXME: Necessary?
5334 pfile->line_table->highest_location
5335 = pfile->line_table->highest_line;
5337 if (!pfile->state.skipping
5338 && pfile->buffer->next_line < pfile->buffer->rlimit)
5339 cb (pfile, CPP_DO_location, data,
5340 pfile->line_table->highest_line);
5342 goto restart;
5344 goto dflt;
5346 case '/':
5348 const unsigned char *peek = do_peek_next (pos, limit);
5349 if (!(*peek == '/' || *peek == '*'))
5350 goto dflt;
5352 /* Line or block comment */
5353 bool is_block = *peek == '*';
5354 bool star = false;
5355 bool esc = false;
5356 location_t sloc
5357 = linemap_position_for_column (pfile->line_table,
5358 pos - line_start);
5360 while (pos < limit)
5362 char c = *pos++;
5363 switch (c)
5365 case '\\':
5366 esc = true;
5367 break;
5369 case '\r':
5370 if (*pos == '\n')
5371 pos++;
5372 /* FALLTHROUGH */
5374 case '\n':
5376 CPP_INCREMENT_LINE (pfile, 0);
5377 line_count++;
5378 line_start = pos;
5379 if (!esc && !is_block)
5381 bol = true;
5382 goto done_comment;
5385 if (!esc)
5386 star = false;
5387 esc = false;
5388 break;
5390 case '*':
5391 if (pos > peek)
5392 star = is_block;
5393 esc = false;
5394 break;
5396 case '/':
5397 if (star)
5398 goto done_comment;
5399 /* FALLTHROUGH */
5401 default:
5402 star = false;
5403 esc = false;
5404 break;
5407 if (pos < limit || is_block)
5408 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5409 "unterminated comment");
5410 done_comment:
5411 lwm = pos;
5412 break;
5415 case '\'':
5416 if (!CPP_OPTION (pfile, digit_separators))
5417 goto delimited_string;
5419 /* Possibly a number punctuator. */
5420 if (!ISIDNUM (*do_peek_next (pos, limit)))
5421 goto delimited_string;
5423 goto quote_peek;
5425 case '\"':
5426 if (!CPP_OPTION (pfile, rliterals))
5427 goto delimited_string;
5429 quote_peek:
5431 /* For ' see if it's a number punctuator
5432 \.?<digit>(<digit>|<identifier-nondigit>
5433 |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5434 /* For " see if it's a raw string
5435 {U,L,u,u8}R. This includes CPP_NUMBER detection,
5436 because that could be 0e+R. */
5437 const unsigned char *peek = pos - 1;
5438 bool quote_first = c == '"';
5439 bool quote_eight = false;
5440 bool maybe_number_start = false;
5441 bool want_number = false;
5443 while ((peek = do_peek_prev (peek, lwm)))
5445 unsigned char p = *peek;
5446 if (quote_first)
5448 if (!raw)
5450 if (p != 'R')
5451 break;
5452 raw = true;
5453 continue;
5456 quote_first = false;
5457 if (p == 'L' || p == 'U' || p == 'u')
5459 else if (p == '8')
5460 quote_eight = true;
5461 else
5462 goto second_raw;
5464 else if (quote_eight)
5466 if (p != 'u')
5468 raw = false;
5469 break;
5471 quote_eight = false;
5473 else if (c == '"')
5475 second_raw:;
5476 if (!want_number && ISIDNUM (p))
5478 raw = false;
5479 break;
5483 if (ISDIGIT (p))
5484 maybe_number_start = true;
5485 else if (p == '.')
5486 want_number = true;
5487 else if (ISIDNUM (p))
5488 maybe_number_start = false;
5489 else if (p == '+' || p == '-')
5491 if (const unsigned char *peek_prev
5492 = do_peek_prev (peek, lwm))
5494 p = *peek_prev;
5495 if (p == 'e' || p == 'E'
5496 || p == 'p' || p == 'P')
5498 want_number = true;
5499 maybe_number_start = false;
5501 else
5502 break;
5504 else
5505 break;
5507 else if (p == '\'' || p == '\"')
5509 /* If this is lwm, this must be the end of a
5510 previous string. So this is a trailing
5511 literal type, (a) if those are allowed,
5512 and (b) maybe_start is false. Otherwise
5513 this must be a CPP_NUMBER because we've
5514 met another ', and we'd have checked that
5515 in its own right. */
5516 if (peek == lwm && CPP_OPTION (pfile, uliterals))
5518 if (!maybe_number_start && !want_number)
5519 /* Must be a literal type. */
5520 raw = false;
5522 else if (p == '\''
5523 && CPP_OPTION (pfile, digit_separators))
5524 maybe_number_start = true;
5525 break;
5527 else if (c == '\'')
5528 break;
5529 else if (!quote_first && !quote_eight)
5530 break;
5533 if (maybe_number_start)
5535 if (c == '\'')
5536 /* A CPP NUMBER. */
5537 goto dflt;
5538 raw = false;
5541 goto delimited_string;
5544 delimited_string:
5546 /* (Possibly raw) string or char literal. */
5547 unsigned char end = c;
5548 int delim_len = -1;
5549 const unsigned char *delim = NULL;
5550 location_t sloc = linemap_position_for_column (pfile->line_table,
5551 pos - line_start);
5552 int esc = 0;
5554 if (raw)
5556 /* There can be no line breaks in the delimiter. */
5557 delim = pos;
5558 for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5560 if (delim_len == 16)
5562 cpp_error_with_line (pfile, CPP_DL_ERROR,
5563 sloc, 0,
5564 "raw string delimiter"
5565 " longer than %d"
5566 " characters",
5567 delim_len);
5568 raw = false;
5569 pos = delim;
5570 break;
5572 if (strchr (") \\\t\v\f\n", c))
5574 cpp_error_with_line (pfile, CPP_DL_ERROR,
5575 sloc, 0,
5576 "invalid character '%c'"
5577 " in raw string"
5578 " delimiter", c);
5579 raw = false;
5580 pos = delim;
5581 break;
5583 if (pos >= limit)
5584 goto bad_string;
5588 while (pos < limit)
5590 char c = *pos++;
5591 switch (c)
5593 case '\\':
5594 if (!raw)
5595 esc++;
5596 break;
5598 case '\r':
5599 if (*pos == '\n')
5600 pos++;
5601 /* FALLTHROUGH */
5603 case '\n':
5605 CPP_INCREMENT_LINE (pfile, 0);
5606 line_count++;
5607 line_start = pos;
5609 if (esc)
5610 esc--;
5611 break;
5613 case ')':
5614 if (raw
5615 && pos + delim_len + 1 < limit
5616 && pos[delim_len] == end
5617 && !memcmp (delim, pos, delim_len))
5619 pos += delim_len + 1;
5620 raw = false;
5621 goto done_string;
5623 break;
5625 default:
5626 if (!raw && !(esc & 1) && c == end)
5627 goto done_string;
5628 esc = 0;
5629 break;
5632 bad_string:
5633 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5634 "unterminated literal");
5636 done_string:
5637 raw = false;
5638 lwm = pos - 1;
5640 goto dflt;
5642 case '_':
5643 case 'e':
5644 case 'i':
5645 case 'm':
5646 if (bol && module_p && !pfile->state.skipping
5647 && do_peek_module (pfile, c, pos, limit))
5649 /* We've seen the start of a module control line.
5650 Start up the tokenizer. */
5651 pos--; /* Backup over the first character. */
5653 /* Backup over whitespace to start of line. */
5654 while (pos > line_start
5655 && (pos[-1] == ' ' || pos[-1] == '\t'))
5656 pos--;
5658 if (pos > base)
5659 cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5661 /* Prep things for directive handling. */
5662 buffer->next_line = pos;
5663 buffer->need_line = true;
5665 /* Now get tokens until the PRAGMA_EOL. */
5668 location_t spelling;
5669 const cpp_token *tok
5670 = cpp_get_token_with_location (pfile, &spelling);
5672 gcc_assert (pfile->state.in_deferred_pragma
5673 || tok->type == CPP_PRAGMA_EOL);
5674 cb (pfile, CPP_DO_token, data, tok, spelling);
5676 while (pfile->state.in_deferred_pragma);
5678 if (pfile->buffer->next_line < pfile->buffer->rlimit)
5679 cb (pfile, CPP_DO_location, data,
5680 pfile->line_table->highest_line);
5682 pfile->mi_valid = false;
5683 goto restart;
5685 goto dflt;
5687 default:
5688 dflt:
5689 bol = false;
5690 pfile->mi_valid = false;
5691 break;
5695 if (buffer->rlimit > base && !pfile->state.skipping)
5697 const unsigned char *limit = buffer->rlimit;
5698 /* If the file was not newline terminated, add rlimit, which is
5699 guaranteed to point to a newline, to the end of our range. */
5700 if (limit[-1] != '\n')
5702 limit++;
5703 CPP_INCREMENT_LINE (pfile, 0);
5704 line_count++;
5706 cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5709 _cpp_pop_buffer (pfile);
5711 while (pfile->buffer);