PR111048: Set arg_npatterns correctly.
[official-gcc.git] / libcpp / lex.cc
blob8dea4d3d4ebcb8bbb3bad1af7c6e837dd5d88264
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000-2023 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
27 enum spell_type
29 SPELL_OPERATOR = 0,
30 SPELL_IDENT,
31 SPELL_LITERAL,
32 SPELL_NONE
35 struct token_spelling
37 enum spell_type category;
38 const unsigned char *name;
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
44 #define OP(e, s) { SPELL_OPERATOR, UC s },
45 #define TK(e, s) { SPELL_ ## s, UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
53 /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. */
54 #define UCS_LIMIT 0x10FFFF
56 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
57 static int skip_line_comment (cpp_reader *);
58 static void skip_whitespace (cpp_reader *, cppchar_t);
59 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
60 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
61 static void store_comment (cpp_reader *, cpp_token *);
62 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
63 unsigned int, enum cpp_ttype);
64 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
65 static int name_p (cpp_reader *, const cpp_string *);
66 static tokenrun *next_tokenrun (tokenrun *);
68 static _cpp_buff *new_buff (size_t);
71 /* Utility routine:
73 Compares, the token TOKEN to the NUL-terminated string STRING.
74 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
75 int
76 cpp_ideq (const cpp_token *token, const char *string)
78 if (token->type != CPP_NAME)
79 return 0;
81 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
84 /* Record a note TYPE at byte POS into the current cleaned logical
85 line. */
86 static void
87 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
89 if (buffer->notes_used == buffer->notes_cap)
91 buffer->notes_cap = buffer->notes_cap * 2 + 200;
92 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
93 buffer->notes_cap);
96 buffer->notes[buffer->notes_used].pos = pos;
97 buffer->notes[buffer->notes_used].type = type;
98 buffer->notes_used++;
102 /* Fast path to find line special characters using optimized character
103 scanning algorithms. Anything complicated falls back to the slow
104 path below. Since this loop is very hot it's worth doing these kinds
105 of optimizations.
107 One of the paths through the ifdefs should provide
109 const uchar *search_line_fast (const uchar *s, const uchar *end);
111 Between S and END, search for \n, \r, \\, ?. Return a pointer to
112 the found character.
114 Note that the last character of the buffer is *always* a newline,
115 as forced by _cpp_convert_input. This fact can be used to avoid
116 explicitly looking for the end of the buffer. */
118 /* Configure gives us an ifdef test. */
119 #ifndef WORDS_BIGENDIAN
120 #define WORDS_BIGENDIAN 0
121 #endif
123 /* We'd like the largest integer that fits into a register. There's nothing
124 in <stdint.h> that gives us that. For most hosts this is unsigned long,
125 but MS decided on an LLP64 model. Thankfully when building with GCC we
126 can get the "real" word size. */
127 #ifdef __GNUC__
128 typedef unsigned int word_type __attribute__((__mode__(__word__)));
129 #else
130 typedef unsigned long word_type;
131 #endif
133 /* The code below is only expecting sizes 4 or 8.
134 Die at compile-time if this expectation is violated. */
135 typedef char check_word_type_size
136 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
138 /* Return X with the first N bytes forced to values that won't match one
139 of the interesting characters. Note that NUL is not interesting. */
141 static inline word_type
142 acc_char_mask_misalign (word_type val, unsigned int n)
144 word_type mask = -1;
145 if (WORDS_BIGENDIAN)
146 mask >>= n * 8;
147 else
148 mask <<= n * 8;
149 return val & mask;
152 /* Return X replicated to all byte positions within WORD_TYPE. */
154 static inline word_type
155 acc_char_replicate (uchar x)
157 word_type ret;
159 ret = (x << 24) | (x << 16) | (x << 8) | x;
160 if (sizeof(word_type) == 8)
161 ret = (ret << 16 << 16) | ret;
162 return ret;
165 /* Return non-zero if some byte of VAL is (probably) C. */
167 static inline word_type
168 acc_char_cmp (word_type val, word_type c)
170 #if defined(__GNUC__) && defined(__alpha__)
171 /* We can get exact results using a compare-bytes instruction.
172 Get (val == c) via (0 >= (val ^ c)). */
173 return __builtin_alpha_cmpbge (0, val ^ c);
174 #else
175 word_type magic = 0x7efefefeU;
176 if (sizeof(word_type) == 8)
177 magic = (magic << 16 << 16) | 0xfefefefeU;
178 magic |= 1;
180 val ^= c;
181 return ((val + magic) ^ ~val) & ~magic;
182 #endif
185 /* Given the result of acc_char_cmp is non-zero, return the index of
186 the found character. If this was a false positive, return -1. */
188 static inline int
189 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
190 word_type val ATTRIBUTE_UNUSED)
192 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
193 /* The cmpbge instruction sets *bits* of the result corresponding to
194 matches in the bytes with no false positives. */
195 return __builtin_ctzl (cmp);
196 #else
197 unsigned int i;
199 /* ??? It would be nice to force unrolling here,
200 and have all of these constants folded. */
201 for (i = 0; i < sizeof(word_type); ++i)
203 uchar c;
204 if (WORDS_BIGENDIAN)
205 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
206 else
207 c = (val >> i * 8) & 0xff;
209 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
210 return i;
213 return -1;
214 #endif
217 /* A version of the fast scanner using bit fiddling techniques.
219 For 32-bit words, one would normally perform 16 comparisons and
220 16 branches. With this algorithm one performs 24 arithmetic
221 operations and one branch. Whether this is faster with a 32-bit
222 word size is going to be somewhat system dependent.
224 For 64-bit words, we eliminate twice the number of comparisons
225 and branches without increasing the number of arithmetic operations.
226 It's almost certainly going to be a win with 64-bit word size. */
228 static const uchar * search_line_acc_char (const uchar *, const uchar *)
229 ATTRIBUTE_UNUSED;
231 static const uchar *
232 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
234 const word_type repl_nl = acc_char_replicate ('\n');
235 const word_type repl_cr = acc_char_replicate ('\r');
236 const word_type repl_bs = acc_char_replicate ('\\');
237 const word_type repl_qm = acc_char_replicate ('?');
239 unsigned int misalign;
240 const word_type *p;
241 word_type val, t;
243 /* Align the buffer. Mask out any bytes from before the beginning. */
244 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
245 val = *p;
246 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
247 if (misalign)
248 val = acc_char_mask_misalign (val, misalign);
250 /* Main loop. */
251 while (1)
253 t = acc_char_cmp (val, repl_nl);
254 t |= acc_char_cmp (val, repl_cr);
255 t |= acc_char_cmp (val, repl_bs);
256 t |= acc_char_cmp (val, repl_qm);
258 if (__builtin_expect (t != 0, 0))
260 int i = acc_char_index (t, val);
261 if (i >= 0)
262 return (const uchar *)p + i;
265 val = *++p;
269 /* Disable on Solaris 2/x86 until the following problem can be properly
270 autoconfed:
272 The Solaris 10+ assembler tags objects with the instruction set
273 extensions used, so SSE4.2 executables cannot run on machines that
274 don't support that extension. */
276 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
278 /* Replicated character data to be shared between implementations.
279 Recall that outside of a context with vector support we can't
280 define compatible vector types, therefore these are all defined
281 in terms of raw characters. */
282 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
283 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
284 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
285 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
286 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
287 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
288 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
289 { '?', '?', '?', '?', '?', '?', '?', '?',
290 '?', '?', '?', '?', '?', '?', '?', '?' },
293 /* A version of the fast scanner using MMX vectorized byte compare insns.
295 This uses the PMOVMSKB instruction which was introduced with "MMX2",
296 which was packaged into SSE1; it is also present in the AMD MMX
297 extension. Mark the function as using "sse" so that we emit a real
298 "emms" instruction, rather than the 3dNOW "femms" instruction. */
300 static const uchar *
301 #ifndef __SSE__
302 __attribute__((__target__("sse")))
303 #endif
304 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
306 typedef char v8qi __attribute__ ((__vector_size__ (8)));
307 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
309 const v8qi repl_nl = *(const v8qi *)repl_chars[0];
310 const v8qi repl_cr = *(const v8qi *)repl_chars[1];
311 const v8qi repl_bs = *(const v8qi *)repl_chars[2];
312 const v8qi repl_qm = *(const v8qi *)repl_chars[3];
314 unsigned int misalign, found, mask;
315 const v8qi *p;
316 v8qi data, t, c;
318 /* Align the source pointer. While MMX doesn't generate unaligned data
319 faults, this allows us to safely scan to the end of the buffer without
320 reading beyond the end of the last page. */
321 misalign = (uintptr_t)s & 7;
322 p = (const v8qi *)((uintptr_t)s & -8);
323 data = *p;
325 /* Create a mask for the bytes that are valid within the first
326 16-byte block. The Idea here is that the AND with the mask
327 within the loop is "free", since we need some AND or TEST
328 insn in order to set the flags for the branch anyway. */
329 mask = -1u << misalign;
331 /* Main loop processing 8 bytes at a time. */
332 goto start;
335 data = *++p;
336 mask = -1;
338 start:
339 t = __builtin_ia32_pcmpeqb(data, repl_nl);
340 c = __builtin_ia32_pcmpeqb(data, repl_cr);
341 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
342 c = __builtin_ia32_pcmpeqb(data, repl_bs);
343 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
344 c = __builtin_ia32_pcmpeqb(data, repl_qm);
345 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
346 found = __builtin_ia32_pmovmskb (t);
347 found &= mask;
349 while (!found);
351 __builtin_ia32_emms ();
353 /* FOUND contains 1 in bits for which we matched a relevant
354 character. Conversion to the byte index is trivial. */
355 found = __builtin_ctz(found);
356 return (const uchar *)p + found;
359 /* A version of the fast scanner using SSE2 vectorized byte compare insns. */
361 static const uchar *
362 #ifndef __SSE2__
363 __attribute__((__target__("sse2")))
364 #endif
365 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
367 typedef char v16qi __attribute__ ((__vector_size__ (16)));
369 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
370 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
371 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
372 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
374 unsigned int misalign, found, mask;
375 const v16qi *p;
376 v16qi data, t;
378 /* Align the source pointer. */
379 misalign = (uintptr_t)s & 15;
380 p = (const v16qi *)((uintptr_t)s & -16);
381 data = *p;
383 /* Create a mask for the bytes that are valid within the first
384 16-byte block. The Idea here is that the AND with the mask
385 within the loop is "free", since we need some AND or TEST
386 insn in order to set the flags for the branch anyway. */
387 mask = -1u << misalign;
389 /* Main loop processing 16 bytes at a time. */
390 goto start;
393 data = *++p;
394 mask = -1;
396 start:
397 t = data == repl_nl;
398 t |= data == repl_cr;
399 t |= data == repl_bs;
400 t |= data == repl_qm;
401 found = __builtin_ia32_pmovmskb128 (t);
402 found &= mask;
404 while (!found);
406 /* FOUND contains 1 in bits for which we matched a relevant
407 character. Conversion to the byte index is trivial. */
408 found = __builtin_ctz(found);
409 return (const uchar *)p + found;
412 #ifdef HAVE_SSE4
413 /* A version of the fast scanner using SSE 4.2 vectorized string insns. */
415 static const uchar *
416 #ifndef __SSE4_2__
417 __attribute__((__target__("sse4.2")))
418 #endif
419 search_line_sse42 (const uchar *s, const uchar *end)
421 typedef char v16qi __attribute__ ((__vector_size__ (16)));
422 static const v16qi search = { '\n', '\r', '?', '\\' };
424 uintptr_t si = (uintptr_t)s;
425 uintptr_t index;
427 /* Check for unaligned input. */
428 if (si & 15)
430 v16qi sv;
432 if (__builtin_expect (end - s < 16, 0)
433 && __builtin_expect ((si & 0xfff) > 0xff0, 0))
435 /* There are less than 16 bytes left in the buffer, and less
436 than 16 bytes left on the page. Reading 16 bytes at this
437 point might generate a spurious page fault. Defer to the
438 SSE2 implementation, which already handles alignment. */
439 return search_line_sse2 (s, end);
442 /* ??? The builtin doesn't understand that the PCMPESTRI read from
443 memory need not be aligned. */
444 sv = __builtin_ia32_loaddqu ((const char *) s);
445 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
447 if (__builtin_expect (index < 16, 0))
448 goto found;
450 /* Advance the pointer to an aligned address. We will re-scan a
451 few bytes, but we no longer need care for reading past the
452 end of a page, since we're guaranteed a match. */
453 s = (const uchar *)((si + 15) & -16);
456 /* Main loop, processing 16 bytes at a time. */
457 #ifdef __GCC_ASM_FLAG_OUTPUTS__
458 while (1)
460 char f;
462 /* By using inline assembly instead of the builtin,
463 we can use the result, as well as the flags set. */
464 __asm ("%vpcmpestri\t$0, %2, %3"
465 : "=c"(index), "=@ccc"(f)
466 : "m"(*s), "x"(search), "a"(4), "d"(16));
467 if (f)
468 break;
470 s += 16;
472 #else
473 s -= 16;
474 /* By doing the whole loop in inline assembly,
475 we can make proper use of the flags set. */
476 __asm ( ".balign 16\n"
477 "0: add $16, %1\n"
478 " %vpcmpestri\t$0, (%1), %2\n"
479 " jnc 0b"
480 : "=&c"(index), "+r"(s)
481 : "x"(search), "a"(4), "d"(16));
482 #endif
484 found:
485 return s + index;
488 #else
489 /* Work around out-dated assemblers without sse4 support. */
490 #define search_line_sse42 search_line_sse2
491 #endif
493 /* Check the CPU capabilities. */
495 #include "../gcc/config/i386/cpuid.h"
497 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
498 static search_line_fast_type search_line_fast;
500 #define HAVE_init_vectorized_lexer 1
501 static inline void
502 init_vectorized_lexer (void)
504 unsigned dummy, ecx = 0, edx = 0;
505 search_line_fast_type impl = search_line_acc_char;
506 int minimum = 0;
508 #if defined(__SSE4_2__)
509 minimum = 3;
510 #elif defined(__SSE2__)
511 minimum = 2;
512 #elif defined(__SSE__)
513 minimum = 1;
514 #endif
516 if (minimum == 3)
517 impl = search_line_sse42;
518 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
520 if (minimum == 3 || (ecx & bit_SSE4_2))
521 impl = search_line_sse42;
522 else if (minimum == 2 || (edx & bit_SSE2))
523 impl = search_line_sse2;
524 else if (minimum == 1 || (edx & bit_SSE))
525 impl = search_line_mmx;
527 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
529 if (minimum == 1
530 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
531 impl = search_line_mmx;
534 search_line_fast = impl;
537 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
539 /* A vection of the fast scanner using AltiVec vectorized byte compares
540 and VSX unaligned loads (when VSX is available). This is otherwise
541 the same as the AltiVec version. */
543 ATTRIBUTE_NO_SANITIZE_UNDEFINED
544 static const uchar *
545 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
547 typedef __attribute__((altivec(vector))) unsigned char vc;
549 const vc repl_nl = {
550 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
551 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
553 const vc repl_cr = {
554 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
555 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
557 const vc repl_bs = {
558 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
559 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
561 const vc repl_qm = {
562 '?', '?', '?', '?', '?', '?', '?', '?',
563 '?', '?', '?', '?', '?', '?', '?', '?',
565 const vc zero = { 0 };
567 vc data, t;
569 /* Main loop processing 16 bytes at a time. */
572 vc m_nl, m_cr, m_bs, m_qm;
574 data = __builtin_vec_vsx_ld (0, s);
575 s += 16;
577 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
578 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
579 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
580 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
581 t = (m_nl | m_cr) | (m_bs | m_qm);
583 /* T now contains 0xff in bytes for which we matched one of the relevant
584 characters. We want to exit the loop if any byte in T is non-zero.
585 Below is the expansion of vec_any_ne(t, zero). */
587 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
589 /* Restore s to to point to the 16 bytes we just processed. */
590 s -= 16;
593 #define N (sizeof(vc) / sizeof(long))
595 union {
596 vc v;
597 /* Statically assert that N is 2 or 4. */
598 unsigned long l[(N == 2 || N == 4) ? N : -1];
599 } u;
600 unsigned long l, i = 0;
602 u.v = t;
604 /* Find the first word of T that is non-zero. */
605 switch (N)
607 case 4:
608 l = u.l[i++];
609 if (l != 0)
610 break;
611 s += sizeof(unsigned long);
612 l = u.l[i++];
613 if (l != 0)
614 break;
615 s += sizeof(unsigned long);
616 /* FALLTHRU */
617 case 2:
618 l = u.l[i++];
619 if (l != 0)
620 break;
621 s += sizeof(unsigned long);
622 l = u.l[i];
625 /* L now contains 0xff in bytes for which we matched one of the
626 relevant characters. We can find the byte index by finding
627 its bit index and dividing by 8. */
628 #ifdef __BIG_ENDIAN__
629 l = __builtin_clzl(l) >> 3;
630 #else
631 l = __builtin_ctzl(l) >> 3;
632 #endif
633 return s + l;
635 #undef N
639 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
641 /* A vection of the fast scanner using AltiVec vectorized byte compares.
642 This cannot be used for little endian because vec_lvsl/lvsr are
643 deprecated for little endian and the code won't work properly. */
644 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
645 so we can't compile this function without -maltivec on the command line
646 (or implied by some other switch). */
648 static const uchar *
649 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
651 typedef __attribute__((altivec(vector))) unsigned char vc;
653 const vc repl_nl = {
654 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
655 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
657 const vc repl_cr = {
658 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
659 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
661 const vc repl_bs = {
662 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
663 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
665 const vc repl_qm = {
666 '?', '?', '?', '?', '?', '?', '?', '?',
667 '?', '?', '?', '?', '?', '?', '?', '?',
669 const vc ones = {
670 -1, -1, -1, -1, -1, -1, -1, -1,
671 -1, -1, -1, -1, -1, -1, -1, -1,
673 const vc zero = { 0 };
675 vc data, mask, t;
677 /* Altivec loads automatically mask addresses with -16. This lets us
678 issue the first load as early as possible. */
679 data = __builtin_vec_ld(0, (const vc *)s);
681 /* Discard bytes before the beginning of the buffer. Do this by
682 beginning with all ones and shifting in zeros according to the
683 mis-alignment. The LVSR instruction pulls the exact shift we
684 want from the address. */
685 mask = __builtin_vec_lvsr(0, s);
686 mask = __builtin_vec_perm(zero, ones, mask);
687 data &= mask;
689 /* While altivec loads mask addresses, we still need to align S so
690 that the offset we compute at the end is correct. */
691 s = (const uchar *)((uintptr_t)s & -16);
693 /* Main loop processing 16 bytes at a time. */
694 goto start;
697 vc m_nl, m_cr, m_bs, m_qm;
699 s += 16;
700 data = __builtin_vec_ld(0, (const vc *)s);
702 start:
703 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
704 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
705 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
706 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
707 t = (m_nl | m_cr) | (m_bs | m_qm);
709 /* T now contains 0xff in bytes for which we matched one of the relevant
710 characters. We want to exit the loop if any byte in T is non-zero.
711 Below is the expansion of vec_any_ne(t, zero). */
713 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
716 #define N (sizeof(vc) / sizeof(long))
718 union {
719 vc v;
720 /* Statically assert that N is 2 or 4. */
721 unsigned long l[(N == 2 || N == 4) ? N : -1];
722 } u;
723 unsigned long l, i = 0;
725 u.v = t;
727 /* Find the first word of T that is non-zero. */
728 switch (N)
730 case 4:
731 l = u.l[i++];
732 if (l != 0)
733 break;
734 s += sizeof(unsigned long);
735 l = u.l[i++];
736 if (l != 0)
737 break;
738 s += sizeof(unsigned long);
739 /* FALLTHROUGH */
740 case 2:
741 l = u.l[i++];
742 if (l != 0)
743 break;
744 s += sizeof(unsigned long);
745 l = u.l[i];
748 /* L now contains 0xff in bytes for which we matched one of the
749 relevant characters. We can find the byte index by finding
750 its bit index and dividing by 8. */
751 l = __builtin_clzl(l) >> 3;
752 return s + l;
754 #undef N
758 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
759 #include "arm_neon.h"
761 /* This doesn't have to be the exact page size, but no system may use
762 a size smaller than this. ARMv8 requires a minimum page size of
763 4k. The impact of being conservative here is a small number of
764 cases will take the slightly slower entry path into the main
765 loop. */
767 #define AARCH64_MIN_PAGE_SIZE 4096
769 static const uchar *
770 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
772 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
773 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
774 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
775 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
776 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
778 #ifdef __ARM_BIG_ENDIAN
779 const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
780 #else
781 const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
782 #endif
784 unsigned int found;
785 const uint8_t *p;
786 uint8x16_t data;
787 uint8x16_t t;
788 uint16x8_t m;
789 uint8x16_t u, v, w;
791 /* Align the source pointer. */
792 p = (const uint8_t *)((uintptr_t)s & -16);
794 /* Assuming random string start positions, with a 4k page size we'll take
795 the slow path about 0.37% of the time. */
796 if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
797 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
798 < 16, 0))
800 /* Slow path: the string starts near a possible page boundary. */
801 uint32_t misalign, mask;
803 misalign = (uintptr_t)s & 15;
804 mask = (-1u << misalign) & 0xffff;
805 data = vld1q_u8 (p);
806 t = vceqq_u8 (data, repl_nl);
807 u = vceqq_u8 (data, repl_cr);
808 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
809 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
810 t = vorrq_u8 (v, w);
811 t = vandq_u8 (t, xmask);
812 m = vpaddlq_u8 (t);
813 m = vshlq_u16 (m, shift);
814 found = vaddvq_u16 (m);
815 found &= mask;
816 if (found)
817 return (const uchar*)p + __builtin_ctz (found);
819 else
821 data = vld1q_u8 ((const uint8_t *) s);
822 t = vceqq_u8 (data, repl_nl);
823 u = vceqq_u8 (data, repl_cr);
824 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
825 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
826 t = vorrq_u8 (v, w);
827 if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
828 goto done;
833 p += 16;
834 data = vld1q_u8 (p);
835 t = vceqq_u8 (data, repl_nl);
836 u = vceqq_u8 (data, repl_cr);
837 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
838 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
839 t = vorrq_u8 (v, w);
840 } while (!vpaddd_u64 ((uint64x2_t)t));
842 done:
843 /* Now that we've found the terminating substring, work out precisely where
844 we need to stop. */
845 t = vandq_u8 (t, xmask);
846 m = vpaddlq_u8 (t);
847 m = vshlq_u16 (m, shift);
848 found = vaddvq_u16 (m);
849 return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
850 + __builtin_ctz (found));
853 #elif defined (__ARM_NEON)
854 #include "arm_neon.h"
856 static const uchar *
857 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
859 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
860 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
861 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
862 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
863 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
865 unsigned int misalign, found, mask;
866 const uint8_t *p;
867 uint8x16_t data;
869 /* Align the source pointer. */
870 misalign = (uintptr_t)s & 15;
871 p = (const uint8_t *)((uintptr_t)s & -16);
872 data = vld1q_u8 (p);
874 /* Create a mask for the bytes that are valid within the first
875 16-byte block. The Idea here is that the AND with the mask
876 within the loop is "free", since we need some AND or TEST
877 insn in order to set the flags for the branch anyway. */
878 mask = (-1u << misalign) & 0xffff;
880 /* Main loop, processing 16 bytes at a time. */
881 goto start;
885 uint8x8_t l;
886 uint16x4_t m;
887 uint32x2_t n;
888 uint8x16_t t, u, v, w;
890 p += 16;
891 data = vld1q_u8 (p);
892 mask = 0xffff;
894 start:
895 t = vceqq_u8 (data, repl_nl);
896 u = vceqq_u8 (data, repl_cr);
897 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
898 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
899 t = vandq_u8 (vorrq_u8 (v, w), xmask);
900 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
901 m = vpaddl_u8 (l);
902 n = vpaddl_u16 (m);
904 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
905 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
906 found &= mask;
908 while (!found);
910 /* FOUND contains 1 in bits for which we matched a relevant
911 character. Conversion to the byte index is trivial. */
912 found = __builtin_ctz (found);
913 return (const uchar *)p + found;
916 #else
918 /* We only have one accelerated alternative. Use a direct call so that
919 we encourage inlining. */
921 #define search_line_fast search_line_acc_char
923 #endif
925 /* Initialize the lexer if needed. */
927 void
928 _cpp_init_lexer (void)
930 #ifdef HAVE_init_vectorized_lexer
931 init_vectorized_lexer ();
932 #endif
935 /* Returns with a logical line that contains no escaped newlines or
936 trigraphs. This is a time-critical inner loop. */
937 void
938 _cpp_clean_line (cpp_reader *pfile)
940 cpp_buffer *buffer;
941 const uchar *s;
942 uchar c, *d, *p;
944 buffer = pfile->buffer;
945 buffer->cur_note = buffer->notes_used = 0;
946 buffer->cur = buffer->line_base = buffer->next_line;
947 buffer->need_line = false;
948 s = buffer->next_line;
950 if (!buffer->from_stage3)
952 const uchar *pbackslash = NULL;
954 /* Fast path. This is the common case of an un-escaped line with
955 no trigraphs. The primary win here is by not writing any
956 data back to memory until we have to. */
957 while (1)
959 /* Perform an optimized search for \n, \r, \\, ?. */
960 s = search_line_fast (s, buffer->rlimit);
962 c = *s;
963 if (c == '\\')
965 /* Record the location of the backslash and continue. */
966 pbackslash = s++;
968 else if (__builtin_expect (c == '?', 0))
970 if (__builtin_expect (s[1] == '?', false)
971 && _cpp_trigraph_map[s[2]])
973 /* Have a trigraph. We may or may not have to convert
974 it. Add a line note regardless, for -Wtrigraphs. */
975 add_line_note (buffer, s, s[2]);
976 if (CPP_OPTION (pfile, trigraphs))
978 /* We do, and that means we have to switch to the
979 slow path. */
980 d = (uchar *) s;
981 *d = _cpp_trigraph_map[s[2]];
982 s += 2;
983 goto slow_path;
986 /* Not a trigraph. Continue on fast-path. */
987 s++;
989 else
990 break;
993 /* This must be \r or \n. We're either done, or we'll be forced
994 to write back to the buffer and continue on the slow path. */
995 d = (uchar *) s;
997 if (__builtin_expect (s == buffer->rlimit, false))
998 goto done;
1000 /* DOS line ending? */
1001 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1003 s++;
1004 if (s == buffer->rlimit)
1005 goto done;
1008 if (__builtin_expect (pbackslash == NULL, true))
1009 goto done;
1011 /* Check for escaped newline. */
1012 p = d;
1013 while (is_nvspace (p[-1]))
1014 p--;
1015 if (p - 1 != pbackslash)
1016 goto done;
1018 /* Have an escaped newline; process it and proceed to
1019 the slow path. */
1020 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1021 d = p - 2;
1022 buffer->next_line = p - 1;
1024 slow_path:
1025 while (1)
1027 c = *++s;
1028 *++d = c;
1030 if (c == '\n' || c == '\r')
1032 /* Handle DOS line endings. */
1033 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1034 s++;
1035 if (s == buffer->rlimit)
1036 break;
1038 /* Escaped? */
1039 p = d;
1040 while (p != buffer->next_line && is_nvspace (p[-1]))
1041 p--;
1042 if (p == buffer->next_line || p[-1] != '\\')
1043 break;
1045 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1046 d = p - 2;
1047 buffer->next_line = p - 1;
1049 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1051 /* Add a note regardless, for the benefit of -Wtrigraphs. */
1052 add_line_note (buffer, d, s[2]);
1053 if (CPP_OPTION (pfile, trigraphs))
1055 *d = _cpp_trigraph_map[s[2]];
1056 s += 2;
1061 else
1063 while (*s != '\n' && *s != '\r')
1064 s++;
1065 d = (uchar *) s;
1067 /* Handle DOS line endings. */
1068 if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1069 s++;
1072 done:
1073 *d = '\n';
1074 /* A sentinel note that should never be processed. */
1075 add_line_note (buffer, d + 1, '\n');
1076 buffer->next_line = s + 1;
1079 template <bool lexing_raw_string>
1080 static bool get_fresh_line_impl (cpp_reader *pfile);
1082 /* Return true if the trigraph indicated by NOTE should be warned
1083 about in a comment. */
1084 static bool
1085 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1087 const uchar *p;
1089 /* Within comments we don't warn about trigraphs, unless the
1090 trigraph forms an escaped newline, as that may change
1091 behavior. */
1092 if (note->type != '/')
1093 return false;
1095 /* If -trigraphs, then this was an escaped newline iff the next note
1096 is coincident. */
1097 if (CPP_OPTION (pfile, trigraphs))
1098 return note[1].pos == note->pos;
1100 /* Otherwise, see if this forms an escaped newline. */
1101 p = note->pos + 3;
1102 while (is_nvspace (*p))
1103 p++;
1105 /* There might have been escaped newlines between the trigraph and the
1106 newline we found. Hence the position test. */
1107 return (*p == '\n' && p < note[1].pos);
1110 /* Process the notes created by add_line_note as far as the current
1111 location. */
1112 void
1113 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1115 cpp_buffer *buffer = pfile->buffer;
1117 for (;;)
1119 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1120 unsigned int col;
1122 if (note->pos > buffer->cur)
1123 break;
1125 buffer->cur_note++;
1126 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1128 if (note->type == '\\' || note->type == ' ')
1130 if (note->type == ' ' && !in_comment)
1131 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1132 "backslash and newline separated by space");
1134 if (buffer->next_line > buffer->rlimit)
1136 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1137 "backslash-newline at end of file");
1138 /* Prevent "no newline at end of file" warning. */
1139 buffer->next_line = buffer->rlimit;
1142 buffer->line_base = note->pos;
1143 CPP_INCREMENT_LINE (pfile, 0);
1145 else if (_cpp_trigraph_map[note->type])
1147 if (CPP_OPTION (pfile, warn_trigraphs)
1148 && (!in_comment || warn_in_comment (pfile, note)))
1150 if (CPP_OPTION (pfile, trigraphs))
1151 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1152 pfile->line_table->highest_line, col,
1153 "trigraph ??%c converted to %c",
1154 note->type,
1155 (int) _cpp_trigraph_map[note->type]);
1156 else
1158 cpp_warning_with_line
1159 (pfile, CPP_W_TRIGRAPHS,
1160 pfile->line_table->highest_line, col,
1161 "trigraph ??%c ignored, use -trigraphs to enable",
1162 note->type);
1166 else if (note->type == 0)
1167 /* Already processed in lex_raw_string. */;
1168 else
1169 abort ();
1173 namespace bidi {
1174 enum class kind {
1175 NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1178 /* All the UTF-8 encodings of bidi characters start with E2. */
1179 constexpr uchar utf8_start = 0xe2;
1181 struct context
1183 context () {}
1184 context (location_t loc, kind k, bool pdf, bool ucn)
1185 : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1189 kind get_pop_kind () const
1191 return m_pdf ? kind::PDF : kind::PDI;
1193 bool ucn_p () const
1195 return m_ucn;
1198 location_t m_loc;
1199 kind m_kind;
1200 unsigned m_pdf : 1;
1201 unsigned m_ucn : 1;
1204 /* A vector holding currently open bidi contexts. We use a char for
1205 each context, its LSB is 1 if it represents a PDF context, 0 if it
1206 represents a PDI context. The next bit is 1 if this context was open
1207 by a bidi character written as a UCN, and 0 when it was UTF-8. */
1208 semi_embedded_vec <context, 16> vec;
1210 /* Close the whole comment/identifier/string literal/character constant
1211 context. */
1212 void on_close ()
1214 vec.truncate (0);
1217 /* Pop the last element in the vector. */
1218 void pop ()
1220 unsigned int len = vec.count ();
1221 gcc_checking_assert (len > 0);
1222 vec.truncate (len - 1);
1225 /* Return the pop kind of the context of the Ith element. */
1226 kind pop_kind_at (unsigned int i)
1228 return vec[i].get_pop_kind ();
1231 /* Return the pop kind of the context that is currently opened. */
1232 kind current_ctx ()
1234 unsigned int len = vec.count ();
1235 if (len == 0)
1236 return kind::NONE;
1237 return vec[len - 1].get_pop_kind ();
1240 /* Return true if the current context comes from a UCN origin, that is,
1241 the bidi char which started this bidi context was written as a UCN. */
1242 bool current_ctx_ucn_p ()
1244 unsigned int len = vec.count ();
1245 gcc_checking_assert (len > 0);
1246 return vec[len - 1].m_ucn;
1249 location_t current_ctx_loc ()
1251 unsigned int len = vec.count ();
1252 gcc_checking_assert (len > 0);
1253 return vec[len - 1].m_loc;
1256 /* We've read a bidi char, update the current vector as necessary.
1257 LOC is only valid when K is not kind::NONE. */
1258 void on_char (kind k, bool ucn_p, location_t loc)
1260 switch (k)
1262 case kind::LRE:
1263 case kind::RLE:
1264 case kind::LRO:
1265 case kind::RLO:
1266 vec.push (context (loc, k, true, ucn_p));
1267 break;
1268 case kind::LRI:
1269 case kind::RLI:
1270 case kind::FSI:
1271 vec.push (context (loc, k, false, ucn_p));
1272 break;
1273 /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1274 whose scope has not yet been terminated. */
1275 case kind::PDF:
1276 if (current_ctx () == kind::PDF)
1277 pop ();
1278 break;
1279 /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1280 scope has not yet been terminated, as well as the scopes of
1281 any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1282 yet been terminated. */
1283 case kind::PDI:
1284 for (int i = vec.count () - 1; i >= 0; --i)
1285 if (pop_kind_at (i) == kind::PDI)
1287 vec.truncate (i);
1288 break;
1290 break;
1291 case kind::LTR:
1292 case kind::RTL:
1293 /* These aren't popped by a PDF/PDI. */
1294 break;
1295 ATTR_LIKELY case kind::NONE:
1296 break;
1297 default:
1298 abort ();
1302 /* Return a descriptive string for K. */
1303 const char *to_str (kind k)
1305 switch (k)
1307 case kind::LRE:
1308 return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1309 case kind::RLE:
1310 return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1311 case kind::LRO:
1312 return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1313 case kind::RLO:
1314 return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1315 case kind::LRI:
1316 return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1317 case kind::RLI:
1318 return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1319 case kind::FSI:
1320 return "U+2068 (FIRST STRONG ISOLATE)";
1321 case kind::PDF:
1322 return "U+202C (POP DIRECTIONAL FORMATTING)";
1323 case kind::PDI:
1324 return "U+2069 (POP DIRECTIONAL ISOLATE)";
1325 case kind::LTR:
1326 return "U+200E (LEFT-TO-RIGHT MARK)";
1327 case kind::RTL:
1328 return "U+200F (RIGHT-TO-LEFT MARK)";
1329 default:
1330 abort ();
1335 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1336 within the current line in FILE, with the caret at START. */
1338 static location_t
1339 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1340 const unsigned char *const start,
1341 size_t num_bytes)
1343 gcc_checking_assert (num_bytes > 0);
1345 /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1346 to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1347 whereas linemap_position_for_column is 1-based. */
1349 /* Get 0-based offsets within the line. */
1350 size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1351 size_t end_offset = start_offset + num_bytes - 1;
1353 /* Now convert to location_t, where "columns" are 1-based byte offsets. */
1354 location_t start_loc = linemap_position_for_column (pfile->line_table,
1355 start_offset + 1);
1356 location_t end_loc = linemap_position_for_column (pfile->line_table,
1357 end_offset + 1);
1359 if (start_loc == end_loc)
1360 return start_loc;
1362 source_range src_range;
1363 src_range.m_start = start_loc;
1364 src_range.m_finish = end_loc;
1365 location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
1366 start_loc,
1367 src_range,
1368 NULL,
1370 return combined_loc;
1373 /* Parse a sequence of 3 bytes starting with P and return its bidi code. */
1375 static bidi::kind
1376 get_bidi_utf8_1 (const unsigned char *const p)
1378 gcc_checking_assert (p[0] == bidi::utf8_start);
1380 if (p[1] == 0x80)
1381 switch (p[2])
1383 case 0xaa:
1384 return bidi::kind::LRE;
1385 case 0xab:
1386 return bidi::kind::RLE;
1387 case 0xac:
1388 return bidi::kind::PDF;
1389 case 0xad:
1390 return bidi::kind::LRO;
1391 case 0xae:
1392 return bidi::kind::RLO;
1393 case 0x8e:
1394 return bidi::kind::LTR;
1395 case 0x8f:
1396 return bidi::kind::RTL;
1397 default:
1398 break;
1400 else if (p[1] == 0x81)
1401 switch (p[2])
1403 case 0xa6:
1404 return bidi::kind::LRI;
1405 case 0xa7:
1406 return bidi::kind::RLI;
1407 case 0xa8:
1408 return bidi::kind::FSI;
1409 case 0xa9:
1410 return bidi::kind::PDI;
1411 default:
1412 break;
1415 return bidi::kind::NONE;
1418 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1419 If the kind is not NONE, write the location to *OUT.*/
1421 static bidi::kind
1422 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1424 bidi::kind result = get_bidi_utf8_1 (p);
1425 if (result != bidi::kind::NONE)
1427 /* We have a sequence of 3 bytes starting at P. */
1428 *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1430 return result;
1433 /* Parse a UCN where P points just past \u or \U and return its bidi code. */
1435 static bidi::kind
1436 get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
1438 /* 6.4.3 Universal Character Names
1439 \u hex-quad
1440 \U hex-quad hex-quad
1441 \u { simple-hexadecimal-digit-sequence }
1442 where \unnnn means \U0000nnnn. */
1444 *end = p + 4;
1445 if (is_U)
1447 if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1448 return bidi::kind::NONE;
1449 /* Skip 4B so we can treat \u and \U the same below. */
1450 p += 4;
1451 *end += 4;
1453 else if (p[0] == '{')
1455 p++;
1456 while (*p == '0')
1457 p++;
1458 if (p[0] != '2'
1459 || p[1] != '0'
1460 || !ISXDIGIT (p[2])
1461 || !ISXDIGIT (p[3])
1462 || p[4] != '}')
1463 return bidi::kind::NONE;
1464 *end = p + 5;
1467 /* All code points we are looking for start with 20xx. */
1468 if (p[0] != '2' || p[1] != '0')
1469 return bidi::kind::NONE;
1470 else if (p[2] == '2')
1471 switch (p[3])
1473 case 'a':
1474 case 'A':
1475 return bidi::kind::LRE;
1476 case 'b':
1477 case 'B':
1478 return bidi::kind::RLE;
1479 case 'c':
1480 case 'C':
1481 return bidi::kind::PDF;
1482 case 'd':
1483 case 'D':
1484 return bidi::kind::LRO;
1485 case 'e':
1486 case 'E':
1487 return bidi::kind::RLO;
1488 default:
1489 break;
1491 else if (p[2] == '6')
1492 switch (p[3])
1494 case '6':
1495 return bidi::kind::LRI;
1496 case '7':
1497 return bidi::kind::RLI;
1498 case '8':
1499 return bidi::kind::FSI;
1500 case '9':
1501 return bidi::kind::PDI;
1502 default:
1503 break;
1505 else if (p[2] == '0')
1506 switch (p[3])
1508 case 'e':
1509 case 'E':
1510 return bidi::kind::LTR;
1511 case 'f':
1512 case 'F':
1513 return bidi::kind::RTL;
1514 default:
1515 break;
1518 return bidi::kind::NONE;
1521 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1522 If the kind is not NONE, write the location to *OUT. */
1524 static bidi::kind
1525 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1526 location_t *out)
1528 const unsigned char *end;
1529 bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
1530 if (result != bidi::kind::NONE)
1532 const unsigned char *start = p - 2;
1533 size_t num_bytes = end - start;
1534 *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1536 return result;
1539 /* Parse a named universal character escape where P points just past \N and
1540 return its bidi code. If the kind is not NONE, write the location to
1541 *OUT. */
1543 static bidi::kind
1544 get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1546 bidi::kind result = bidi::kind::NONE;
1547 if (*p != '{')
1548 return bidi::kind::NONE;
1549 if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
1551 if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1552 result = bidi::kind::LTR;
1553 else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1554 result = bidi::kind::LRE;
1555 else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1556 result = bidi::kind::LRO;
1557 else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1558 result = bidi::kind::LRI;
1560 else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
1562 if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1563 result = bidi::kind::RTL;
1564 else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1565 result = bidi::kind::RLE;
1566 else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1567 result = bidi::kind::RLO;
1568 else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1569 result = bidi::kind::RLI;
1571 else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
1573 if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
1574 result = bidi::kind::PDF;
1575 else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
1576 result = bidi::kind::PDI;
1578 else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
1579 result = bidi::kind::FSI;
1580 if (result != bidi::kind::NONE)
1581 *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
1582 (strchr ((const char *)
1583 (p + 1), '}')
1584 - (const char *) p)
1585 + 3);
1586 return result;
1589 /* Subclass of rich_location for reporting on unpaired UTF-8
1590 bidirectional control character(s).
1591 Escape the source lines on output, and show all unclosed
1592 bidi context, labelling everything. */
1594 class unpaired_bidi_rich_location : public rich_location
1596 public:
1597 class custom_range_label : public range_label
1599 public:
1600 label_text get_text (unsigned range_idx) const final override
1602 /* range 0 is the primary location; each subsequent range i + 1
1603 is for bidi::vec[i]. */
1604 if (range_idx > 0)
1606 const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1607 return label_text::borrow (bidi::to_str (ctxt.m_kind));
1609 else
1610 return label_text::borrow (_("end of bidirectional context"));
1614 unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1615 : rich_location (pfile->line_table, loc, &m_custom_label)
1617 set_escape_on_output (true);
1618 for (unsigned i = 0; i < bidi::vec.count (); i++)
1619 add_range (bidi::vec[i].m_loc,
1620 SHOW_RANGE_WITHOUT_CARET,
1621 &m_custom_label);
1624 private:
1625 custom_range_label m_custom_label;
1628 /* We're closing a bidi context, that is, we've encountered a newline,
1629 are closing a C-style comment, or are at the end of a string literal,
1630 character constant, or identifier. Warn if this context was not
1631 properly terminated by a PDI or PDF. P points to the last character
1632 in this context. */
1634 static void
1635 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1637 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1638 if (bidi::vec.count () > 0
1639 && (warn_bidi & bidirectional_unpaired
1640 && (!bidi::current_ctx_ucn_p ()
1641 || (warn_bidi & bidirectional_ucn))))
1643 const location_t loc
1644 = linemap_position_for_column (pfile->line_table,
1645 CPP_BUF_COLUMN (pfile->buffer, p));
1646 unpaired_bidi_rich_location rich_loc (pfile, loc);
1647 /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1648 forms of a diagnostic, so fake it for now. */
1649 if (bidi::vec.count () > 1)
1650 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1651 "unpaired UTF-8 bidirectional control characters "
1652 "detected");
1653 else
1654 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1655 "unpaired UTF-8 bidirectional control character "
1656 "detected");
1658 /* We're done with this context. */
1659 bidi::on_close ();
1662 /* We're at the beginning or in the middle of an identifier/comment/string
1663 literal/character constant. Warn if we've encountered a bidi character.
1664 KIND says which bidi control character it was; UCN_P is true iff this bidi
1665 control character was written as a UCN. LOC is the location of the
1666 character, but is only valid if KIND != bidi::kind::NONE. */
1668 static void
1669 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1670 bool ucn_p, location_t loc)
1672 if (__builtin_expect (kind == bidi::kind::NONE, 1))
1673 return;
1675 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1677 if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1679 rich_location rich_loc (pfile->line_table, loc);
1680 rich_loc.set_escape_on_output (true);
1682 /* It seems excessive to warn about a PDI/PDF that is closing
1683 an opened context because we've already warned about the
1684 opening character. Except warn when we have a UCN x UTF-8
1685 mismatch, if UCN checking is enabled. */
1686 if (kind == bidi::current_ctx ())
1688 if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1689 && bidi::current_ctx_ucn_p () != ucn_p)
1691 rich_loc.add_range (bidi::current_ctx_loc ());
1692 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1693 "UTF-8 vs UCN mismatch when closing "
1694 "a context by \"%s\"", bidi::to_str (kind));
1697 else if (warn_bidi & bidirectional_any
1698 && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1700 if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1701 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1702 "\"%s\" is closing an unopened context",
1703 bidi::to_str (kind));
1704 else
1705 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1706 "found problematic Unicode character \"%s\"",
1707 bidi::to_str (kind));
1710 /* We're done with this context. */
1711 bidi::on_char (kind, ucn_p, loc);
1714 static const cppchar_t utf8_continuation = 0x80;
1715 static const cppchar_t utf8_signifier = 0xC0;
1717 /* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1718 at PFILE->buffer->cur. Return a pointer after the diagnosed
1719 invalid character. */
1721 static const uchar *
1722 _cpp_warn_invalid_utf8 (cpp_reader *pfile)
1724 cpp_buffer *buffer = pfile->buffer;
1725 const uchar *cur = buffer->cur;
1726 bool pedantic = (CPP_PEDANTIC (pfile)
1727 && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1729 if (cur[0] < utf8_signifier
1730 || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1732 if (pedantic)
1733 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1734 pfile->line_table->highest_line,
1735 CPP_BUF_COL (buffer),
1736 "invalid UTF-8 character <%x>",
1737 cur[0]);
1738 else
1739 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1740 pfile->line_table->highest_line,
1741 CPP_BUF_COL (buffer),
1742 "invalid UTF-8 character <%x>",
1743 cur[0]);
1744 return cur + 1;
1746 else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1748 if (pedantic)
1749 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1750 pfile->line_table->highest_line,
1751 CPP_BUF_COL (buffer),
1752 "invalid UTF-8 character <%x><%x>",
1753 cur[0], cur[1]);
1754 else
1755 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1756 pfile->line_table->highest_line,
1757 CPP_BUF_COL (buffer),
1758 "invalid UTF-8 character <%x><%x>",
1759 cur[0], cur[1]);
1760 return cur + 2;
1762 else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1764 if (pedantic)
1765 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1766 pfile->line_table->highest_line,
1767 CPP_BUF_COL (buffer),
1768 "invalid UTF-8 character <%x><%x><%x>",
1769 cur[0], cur[1], cur[2]);
1770 else
1771 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1772 pfile->line_table->highest_line,
1773 CPP_BUF_COL (buffer),
1774 "invalid UTF-8 character <%x><%x><%x>",
1775 cur[0], cur[1], cur[2]);
1776 return cur + 3;
1778 else
1780 if (pedantic)
1781 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1782 pfile->line_table->highest_line,
1783 CPP_BUF_COL (buffer),
1784 "invalid UTF-8 character <%x><%x><%x><%x>",
1785 cur[0], cur[1], cur[2], cur[3]);
1786 else
1787 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1788 pfile->line_table->highest_line,
1789 CPP_BUF_COL (buffer),
1790 "invalid UTF-8 character <%x><%x><%x><%x>",
1791 cur[0], cur[1], cur[2], cur[3]);
1792 return cur + 4;
1796 /* Helper function of *skip_*_comment and lex*_string. For C,
1797 character at CUR[-1] with MSB set handle -Wbidi-chars* and
1798 -Winvalid-utf8 diagnostics and return pointer to first character
1799 that should be processed next. */
1801 static inline const uchar *
1802 _cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1803 const uchar *cur, bool warn_bidi_p,
1804 bool warn_invalid_utf8_p)
1806 /* If this is a beginning of a UTF-8 encoding, it might be
1807 a bidirectional control character. */
1808 if (c == bidi::utf8_start && warn_bidi_p)
1810 location_t loc;
1811 bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1812 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1814 if (!warn_invalid_utf8_p)
1815 return cur;
1816 if (c >= utf8_signifier)
1818 cppchar_t s;
1819 const uchar *pstr = cur - 1;
1820 if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
1821 && s <= UCS_LIMIT)
1822 return pstr;
1824 pfile->buffer->cur = cur - 1;
1825 return _cpp_warn_invalid_utf8 (pfile);
1828 /* Skip a C-style block comment. We find the end of the comment by
1829 seeing if an asterisk is before every '/' we encounter. Returns
1830 nonzero if comment terminated by EOF, zero otherwise.
1832 Buffer->cur points to the initial asterisk of the comment. */
1833 bool
1834 _cpp_skip_block_comment (cpp_reader *pfile)
1836 cpp_buffer *buffer = pfile->buffer;
1837 const uchar *cur = buffer->cur;
1838 uchar c;
1839 const bool warn_bidi_p = pfile->warn_bidi_p ();
1840 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1841 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1843 cur++;
1844 if (*cur == '/')
1845 cur++;
1847 for (;;)
1849 /* People like decorating comments with '*', so check for '/'
1850 instead for efficiency. */
1851 c = *cur++;
1853 if (c == '/')
1855 if (cur[-2] == '*')
1857 if (warn_bidi_p)
1858 maybe_warn_bidi_on_close (pfile, cur);
1859 break;
1862 /* Warn about potential nested comments, but not if the '/'
1863 comes immediately before the true comment delimiter.
1864 Don't bother to get it right across escaped newlines. */
1865 if (CPP_OPTION (pfile, warn_comments)
1866 && cur[0] == '*' && cur[1] != '/')
1868 buffer->cur = cur;
1869 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1870 pfile->line_table->highest_line,
1871 CPP_BUF_COL (buffer),
1872 "\"/*\" within comment");
1875 else if (c == '\n')
1877 unsigned int cols;
1878 buffer->cur = cur - 1;
1879 if (warn_bidi_p)
1880 maybe_warn_bidi_on_close (pfile, cur);
1881 _cpp_process_line_notes (pfile, true);
1882 if (buffer->next_line >= buffer->rlimit)
1883 return true;
1884 _cpp_clean_line (pfile);
1886 cols = buffer->next_line - buffer->line_base;
1887 CPP_INCREMENT_LINE (pfile, cols);
1889 cur = buffer->cur;
1891 else if (__builtin_expect (c >= utf8_continuation, 0)
1892 && warn_bidi_or_invalid_utf8_p)
1893 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1894 warn_invalid_utf8_p);
1897 buffer->cur = cur;
1898 _cpp_process_line_notes (pfile, true);
1899 return false;
1902 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1903 terminating newline. Handles escaped newlines. Returns nonzero
1904 if a multiline comment. */
1905 static int
1906 skip_line_comment (cpp_reader *pfile)
1908 cpp_buffer *buffer = pfile->buffer;
1909 location_t orig_line = pfile->line_table->highest_line;
1910 const bool warn_bidi_p = pfile->warn_bidi_p ();
1911 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1912 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1914 if (!warn_bidi_or_invalid_utf8_p)
1915 while (*buffer->cur != '\n')
1916 buffer->cur++;
1917 else if (!warn_invalid_utf8_p)
1919 while (*buffer->cur != '\n'
1920 && *buffer->cur != bidi::utf8_start)
1921 buffer->cur++;
1922 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1924 while (*buffer->cur != '\n')
1926 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1928 location_t loc;
1929 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1930 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1932 buffer->cur++;
1934 maybe_warn_bidi_on_close (pfile, buffer->cur);
1937 else
1939 while (*buffer->cur != '\n')
1941 if (*buffer->cur < utf8_continuation)
1943 buffer->cur++;
1944 continue;
1946 buffer->cur
1947 = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
1948 warn_bidi_p, warn_invalid_utf8_p);
1950 if (warn_bidi_p)
1951 maybe_warn_bidi_on_close (pfile, buffer->cur);
1954 _cpp_process_line_notes (pfile, true);
1955 return orig_line != pfile->line_table->highest_line;
1958 /* Skips whitespace, saving the next non-whitespace character. */
1959 static void
1960 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1962 cpp_buffer *buffer = pfile->buffer;
1963 bool saw_NUL = false;
1967 /* Horizontal space always OK. */
1968 if (c == ' ' || c == '\t')
1970 /* Just \f \v or \0 left. */
1971 else if (c == '\0')
1972 saw_NUL = true;
1973 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1974 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1975 CPP_BUF_COL (buffer),
1976 "%s in preprocessing directive",
1977 c == '\f' ? "form feed" : "vertical tab");
1979 c = *buffer->cur++;
1981 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1982 while (is_nvspace (c));
1984 if (saw_NUL)
1986 encoding_rich_location rich_loc (pfile);
1987 cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1988 "null character(s) ignored");
1991 buffer->cur--;
1994 /* See if the characters of a number token are valid in a name (no
1995 '.', '+' or '-'). */
1996 static int
1997 name_p (cpp_reader *pfile, const cpp_string *string)
1999 unsigned int i;
2001 for (i = 0; i < string->len; i++)
2002 if (!is_idchar (string->text[i]))
2003 return 0;
2005 return 1;
2008 /* After parsing an identifier or other sequence, produce a warning about
2009 sequences not in NFC/NFKC. */
2010 static void
2011 warn_about_normalization (cpp_reader *pfile,
2012 const cpp_token *token,
2013 const struct normalize_state *s,
2014 bool identifier)
2016 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2017 && !pfile->state.skipping)
2019 location_t loc = token->src_loc;
2021 /* If possible, create a location range for the token. */
2022 if (loc >= RESERVED_LOCATION_COUNT
2023 && token->type != CPP_EOF
2024 /* There must be no line notes to process. */
2025 && (!(pfile->buffer->cur
2026 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2027 && !pfile->overlaid_buffer)))
2029 source_range tok_range;
2030 tok_range.m_start = loc;
2031 tok_range.m_finish
2032 = linemap_position_for_column (pfile->line_table,
2033 CPP_BUF_COLUMN (pfile->buffer,
2034 pfile->buffer->cur));
2035 loc = COMBINE_LOCATION_DATA (pfile->line_table,
2036 loc, tok_range, NULL, 0);
2039 encoding_rich_location rich_loc (pfile, loc);
2041 /* Make sure that the token is printed using UCNs, even
2042 if we'd otherwise happily print UTF-8. */
2043 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2044 size_t sz;
2046 sz = cpp_spell_token (pfile, token, buf, false) - buf;
2047 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2048 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2049 "`%.*s' is not in NFKC", (int) sz, buf);
2050 else if (identifier && CPP_OPTION (pfile, xid_identifiers))
2051 cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2052 "`%.*s' is not in NFC", (int) sz, buf);
2053 else
2054 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2055 "`%.*s' is not in NFC", (int) sz, buf);
2056 free (buf);
2060 /* Returns TRUE if the byte sequence starting at buffer->cur is a valid
2061 extended character in an identifier. If FIRST is TRUE, then the character
2062 must be valid at the beginning of an identifier as well. If the return
2063 value is TRUE, then pfile->buffer->cur has been moved to point to the next
2064 byte after the extended character. */
2066 static bool
2067 forms_identifier_p (cpp_reader *pfile, int first,
2068 struct normalize_state *state)
2070 cpp_buffer *buffer = pfile->buffer;
2071 const bool warn_bidi_p = pfile->warn_bidi_p ();
2073 if (*buffer->cur == '$')
2075 if (!CPP_OPTION (pfile, dollars_in_ident))
2076 return false;
2078 buffer->cur++;
2079 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
2081 CPP_OPTION (pfile, warn_dollars) = 0;
2082 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
2085 return true;
2088 /* Is this a syntactically valid UCN or a valid UTF-8 char? */
2089 if (CPP_OPTION (pfile, extended_identifiers))
2091 cppchar_t s;
2092 if (*buffer->cur >= utf8_signifier)
2094 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2095 && warn_bidi_p)
2097 location_t loc;
2098 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
2099 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2101 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2102 state, &s))
2103 return true;
2105 else if (*buffer->cur == '\\'
2106 && (buffer->cur[1] == 'u'
2107 || buffer->cur[1] == 'U'
2108 || buffer->cur[1] == 'N'))
2110 buffer->cur += 2;
2111 if (warn_bidi_p)
2113 location_t loc;
2114 bidi::kind kind;
2115 if (buffer->cur[-1] == 'N')
2116 kind = get_bidi_named (pfile, buffer->cur, &loc);
2117 else
2118 kind = get_bidi_ucn (pfile, buffer->cur,
2119 buffer->cur[-1] == 'U', &loc);
2120 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2122 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2123 state, &s, NULL, NULL))
2124 return true;
2125 buffer->cur -= 2;
2129 return false;
2132 /* Helper function to issue error about improper __VA_OPT__ use. */
2133 static void
2134 maybe_va_opt_error (cpp_reader *pfile)
2136 if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2138 /* __VA_OPT__ should not be accepted at all, but allow it in
2139 system headers. */
2140 if (!_cpp_in_system_header (pfile))
2142 if (CPP_OPTION (pfile, cplusplus))
2143 cpp_error (pfile, CPP_DL_PEDWARN,
2144 "__VA_OPT__ is not available until C++20");
2145 else
2146 cpp_error (pfile, CPP_DL_PEDWARN,
2147 "__VA_OPT__ is not available until C2X");
2150 else if (!pfile->state.va_args_ok)
2152 /* __VA_OPT__ should only appear in the replacement list of a
2153 variadic macro. */
2154 cpp_error (pfile, CPP_DL_PEDWARN,
2155 "__VA_OPT__ can only appear in the expansion"
2156 " of a C++20 variadic macro");
2160 /* Helper function to perform diagnostics that are needed (rarely)
2161 when an identifier is lexed. */
2162 static void
2163 identifier_diagnostics_on_lex (cpp_reader *pfile, cpp_hashnode *node)
2165 if (__builtin_expect (!(node->flags & NODE_DIAGNOSTIC)
2166 || pfile->state.skipping, 1))
2167 return;
2169 /* It is allowed to poison the same identifier twice. */
2170 if ((node->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2171 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2172 NODE_NAME (node));
2174 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2175 replacement list of a variadic macro. */
2176 if (node == pfile->spec_nodes.n__VA_ARGS__
2177 && !pfile->state.va_args_ok)
2179 if (CPP_OPTION (pfile, cplusplus))
2180 cpp_error (pfile, CPP_DL_PEDWARN,
2181 "__VA_ARGS__ can only appear in the expansion"
2182 " of a C++11 variadic macro");
2183 else
2184 cpp_error (pfile, CPP_DL_PEDWARN,
2185 "__VA_ARGS__ can only appear in the expansion"
2186 " of a C99 variadic macro");
2189 /* __VA_OPT__ should only appear in the replacement list of a
2190 variadic macro. */
2191 if (node == pfile->spec_nodes.n__VA_OPT__)
2192 maybe_va_opt_error (pfile);
2194 /* For -Wc++-compat, warn about use of C++ named operators. */
2195 if (node->flags & NODE_WARN_OPERATOR)
2196 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2197 "identifier \"%s\" is a special operator name in C++",
2198 NODE_NAME (node));
2201 /* Helper function to get the cpp_hashnode of the identifier BASE. */
2202 static cpp_hashnode *
2203 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
2205 cpp_hashnode *result;
2206 const uchar *cur;
2207 unsigned int len;
2208 unsigned int hash = HT_HASHSTEP (0, *base);
2210 cur = base + 1;
2211 while (ISIDNUM (*cur))
2213 hash = HT_HASHSTEP (hash, *cur);
2214 cur++;
2216 len = cur - base;
2217 hash = HT_HASHFINISH (hash, len);
2218 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2219 base, len, hash, HT_ALLOC));
2220 identifier_diagnostics_on_lex (pfile, result);
2221 return result;
2224 /* Get the cpp_hashnode of an identifier specified by NAME in
2225 the current cpp_reader object. If none is found, NULL is returned. */
2226 cpp_hashnode *
2227 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2229 cpp_hashnode *result;
2230 result = lex_identifier_intern (pfile, (uchar *) name);
2231 return result;
2234 /* Lex an identifier starting at BASE. BUFFER->CUR is expected to point
2235 one past the first character at BASE, which may be a (possibly multi-byte)
2236 character if STARTS_UCN is true. */
2237 static cpp_hashnode *
2238 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2239 struct normalize_state *nst, cpp_hashnode **spelling)
2241 cpp_hashnode *result;
2242 const uchar *cur;
2243 unsigned int len;
2244 unsigned int hash = HT_HASHSTEP (0, *base);
2245 const bool warn_bidi_p = pfile->warn_bidi_p ();
2247 cur = pfile->buffer->cur;
2248 if (! starts_ucn)
2250 while (ISIDNUM (*cur))
2252 hash = HT_HASHSTEP (hash, *cur);
2253 cur++;
2255 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2257 pfile->buffer->cur = cur;
2258 if (starts_ucn || forms_identifier_p (pfile, false, nst))
2260 /* Slower version for identifiers containing UCNs
2261 or extended chars (including $). */
2262 do {
2263 while (ISIDNUM (*pfile->buffer->cur))
2265 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2266 pfile->buffer->cur++;
2268 } while (forms_identifier_p (pfile, false, nst));
2269 if (warn_bidi_p)
2270 maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2271 result = _cpp_interpret_identifier (pfile, base,
2272 pfile->buffer->cur - base);
2273 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2275 else
2277 len = cur - base;
2278 hash = HT_HASHFINISH (hash, len);
2280 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2281 base, len, hash, HT_ALLOC));
2282 *spelling = result;
2285 return result;
2288 /* Struct to hold the return value of the scan_cur_identifier () helper
2289 function below. */
2291 struct scan_id_result
2293 cpp_hashnode *node;
2294 normalize_state nst;
2296 scan_id_result ()
2297 : node (nullptr)
2299 nst = INITIAL_NORMALIZE_STATE;
2302 explicit operator bool () const { return node; }
2305 /* Helper function to scan an entire identifier beginning at
2306 pfile->buffer->cur, and possibly containing extended characters (UCNs
2307 and/or UTF-8). Returns the cpp_hashnode for the identifier on success, or
2308 else nullptr, as well as a normalize_state so that normalization warnings
2309 may be issued once the token lexing is complete. */
2311 static scan_id_result
2312 scan_cur_identifier (cpp_reader *pfile)
2314 const auto buffer = pfile->buffer;
2315 const auto begin = buffer->cur;
2316 scan_id_result result;
2317 if (ISIDST (*buffer->cur))
2319 ++buffer->cur;
2320 cpp_hashnode *ignore;
2321 result.node = lex_identifier (pfile, begin, false, &result.nst, &ignore);
2323 else if (forms_identifier_p (pfile, true, &result.nst))
2325 /* buffer->cur has been moved already by the call
2326 to forms_identifier_p. */
2327 cpp_hashnode *ignore;
2328 result.node = lex_identifier (pfile, begin, true, &result.nst, &ignore);
2330 return result;
2333 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
2334 static void
2335 lex_number (cpp_reader *pfile, cpp_string *number,
2336 struct normalize_state *nst)
2338 const uchar *cur;
2339 const uchar *base;
2340 uchar *dest;
2342 base = pfile->buffer->cur - 1;
2345 const uchar *adj_digit_sep = NULL;
2346 cur = pfile->buffer->cur;
2348 /* N.B. ISIDNUM does not include $. */
2349 while (ISIDNUM (*cur)
2350 || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2351 || DIGIT_SEP (*cur)
2352 || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2354 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2355 /* Adjacent digit separators do not form part of the pp-number syntax.
2356 However, they can safely be diagnosed here as an error, since '' is
2357 not a valid preprocessing token. */
2358 if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2359 adj_digit_sep = cur;
2360 cur++;
2362 /* A number can't end with a digit separator. */
2363 while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2364 --cur;
2365 if (adj_digit_sep && adj_digit_sep < cur)
2366 cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2368 pfile->buffer->cur = cur;
2370 while (forms_identifier_p (pfile, false, nst));
2372 number->len = cur - base;
2373 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2374 memcpy (dest, base, number->len);
2375 dest[number->len] = '\0';
2376 number->text = dest;
2379 /* Create a token of type TYPE with a literal spelling. */
2380 static void
2381 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2382 unsigned int len, enum cpp_ttype type)
2384 token->type = type;
2385 token->val.str.len = len;
2386 token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2389 /* Like create_literal(), but construct it from two separate strings
2390 which are concatenated. LEN2 may be 0 if no second string is
2391 required. */
2392 static void
2393 create_literal2 (cpp_reader *pfile, cpp_token *token, const uchar *base1,
2394 unsigned int len1, const uchar *base2, unsigned int len2,
2395 enum cpp_ttype type)
2397 token->type = type;
2398 token->val.str.len = len1 + len2;
2399 uchar *const dest = _cpp_unaligned_alloc (pfile, len1 + len2 + 1);
2400 memcpy (dest, base1, len1);
2401 if (len2)
2402 memcpy (dest+len1, base2, len2);
2403 dest[len1 + len2] = 0;
2404 token->val.str.text = dest;
2407 const uchar *
2408 cpp_alloc_token_string (cpp_reader *pfile,
2409 const unsigned char *ptr, unsigned len)
2411 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2413 dest[len] = 0;
2414 memcpy (dest, ptr, len);
2415 return dest;
2418 /* A pair of raw buffer pointers. The currently open one is [1], the
2419 first one is [0]. Used for string literal lexing. */
2420 struct lit_accum {
2421 _cpp_buff *first;
2422 _cpp_buff *last;
2423 const uchar *rpos;
2424 size_t accum;
2426 lit_accum ()
2427 : first (NULL), last (NULL), rpos (0), accum (0)
2431 void append (cpp_reader *, const uchar *, size_t);
2433 void read_begin (cpp_reader *);
2434 bool reading_p () const
2436 return rpos != NULL;
2438 char read_char ()
2440 char c = *rpos++;
2441 if (rpos == BUFF_FRONT (last))
2442 rpos = NULL;
2443 return c;
2446 void create_literal2 (cpp_reader *pfile, cpp_token *token,
2447 const uchar *base1, unsigned int len1,
2448 const uchar *base2, unsigned int len2,
2449 enum cpp_ttype type);
2452 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2453 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
2455 void
2456 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2458 if (!last)
2459 /* Starting. */
2460 first = last = _cpp_get_buff (pfile, len);
2461 else if (len > BUFF_ROOM (last))
2463 /* There is insufficient room in the buffer. Copy what we can,
2464 and then either extend or create a new one. */
2465 size_t room = BUFF_ROOM (last);
2466 memcpy (BUFF_FRONT (last), base, room);
2467 BUFF_FRONT (last) += room;
2468 base += room;
2469 len -= room;
2470 accum += room;
2472 gcc_checking_assert (!rpos);
2474 last = _cpp_append_extend_buff (pfile, last, len);
2477 memcpy (BUFF_FRONT (last), base, len);
2478 BUFF_FRONT (last) += len;
2479 accum += len;
2482 void
2483 lit_accum::read_begin (cpp_reader *pfile)
2485 /* We never accumulate more than 4 chars to read. */
2486 if (BUFF_ROOM (last) < 4)
2488 last = _cpp_append_extend_buff (pfile, last, 4);
2489 rpos = BUFF_FRONT (last);
2492 /* Helper function to check if a string format macro, say from inttypes.h, is
2493 placed touching a string literal, in which case it could be parsed as a C++11
2494 user-defined string literal thus breaking the program. Return TRUE if the
2495 UDL should be ignored for now and preserved for potential macro
2496 expansion. */
2498 static bool
2499 maybe_ignore_udl_macro_suffix (cpp_reader *pfile, location_t src_loc,
2500 const uchar *suffix_begin, cpp_hashnode *node)
2502 /* User-defined literals outside of namespace std must start with a single
2503 underscore, so assume anything of that form really is a UDL suffix.
2504 We don't need to worry about UDLs defined inside namespace std because
2505 their names are reserved, so cannot be used as macro names in valid
2506 programs. */
2507 if ((suffix_begin[0] == '_' && suffix_begin[1] != '_')
2508 || !cpp_macro_p (node))
2509 return false;
2511 /* Maybe raise a warning here; caller should arrange not to consume
2512 the tokens. */
2513 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2514 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX, src_loc, 0,
2515 "invalid suffix on literal; C++11 requires a space "
2516 "between literal and string macro");
2517 return true;
2520 /* Like create_literal2(), but also prepend all the accumulated data from
2521 the lit_accum struct. */
2522 void
2523 lit_accum::create_literal2 (cpp_reader *pfile, cpp_token *token,
2524 const uchar *base1, unsigned int len1,
2525 const uchar *base2, unsigned int len2,
2526 enum cpp_ttype type)
2528 const unsigned int tot_len = accum + len1 + len2;
2529 uchar *dest = _cpp_unaligned_alloc (pfile, tot_len + 1);
2530 token->type = type;
2531 token->val.str.len = tot_len;
2532 token->val.str.text = dest;
2533 for (_cpp_buff *buf = first; buf; buf = buf->next)
2535 size_t len = BUFF_FRONT (buf) - buf->base;
2536 memcpy (dest, buf->base, len);
2537 dest += len;
2539 memcpy (dest, base1, len1);
2540 dest += len1;
2541 if (len2)
2542 memcpy (dest, base2, len2);
2543 dest += len2;
2544 *dest = '\0';
2547 /* Lexes a raw string. The stored string contains the spelling,
2548 including double quotes, delimiter string, '(' and ')', any leading
2549 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
2550 the type of the literal, or CPP_OTHER if it was not properly
2551 terminated.
2553 BASE is the start of the token. Updates pfile->buffer->cur to just
2554 after the lexed string.
2556 The spelling is NUL-terminated, but it is not guaranteed that this
2557 is the first NUL since embedded NULs are preserved. */
2559 static void
2560 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2562 const uchar *pos = base;
2563 const bool warn_bidi_p = pfile->warn_bidi_p ();
2564 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2565 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2567 /* 'tis a pity this information isn't passed down from the lexer's
2568 initial categorization of the token. */
2569 enum cpp_ttype type = CPP_STRING;
2571 if (*pos == 'L')
2573 type = CPP_WSTRING;
2574 pos++;
2576 else if (*pos == 'U')
2578 type = CPP_STRING32;
2579 pos++;
2581 else if (*pos == 'u')
2583 if (pos[1] == '8')
2585 type = CPP_UTF8STRING;
2586 pos++;
2588 else
2589 type = CPP_STRING16;
2590 pos++;
2593 gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2594 pos += 2;
2596 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2598 /* Skip notes before the ". */
2599 while (note->pos < pos)
2600 ++note;
2602 lit_accum accum;
2604 uchar prefix[17];
2605 unsigned prefix_len = 0;
2606 enum Phase
2608 PHASE_PREFIX = -2,
2609 PHASE_NONE = -1,
2610 PHASE_SUFFIX = 0
2611 } phase = PHASE_PREFIX;
2613 for (;;)
2615 gcc_checking_assert (note->pos >= pos);
2617 /* Undo any escaped newlines and trigraphs. */
2618 if (!accum.reading_p () && note->pos == pos)
2619 switch (note->type)
2621 case '\\':
2622 case ' ':
2623 /* Restore backslash followed by newline. */
2624 accum.append (pfile, base, pos - base);
2625 base = pos;
2626 accum.read_begin (pfile);
2627 accum.append (pfile, UC"\\", 1);
2629 after_backslash:
2630 if (note->type == ' ')
2631 /* GNU backslash whitespace newline extension. FIXME
2632 could be any sequence of non-vertical space. When we
2633 can properly restore any such sequence, we should
2634 mark this note as handled so _cpp_process_line_notes
2635 doesn't warn. */
2636 accum.append (pfile, UC" ", 1);
2638 accum.append (pfile, UC"\n", 1);
2639 note++;
2640 break;
2642 case '\n':
2643 /* This can happen for ??/<NEWLINE> when trigraphs are not
2644 being interpretted. */
2645 gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2646 note->type = 0;
2647 note++;
2648 break;
2650 default:
2651 gcc_checking_assert (_cpp_trigraph_map[note->type]);
2653 /* Don't warn about this trigraph in
2654 _cpp_process_line_notes, since trigraphs show up as
2655 trigraphs in raw strings. */
2656 uchar type = note->type;
2657 note->type = 0;
2659 if (CPP_OPTION (pfile, trigraphs))
2661 accum.append (pfile, base, pos - base);
2662 base = pos;
2663 accum.read_begin (pfile);
2664 accum.append (pfile, UC"??", 2);
2665 accum.append (pfile, &type, 1);
2667 /* ??/ followed by newline gets two line notes, one for
2668 the trigraph and one for the backslash/newline. */
2669 if (type == '/' && note[1].pos == pos)
2671 note++;
2672 gcc_assert (note->type == '\\' || note->type == ' ');
2673 goto after_backslash;
2675 /* Skip the replacement character. */
2676 base = ++pos;
2679 note++;
2680 break;
2683 /* Now get a char to process. Either from an expanded note, or
2684 from the line buffer. */
2685 bool read_note = accum.reading_p ();
2686 char c = read_note ? accum.read_char () : *pos++;
2688 if (phase == PHASE_PREFIX)
2690 if (c == '(')
2692 /* Done. */
2693 phase = PHASE_NONE;
2694 prefix[prefix_len++] = '"';
2696 else if (prefix_len < 16
2697 /* Prefix chars are any of the basic character set,
2698 [lex.charset] except for '
2699 ()\\\t\v\f\n'. Optimized for a contiguous
2700 alphabet. */
2701 /* Unlike a switch, this collapses down to one or
2702 two shift and bitmask operations on an ASCII
2703 system, with an outlier or two. */
2704 && (('Z' - 'A' == 25
2705 ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2706 : ISIDST (c))
2707 || (c >= '0' && c <= '9')
2708 || c == '_' || c == '{' || c == '}'
2709 || c == '[' || c == ']' || c == '#'
2710 || c == '<' || c == '>' || c == '%'
2711 || c == ':' || c == ';' || c == '.' || c == '?'
2712 || c == '*' || c == '+' || c == '-' || c == '/'
2713 || c == '^' || c == '&' || c == '|' || c == '~'
2714 || c == '!' || c == '=' || c == ','
2715 || c == '"' || c == '\''))
2716 prefix[prefix_len++] = c;
2717 else
2719 /* Something is wrong. */
2720 int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2721 if (prefix_len == 16)
2722 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2723 col, "raw string delimiter longer "
2724 "than 16 characters");
2725 else if (c == '\n')
2726 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2727 col, "invalid new-line in raw "
2728 "string delimiter");
2729 else
2730 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2731 col, "invalid character '%c' in "
2732 "raw string delimiter", c);
2733 type = CPP_OTHER;
2734 phase = PHASE_NONE;
2735 /* Continue until we get a close quote, that's probably
2736 the best failure mode. */
2737 prefix_len = 0;
2739 if (c != '\n')
2740 continue;
2743 if (phase != PHASE_NONE)
2745 if (prefix[phase] != c)
2746 phase = PHASE_NONE;
2747 else if (unsigned (phase + 1) == prefix_len)
2748 break;
2749 else
2751 phase = Phase (phase + 1);
2752 continue;
2756 if (!prefix_len && c == '"')
2757 /* Failure mode lexing. */
2758 goto out;
2759 else if (prefix_len && c == ')')
2760 phase = PHASE_SUFFIX;
2761 else if (!read_note && c == '\n')
2763 pos--;
2764 pfile->buffer->cur = pos;
2765 if ((pfile->state.in_directive || pfile->state.parsing_args)
2766 && pfile->buffer->next_line >= pfile->buffer->rlimit)
2768 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2769 "unterminated raw string");
2770 type = CPP_OTHER;
2771 goto out;
2774 accum.append (pfile, base, pos - base + 1);
2775 _cpp_process_line_notes (pfile, false);
2777 if (pfile->buffer->next_line < pfile->buffer->rlimit)
2778 CPP_INCREMENT_LINE (pfile, 0);
2779 pfile->buffer->need_line = true;
2781 if (!get_fresh_line_impl<true> (pfile))
2783 /* We ran out of file and failed to get a line. */
2784 location_t src_loc = token->src_loc;
2785 token->type = CPP_EOF;
2786 /* Tell the compiler the line number of the EOF token. */
2787 token->src_loc = pfile->line_table->highest_line;
2788 token->flags = BOL;
2789 if (accum.first)
2790 _cpp_release_buff (pfile, accum.first);
2791 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2792 "unterminated raw string");
2794 /* Now pop the buffer that get_fresh_line_impl() did not. Popping
2795 is not safe if processing a directive, however this cannot
2796 happen as we already checked above that a line would be
2797 available, and get_fresh_line_impl() can't fail in this
2798 case. */
2799 gcc_assert (!pfile->state.in_directive);
2800 _cpp_pop_buffer (pfile);
2802 return;
2805 pos = base = pfile->buffer->cur;
2806 note = &pfile->buffer->notes[pfile->buffer->cur_note];
2808 else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2809 && warn_bidi_or_invalid_utf8_p)
2810 pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
2811 warn_invalid_utf8_p);
2814 if (warn_bidi_p)
2815 maybe_warn_bidi_on_close (pfile, pos);
2817 if (CPP_OPTION (pfile, user_literals))
2819 const uchar *const suffix_begin = pos;
2820 pfile->buffer->cur = pos;
2822 if (const auto sr = scan_cur_identifier (pfile))
2824 if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
2825 suffix_begin, sr.node))
2826 pfile->buffer->cur = suffix_begin;
2827 else
2829 type = cpp_userdef_string_add_type (type);
2830 accum.create_literal2 (pfile, token, base, suffix_begin - base,
2831 NODE_NAME (sr.node), NODE_LEN (sr.node),
2832 type);
2833 if (accum.first)
2834 _cpp_release_buff (pfile, accum.first);
2835 warn_about_normalization (pfile, token, &sr.nst, true);
2836 return;
2841 out:
2842 pfile->buffer->cur = pos;
2843 if (!accum.accum)
2844 create_literal (pfile, token, base, pos - base, type);
2845 else
2847 accum.create_literal2 (pfile, token, base, pos - base, nullptr, 0, type);
2848 _cpp_release_buff (pfile, accum.first);
2852 /* Lexes a string, character constant, or angle-bracketed header file
2853 name. The stored string contains the spelling, including opening
2854 quote and any leading 'L', 'u', 'U' or 'u8' and optional
2855 'R' modifier. It returns the type of the literal, or CPP_OTHER
2856 if it was not properly terminated, or CPP_LESS for an unterminated
2857 header name which must be relexed as normal tokens.
2859 The spelling is NUL-terminated, but it is not guaranteed that this
2860 is the first NUL since embedded NULs are preserved. */
2861 static void
2862 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2864 bool saw_NUL = false;
2865 const uchar *cur;
2866 cppchar_t terminator;
2867 enum cpp_ttype type;
2869 cur = base;
2870 terminator = *cur++;
2871 if (terminator == 'L' || terminator == 'U')
2872 terminator = *cur++;
2873 else if (terminator == 'u')
2875 terminator = *cur++;
2876 if (terminator == '8')
2877 terminator = *cur++;
2879 if (terminator == 'R')
2881 lex_raw_string (pfile, token, base);
2882 return;
2884 if (terminator == '"')
2885 type = (*base == 'L' ? CPP_WSTRING :
2886 *base == 'U' ? CPP_STRING32 :
2887 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2888 : CPP_STRING);
2889 else if (terminator == '\'')
2890 type = (*base == 'L' ? CPP_WCHAR :
2891 *base == 'U' ? CPP_CHAR32 :
2892 *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2893 : CPP_CHAR);
2894 else
2895 terminator = '>', type = CPP_HEADER_NAME;
2897 const bool warn_bidi_p = pfile->warn_bidi_p ();
2898 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2899 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2900 for (;;)
2902 cppchar_t c = *cur++;
2904 /* In #include-style directives, terminators are not escapable. */
2905 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2907 if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
2909 location_t loc;
2910 bidi::kind kind;
2911 if (cur[0] == 'N')
2912 kind = get_bidi_named (pfile, cur + 1, &loc);
2913 else
2914 kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
2915 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2917 cur++;
2919 else if (c == terminator)
2921 if (warn_bidi_p)
2922 maybe_warn_bidi_on_close (pfile, cur - 1);
2923 break;
2925 else if (c == '\n')
2927 cur--;
2928 /* Unmatched quotes always yield undefined behavior, but
2929 greedy lexing means that what appears to be an unterminated
2930 header name may actually be a legitimate sequence of tokens. */
2931 if (terminator == '>')
2933 token->type = CPP_LESS;
2934 return;
2936 type = CPP_OTHER;
2937 break;
2939 else if (c == '\0')
2940 saw_NUL = true;
2941 else if (__builtin_expect (c >= utf8_continuation, 0)
2942 && warn_bidi_or_invalid_utf8_p)
2943 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2944 warn_invalid_utf8_p);
2947 if (saw_NUL && !pfile->state.skipping)
2948 cpp_error (pfile, CPP_DL_WARNING,
2949 "null character(s) preserved in literal");
2951 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2952 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2953 (int) terminator);
2955 pfile->buffer->cur = cur;
2956 const uchar *const suffix_begin = cur;
2958 if (CPP_OPTION (pfile, user_literals))
2960 if (const auto sr = scan_cur_identifier (pfile))
2962 if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
2963 suffix_begin, sr.node))
2964 pfile->buffer->cur = suffix_begin;
2965 else
2967 /* Grab user defined literal suffix. */
2968 type = cpp_userdef_char_add_type (type);
2969 type = cpp_userdef_string_add_type (type);
2970 create_literal2 (pfile, token, base, suffix_begin - base,
2971 NODE_NAME (sr.node), NODE_LEN (sr.node), type);
2972 warn_about_normalization (pfile, token, &sr.nst, true);
2973 return;
2977 else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2978 && !pfile->state.skipping)
2980 const auto sr = scan_cur_identifier (pfile);
2981 /* Maybe raise a warning, but do not consume the tokens. */
2982 pfile->buffer->cur = suffix_begin;
2983 if (sr && cpp_macro_p (sr.node))
2984 cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2985 token->src_loc, 0, "C++11 requires a space "
2986 "between string literal and macro");
2989 create_literal (pfile, token, base, cur - base, type);
2992 /* Return the comment table. The client may not make any assumption
2993 about the ordering of the table. */
2994 cpp_comment_table *
2995 cpp_get_comments (cpp_reader *pfile)
2997 return &pfile->comments;
3000 /* Append a comment to the end of the comment table. */
3001 static void
3002 store_comment (cpp_reader *pfile, cpp_token *token)
3004 int len;
3006 if (pfile->comments.allocated == 0)
3008 pfile->comments.allocated = 256;
3009 pfile->comments.entries = (cpp_comment *) xmalloc
3010 (pfile->comments.allocated * sizeof (cpp_comment));
3013 if (pfile->comments.count == pfile->comments.allocated)
3015 pfile->comments.allocated *= 2;
3016 pfile->comments.entries = (cpp_comment *) xrealloc
3017 (pfile->comments.entries,
3018 pfile->comments.allocated * sizeof (cpp_comment));
3021 len = token->val.str.len;
3023 /* Copy comment. Note, token may not be NULL terminated. */
3024 pfile->comments.entries[pfile->comments.count].comment =
3025 (char *) xmalloc (sizeof (char) * (len + 1));
3026 memcpy (pfile->comments.entries[pfile->comments.count].comment,
3027 token->val.str.text, len);
3028 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
3030 /* Set source location. */
3031 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
3033 /* Increment the count of entries in the comment table. */
3034 pfile->comments.count++;
3037 /* The stored comment includes the comment start and any terminator. */
3038 static void
3039 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
3040 cppchar_t type)
3042 unsigned char *buffer;
3043 unsigned int len, clen, i;
3045 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
3047 /* C++ comments probably (not definitely) have moved past a new
3048 line, which we don't want to save in the comment. */
3049 if (is_vspace (pfile->buffer->cur[-1]))
3050 len--;
3052 /* If we are currently in a directive or in argument parsing, then
3053 we need to store all C++ comments as C comments internally, and
3054 so we need to allocate a little extra space in that case.
3056 Note that the only time we encounter a directive here is
3057 when we are saving comments in a "#define". */
3058 clen = ((pfile->state.in_directive || pfile->state.parsing_args)
3059 && type == '/') ? len + 2 : len;
3061 buffer = _cpp_unaligned_alloc (pfile, clen);
3063 token->type = CPP_COMMENT;
3064 token->val.str.len = clen;
3065 token->val.str.text = buffer;
3067 buffer[0] = '/';
3068 memcpy (buffer + 1, from, len - 1);
3070 /* Finish conversion to a C comment, if necessary. */
3071 if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
3073 buffer[1] = '*';
3074 buffer[clen - 2] = '*';
3075 buffer[clen - 1] = '/';
3076 /* As there can be in a C++ comments illegal sequences for C comments
3077 we need to filter them out. */
3078 for (i = 2; i < (clen - 2); i++)
3079 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3080 buffer[i] = '|';
3083 /* Finally store this comment for use by clients of libcpp. */
3084 store_comment (pfile, token);
3087 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3088 comment. */
3090 static bool
3091 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3093 const unsigned char *from = comment_start + 1;
3095 switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3097 /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3098 don't recognize any comments. The latter only checks attributes,
3099 the former doesn't warn. */
3100 case 0:
3101 default:
3102 return false;
3103 /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3104 content it has. */
3105 case 1:
3106 return true;
3107 case 2:
3108 /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3109 .*falls?[ \t-]*thr(u|ough).* regex. */
3110 for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3111 from++)
3113 /* Is there anything like strpbrk with upper boundary, or
3114 memchr looking for 2 characters rather than just one? */
3115 if (from[0] != 'f' && from[0] != 'F')
3116 continue;
3117 if (from[1] != 'a' && from[1] != 'A')
3118 continue;
3119 if (from[2] != 'l' && from[2] != 'L')
3120 continue;
3121 if (from[3] != 'l' && from[3] != 'L')
3122 continue;
3123 from += sizeof "fall" - 1;
3124 if (from[0] == 's' || from[0] == 'S')
3125 from++;
3126 while (*from == ' ' || *from == '\t' || *from == '-')
3127 from++;
3128 if (from[0] != 't' && from[0] != 'T')
3129 continue;
3130 if (from[1] != 'h' && from[1] != 'H')
3131 continue;
3132 if (from[2] != 'r' && from[2] != 'R')
3133 continue;
3134 if (from[3] == 'u' || from[3] == 'U')
3135 return true;
3136 if (from[3] != 'o' && from[3] != 'O')
3137 continue;
3138 if (from[4] != 'u' && from[4] != 'U')
3139 continue;
3140 if (from[5] != 'g' && from[5] != 'G')
3141 continue;
3142 if (from[6] != 'h' && from[6] != 'H')
3143 continue;
3144 return true;
3146 return false;
3147 case 3:
3148 case 4:
3149 break;
3152 /* Whole comment contents:
3153 -fallthrough
3154 @fallthrough@
3156 if (*from == '-' || *from == '@')
3158 size_t len = sizeof "fallthrough" - 1;
3159 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3160 return false;
3161 if (memcmp (from + 1, "fallthrough", len))
3162 return false;
3163 if (*from == '@')
3165 if (from[len + 1] != '@')
3166 return false;
3167 len++;
3169 from += 1 + len;
3171 /* Whole comment contents (regex):
3172 lint -fallthrough[ \t]*
3174 else if (*from == 'l')
3176 size_t len = sizeof "int -fallthrough" - 1;
3177 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3178 return false;
3179 if (memcmp (from + 1, "int -fallthrough", len))
3180 return false;
3181 from += 1 + len;
3182 while (*from == ' ' || *from == '\t')
3183 from++;
3185 /* Whole comment contents (regex):
3186 [ \t]*FALLTHR(U|OUGH)[ \t]*
3188 else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3190 while (*from == ' ' || *from == '\t')
3191 from++;
3192 if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
3193 return false;
3194 if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
3195 return false;
3196 from += sizeof "FALLTHR" - 1;
3197 if (*from == 'U')
3198 from++;
3199 else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
3200 return false;
3201 else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
3202 return false;
3203 else
3204 from += sizeof "OUGH" - 1;
3205 while (*from == ' ' || *from == '\t')
3206 from++;
3208 /* Whole comment contents (regex):
3209 [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3210 [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3211 [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
3213 else
3215 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3216 from++;
3217 unsigned char f = *from;
3218 bool all_upper = false;
3219 if (f == 'E' || f == 'e')
3221 if ((size_t) (pfile->buffer->cur - from)
3222 < sizeof "else fallthru" - 1)
3223 return false;
3224 if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
3225 all_upper = true;
3226 else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
3227 return false;
3228 from += sizeof "else" - 1;
3229 if (*from == ',')
3230 from++;
3231 if (*from != ' ')
3232 return false;
3233 from++;
3234 if (all_upper && *from == 'f')
3235 return false;
3236 if (f == 'e' && *from == 'F')
3237 return false;
3238 f = *from;
3240 else if (f == 'I' || f == 'i')
3242 if ((size_t) (pfile->buffer->cur - from)
3243 < sizeof "intentional fallthru" - 1)
3244 return false;
3245 if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
3246 sizeof "NTENTIONAL" - 1) == 0)
3247 all_upper = true;
3248 else if (memcmp (from + 1, "ntentional",
3249 sizeof "ntentional" - 1))
3250 return false;
3251 from += sizeof "intentional" - 1;
3252 if (*from == ' ')
3254 from++;
3255 if (all_upper && *from == 'f')
3256 return false;
3258 else if (all_upper)
3260 if (memcmp (from, "LY F", sizeof "LY F" - 1))
3261 return false;
3262 from += sizeof "LY " - 1;
3264 else
3266 if (memcmp (from, "ly ", sizeof "ly " - 1))
3267 return false;
3268 from += sizeof "ly " - 1;
3270 if (f == 'i' && *from == 'F')
3271 return false;
3272 f = *from;
3274 if (f != 'F' && f != 'f')
3275 return false;
3276 if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3277 return false;
3278 if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3279 all_upper = true;
3280 else if (all_upper)
3281 return false;
3282 else if (memcmp (from + 1, "all", sizeof "all" - 1))
3283 return false;
3284 from += sizeof "fall" - 1;
3285 if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3286 from += 2;
3287 else if (*from == ' ' || *from == '-')
3288 from++;
3289 else if (*from != (all_upper ? 'T' : 't'))
3290 return false;
3291 if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3292 return false;
3293 if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3294 return false;
3295 if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3297 if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3298 return false;
3299 if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3300 sizeof "hrough" - 1))
3301 return false;
3302 from += sizeof "through" - 1;
3304 else
3305 from += sizeof "thru" - 1;
3306 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3307 from++;
3308 if (*from == '-')
3310 from++;
3311 if (*comment_start == '*')
3315 while (*from && *from != '*'
3316 && *from != '\n' && *from != '\r')
3317 from++;
3318 if (*from != '*' || from[1] == '/')
3319 break;
3320 from++;
3322 while (1);
3324 else
3325 while (*from && *from != '\n' && *from != '\r')
3326 from++;
3329 /* C block comment. */
3330 if (*comment_start == '*')
3332 if (*from != '*' || from[1] != '/')
3333 return false;
3335 /* C++ line comment. */
3336 else if (*from != '\n')
3337 return false;
3339 return true;
3342 /* Allocate COUNT tokens for RUN. */
3343 void
3344 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3346 run->base = XNEWVEC (cpp_token, count);
3347 run->limit = run->base + count;
3348 run->next = NULL;
3351 /* Returns the next tokenrun, or creates one if there is none. */
3352 static tokenrun *
3353 next_tokenrun (tokenrun *run)
3355 if (run->next == NULL)
3357 run->next = XNEW (tokenrun);
3358 run->next->prev = run;
3359 _cpp_init_tokenrun (run->next, 250);
3362 return run->next;
3365 /* Return the number of not yet processed token in a given
3366 context. */
3368 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3370 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3371 return (LAST (context).token - FIRST (context).token);
3372 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3373 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3374 return (LAST (context).ptoken - FIRST (context).ptoken);
3375 else
3376 abort ();
3379 /* Returns the token present at index INDEX in a given context. If
3380 INDEX is zero, the next token to be processed is returned. */
3381 static const cpp_token*
3382 _cpp_token_from_context_at (cpp_context *context, int index)
3384 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3385 return &(FIRST (context).token[index]);
3386 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3387 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3388 return FIRST (context).ptoken[index];
3389 else
3390 abort ();
3393 /* Look ahead in the input stream. */
3394 const cpp_token *
3395 cpp_peek_token (cpp_reader *pfile, int index)
3397 cpp_context *context = pfile->context;
3398 const cpp_token *peektok;
3399 int count;
3401 /* First, scan through any pending cpp_context objects. */
3402 while (context->prev)
3404 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3406 if (index < (int) sz)
3407 return _cpp_token_from_context_at (context, index);
3408 index -= (int) sz;
3409 context = context->prev;
3412 /* We will have to read some new tokens after all (and do so
3413 without invalidating preceding tokens). */
3414 count = index;
3415 pfile->keep_tokens++;
3417 /* For peeked tokens temporarily disable line_change reporting,
3418 until the tokens are parsed for real. */
3419 void (*line_change) (cpp_reader *, const cpp_token *, int)
3420 = pfile->cb.line_change;
3421 pfile->cb.line_change = NULL;
3425 peektok = _cpp_lex_token (pfile);
3426 if (peektok->type == CPP_EOF)
3428 index--;
3429 break;
3431 else if (peektok->type == CPP_PRAGMA)
3433 /* Don't peek past a pragma. */
3434 if (peektok == &pfile->directive_result)
3435 /* Save the pragma in the buffer. */
3436 *pfile->cur_token++ = *peektok;
3437 index--;
3438 break;
3441 while (index--);
3443 _cpp_backup_tokens_direct (pfile, count - index);
3444 pfile->keep_tokens--;
3445 pfile->cb.line_change = line_change;
3447 return peektok;
3450 /* Allocate a single token that is invalidated at the same time as the
3451 rest of the tokens on the line. Has its line and col set to the
3452 same as the last lexed token, so that diagnostics appear in the
3453 right place. */
3454 cpp_token *
3455 _cpp_temp_token (cpp_reader *pfile)
3457 cpp_token *old, *result;
3458 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3459 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3461 old = pfile->cur_token - 1;
3462 /* Any pre-existing lookaheads must not be clobbered. */
3463 if (la)
3465 if (sz <= la)
3467 tokenrun *next = next_tokenrun (pfile->cur_run);
3469 if (sz < la)
3470 memmove (next->base + 1, next->base,
3471 (la - sz) * sizeof (cpp_token));
3473 next->base[0] = pfile->cur_run->limit[-1];
3476 if (sz > 1)
3477 memmove (pfile->cur_token + 1, pfile->cur_token,
3478 MIN (la, sz - 1) * sizeof (cpp_token));
3481 if (!sz && pfile->cur_token == pfile->cur_run->limit)
3483 pfile->cur_run = next_tokenrun (pfile->cur_run);
3484 pfile->cur_token = pfile->cur_run->base;
3487 result = pfile->cur_token++;
3488 result->src_loc = old->src_loc;
3489 return result;
3492 /* We're at the beginning of a logical line (so not in
3493 directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
3494 if we should enter deferred_pragma mode to tokenize the rest of the
3495 line as a module control-line. */
3497 static void
3498 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3500 unsigned backup = 0; /* Tokens we peeked. */
3501 cpp_hashnode *node = result->val.node.node;
3502 cpp_token *peek = result;
3503 cpp_token *keyword = peek;
3504 cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3505 int header_count = 0;
3507 /* Make sure the incoming state is as we expect it. This way we
3508 can restore it using constants. */
3509 gcc_checking_assert (!pfile->state.in_deferred_pragma
3510 && !pfile->state.skipping
3511 && !pfile->state.parsing_args
3512 && !pfile->state.angled_headers
3513 && (pfile->state.save_comments
3514 == !CPP_OPTION (pfile, discard_comments)));
3516 /* Enter directives mode sufficiently for peeking. We don't have
3517 to actually set in_directive. */
3518 pfile->state.in_deferred_pragma = true;
3520 /* These two fields are needed to process tokenization in deferred
3521 pragma mode. They are not used outside deferred pragma mode or
3522 directives mode. */
3523 pfile->state.pragma_allow_expansion = true;
3524 pfile->directive_line = result->src_loc;
3526 /* Saving comments is incompatible with directives mode. */
3527 pfile->state.save_comments = 0;
3529 if (node == n_modules[spec_nodes::M_EXPORT][0])
3531 peek = _cpp_lex_direct (pfile);
3532 keyword = peek;
3533 backup++;
3534 if (keyword->type != CPP_NAME)
3535 goto not_module;
3536 node = keyword->val.node.node;
3537 if (!(node->flags & NODE_MODULE))
3538 goto not_module;
3541 if (node == n_modules[spec_nodes::M__IMPORT][0])
3542 /* __import */
3543 header_count = backup + 2 + 16;
3544 else if (node == n_modules[spec_nodes::M_IMPORT][0])
3545 /* import */
3546 header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3547 else if (node == n_modules[spec_nodes::M_MODULE][0])
3548 ; /* module */
3549 else
3550 goto not_module;
3552 /* We've seen [export] {module|import|__import}. Check the next token. */
3553 if (header_count)
3554 /* After '{,__}import' a header name may appear. */
3555 pfile->state.angled_headers = true;
3556 peek = _cpp_lex_direct (pfile);
3557 backup++;
3559 /* ... import followed by identifier, ':', '<' or
3560 header-name preprocessing tokens, or module
3561 followed by cpp-identifier, ':' or ';' preprocessing
3562 tokens. C++ keywords are not yet relevant. */
3563 if (peek->type == CPP_NAME
3564 || peek->type == CPP_COLON
3565 || (header_count
3566 ? (peek->type == CPP_LESS
3567 || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3568 || peek->type == CPP_HEADER_NAME)
3569 : peek->type == CPP_SEMICOLON))
3571 pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3572 if (!pfile->state.pragma_allow_expansion)
3573 pfile->state.prevent_expansion++;
3575 if (!header_count && linemap_included_from
3576 (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3577 cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3578 "module control-line cannot be in included file");
3580 /* The first one or two tokens cannot be macro names. */
3581 for (int ix = backup; ix--;)
3583 cpp_token *tok = ix ? keyword : result;
3584 cpp_hashnode *node = tok->val.node.node;
3586 /* Don't attempt to expand the token. */
3587 tok->flags |= NO_EXPAND;
3588 if (_cpp_defined_macro_p (node)
3589 && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3590 && !cpp_fun_like_macro_p (node))
3591 cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3592 "module control-line \"%s\" cannot be"
3593 " an object-like macro",
3594 NODE_NAME (node));
3597 /* Map to underbar variants. */
3598 keyword->val.node.node = n_modules[header_count
3599 ? spec_nodes::M_IMPORT
3600 : spec_nodes::M_MODULE][1];
3601 if (backup != 1)
3602 result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3604 /* Maybe tell the tokenizer we expect a header-name down the
3605 road. */
3606 pfile->state.directive_file_token = header_count;
3608 else
3610 not_module:
3611 /* Drop out of directive mode. */
3612 /* We aaserted save_comments had this value upon entry. */
3613 pfile->state.save_comments
3614 = !CPP_OPTION (pfile, discard_comments);
3615 pfile->state.in_deferred_pragma = false;
3616 /* Do not let this remain on. */
3617 pfile->state.angled_headers = false;
3620 /* In either case we want to backup the peeked tokens. */
3621 if (backup)
3623 /* If we saw EOL, we should drop it, because this isn't a module
3624 control-line after all. */
3625 bool eol = peek->type == CPP_PRAGMA_EOL;
3626 if (!eol || backup > 1)
3628 /* Put put the peeked tokens back */
3629 _cpp_backup_tokens_direct (pfile, backup);
3630 /* But if the last one was an EOL, forget it. */
3631 if (eol)
3632 pfile->lookaheads--;
3637 /* Lex a token into RESULT (external interface). Takes care of issues
3638 like directive handling, token lookahead, multiple include
3639 optimization and skipping. */
3640 const cpp_token *
3641 _cpp_lex_token (cpp_reader *pfile)
3643 cpp_token *result;
3645 for (;;)
3647 if (pfile->cur_token == pfile->cur_run->limit)
3649 pfile->cur_run = next_tokenrun (pfile->cur_run);
3650 pfile->cur_token = pfile->cur_run->base;
3652 /* We assume that the current token is somewhere in the current
3653 run. */
3654 if (pfile->cur_token < pfile->cur_run->base
3655 || pfile->cur_token >= pfile->cur_run->limit)
3656 abort ();
3658 if (pfile->lookaheads)
3660 pfile->lookaheads--;
3661 result = pfile->cur_token++;
3663 else
3664 result = _cpp_lex_direct (pfile);
3666 if (result->flags & BOL)
3668 /* Is this a directive. If _cpp_handle_directive returns
3669 false, it is an assembler #. */
3670 if (result->type == CPP_HASH
3671 /* 6.10.3 p 11: Directives in a list of macro arguments
3672 gives undefined behavior. This implementation
3673 handles the directive as normal. */
3674 && pfile->state.parsing_args != 1)
3676 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3678 if (pfile->directive_result.type == CPP_PADDING)
3679 continue;
3680 result = &pfile->directive_result;
3683 else if (pfile->state.in_deferred_pragma)
3684 result = &pfile->directive_result;
3685 else if (result->type == CPP_NAME
3686 && (result->val.node.node->flags & NODE_MODULE)
3687 && !pfile->state.skipping
3688 /* Unlike regular directives, we do not deal with
3689 tokenizing module directives as macro arguments.
3690 That's not permitted. */
3691 && !pfile->state.parsing_args)
3693 /* P1857. Before macro expansion, At start of logical
3694 line ... */
3695 /* We don't have to consider lookaheads at this point. */
3696 gcc_checking_assert (!pfile->lookaheads);
3698 cpp_maybe_module_directive (pfile, result);
3701 if (pfile->cb.line_change && !pfile->state.skipping)
3702 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3705 /* We don't skip tokens in directives. */
3706 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3707 break;
3709 /* Outside a directive, invalidate controlling macros. At file
3710 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3711 get here and MI optimization works. */
3712 pfile->mi_valid = false;
3714 if (!pfile->state.skipping || result->type == CPP_EOF)
3715 break;
3718 return result;
3721 /* Returns true if a fresh line has been loaded. */
3722 template <bool lexing_raw_string>
3723 static bool
3724 get_fresh_line_impl (cpp_reader *pfile)
3726 /* We can't get a new line until we leave the current directive, unless we
3727 are lexing a raw string, in which case it will be OK as long as we don't
3728 pop the current buffer. */
3729 if (!lexing_raw_string && pfile->state.in_directive)
3730 return false;
3732 for (;;)
3734 cpp_buffer *buffer = pfile->buffer;
3736 if (!buffer->need_line)
3737 return true;
3739 if (buffer->next_line < buffer->rlimit)
3741 _cpp_clean_line (pfile);
3742 return true;
3745 /* We can't change buffers until we leave the current directive. */
3746 if (lexing_raw_string && pfile->state.in_directive)
3747 return false;
3749 /* First, get out of parsing arguments state. */
3750 if (pfile->state.parsing_args)
3751 return false;
3753 /* End of buffer. Non-empty files should end in a newline. */
3754 if (buffer->buf != buffer->rlimit
3755 && buffer->next_line > buffer->rlimit
3756 && !buffer->from_stage3)
3758 /* Clip to buffer size. */
3759 buffer->next_line = buffer->rlimit;
3762 if (buffer->prev && !buffer->return_at_eof)
3763 _cpp_pop_buffer (pfile);
3764 else
3766 /* End of translation. Do not pop the buffer yet. Increment
3767 line number so that the EOF token is on a line of its own
3768 (_cpp_lex_direct doesn't increment in that case, because
3769 it's hard for it to distinguish this special case). */
3770 CPP_INCREMENT_LINE (pfile, 0);
3771 return false;
3776 bool
3777 _cpp_get_fresh_line (cpp_reader *pfile)
3779 return get_fresh_line_impl<false> (pfile);
3783 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
3784 do \
3786 result->type = ELSE_TYPE; \
3787 if (*buffer->cur == CHAR) \
3788 buffer->cur++, result->type = THEN_TYPE; \
3790 while (0)
3792 /* Lex a token into pfile->cur_token, which is also incremented, to
3793 get diagnostics pointing to the correct location.
3795 Does not handle issues such as token lookahead, multiple-include
3796 optimization, directives, skipping etc. This function is only
3797 suitable for use by _cpp_lex_token, and in special cases like
3798 lex_expansion_token which doesn't care for any of these issues.
3800 When meeting a newline, returns CPP_EOF if parsing a directive,
3801 otherwise returns to the start of the token buffer if permissible.
3802 Returns the location of the lexed token. */
3803 cpp_token *
3804 _cpp_lex_direct (cpp_reader *pfile)
3806 cppchar_t c;
3807 cpp_buffer *buffer;
3808 const unsigned char *comment_start;
3809 bool fallthrough_comment = false;
3810 cpp_token *result = pfile->cur_token++;
3812 fresh_line:
3813 result->flags = 0;
3814 buffer = pfile->buffer;
3815 if (buffer->need_line)
3817 if (pfile->state.in_deferred_pragma)
3819 /* This can happen in cases like:
3820 #define loop(x) whatever
3821 #pragma omp loop
3822 where when trying to expand loop we need to peek
3823 next token after loop, but aren't still in_deferred_pragma
3824 mode but are in in_directive mode, so buffer->need_line
3825 is set, a CPP_EOF is peeked. */
3826 result->type = CPP_PRAGMA_EOL;
3827 pfile->state.in_deferred_pragma = false;
3828 if (!pfile->state.pragma_allow_expansion)
3829 pfile->state.prevent_expansion--;
3830 return result;
3832 if (!_cpp_get_fresh_line (pfile))
3834 result->type = CPP_EOF;
3835 /* Not a real EOF in a directive or arg parsing -- we refuse
3836 to advance to the next file now, and will once we're out
3837 of those modes. */
3838 if (!pfile->state.in_directive && !pfile->state.parsing_args)
3840 /* Tell the compiler the line number of the EOF token. */
3841 result->src_loc = pfile->line_table->highest_line;
3842 result->flags = BOL;
3843 /* Now pop the buffer that _cpp_get_fresh_line did not. */
3844 _cpp_pop_buffer (pfile);
3846 return result;
3848 if (buffer != pfile->buffer)
3849 fallthrough_comment = false;
3850 if (!pfile->keep_tokens)
3852 pfile->cur_run = &pfile->base_run;
3853 result = pfile->base_run.base;
3854 pfile->cur_token = result + 1;
3856 result->flags = BOL;
3857 if (pfile->state.parsing_args == 2)
3858 result->flags |= PREV_WHITE;
3860 buffer = pfile->buffer;
3861 update_tokens_line:
3862 result->src_loc = pfile->line_table->highest_line;
3864 skipped_white:
3865 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3866 && !pfile->overlaid_buffer)
3868 _cpp_process_line_notes (pfile, false);
3869 result->src_loc = pfile->line_table->highest_line;
3871 c = *buffer->cur++;
3873 if (pfile->forced_token_location)
3874 result->src_loc = pfile->forced_token_location;
3875 else
3876 result->src_loc = linemap_position_for_column (pfile->line_table,
3877 CPP_BUF_COLUMN (buffer, buffer->cur));
3879 switch (c)
3881 case ' ': case '\t': case '\f': case '\v': case '\0':
3882 result->flags |= PREV_WHITE;
3883 skip_whitespace (pfile, c);
3884 goto skipped_white;
3886 case '\n':
3887 /* Increment the line, unless this is the last line ... */
3888 if (buffer->cur < buffer->rlimit
3889 /* ... or this is a #include, (where _cpp_stack_file needs to
3890 unwind by one line) ... */
3891 || (pfile->state.in_directive > 1
3892 /* ... except traditional-cpp increments this elsewhere. */
3893 && !CPP_OPTION (pfile, traditional)))
3894 CPP_INCREMENT_LINE (pfile, 0);
3895 buffer->need_line = true;
3896 if (pfile->state.in_deferred_pragma)
3898 /* Produce the PRAGMA_EOL on this line. File reading
3899 ensures there is always a \n at end of the buffer, thus
3900 in a deferred pragma we always see CPP_PRAGMA_EOL before
3901 any CPP_EOF. */
3902 result->type = CPP_PRAGMA_EOL;
3903 result->flags &= ~PREV_WHITE;
3904 pfile->state.in_deferred_pragma = false;
3905 if (!pfile->state.pragma_allow_expansion)
3906 pfile->state.prevent_expansion--;
3907 return result;
3909 goto fresh_line;
3911 case '0': case '1': case '2': case '3': case '4':
3912 case '5': case '6': case '7': case '8': case '9':
3914 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3915 result->type = CPP_NUMBER;
3916 lex_number (pfile, &result->val.str, &nst);
3917 warn_about_normalization (pfile, result, &nst, false);
3918 break;
3921 case 'L':
3922 case 'u':
3923 case 'U':
3924 case 'R':
3925 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3926 wide strings or raw strings. */
3927 if (c == 'L' || CPP_OPTION (pfile, rliterals)
3928 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3930 if ((*buffer->cur == '\'' && c != 'R')
3931 || *buffer->cur == '"'
3932 || (*buffer->cur == 'R'
3933 && c != 'R'
3934 && buffer->cur[1] == '"'
3935 && CPP_OPTION (pfile, rliterals))
3936 || (*buffer->cur == '8'
3937 && c == 'u'
3938 && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3939 && CPP_OPTION (pfile, utf8_char_literals)))
3940 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3941 && CPP_OPTION (pfile, rliterals)))))
3943 lex_string (pfile, result, buffer->cur - 1);
3944 break;
3947 /* Fall through. */
3949 case '_':
3950 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3951 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3952 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3953 case 's': case 't': case 'v': case 'w': case 'x':
3954 case 'y': case 'z':
3955 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3956 case 'G': case 'H': case 'I': case 'J': case 'K':
3957 case 'M': case 'N': case 'O': case 'P': case 'Q':
3958 case 'S': case 'T': case 'V': case 'W': case 'X':
3959 case 'Y': case 'Z':
3960 result->type = CPP_NAME;
3962 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3963 const auto node = lex_identifier (pfile, buffer->cur - 1, false, &nst,
3964 &result->val.node.spelling);
3965 result->val.node.node = node;
3966 identifier_diagnostics_on_lex (pfile, node);
3967 warn_about_normalization (pfile, result, &nst, true);
3970 /* Convert named operators to their proper types. */
3971 if (result->val.node.node->flags & NODE_OPERATOR)
3973 result->flags |= NAMED_OP;
3974 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3977 /* Signal FALLTHROUGH comment followed by another token. */
3978 if (fallthrough_comment)
3979 result->flags |= PREV_FALLTHROUGH;
3980 break;
3982 case '\'':
3983 case '"':
3984 lex_string (pfile, result, buffer->cur - 1);
3985 break;
3987 case '/':
3988 /* A potential block or line comment. */
3989 comment_start = buffer->cur;
3990 c = *buffer->cur;
3992 if (c == '*')
3994 if (_cpp_skip_block_comment (pfile))
3995 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3997 else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3999 /* Don't warn for system headers. */
4000 if (_cpp_in_system_header (pfile))
4002 /* Warn about comments if pedantically GNUC89, and not
4003 in system headers. */
4004 else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
4005 && CPP_PEDANTIC (pfile)
4006 && ! buffer->warned_cplusplus_comments)
4008 if (cpp_error (pfile, CPP_DL_PEDWARN,
4009 "C++ style comments are not allowed in ISO C90"))
4010 cpp_error (pfile, CPP_DL_NOTE,
4011 "(this will be reported only once per input file)");
4012 buffer->warned_cplusplus_comments = 1;
4014 /* Or if specifically desired via -Wc90-c99-compat. */
4015 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
4016 && ! CPP_OPTION (pfile, cplusplus)
4017 && ! buffer->warned_cplusplus_comments)
4019 if (cpp_error (pfile, CPP_DL_WARNING,
4020 "C++ style comments are incompatible with C90"))
4021 cpp_error (pfile, CPP_DL_NOTE,
4022 "(this will be reported only once per input file)");
4023 buffer->warned_cplusplus_comments = 1;
4025 /* In C89/C94, C++ style comments are forbidden. */
4026 else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
4027 || CPP_OPTION (pfile, lang) == CLK_STDC94))
4029 /* But don't be confused about valid code such as
4030 - // immediately followed by *,
4031 - // in a preprocessing directive,
4032 - // in an #if 0 block. */
4033 if (buffer->cur[1] == '*'
4034 || pfile->state.in_directive
4035 || pfile->state.skipping)
4037 result->type = CPP_DIV;
4038 break;
4040 else if (! buffer->warned_cplusplus_comments)
4042 if (cpp_error (pfile, CPP_DL_ERROR,
4043 "C++ style comments are not allowed in "
4044 "ISO C90"))
4045 cpp_error (pfile, CPP_DL_NOTE,
4046 "(this will be reported only once per input "
4047 "file)");
4048 buffer->warned_cplusplus_comments = 1;
4051 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
4052 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
4054 else if (c == '=')
4056 buffer->cur++;
4057 result->type = CPP_DIV_EQ;
4058 break;
4060 else
4062 result->type = CPP_DIV;
4063 break;
4066 if (fallthrough_comment_p (pfile, comment_start))
4067 fallthrough_comment = true;
4069 if (pfile->cb.comment)
4071 size_t len = pfile->buffer->cur - comment_start;
4072 pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
4073 len + 1);
4076 if (!pfile->state.save_comments)
4078 result->flags |= PREV_WHITE;
4079 goto update_tokens_line;
4082 if (fallthrough_comment)
4083 result->flags |= PREV_FALLTHROUGH;
4085 /* Save the comment as a token in its own right. */
4086 save_comment (pfile, result, comment_start, c);
4087 break;
4089 case '<':
4090 if (pfile->state.angled_headers)
4092 lex_string (pfile, result, buffer->cur - 1);
4093 if (result->type != CPP_LESS)
4094 break;
4097 result->type = CPP_LESS;
4098 if (*buffer->cur == '=')
4100 buffer->cur++, result->type = CPP_LESS_EQ;
4101 if (*buffer->cur == '>'
4102 && CPP_OPTION (pfile, cplusplus)
4103 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
4104 buffer->cur++, result->type = CPP_SPACESHIP;
4106 else if (*buffer->cur == '<')
4108 buffer->cur++;
4109 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
4111 else if (CPP_OPTION (pfile, digraphs))
4113 if (*buffer->cur == ':')
4115 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4116 three characters are <:: and the subsequent character
4117 is neither : nor >, the < is treated as a preprocessor
4118 token by itself". */
4119 if (CPP_OPTION (pfile, cplusplus)
4120 && CPP_OPTION (pfile, lang) != CLK_CXX98
4121 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
4122 && buffer->cur[1] == ':'
4123 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4124 break;
4126 buffer->cur++;
4127 result->flags |= DIGRAPH;
4128 result->type = CPP_OPEN_SQUARE;
4130 else if (*buffer->cur == '%')
4132 buffer->cur++;
4133 result->flags |= DIGRAPH;
4134 result->type = CPP_OPEN_BRACE;
4137 break;
4139 case '>':
4140 result->type = CPP_GREATER;
4141 if (*buffer->cur == '=')
4142 buffer->cur++, result->type = CPP_GREATER_EQ;
4143 else if (*buffer->cur == '>')
4145 buffer->cur++;
4146 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4148 break;
4150 case '%':
4151 result->type = CPP_MOD;
4152 if (*buffer->cur == '=')
4153 buffer->cur++, result->type = CPP_MOD_EQ;
4154 else if (CPP_OPTION (pfile, digraphs))
4156 if (*buffer->cur == ':')
4158 buffer->cur++;
4159 result->flags |= DIGRAPH;
4160 result->type = CPP_HASH;
4161 if (*buffer->cur == '%' && buffer->cur[1] == ':')
4162 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
4164 else if (*buffer->cur == '>')
4166 buffer->cur++;
4167 result->flags |= DIGRAPH;
4168 result->type = CPP_CLOSE_BRACE;
4171 break;
4173 case '.':
4174 result->type = CPP_DOT;
4175 if (ISDIGIT (*buffer->cur))
4177 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4178 result->type = CPP_NUMBER;
4179 lex_number (pfile, &result->val.str, &nst);
4180 warn_about_normalization (pfile, result, &nst, false);
4182 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4183 buffer->cur += 2, result->type = CPP_ELLIPSIS;
4184 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4185 buffer->cur++, result->type = CPP_DOT_STAR;
4186 break;
4188 case '+':
4189 result->type = CPP_PLUS;
4190 if (*buffer->cur == '+')
4191 buffer->cur++, result->type = CPP_PLUS_PLUS;
4192 else if (*buffer->cur == '=')
4193 buffer->cur++, result->type = CPP_PLUS_EQ;
4194 break;
4196 case '-':
4197 result->type = CPP_MINUS;
4198 if (*buffer->cur == '>')
4200 buffer->cur++;
4201 result->type = CPP_DEREF;
4202 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4203 buffer->cur++, result->type = CPP_DEREF_STAR;
4205 else if (*buffer->cur == '-')
4206 buffer->cur++, result->type = CPP_MINUS_MINUS;
4207 else if (*buffer->cur == '=')
4208 buffer->cur++, result->type = CPP_MINUS_EQ;
4209 break;
4211 case '&':
4212 result->type = CPP_AND;
4213 if (*buffer->cur == '&')
4214 buffer->cur++, result->type = CPP_AND_AND;
4215 else if (*buffer->cur == '=')
4216 buffer->cur++, result->type = CPP_AND_EQ;
4217 break;
4219 case '|':
4220 result->type = CPP_OR;
4221 if (*buffer->cur == '|')
4222 buffer->cur++, result->type = CPP_OR_OR;
4223 else if (*buffer->cur == '=')
4224 buffer->cur++, result->type = CPP_OR_EQ;
4225 break;
4227 case ':':
4228 result->type = CPP_COLON;
4229 if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
4230 buffer->cur++, result->type = CPP_SCOPE;
4231 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
4233 buffer->cur++;
4234 result->flags |= DIGRAPH;
4235 result->type = CPP_CLOSE_SQUARE;
4237 break;
4239 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4240 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4241 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4242 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
4243 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
4245 case '?': result->type = CPP_QUERY; break;
4246 case '~': result->type = CPP_COMPL; break;
4247 case ',': result->type = CPP_COMMA; break;
4248 case '(': result->type = CPP_OPEN_PAREN; break;
4249 case ')': result->type = CPP_CLOSE_PAREN; break;
4250 case '[': result->type = CPP_OPEN_SQUARE; break;
4251 case ']': result->type = CPP_CLOSE_SQUARE; break;
4252 case '{': result->type = CPP_OPEN_BRACE; break;
4253 case '}': result->type = CPP_CLOSE_BRACE; break;
4254 case ';': result->type = CPP_SEMICOLON; break;
4256 /* @ is a punctuator in Objective-C. */
4257 case '@': result->type = CPP_ATSIGN; break;
4259 default:
4261 const uchar *base = --buffer->cur;
4262 static int no_warn_cnt;
4264 /* Check for an extended identifier ($ or UCN or UTF-8). */
4265 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4266 if (forms_identifier_p (pfile, true, &nst))
4268 result->type = CPP_NAME;
4269 const auto node = lex_identifier (pfile, base, true, &nst,
4270 &result->val.node.spelling);
4271 result->val.node.node = node;
4272 identifier_diagnostics_on_lex (pfile, node);
4273 warn_about_normalization (pfile, result, &nst, true);
4274 break;
4277 /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
4278 single token. */
4279 buffer->cur++;
4280 if (c >= utf8_signifier)
4282 const uchar *pstr = base;
4283 cppchar_t s;
4284 if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4286 if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4288 buffer->cur = base;
4289 _cpp_warn_invalid_utf8 (pfile);
4291 buffer->cur = pstr;
4293 else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4295 buffer->cur = base;
4296 const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4297 buffer->cur = base + 1;
4298 no_warn_cnt = end - buffer->cur;
4301 else if (c >= utf8_continuation
4302 && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4304 if (no_warn_cnt)
4305 --no_warn_cnt;
4306 else
4308 buffer->cur = base;
4309 _cpp_warn_invalid_utf8 (pfile);
4310 buffer->cur = base + 1;
4313 create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4314 break;
4319 /* Potentially convert the location of the token to a range. */
4320 if (result->src_loc >= RESERVED_LOCATION_COUNT
4321 && result->type != CPP_EOF)
4323 /* Ensure that any line notes are processed, so that we have the
4324 correct physical line/column for the end-point of the token even
4325 when a logical line is split via one or more backslashes. */
4326 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4327 && !pfile->overlaid_buffer)
4328 _cpp_process_line_notes (pfile, false);
4330 source_range tok_range;
4331 tok_range.m_start = result->src_loc;
4332 tok_range.m_finish
4333 = linemap_position_for_column (pfile->line_table,
4334 CPP_BUF_COLUMN (buffer, buffer->cur));
4336 result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
4337 result->src_loc,
4338 tok_range, NULL, 0);
4341 return result;
4344 /* An upper bound on the number of bytes needed to spell TOKEN.
4345 Does not include preceding whitespace. */
4346 unsigned int
4347 cpp_token_len (const cpp_token *token)
4349 unsigned int len;
4351 switch (TOKEN_SPELL (token))
4353 default: len = 6; break;
4354 case SPELL_LITERAL: len = token->val.str.len; break;
4355 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
4358 return len;
4361 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4362 Return the number of bytes read out of NAME. (There are always
4363 10 bytes written to BUFFER.) */
4365 static size_t
4366 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4368 int j;
4369 int ucn_len = 0;
4370 int ucn_len_c;
4371 unsigned t;
4372 unsigned long utf32;
4374 /* Compute the length of the UTF-8 sequence. */
4375 for (t = *name; t & 0x80; t <<= 1)
4376 ucn_len++;
4378 utf32 = *name & (0x7F >> ucn_len);
4379 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4381 utf32 = (utf32 << 6) | (*++name & 0x3F);
4383 /* Ill-formed UTF-8. */
4384 if ((*name & ~0x3F) != 0x80)
4385 abort ();
4388 *buffer++ = '\\';
4389 *buffer++ = 'U';
4390 for (j = 7; j >= 0; j--)
4391 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4392 return ucn_len;
4395 /* Given a token TYPE corresponding to a digraph, return a pointer to
4396 the spelling of the digraph. */
4397 static const unsigned char *
4398 cpp_digraph2name (enum cpp_ttype type)
4400 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4403 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4404 The buffer must already contain enough space to hold the
4405 token's spelling. Returns a pointer to the character after the
4406 last character written. */
4407 unsigned char *
4408 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4410 size_t i;
4411 const unsigned char *name = NODE_NAME (ident);
4413 for (i = 0; i < NODE_LEN (ident); i++)
4414 if (name[i] & ~0x7F)
4416 i += utf8_to_ucn (buffer, name + i) - 1;
4417 buffer += 10;
4419 else
4420 *buffer++ = name[i];
4422 return buffer;
4425 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
4426 already contain enough space to hold the token's spelling.
4427 Returns a pointer to the character after the last character written.
4428 FORSTRING is true if this is to be the spelling after translation
4429 phase 1 (with the original spelling of extended identifiers), false
4430 if extended identifiers should always be written using UCNs (there is
4431 no option for always writing them in the internal UTF-8 form).
4432 FIXME: Would be nice if we didn't need the PFILE argument. */
4433 unsigned char *
4434 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4435 unsigned char *buffer, bool forstring)
4437 switch (TOKEN_SPELL (token))
4439 case SPELL_OPERATOR:
4441 const unsigned char *spelling;
4442 unsigned char c;
4444 if (token->flags & DIGRAPH)
4445 spelling = cpp_digraph2name (token->type);
4446 else if (token->flags & NAMED_OP)
4447 goto spell_ident;
4448 else
4449 spelling = TOKEN_NAME (token);
4451 while ((c = *spelling++) != '\0')
4452 *buffer++ = c;
4454 break;
4456 spell_ident:
4457 case SPELL_IDENT:
4458 if (forstring)
4460 memcpy (buffer, NODE_NAME (token->val.node.spelling),
4461 NODE_LEN (token->val.node.spelling));
4462 buffer += NODE_LEN (token->val.node.spelling);
4464 else
4465 buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4466 break;
4468 case SPELL_LITERAL:
4469 memcpy (buffer, token->val.str.text, token->val.str.len);
4470 buffer += token->val.str.len;
4471 break;
4473 case SPELL_NONE:
4474 cpp_error (pfile, CPP_DL_ICE,
4475 "unspellable token %s", TOKEN_NAME (token));
4476 break;
4479 return buffer;
4482 /* Returns TOKEN spelt as a null-terminated string. The string is
4483 freed when the reader is destroyed. Useful for diagnostics. */
4484 unsigned char *
4485 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4487 unsigned int len = cpp_token_len (token) + 1;
4488 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4490 end = cpp_spell_token (pfile, token, start, false);
4491 end[0] = '\0';
4493 return start;
4496 /* Returns a pointer to a string which spells the token defined by
4497 TYPE and FLAGS. Used by C front ends, which really should move to
4498 using cpp_token_as_text. */
4499 const char *
4500 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4502 if (flags & DIGRAPH)
4503 return (const char *) cpp_digraph2name (type);
4504 else if (flags & NAMED_OP)
4505 return cpp_named_operator2name (type);
4507 return (const char *) token_spellings[type].name;
4510 /* Writes the spelling of token to FP, without any preceding space.
4511 Separated from cpp_spell_token for efficiency - to avoid stdio
4512 double-buffering. */
4513 void
4514 cpp_output_token (const cpp_token *token, FILE *fp)
4516 switch (TOKEN_SPELL (token))
4518 case SPELL_OPERATOR:
4520 const unsigned char *spelling;
4521 int c;
4523 if (token->flags & DIGRAPH)
4524 spelling = cpp_digraph2name (token->type);
4525 else if (token->flags & NAMED_OP)
4526 goto spell_ident;
4527 else
4528 spelling = TOKEN_NAME (token);
4530 c = *spelling;
4532 putc (c, fp);
4533 while ((c = *++spelling) != '\0');
4535 break;
4537 spell_ident:
4538 case SPELL_IDENT:
4540 size_t i;
4541 const unsigned char * name = NODE_NAME (token->val.node.node);
4543 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4544 if (name[i] & ~0x7F)
4546 unsigned char buffer[10];
4547 i += utf8_to_ucn (buffer, name + i) - 1;
4548 fwrite (buffer, 1, 10, fp);
4550 else
4551 fputc (NODE_NAME (token->val.node.node)[i], fp);
4553 break;
4555 case SPELL_LITERAL:
4556 if (token->type == CPP_HEADER_NAME)
4557 fputc ('"', fp);
4558 fwrite (token->val.str.text, 1, token->val.str.len, fp);
4559 if (token->type == CPP_HEADER_NAME)
4560 fputc ('"', fp);
4561 break;
4563 case SPELL_NONE:
4564 /* An error, most probably. */
4565 break;
4569 /* Compare two tokens. */
4571 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4573 if (a->type == b->type && a->flags == b->flags)
4574 switch (TOKEN_SPELL (a))
4576 default: /* Keep compiler happy. */
4577 case SPELL_OPERATOR:
4578 /* token_no is used to track where multiple consecutive ##
4579 tokens were originally located. */
4580 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4581 case SPELL_NONE:
4582 return (a->type != CPP_MACRO_ARG
4583 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4584 && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4585 case SPELL_IDENT:
4586 return (a->val.node.node == b->val.node.node
4587 && a->val.node.spelling == b->val.node.spelling);
4588 case SPELL_LITERAL:
4589 return (a->val.str.len == b->val.str.len
4590 && !memcmp (a->val.str.text, b->val.str.text,
4591 a->val.str.len));
4594 return 0;
4597 /* Returns nonzero if a space should be inserted to avoid an
4598 accidental token paste for output. For simplicity, it is
4599 conservative, and occasionally advises a space where one is not
4600 needed, e.g. "." and ".2". */
4602 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4603 const cpp_token *token2)
4605 enum cpp_ttype a = token1->type, b = token2->type;
4606 cppchar_t c;
4608 if (token1->flags & NAMED_OP)
4609 a = CPP_NAME;
4610 if (token2->flags & NAMED_OP)
4611 b = CPP_NAME;
4613 c = EOF;
4614 if (token2->flags & DIGRAPH)
4615 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4616 else if (token_spellings[b].category == SPELL_OPERATOR)
4617 c = token_spellings[b].name[0];
4619 /* Quickly get everything that can paste with an '='. */
4620 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4621 return 1;
4623 switch (a)
4625 case CPP_GREATER: return c == '>';
4626 case CPP_LESS: return c == '<' || c == '%' || c == ':';
4627 case CPP_PLUS: return c == '+';
4628 case CPP_MINUS: return c == '-' || c == '>';
4629 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
4630 case CPP_MOD: return c == ':' || c == '>';
4631 case CPP_AND: return c == '&';
4632 case CPP_OR: return c == '|';
4633 case CPP_COLON: return c == ':' || c == '>';
4634 case CPP_DEREF: return c == '*';
4635 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
4636 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
4637 case CPP_PRAGMA:
4638 case CPP_NAME: return ((b == CPP_NUMBER
4639 && name_p (pfile, &token2->val.str))
4640 || b == CPP_NAME
4641 || b == CPP_CHAR || b == CPP_STRING); /* L */
4642 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
4643 || b == CPP_CHAR
4644 || c == '.' || c == '+' || c == '-');
4645 /* UCNs */
4646 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
4647 && b == CPP_NAME)
4648 || (CPP_OPTION (pfile, objc)
4649 && token1->val.str.text[0] == '@'
4650 && (b == CPP_NAME || b == CPP_STRING)));
4651 case CPP_LESS_EQ: return c == '>';
4652 case CPP_STRING:
4653 case CPP_WSTRING:
4654 case CPP_UTF8STRING:
4655 case CPP_STRING16:
4656 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
4657 && (b == CPP_NAME
4658 || (TOKEN_SPELL (token2) == SPELL_LITERAL
4659 && ISIDST (token2->val.str.text[0]))));
4661 default: break;
4664 return 0;
4667 /* Output all the remaining tokens on the current line, and a newline
4668 character, to FP. Leading whitespace is removed. If there are
4669 macros, special token padding is not performed. */
4670 void
4671 cpp_output_line (cpp_reader *pfile, FILE *fp)
4673 const cpp_token *token;
4675 token = cpp_get_token (pfile);
4676 while (token->type != CPP_EOF)
4678 cpp_output_token (token, fp);
4679 token = cpp_get_token (pfile);
4680 if (token->flags & PREV_WHITE)
4681 putc (' ', fp);
4684 putc ('\n', fp);
4687 /* Return a string representation of all the remaining tokens on the
4688 current line. The result is allocated using xmalloc and must be
4689 freed by the caller. */
4690 unsigned char *
4691 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4693 const cpp_token *token;
4694 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4695 unsigned int alloced = 120 + out;
4696 unsigned char *result = (unsigned char *) xmalloc (alloced);
4698 /* If DIR_NAME is empty, there are no initial contents. */
4699 if (dir_name)
4701 sprintf ((char *) result, "#%s ", dir_name);
4702 out += 2;
4705 token = cpp_get_token (pfile);
4706 while (token->type != CPP_EOF)
4708 unsigned char *last;
4709 /* Include room for a possible space and the terminating nul. */
4710 unsigned int len = cpp_token_len (token) + 2;
4712 if (out + len > alloced)
4714 alloced *= 2;
4715 if (out + len > alloced)
4716 alloced = out + len;
4717 result = (unsigned char *) xrealloc (result, alloced);
4720 last = cpp_spell_token (pfile, token, &result[out], 0);
4721 out = last - result;
4723 token = cpp_get_token (pfile);
4724 if (token->flags & PREV_WHITE)
4725 result[out++] = ' ';
4728 result[out] = '\0';
4729 return result;
4732 /* Memory buffers. Changing these three constants can have a dramatic
4733 effect on performance. The values here are reasonable defaults,
4734 but might be tuned. If you adjust them, be sure to test across a
4735 range of uses of cpplib, including heavy nested function-like macro
4736 expansion. Also check the change in peak memory usage (NJAMD is a
4737 good tool for this). */
4738 #define MIN_BUFF_SIZE 8000
4739 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4740 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4741 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4743 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4744 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4745 #endif
4747 /* Create a new allocation buffer. Place the control block at the end
4748 of the buffer, so that buffer overflows will cause immediate chaos. */
4749 static _cpp_buff *
4750 new_buff (size_t len)
4752 _cpp_buff *result;
4753 unsigned char *base;
4755 if (len < MIN_BUFF_SIZE)
4756 len = MIN_BUFF_SIZE;
4757 len = CPP_ALIGN (len);
4759 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4760 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4761 struct first. */
4762 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4763 base = XNEWVEC (unsigned char, len + slen);
4764 result = (_cpp_buff *) base;
4765 base += slen;
4766 #else
4767 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4768 result = (_cpp_buff *) (base + len);
4769 #endif
4770 result->base = base;
4771 result->cur = base;
4772 result->limit = base + len;
4773 result->next = NULL;
4774 return result;
4777 /* Place a chain of unwanted allocation buffers on the free list. */
4778 void
4779 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4781 _cpp_buff *end = buff;
4783 while (end->next)
4784 end = end->next;
4785 end->next = pfile->free_buffs;
4786 pfile->free_buffs = buff;
4789 /* Return a free buffer of size at least MIN_SIZE. */
4790 _cpp_buff *
4791 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4793 _cpp_buff *result, **p;
4795 for (p = &pfile->free_buffs;; p = &(*p)->next)
4797 size_t size;
4799 if (*p == NULL)
4800 return new_buff (min_size);
4801 result = *p;
4802 size = result->limit - result->base;
4803 /* Return a buffer that's big enough, but don't waste one that's
4804 way too big. */
4805 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4806 break;
4809 *p = result->next;
4810 result->next = NULL;
4811 result->cur = result->base;
4812 return result;
4815 /* Creates a new buffer with enough space to hold the uncommitted
4816 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
4817 the excess bytes to the new buffer. Chains the new buffer after
4818 BUFF, and returns the new buffer. */
4819 _cpp_buff *
4820 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4822 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4823 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4825 buff->next = new_buff;
4826 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4827 return new_buff;
4830 /* Creates a new buffer with enough space to hold the uncommitted
4831 remaining bytes of the buffer pointed to by BUFF, and at least
4832 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
4833 Chains the new buffer before the buffer pointed to by BUFF, and
4834 updates the pointer to point to the new buffer. */
4835 void
4836 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4838 _cpp_buff *new_buff, *old_buff = *pbuff;
4839 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4841 new_buff = _cpp_get_buff (pfile, size);
4842 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4843 new_buff->next = old_buff;
4844 *pbuff = new_buff;
4847 /* Free a chain of buffers starting at BUFF. */
4848 void
4849 _cpp_free_buff (_cpp_buff *buff)
4851 _cpp_buff *next;
4853 for (; buff; buff = next)
4855 next = buff->next;
4856 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4857 free (buff);
4858 #else
4859 free (buff->base);
4860 #endif
4864 /* Allocate permanent, unaligned storage of length LEN. */
4865 unsigned char *
4866 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4868 _cpp_buff *buff = pfile->u_buff;
4869 unsigned char *result = buff->cur;
4871 if (len > (size_t) (buff->limit - result))
4873 buff = _cpp_get_buff (pfile, len);
4874 buff->next = pfile->u_buff;
4875 pfile->u_buff = buff;
4876 result = buff->cur;
4879 buff->cur = result + len;
4880 return result;
4883 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4884 That buffer is used for growing allocations when saving macro
4885 replacement lists in a #define, and when parsing an answer to an
4886 assertion in #assert, #unassert or #if (and therefore possibly
4887 whilst expanding macros). It therefore must not be used by any
4888 code that they might call: specifically the lexer and the guts of
4889 the macro expander.
4891 All existing other uses clearly fit this restriction: storing
4892 registered pragmas during initialization. */
4893 unsigned char *
4894 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4896 _cpp_buff *buff = pfile->a_buff;
4897 unsigned char *result = buff->cur;
4899 if (len > (size_t) (buff->limit - result))
4901 buff = _cpp_get_buff (pfile, len);
4902 buff->next = pfile->a_buff;
4903 pfile->a_buff = buff;
4904 result = buff->cur;
4907 buff->cur = result + len;
4908 return result;
4911 /* Commit or allocate storage from a buffer. */
4913 void *
4914 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4916 void *ptr = BUFF_FRONT (pfile->a_buff);
4918 if (pfile->hash_table->alloc_subobject)
4920 void *copy = pfile->hash_table->alloc_subobject (size);
4921 memcpy (copy, ptr, size);
4922 ptr = copy;
4924 else
4925 BUFF_FRONT (pfile->a_buff) += size;
4927 return ptr;
4930 /* Say which field of TOK is in use. */
4932 enum cpp_token_fld_kind
4933 cpp_token_val_index (const cpp_token *tok)
4935 switch (TOKEN_SPELL (tok))
4937 case SPELL_IDENT:
4938 return CPP_TOKEN_FLD_NODE;
4939 case SPELL_LITERAL:
4940 return CPP_TOKEN_FLD_STR;
4941 case SPELL_OPERATOR:
4942 /* Operands which were originally spelled as ident keep around
4943 the node for the exact spelling. */
4944 if (tok->flags & NAMED_OP)
4945 return CPP_TOKEN_FLD_NODE;
4946 else if (tok->type == CPP_PASTE)
4947 return CPP_TOKEN_FLD_TOKEN_NO;
4948 else
4949 return CPP_TOKEN_FLD_NONE;
4950 case SPELL_NONE:
4951 if (tok->type == CPP_MACRO_ARG)
4952 return CPP_TOKEN_FLD_ARG_NO;
4953 else if (tok->type == CPP_PADDING)
4954 return CPP_TOKEN_FLD_SOURCE;
4955 else if (tok->type == CPP_PRAGMA)
4956 return CPP_TOKEN_FLD_PRAGMA;
4957 /* fall through */
4958 default:
4959 return CPP_TOKEN_FLD_NONE;
4963 /* All tokens lexed in R after calling this function will be forced to
4964 have their location_t to be P, until
4965 cpp_stop_forcing_token_locations is called for R. */
4967 void
4968 cpp_force_token_locations (cpp_reader *r, location_t loc)
4970 r->forced_token_location = loc;
4973 /* Go back to assigning locations naturally for lexed tokens. */
4975 void
4976 cpp_stop_forcing_token_locations (cpp_reader *r)
4978 r->forced_token_location = 0;
4981 /* We're looking at \, if it's escaping EOL, look past it. If at
4982 LIMIT, don't advance. */
4984 static const unsigned char *
4985 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4987 const unsigned char *probe = peek;
4989 if (__builtin_expect (peek[1] == '\n', true))
4991 eol:
4992 probe += 2;
4993 if (__builtin_expect (probe < limit, true))
4995 peek = probe;
4996 if (*peek == '\\')
4997 /* The user might be perverse. */
4998 return do_peek_backslash (peek, limit);
5001 else if (__builtin_expect (peek[1] == '\r', false))
5003 if (probe[2] == '\n')
5004 probe++;
5005 goto eol;
5008 return peek;
5011 static const unsigned char *
5012 do_peek_next (const unsigned char *peek, const unsigned char *limit)
5014 if (__builtin_expect (*peek == '\\', false))
5015 peek = do_peek_backslash (peek, limit);
5016 return peek;
5019 static const unsigned char *
5020 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
5022 if (peek == bound)
5023 return NULL;
5025 unsigned char c = *--peek;
5026 if (__builtin_expect (c == '\n', false)
5027 || __builtin_expect (c == 'r', false))
5029 if (peek == bound)
5030 return peek;
5031 int ix = -1;
5032 if (c == '\n' && peek[ix] == '\r')
5034 if (peek + ix == bound)
5035 return peek;
5036 ix--;
5039 if (peek[ix] == '\\')
5040 return do_peek_prev (peek + ix, bound);
5042 return peek;
5044 else
5045 return peek;
5048 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
5049 space. Otherwise return NULL. */
5051 static const unsigned char *
5052 do_peek_ident (const char *match, const unsigned char *peek,
5053 const unsigned char *limit)
5055 for (; *++match; peek++)
5056 if (*peek != *match)
5058 peek = do_peek_next (peek, limit);
5059 if (*peek != *match)
5060 return NULL;
5063 /* Must now not be looking at an identifier char. */
5064 peek = do_peek_next (peek, limit);
5065 if (ISIDNUM (*peek))
5066 return NULL;
5068 /* Skip control-line whitespace. */
5070 while (*peek == ' ' || *peek == '\t')
5071 peek++;
5072 if (__builtin_expect (*peek == '\\', false))
5074 peek = do_peek_backslash (peek, limit);
5075 if (*peek != '\\')
5076 goto ws;
5079 return peek;
5082 /* Are we looking at a module control line starting as PEEK - 1? */
5084 static bool
5085 do_peek_module (cpp_reader *pfile, unsigned char c,
5086 const unsigned char *peek, const unsigned char *limit)
5088 bool import = false;
5090 if (__builtin_expect (c == 'e', false))
5092 if (!((peek[0] == 'x' || peek[0] == '\\')
5093 && (peek = do_peek_ident ("export", peek, limit))))
5094 return false;
5096 /* export, peek for import or module. No need to peek __import
5097 here. */
5098 if (peek[0] == 'i')
5100 if (!((peek[1] == 'm' || peek[1] == '\\')
5101 && (peek = do_peek_ident ("import", peek + 1, limit))))
5102 return false;
5103 import = true;
5105 else if (peek[0] == 'm')
5107 if (!((peek[1] == 'o' || peek[1] == '\\')
5108 && (peek = do_peek_ident ("module", peek + 1, limit))))
5109 return false;
5111 else
5112 return false;
5114 else if (__builtin_expect (c == 'i', false))
5116 if (!((peek[0] == 'm' || peek[0] == '\\')
5117 && (peek = do_peek_ident ("import", peek, limit))))
5118 return false;
5119 import = true;
5121 else if (__builtin_expect (c == '_', false))
5123 /* Needed for translated includes. */
5124 if (!((peek[0] == '_' || peek[0] == '\\')
5125 && (peek = do_peek_ident ("__import", peek, limit))))
5126 return false;
5127 import = true;
5129 else if (__builtin_expect (c == 'm', false))
5131 if (!((peek[0] == 'o' || peek[0] == '\\')
5132 && (peek = do_peek_ident ("module", peek, limit))))
5133 return false;
5135 else
5136 return false;
5138 /* Peek the next character to see if it's good enough. We'll be at
5139 the first non-whitespace char, including skipping an escaped
5140 newline. */
5141 /* ... import followed by identifier, ':', '<' or header-name
5142 preprocessing tokens, or module followed by identifier, ':' or
5143 ';' preprocessing tokens. */
5144 unsigned char p = *peek++;
5146 /* A character literal is ... single quotes, ... optionally preceded
5147 by u8, u, U, or L */
5148 /* A string-literal is a ... double quotes, optionally prefixed by
5149 R, u8, u8R, u, uR, U, UR, L, or LR */
5150 if (p == 'u')
5152 peek = do_peek_next (peek, limit);
5153 if (*peek == '8')
5155 peek++;
5156 goto peek_u8;
5158 goto peek_u;
5160 else if (p == 'U' || p == 'L')
5162 peek_u8:
5163 peek = do_peek_next (peek, limit);
5164 peek_u:
5165 if (*peek == '\"' || *peek == '\'')
5166 return false;
5168 if (*peek == 'R')
5169 goto peek_R;
5170 /* Identifier. Ok. */
5172 else if (p == 'R')
5174 peek_R:
5175 if (CPP_OPTION (pfile, rliterals))
5177 peek = do_peek_next (peek, limit);
5178 if (*peek == '\"')
5179 return false;
5181 /* Identifier. Ok. */
5183 else if ('Z' - 'A' == 25
5184 ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5185 : ISIDST (p))
5187 /* Identifier. Ok. */
5189 else if (p == '<')
5191 /* Maybe angle header, ok for import. Reject
5192 '<=', '<<' digraph:'<:'. */
5193 if (!import)
5194 return false;
5195 peek = do_peek_next (peek, limit);
5196 if (*peek == '=' || *peek == '<'
5197 || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5198 return false;
5200 else if (p == ';')
5202 /* SEMICOLON, ok for module. */
5203 if (import)
5204 return false;
5206 else if (p == '"')
5208 /* STRING, ok for import. */
5209 if (!import)
5210 return false;
5212 else if (p == ':')
5214 /* Maybe COLON, ok. Reject '::', digraph:':>'. */
5215 peek = do_peek_next (peek, limit);
5216 if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5217 return false;
5219 else
5220 /* FIXME: Detect a unicode character, excluding those not
5221 permitted as the initial character. [lex.name]/1. I presume
5222 we need to check the \[uU] spellings, and directly using
5223 Unicode in say UTF8 form? Or perhaps we do the phase-1
5224 conversion of UTF8 to universal-character-names? */
5225 return false;
5227 return true;
5230 /* Directives-only scanning. Somewhat more relaxed than correct
5231 parsing -- some ill-formed programs will not be rejected. */
5233 void
5234 cpp_directive_only_process (cpp_reader *pfile,
5235 void *data,
5236 void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5238 bool module_p = CPP_OPTION (pfile, module_directives);
5242 restart:
5243 /* Buffer initialization, but no line cleaning. */
5244 cpp_buffer *buffer = pfile->buffer;
5245 buffer->cur_note = buffer->notes_used = 0;
5246 buffer->cur = buffer->line_base = buffer->next_line;
5247 buffer->need_line = false;
5248 /* Files always end in a newline or carriage return. We rely on this for
5249 character peeking safety. */
5250 gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
5252 const unsigned char *base = buffer->cur;
5253 unsigned line_count = 0;
5254 const unsigned char *line_start = base;
5256 bool bol = true;
5257 bool raw = false;
5259 const unsigned char *lwm = base;
5260 for (const unsigned char *pos = base, *limit = buffer->rlimit;
5261 pos < limit;)
5263 unsigned char c = *pos++;
5264 /* This matches the switch in _cpp_lex_direct. */
5265 switch (c)
5267 case ' ': case '\t': case '\f': case '\v':
5268 /* Whitespace, do nothing. */
5269 break;
5271 case '\r': /* MAC line ending, or Windows \r\n */
5272 if (*pos == '\n')
5273 pos++;
5274 /* FALLTHROUGH */
5276 case '\n':
5277 bol = true;
5279 next_line:
5280 CPP_INCREMENT_LINE (pfile, 0);
5281 line_count++;
5282 line_start = pos;
5283 break;
5285 case '\\':
5286 /* <backslash><newline> is removed, and doesn't undo any
5287 preceeding escape or whatnot. */
5288 if (*pos == '\n')
5290 pos++;
5291 goto next_line;
5293 else if (*pos == '\r')
5295 if (pos[1] == '\n')
5296 pos++;
5297 pos++;
5298 goto next_line;
5300 goto dflt;
5302 case '#':
5303 if (bol)
5305 /* Line directive. */
5306 if (pos - 1 > base && !pfile->state.skipping)
5307 cb (pfile, CPP_DO_print, data,
5308 line_count, base, pos - 1 - base);
5310 /* Prep things for directive handling. */
5311 buffer->next_line = pos;
5312 buffer->need_line = true;
5313 bool ok = _cpp_get_fresh_line (pfile);
5314 gcc_checking_assert (ok);
5316 /* Ensure proper column numbering for generated
5317 error messages. */
5318 buffer->line_base -= pos - line_start;
5320 _cpp_handle_directive (pfile, line_start + 1 != pos);
5322 /* Sanitize the line settings. Duplicate #include's can
5323 mess things up. */
5324 // FIXME: Necessary?
5325 pfile->line_table->highest_location
5326 = pfile->line_table->highest_line;
5328 if (!pfile->state.skipping
5329 && pfile->buffer->next_line < pfile->buffer->rlimit)
5330 cb (pfile, CPP_DO_location, data,
5331 pfile->line_table->highest_line);
5333 goto restart;
5335 goto dflt;
5337 case '/':
5339 const unsigned char *peek = do_peek_next (pos, limit);
5340 if (!(*peek == '/' || *peek == '*'))
5341 goto dflt;
5343 /* Line or block comment */
5344 bool is_block = *peek == '*';
5345 bool star = false;
5346 bool esc = false;
5347 location_t sloc
5348 = linemap_position_for_column (pfile->line_table,
5349 pos - line_start);
5351 while (pos < limit)
5353 char c = *pos++;
5354 switch (c)
5356 case '\\':
5357 esc = true;
5358 break;
5360 case '\r':
5361 if (*pos == '\n')
5362 pos++;
5363 /* FALLTHROUGH */
5365 case '\n':
5367 CPP_INCREMENT_LINE (pfile, 0);
5368 line_count++;
5369 line_start = pos;
5370 if (!esc && !is_block)
5372 bol = true;
5373 goto done_comment;
5376 if (!esc)
5377 star = false;
5378 esc = false;
5379 break;
5381 case '*':
5382 if (pos > peek)
5383 star = is_block;
5384 esc = false;
5385 break;
5387 case '/':
5388 if (star)
5389 goto done_comment;
5390 /* FALLTHROUGH */
5392 default:
5393 star = false;
5394 esc = false;
5395 break;
5398 if (pos < limit || is_block)
5399 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5400 "unterminated comment");
5401 done_comment:
5402 lwm = pos;
5403 break;
5406 case '\'':
5407 if (!CPP_OPTION (pfile, digit_separators))
5408 goto delimited_string;
5410 /* Possibly a number punctuator. */
5411 if (!ISIDNUM (*do_peek_next (pos, limit)))
5412 goto delimited_string;
5414 goto quote_peek;
5416 case '\"':
5417 if (!CPP_OPTION (pfile, rliterals))
5418 goto delimited_string;
5420 quote_peek:
5422 /* For ' see if it's a number punctuator
5423 \.?<digit>(<digit>|<identifier-nondigit>
5424 |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5425 /* For " see if it's a raw string
5426 {U,L,u,u8}R. This includes CPP_NUMBER detection,
5427 because that could be 0e+R. */
5428 const unsigned char *peek = pos - 1;
5429 bool quote_first = c == '"';
5430 bool quote_eight = false;
5431 bool maybe_number_start = false;
5432 bool want_number = false;
5434 while ((peek = do_peek_prev (peek, lwm)))
5436 unsigned char p = *peek;
5437 if (quote_first)
5439 if (!raw)
5441 if (p != 'R')
5442 break;
5443 raw = true;
5444 continue;
5447 quote_first = false;
5448 if (p == 'L' || p == 'U' || p == 'u')
5450 else if (p == '8')
5451 quote_eight = true;
5452 else
5453 goto second_raw;
5455 else if (quote_eight)
5457 if (p != 'u')
5459 raw = false;
5460 break;
5462 quote_eight = false;
5464 else if (c == '"')
5466 second_raw:;
5467 if (!want_number && ISIDNUM (p))
5469 raw = false;
5470 break;
5474 if (ISDIGIT (p))
5475 maybe_number_start = true;
5476 else if (p == '.')
5477 want_number = true;
5478 else if (ISIDNUM (p))
5479 maybe_number_start = false;
5480 else if (p == '+' || p == '-')
5482 if (const unsigned char *peek_prev
5483 = do_peek_prev (peek, lwm))
5485 p = *peek_prev;
5486 if (p == 'e' || p == 'E'
5487 || p == 'p' || p == 'P')
5489 want_number = true;
5490 maybe_number_start = false;
5492 else
5493 break;
5495 else
5496 break;
5498 else if (p == '\'' || p == '\"')
5500 /* If this is lwm, this must be the end of a
5501 previous string. So this is a trailing
5502 literal type, (a) if those are allowed,
5503 and (b) maybe_start is false. Otherwise
5504 this must be a CPP_NUMBER because we've
5505 met another ', and we'd have checked that
5506 in its own right. */
5507 if (peek == lwm && CPP_OPTION (pfile, uliterals))
5509 if (!maybe_number_start && !want_number)
5510 /* Must be a literal type. */
5511 raw = false;
5513 else if (p == '\''
5514 && CPP_OPTION (pfile, digit_separators))
5515 maybe_number_start = true;
5516 break;
5518 else if (c == '\'')
5519 break;
5520 else if (!quote_first && !quote_eight)
5521 break;
5524 if (maybe_number_start)
5526 if (c == '\'')
5527 /* A CPP NUMBER. */
5528 goto dflt;
5529 raw = false;
5532 goto delimited_string;
5535 delimited_string:
5537 /* (Possibly raw) string or char literal. */
5538 unsigned char end = c;
5539 int delim_len = -1;
5540 const unsigned char *delim = NULL;
5541 location_t sloc = linemap_position_for_column (pfile->line_table,
5542 pos - line_start);
5543 int esc = 0;
5545 if (raw)
5547 /* There can be no line breaks in the delimiter. */
5548 delim = pos;
5549 for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5551 if (delim_len == 16)
5553 cpp_error_with_line (pfile, CPP_DL_ERROR,
5554 sloc, 0,
5555 "raw string delimiter"
5556 " longer than %d"
5557 " characters",
5558 delim_len);
5559 raw = false;
5560 pos = delim;
5561 break;
5563 if (strchr (") \\\t\v\f\n", c))
5565 cpp_error_with_line (pfile, CPP_DL_ERROR,
5566 sloc, 0,
5567 "invalid character '%c'"
5568 " in raw string"
5569 " delimiter", c);
5570 raw = false;
5571 pos = delim;
5572 break;
5574 if (pos >= limit)
5575 goto bad_string;
5579 while (pos < limit)
5581 char c = *pos++;
5582 switch (c)
5584 case '\\':
5585 if (!raw)
5586 esc++;
5587 break;
5589 case '\r':
5590 if (*pos == '\n')
5591 pos++;
5592 /* FALLTHROUGH */
5594 case '\n':
5596 CPP_INCREMENT_LINE (pfile, 0);
5597 line_count++;
5598 line_start = pos;
5600 if (esc)
5601 esc--;
5602 break;
5604 case ')':
5605 if (raw
5606 && pos + delim_len + 1 < limit
5607 && pos[delim_len] == end
5608 && !memcmp (delim, pos, delim_len))
5610 pos += delim_len + 1;
5611 raw = false;
5612 goto done_string;
5614 break;
5616 default:
5617 if (!raw && !(esc & 1) && c == end)
5618 goto done_string;
5619 esc = 0;
5620 break;
5623 bad_string:
5624 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5625 "unterminated literal");
5627 done_string:
5628 raw = false;
5629 lwm = pos - 1;
5631 goto dflt;
5633 case '_':
5634 case 'e':
5635 case 'i':
5636 case 'm':
5637 if (bol && module_p && !pfile->state.skipping
5638 && do_peek_module (pfile, c, pos, limit))
5640 /* We've seen the start of a module control line.
5641 Start up the tokenizer. */
5642 pos--; /* Backup over the first character. */
5644 /* Backup over whitespace to start of line. */
5645 while (pos > line_start
5646 && (pos[-1] == ' ' || pos[-1] == '\t'))
5647 pos--;
5649 if (pos > base)
5650 cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5652 /* Prep things for directive handling. */
5653 buffer->next_line = pos;
5654 buffer->need_line = true;
5656 /* Now get tokens until the PRAGMA_EOL. */
5659 location_t spelling;
5660 const cpp_token *tok
5661 = cpp_get_token_with_location (pfile, &spelling);
5663 gcc_assert (pfile->state.in_deferred_pragma
5664 || tok->type == CPP_PRAGMA_EOL);
5665 cb (pfile, CPP_DO_token, data, tok, spelling);
5667 while (pfile->state.in_deferred_pragma);
5669 if (pfile->buffer->next_line < pfile->buffer->rlimit)
5670 cb (pfile, CPP_DO_location, data,
5671 pfile->line_table->highest_line);
5673 pfile->mi_valid = false;
5674 goto restart;
5676 goto dflt;
5678 default:
5679 dflt:
5680 bol = false;
5681 pfile->mi_valid = false;
5682 break;
5686 if (buffer->rlimit > base && !pfile->state.skipping)
5688 const unsigned char *limit = buffer->rlimit;
5689 /* If the file was not newline terminated, add rlimit, which is
5690 guaranteed to point to a newline, to the end of our range. */
5691 if (limit[-1] != '\n')
5693 limit++;
5694 CPP_INCREMENT_LINE (pfile, 0);
5695 line_count++;
5697 cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5700 _cpp_pop_buffer (pfile);
5702 while (pfile->buffer);