[Ada] Do not perform useless work in Check_No_Parts_Violations
[official-gcc.git] / libcpp / lex.c
blob3618fa5d7370c1df5ada1cb81e450ac0b7db54c9
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000-2021 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
27 enum spell_type
29 SPELL_OPERATOR = 0,
30 SPELL_IDENT,
31 SPELL_LITERAL,
32 SPELL_NONE
35 struct token_spelling
37 enum spell_type category;
38 const unsigned char *name;
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
44 #define OP(e, s) { SPELL_OPERATOR, UC s },
45 #define TK(e, s) { SPELL_ ## s, UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54 static int skip_line_comment (cpp_reader *);
55 static void skip_whitespace (cpp_reader *, cppchar_t);
56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58 static void store_comment (cpp_reader *, cpp_token *);
59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
60 unsigned int, enum cpp_ttype);
61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
62 static int name_p (cpp_reader *, const cpp_string *);
63 static tokenrun *next_tokenrun (tokenrun *);
65 static _cpp_buff *new_buff (size_t);
68 /* Utility routine:
70 Compares, the token TOKEN to the NUL-terminated string STRING.
71 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
72 int
73 cpp_ideq (const cpp_token *token, const char *string)
75 if (token->type != CPP_NAME)
76 return 0;
78 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
81 /* Record a note TYPE at byte POS into the current cleaned logical
82 line. */
83 static void
84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
86 if (buffer->notes_used == buffer->notes_cap)
88 buffer->notes_cap = buffer->notes_cap * 2 + 200;
89 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
90 buffer->notes_cap);
93 buffer->notes[buffer->notes_used].pos = pos;
94 buffer->notes[buffer->notes_used].type = type;
95 buffer->notes_used++;
99 /* Fast path to find line special characters using optimized character
100 scanning algorithms. Anything complicated falls back to the slow
101 path below. Since this loop is very hot it's worth doing these kinds
102 of optimizations.
104 One of the paths through the ifdefs should provide
106 const uchar *search_line_fast (const uchar *s, const uchar *end);
108 Between S and END, search for \n, \r, \\, ?. Return a pointer to
109 the found character.
111 Note that the last character of the buffer is *always* a newline,
112 as forced by _cpp_convert_input. This fact can be used to avoid
113 explicitly looking for the end of the buffer. */
115 /* Configure gives us an ifdef test. */
116 #ifndef WORDS_BIGENDIAN
117 #define WORDS_BIGENDIAN 0
118 #endif
120 /* We'd like the largest integer that fits into a register. There's nothing
121 in <stdint.h> that gives us that. For most hosts this is unsigned long,
122 but MS decided on an LLP64 model. Thankfully when building with GCC we
123 can get the "real" word size. */
124 #ifdef __GNUC__
125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
126 #else
127 typedef unsigned long word_type;
128 #endif
130 /* The code below is only expecting sizes 4 or 8.
131 Die at compile-time if this expectation is violated. */
132 typedef char check_word_type_size
133 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
135 /* Return X with the first N bytes forced to values that won't match one
136 of the interesting characters. Note that NUL is not interesting. */
138 static inline word_type
139 acc_char_mask_misalign (word_type val, unsigned int n)
141 word_type mask = -1;
142 if (WORDS_BIGENDIAN)
143 mask >>= n * 8;
144 else
145 mask <<= n * 8;
146 return val & mask;
149 /* Return X replicated to all byte positions within WORD_TYPE. */
151 static inline word_type
152 acc_char_replicate (uchar x)
154 word_type ret;
156 ret = (x << 24) | (x << 16) | (x << 8) | x;
157 if (sizeof(word_type) == 8)
158 ret = (ret << 16 << 16) | ret;
159 return ret;
162 /* Return non-zero if some byte of VAL is (probably) C. */
164 static inline word_type
165 acc_char_cmp (word_type val, word_type c)
167 #if defined(__GNUC__) && defined(__alpha__)
168 /* We can get exact results using a compare-bytes instruction.
169 Get (val == c) via (0 >= (val ^ c)). */
170 return __builtin_alpha_cmpbge (0, val ^ c);
171 #else
172 word_type magic = 0x7efefefeU;
173 if (sizeof(word_type) == 8)
174 magic = (magic << 16 << 16) | 0xfefefefeU;
175 magic |= 1;
177 val ^= c;
178 return ((val + magic) ^ ~val) & ~magic;
179 #endif
182 /* Given the result of acc_char_cmp is non-zero, return the index of
183 the found character. If this was a false positive, return -1. */
185 static inline int
186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
187 word_type val ATTRIBUTE_UNUSED)
189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
190 /* The cmpbge instruction sets *bits* of the result corresponding to
191 matches in the bytes with no false positives. */
192 return __builtin_ctzl (cmp);
193 #else
194 unsigned int i;
196 /* ??? It would be nice to force unrolling here,
197 and have all of these constants folded. */
198 for (i = 0; i < sizeof(word_type); ++i)
200 uchar c;
201 if (WORDS_BIGENDIAN)
202 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
203 else
204 c = (val >> i * 8) & 0xff;
206 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
207 return i;
210 return -1;
211 #endif
214 /* A version of the fast scanner using bit fiddling techniques.
216 For 32-bit words, one would normally perform 16 comparisons and
217 16 branches. With this algorithm one performs 24 arithmetic
218 operations and one branch. Whether this is faster with a 32-bit
219 word size is going to be somewhat system dependent.
221 For 64-bit words, we eliminate twice the number of comparisons
222 and branches without increasing the number of arithmetic operations.
223 It's almost certainly going to be a win with 64-bit word size. */
225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
226 ATTRIBUTE_UNUSED;
228 static const uchar *
229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
231 const word_type repl_nl = acc_char_replicate ('\n');
232 const word_type repl_cr = acc_char_replicate ('\r');
233 const word_type repl_bs = acc_char_replicate ('\\');
234 const word_type repl_qm = acc_char_replicate ('?');
236 unsigned int misalign;
237 const word_type *p;
238 word_type val, t;
240 /* Align the buffer. Mask out any bytes from before the beginning. */
241 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
242 val = *p;
243 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
244 if (misalign)
245 val = acc_char_mask_misalign (val, misalign);
247 /* Main loop. */
248 while (1)
250 t = acc_char_cmp (val, repl_nl);
251 t |= acc_char_cmp (val, repl_cr);
252 t |= acc_char_cmp (val, repl_bs);
253 t |= acc_char_cmp (val, repl_qm);
255 if (__builtin_expect (t != 0, 0))
257 int i = acc_char_index (t, val);
258 if (i >= 0)
259 return (const uchar *)p + i;
262 val = *++p;
266 /* Disable on Solaris 2/x86 until the following problem can be properly
267 autoconfed:
269 The Solaris 10+ assembler tags objects with the instruction set
270 extensions used, so SSE4.2 executables cannot run on machines that
271 don't support that extension. */
273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
275 /* Replicated character data to be shared between implementations.
276 Recall that outside of a context with vector support we can't
277 define compatible vector types, therefore these are all defined
278 in terms of raw characters. */
279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
280 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
281 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
282 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
283 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
284 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
285 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
286 { '?', '?', '?', '?', '?', '?', '?', '?',
287 '?', '?', '?', '?', '?', '?', '?', '?' },
290 /* A version of the fast scanner using MMX vectorized byte compare insns.
292 This uses the PMOVMSKB instruction which was introduced with "MMX2",
293 which was packaged into SSE1; it is also present in the AMD MMX
294 extension. Mark the function as using "sse" so that we emit a real
295 "emms" instruction, rather than the 3dNOW "femms" instruction. */
297 static const uchar *
298 #ifndef __SSE__
299 __attribute__((__target__("sse")))
300 #endif
301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
303 typedef char v8qi __attribute__ ((__vector_size__ (8)));
304 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
306 const v8qi repl_nl = *(const v8qi *)repl_chars[0];
307 const v8qi repl_cr = *(const v8qi *)repl_chars[1];
308 const v8qi repl_bs = *(const v8qi *)repl_chars[2];
309 const v8qi repl_qm = *(const v8qi *)repl_chars[3];
311 unsigned int misalign, found, mask;
312 const v8qi *p;
313 v8qi data, t, c;
315 /* Align the source pointer. While MMX doesn't generate unaligned data
316 faults, this allows us to safely scan to the end of the buffer without
317 reading beyond the end of the last page. */
318 misalign = (uintptr_t)s & 7;
319 p = (const v8qi *)((uintptr_t)s & -8);
320 data = *p;
322 /* Create a mask for the bytes that are valid within the first
323 16-byte block. The Idea here is that the AND with the mask
324 within the loop is "free", since we need some AND or TEST
325 insn in order to set the flags for the branch anyway. */
326 mask = -1u << misalign;
328 /* Main loop processing 8 bytes at a time. */
329 goto start;
332 data = *++p;
333 mask = -1;
335 start:
336 t = __builtin_ia32_pcmpeqb(data, repl_nl);
337 c = __builtin_ia32_pcmpeqb(data, repl_cr);
338 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
339 c = __builtin_ia32_pcmpeqb(data, repl_bs);
340 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
341 c = __builtin_ia32_pcmpeqb(data, repl_qm);
342 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
343 found = __builtin_ia32_pmovmskb (t);
344 found &= mask;
346 while (!found);
348 __builtin_ia32_emms ();
350 /* FOUND contains 1 in bits for which we matched a relevant
351 character. Conversion to the byte index is trivial. */
352 found = __builtin_ctz(found);
353 return (const uchar *)p + found;
356 /* A version of the fast scanner using SSE2 vectorized byte compare insns. */
358 static const uchar *
359 #ifndef __SSE2__
360 __attribute__((__target__("sse2")))
361 #endif
362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
364 typedef char v16qi __attribute__ ((__vector_size__ (16)));
366 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
367 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
368 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
369 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
371 unsigned int misalign, found, mask;
372 const v16qi *p;
373 v16qi data, t;
375 /* Align the source pointer. */
376 misalign = (uintptr_t)s & 15;
377 p = (const v16qi *)((uintptr_t)s & -16);
378 data = *p;
380 /* Create a mask for the bytes that are valid within the first
381 16-byte block. The Idea here is that the AND with the mask
382 within the loop is "free", since we need some AND or TEST
383 insn in order to set the flags for the branch anyway. */
384 mask = -1u << misalign;
386 /* Main loop processing 16 bytes at a time. */
387 goto start;
390 data = *++p;
391 mask = -1;
393 start:
394 t = data == repl_nl;
395 t |= data == repl_cr;
396 t |= data == repl_bs;
397 t |= data == repl_qm;
398 found = __builtin_ia32_pmovmskb128 (t);
399 found &= mask;
401 while (!found);
403 /* FOUND contains 1 in bits for which we matched a relevant
404 character. Conversion to the byte index is trivial. */
405 found = __builtin_ctz(found);
406 return (const uchar *)p + found;
409 #ifdef HAVE_SSE4
410 /* A version of the fast scanner using SSE 4.2 vectorized string insns. */
412 static const uchar *
413 #ifndef __SSE4_2__
414 __attribute__((__target__("sse4.2")))
415 #endif
416 search_line_sse42 (const uchar *s, const uchar *end)
418 typedef char v16qi __attribute__ ((__vector_size__ (16)));
419 static const v16qi search = { '\n', '\r', '?', '\\' };
421 uintptr_t si = (uintptr_t)s;
422 uintptr_t index;
424 /* Check for unaligned input. */
425 if (si & 15)
427 v16qi sv;
429 if (__builtin_expect (end - s < 16, 0)
430 && __builtin_expect ((si & 0xfff) > 0xff0, 0))
432 /* There are less than 16 bytes left in the buffer, and less
433 than 16 bytes left on the page. Reading 16 bytes at this
434 point might generate a spurious page fault. Defer to the
435 SSE2 implementation, which already handles alignment. */
436 return search_line_sse2 (s, end);
439 /* ??? The builtin doesn't understand that the PCMPESTRI read from
440 memory need not be aligned. */
441 sv = __builtin_ia32_loaddqu ((const char *) s);
442 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
444 if (__builtin_expect (index < 16, 0))
445 goto found;
447 /* Advance the pointer to an aligned address. We will re-scan a
448 few bytes, but we no longer need care for reading past the
449 end of a page, since we're guaranteed a match. */
450 s = (const uchar *)((si + 15) & -16);
453 /* Main loop, processing 16 bytes at a time. */
454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
455 while (1)
457 char f;
459 /* By using inline assembly instead of the builtin,
460 we can use the result, as well as the flags set. */
461 __asm ("%vpcmpestri\t$0, %2, %3"
462 : "=c"(index), "=@ccc"(f)
463 : "m"(*s), "x"(search), "a"(4), "d"(16));
464 if (f)
465 break;
467 s += 16;
469 #else
470 s -= 16;
471 /* By doing the whole loop in inline assembly,
472 we can make proper use of the flags set. */
473 __asm ( ".balign 16\n"
474 "0: add $16, %1\n"
475 " %vpcmpestri\t$0, (%1), %2\n"
476 " jnc 0b"
477 : "=&c"(index), "+r"(s)
478 : "x"(search), "a"(4), "d"(16));
479 #endif
481 found:
482 return s + index;
485 #else
486 /* Work around out-dated assemblers without sse4 support. */
487 #define search_line_sse42 search_line_sse2
488 #endif
490 /* Check the CPU capabilities. */
492 #include "../gcc/config/i386/cpuid.h"
494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
495 static search_line_fast_type search_line_fast;
497 #define HAVE_init_vectorized_lexer 1
498 static inline void
499 init_vectorized_lexer (void)
501 unsigned dummy, ecx = 0, edx = 0;
502 search_line_fast_type impl = search_line_acc_char;
503 int minimum = 0;
505 #if defined(__SSE4_2__)
506 minimum = 3;
507 #elif defined(__SSE2__)
508 minimum = 2;
509 #elif defined(__SSE__)
510 minimum = 1;
511 #endif
513 if (minimum == 3)
514 impl = search_line_sse42;
515 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
517 if (minimum == 3 || (ecx & bit_SSE4_2))
518 impl = search_line_sse42;
519 else if (minimum == 2 || (edx & bit_SSE2))
520 impl = search_line_sse2;
521 else if (minimum == 1 || (edx & bit_SSE))
522 impl = search_line_mmx;
524 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
526 if (minimum == 1
527 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
528 impl = search_line_mmx;
531 search_line_fast = impl;
534 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
536 /* A vection of the fast scanner using AltiVec vectorized byte compares
537 and VSX unaligned loads (when VSX is available). This is otherwise
538 the same as the AltiVec version. */
540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
541 static const uchar *
542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
544 typedef __attribute__((altivec(vector))) unsigned char vc;
546 const vc repl_nl = {
547 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
548 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
550 const vc repl_cr = {
551 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
552 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
554 const vc repl_bs = {
555 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
556 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
558 const vc repl_qm = {
559 '?', '?', '?', '?', '?', '?', '?', '?',
560 '?', '?', '?', '?', '?', '?', '?', '?',
562 const vc zero = { 0 };
564 vc data, t;
566 /* Main loop processing 16 bytes at a time. */
569 vc m_nl, m_cr, m_bs, m_qm;
571 data = __builtin_vec_vsx_ld (0, s);
572 s += 16;
574 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
575 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
576 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
577 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
578 t = (m_nl | m_cr) | (m_bs | m_qm);
580 /* T now contains 0xff in bytes for which we matched one of the relevant
581 characters. We want to exit the loop if any byte in T is non-zero.
582 Below is the expansion of vec_any_ne(t, zero). */
584 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
586 /* Restore s to to point to the 16 bytes we just processed. */
587 s -= 16;
590 #define N (sizeof(vc) / sizeof(long))
592 union {
593 vc v;
594 /* Statically assert that N is 2 or 4. */
595 unsigned long l[(N == 2 || N == 4) ? N : -1];
596 } u;
597 unsigned long l, i = 0;
599 u.v = t;
601 /* Find the first word of T that is non-zero. */
602 switch (N)
604 case 4:
605 l = u.l[i++];
606 if (l != 0)
607 break;
608 s += sizeof(unsigned long);
609 l = u.l[i++];
610 if (l != 0)
611 break;
612 s += sizeof(unsigned long);
613 /* FALLTHRU */
614 case 2:
615 l = u.l[i++];
616 if (l != 0)
617 break;
618 s += sizeof(unsigned long);
619 l = u.l[i];
622 /* L now contains 0xff in bytes for which we matched one of the
623 relevant characters. We can find the byte index by finding
624 its bit index and dividing by 8. */
625 #ifdef __BIG_ENDIAN__
626 l = __builtin_clzl(l) >> 3;
627 #else
628 l = __builtin_ctzl(l) >> 3;
629 #endif
630 return s + l;
632 #undef N
636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
638 /* A vection of the fast scanner using AltiVec vectorized byte compares.
639 This cannot be used for little endian because vec_lvsl/lvsr are
640 deprecated for little endian and the code won't work properly. */
641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
642 so we can't compile this function without -maltivec on the command line
643 (or implied by some other switch). */
645 static const uchar *
646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
648 typedef __attribute__((altivec(vector))) unsigned char vc;
650 const vc repl_nl = {
651 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
652 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
654 const vc repl_cr = {
655 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
656 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
658 const vc repl_bs = {
659 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
660 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
662 const vc repl_qm = {
663 '?', '?', '?', '?', '?', '?', '?', '?',
664 '?', '?', '?', '?', '?', '?', '?', '?',
666 const vc ones = {
667 -1, -1, -1, -1, -1, -1, -1, -1,
668 -1, -1, -1, -1, -1, -1, -1, -1,
670 const vc zero = { 0 };
672 vc data, mask, t;
674 /* Altivec loads automatically mask addresses with -16. This lets us
675 issue the first load as early as possible. */
676 data = __builtin_vec_ld(0, (const vc *)s);
678 /* Discard bytes before the beginning of the buffer. Do this by
679 beginning with all ones and shifting in zeros according to the
680 mis-alignment. The LVSR instruction pulls the exact shift we
681 want from the address. */
682 mask = __builtin_vec_lvsr(0, s);
683 mask = __builtin_vec_perm(zero, ones, mask);
684 data &= mask;
686 /* While altivec loads mask addresses, we still need to align S so
687 that the offset we compute at the end is correct. */
688 s = (const uchar *)((uintptr_t)s & -16);
690 /* Main loop processing 16 bytes at a time. */
691 goto start;
694 vc m_nl, m_cr, m_bs, m_qm;
696 s += 16;
697 data = __builtin_vec_ld(0, (const vc *)s);
699 start:
700 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
701 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
702 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
703 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
704 t = (m_nl | m_cr) | (m_bs | m_qm);
706 /* T now contains 0xff in bytes for which we matched one of the relevant
707 characters. We want to exit the loop if any byte in T is non-zero.
708 Below is the expansion of vec_any_ne(t, zero). */
710 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
713 #define N (sizeof(vc) / sizeof(long))
715 union {
716 vc v;
717 /* Statically assert that N is 2 or 4. */
718 unsigned long l[(N == 2 || N == 4) ? N : -1];
719 } u;
720 unsigned long l, i = 0;
722 u.v = t;
724 /* Find the first word of T that is non-zero. */
725 switch (N)
727 case 4:
728 l = u.l[i++];
729 if (l != 0)
730 break;
731 s += sizeof(unsigned long);
732 l = u.l[i++];
733 if (l != 0)
734 break;
735 s += sizeof(unsigned long);
736 /* FALLTHROUGH */
737 case 2:
738 l = u.l[i++];
739 if (l != 0)
740 break;
741 s += sizeof(unsigned long);
742 l = u.l[i];
745 /* L now contains 0xff in bytes for which we matched one of the
746 relevant characters. We can find the byte index by finding
747 its bit index and dividing by 8. */
748 l = __builtin_clzl(l) >> 3;
749 return s + l;
751 #undef N
755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
756 #include "arm_neon.h"
758 /* This doesn't have to be the exact page size, but no system may use
759 a size smaller than this. ARMv8 requires a minimum page size of
760 4k. The impact of being conservative here is a small number of
761 cases will take the slightly slower entry path into the main
762 loop. */
764 #define AARCH64_MIN_PAGE_SIZE 4096
766 static const uchar *
767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
769 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
770 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
771 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
772 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
773 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
775 #ifdef __ARM_BIG_ENDIAN
776 const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
777 #else
778 const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
779 #endif
781 unsigned int found;
782 const uint8_t *p;
783 uint8x16_t data;
784 uint8x16_t t;
785 uint16x8_t m;
786 uint8x16_t u, v, w;
788 /* Align the source pointer. */
789 p = (const uint8_t *)((uintptr_t)s & -16);
791 /* Assuming random string start positions, with a 4k page size we'll take
792 the slow path about 0.37% of the time. */
793 if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
794 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
795 < 16, 0))
797 /* Slow path: the string starts near a possible page boundary. */
798 uint32_t misalign, mask;
800 misalign = (uintptr_t)s & 15;
801 mask = (-1u << misalign) & 0xffff;
802 data = vld1q_u8 (p);
803 t = vceqq_u8 (data, repl_nl);
804 u = vceqq_u8 (data, repl_cr);
805 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
806 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
807 t = vorrq_u8 (v, w);
808 t = vandq_u8 (t, xmask);
809 m = vpaddlq_u8 (t);
810 m = vshlq_u16 (m, shift);
811 found = vaddvq_u16 (m);
812 found &= mask;
813 if (found)
814 return (const uchar*)p + __builtin_ctz (found);
816 else
818 data = vld1q_u8 ((const uint8_t *) s);
819 t = vceqq_u8 (data, repl_nl);
820 u = vceqq_u8 (data, repl_cr);
821 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
822 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
823 t = vorrq_u8 (v, w);
824 if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
825 goto done;
830 p += 16;
831 data = vld1q_u8 (p);
832 t = vceqq_u8 (data, repl_nl);
833 u = vceqq_u8 (data, repl_cr);
834 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
835 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
836 t = vorrq_u8 (v, w);
837 } while (!vpaddd_u64 ((uint64x2_t)t));
839 done:
840 /* Now that we've found the terminating substring, work out precisely where
841 we need to stop. */
842 t = vandq_u8 (t, xmask);
843 m = vpaddlq_u8 (t);
844 m = vshlq_u16 (m, shift);
845 found = vaddvq_u16 (m);
846 return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
847 + __builtin_ctz (found));
850 #elif defined (__ARM_NEON)
851 #include "arm_neon.h"
853 static const uchar *
854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
856 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
857 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
858 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
859 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
860 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
862 unsigned int misalign, found, mask;
863 const uint8_t *p;
864 uint8x16_t data;
866 /* Align the source pointer. */
867 misalign = (uintptr_t)s & 15;
868 p = (const uint8_t *)((uintptr_t)s & -16);
869 data = vld1q_u8 (p);
871 /* Create a mask for the bytes that are valid within the first
872 16-byte block. The Idea here is that the AND with the mask
873 within the loop is "free", since we need some AND or TEST
874 insn in order to set the flags for the branch anyway. */
875 mask = (-1u << misalign) & 0xffff;
877 /* Main loop, processing 16 bytes at a time. */
878 goto start;
882 uint8x8_t l;
883 uint16x4_t m;
884 uint32x2_t n;
885 uint8x16_t t, u, v, w;
887 p += 16;
888 data = vld1q_u8 (p);
889 mask = 0xffff;
891 start:
892 t = vceqq_u8 (data, repl_nl);
893 u = vceqq_u8 (data, repl_cr);
894 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
895 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
896 t = vandq_u8 (vorrq_u8 (v, w), xmask);
897 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
898 m = vpaddl_u8 (l);
899 n = vpaddl_u16 (m);
901 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
902 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
903 found &= mask;
905 while (!found);
907 /* FOUND contains 1 in bits for which we matched a relevant
908 character. Conversion to the byte index is trivial. */
909 found = __builtin_ctz (found);
910 return (const uchar *)p + found;
913 #else
915 /* We only have one accelerated alternative. Use a direct call so that
916 we encourage inlining. */
918 #define search_line_fast search_line_acc_char
920 #endif
922 /* Initialize the lexer if needed. */
924 void
925 _cpp_init_lexer (void)
927 #ifdef HAVE_init_vectorized_lexer
928 init_vectorized_lexer ();
929 #endif
932 /* Returns with a logical line that contains no escaped newlines or
933 trigraphs. This is a time-critical inner loop. */
934 void
935 _cpp_clean_line (cpp_reader *pfile)
937 cpp_buffer *buffer;
938 const uchar *s;
939 uchar c, *d, *p;
941 buffer = pfile->buffer;
942 buffer->cur_note = buffer->notes_used = 0;
943 buffer->cur = buffer->line_base = buffer->next_line;
944 buffer->need_line = false;
945 s = buffer->next_line;
947 if (!buffer->from_stage3)
949 const uchar *pbackslash = NULL;
951 /* Fast path. This is the common case of an un-escaped line with
952 no trigraphs. The primary win here is by not writing any
953 data back to memory until we have to. */
954 while (1)
956 /* Perform an optimized search for \n, \r, \\, ?. */
957 s = search_line_fast (s, buffer->rlimit);
959 c = *s;
960 if (c == '\\')
962 /* Record the location of the backslash and continue. */
963 pbackslash = s++;
965 else if (__builtin_expect (c == '?', 0))
967 if (__builtin_expect (s[1] == '?', false)
968 && _cpp_trigraph_map[s[2]])
970 /* Have a trigraph. We may or may not have to convert
971 it. Add a line note regardless, for -Wtrigraphs. */
972 add_line_note (buffer, s, s[2]);
973 if (CPP_OPTION (pfile, trigraphs))
975 /* We do, and that means we have to switch to the
976 slow path. */
977 d = (uchar *) s;
978 *d = _cpp_trigraph_map[s[2]];
979 s += 2;
980 goto slow_path;
983 /* Not a trigraph. Continue on fast-path. */
984 s++;
986 else
987 break;
990 /* This must be \r or \n. We're either done, or we'll be forced
991 to write back to the buffer and continue on the slow path. */
992 d = (uchar *) s;
994 if (__builtin_expect (s == buffer->rlimit, false))
995 goto done;
997 /* DOS line ending? */
998 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1000 s++;
1001 if (s == buffer->rlimit)
1002 goto done;
1005 if (__builtin_expect (pbackslash == NULL, true))
1006 goto done;
1008 /* Check for escaped newline. */
1009 p = d;
1010 while (is_nvspace (p[-1]))
1011 p--;
1012 if (p - 1 != pbackslash)
1013 goto done;
1015 /* Have an escaped newline; process it and proceed to
1016 the slow path. */
1017 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018 d = p - 2;
1019 buffer->next_line = p - 1;
1021 slow_path:
1022 while (1)
1024 c = *++s;
1025 *++d = c;
1027 if (c == '\n' || c == '\r')
1029 /* Handle DOS line endings. */
1030 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031 s++;
1032 if (s == buffer->rlimit)
1033 break;
1035 /* Escaped? */
1036 p = d;
1037 while (p != buffer->next_line && is_nvspace (p[-1]))
1038 p--;
1039 if (p == buffer->next_line || p[-1] != '\\')
1040 break;
1042 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043 d = p - 2;
1044 buffer->next_line = p - 1;
1046 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1048 /* Add a note regardless, for the benefit of -Wtrigraphs. */
1049 add_line_note (buffer, d, s[2]);
1050 if (CPP_OPTION (pfile, trigraphs))
1052 *d = _cpp_trigraph_map[s[2]];
1053 s += 2;
1058 else
1060 while (*s != '\n' && *s != '\r')
1061 s++;
1062 d = (uchar *) s;
1064 /* Handle DOS line endings. */
1065 if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1066 s++;
1069 done:
1070 *d = '\n';
1071 /* A sentinel note that should never be processed. */
1072 add_line_note (buffer, d + 1, '\n');
1073 buffer->next_line = s + 1;
1076 /* Return true if the trigraph indicated by NOTE should be warned
1077 about in a comment. */
1078 static bool
1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1081 const uchar *p;
1083 /* Within comments we don't warn about trigraphs, unless the
1084 trigraph forms an escaped newline, as that may change
1085 behavior. */
1086 if (note->type != '/')
1087 return false;
1089 /* If -trigraphs, then this was an escaped newline iff the next note
1090 is coincident. */
1091 if (CPP_OPTION (pfile, trigraphs))
1092 return note[1].pos == note->pos;
1094 /* Otherwise, see if this forms an escaped newline. */
1095 p = note->pos + 3;
1096 while (is_nvspace (*p))
1097 p++;
1099 /* There might have been escaped newlines between the trigraph and the
1100 newline we found. Hence the position test. */
1101 return (*p == '\n' && p < note[1].pos);
1104 /* Process the notes created by add_line_note as far as the current
1105 location. */
1106 void
1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1109 cpp_buffer *buffer = pfile->buffer;
1111 for (;;)
1113 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114 unsigned int col;
1116 if (note->pos > buffer->cur)
1117 break;
1119 buffer->cur_note++;
1120 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1122 if (note->type == '\\' || note->type == ' ')
1124 if (note->type == ' ' && !in_comment)
1125 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126 "backslash and newline separated by space");
1128 if (buffer->next_line > buffer->rlimit)
1130 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131 "backslash-newline at end of file");
1132 /* Prevent "no newline at end of file" warning. */
1133 buffer->next_line = buffer->rlimit;
1136 buffer->line_base = note->pos;
1137 CPP_INCREMENT_LINE (pfile, 0);
1139 else if (_cpp_trigraph_map[note->type])
1141 if (CPP_OPTION (pfile, warn_trigraphs)
1142 && (!in_comment || warn_in_comment (pfile, note)))
1144 if (CPP_OPTION (pfile, trigraphs))
1145 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146 pfile->line_table->highest_line, col,
1147 "trigraph ??%c converted to %c",
1148 note->type,
1149 (int) _cpp_trigraph_map[note->type]);
1150 else
1152 cpp_warning_with_line
1153 (pfile, CPP_W_TRIGRAPHS,
1154 pfile->line_table->highest_line, col,
1155 "trigraph ??%c ignored, use -trigraphs to enable",
1156 note->type);
1160 else if (note->type == 0)
1161 /* Already processed in lex_raw_string. */;
1162 else
1163 abort ();
1167 /* Skip a C-style block comment. We find the end of the comment by
1168 seeing if an asterisk is before every '/' we encounter. Returns
1169 nonzero if comment terminated by EOF, zero otherwise.
1171 Buffer->cur points to the initial asterisk of the comment. */
1172 bool
1173 _cpp_skip_block_comment (cpp_reader *pfile)
1175 cpp_buffer *buffer = pfile->buffer;
1176 const uchar *cur = buffer->cur;
1177 uchar c;
1179 cur++;
1180 if (*cur == '/')
1181 cur++;
1183 for (;;)
1185 /* People like decorating comments with '*', so check for '/'
1186 instead for efficiency. */
1187 c = *cur++;
1189 if (c == '/')
1191 if (cur[-2] == '*')
1192 break;
1194 /* Warn about potential nested comments, but not if the '/'
1195 comes immediately before the true comment delimiter.
1196 Don't bother to get it right across escaped newlines. */
1197 if (CPP_OPTION (pfile, warn_comments)
1198 && cur[0] == '*' && cur[1] != '/')
1200 buffer->cur = cur;
1201 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1202 pfile->line_table->highest_line,
1203 CPP_BUF_COL (buffer),
1204 "\"/*\" within comment");
1207 else if (c == '\n')
1209 unsigned int cols;
1210 buffer->cur = cur - 1;
1211 _cpp_process_line_notes (pfile, true);
1212 if (buffer->next_line >= buffer->rlimit)
1213 return true;
1214 _cpp_clean_line (pfile);
1216 cols = buffer->next_line - buffer->line_base;
1217 CPP_INCREMENT_LINE (pfile, cols);
1219 cur = buffer->cur;
1223 buffer->cur = cur;
1224 _cpp_process_line_notes (pfile, true);
1225 return false;
1228 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1229 terminating newline. Handles escaped newlines. Returns nonzero
1230 if a multiline comment. */
1231 static int
1232 skip_line_comment (cpp_reader *pfile)
1234 cpp_buffer *buffer = pfile->buffer;
1235 location_t orig_line = pfile->line_table->highest_line;
1237 while (*buffer->cur != '\n')
1238 buffer->cur++;
1240 _cpp_process_line_notes (pfile, true);
1241 return orig_line != pfile->line_table->highest_line;
1244 /* Skips whitespace, saving the next non-whitespace character. */
1245 static void
1246 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1248 cpp_buffer *buffer = pfile->buffer;
1249 bool saw_NUL = false;
1253 /* Horizontal space always OK. */
1254 if (c == ' ' || c == '\t')
1256 /* Just \f \v or \0 left. */
1257 else if (c == '\0')
1258 saw_NUL = true;
1259 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1260 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1261 CPP_BUF_COL (buffer),
1262 "%s in preprocessing directive",
1263 c == '\f' ? "form feed" : "vertical tab");
1265 c = *buffer->cur++;
1267 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1268 while (is_nvspace (c));
1270 if (saw_NUL)
1271 cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1273 buffer->cur--;
1276 /* See if the characters of a number token are valid in a name (no
1277 '.', '+' or '-'). */
1278 static int
1279 name_p (cpp_reader *pfile, const cpp_string *string)
1281 unsigned int i;
1283 for (i = 0; i < string->len; i++)
1284 if (!is_idchar (string->text[i]))
1285 return 0;
1287 return 1;
1290 /* After parsing an identifier or other sequence, produce a warning about
1291 sequences not in NFC/NFKC. */
1292 static void
1293 warn_about_normalization (cpp_reader *pfile,
1294 const cpp_token *token,
1295 const struct normalize_state *s)
1297 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1298 && !pfile->state.skipping)
1300 /* Make sure that the token is printed using UCNs, even
1301 if we'd otherwise happily print UTF-8. */
1302 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1303 size_t sz;
1305 sz = cpp_spell_token (pfile, token, buf, false) - buf;
1306 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1307 cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1308 "`%.*s' is not in NFKC", (int) sz, buf);
1309 else
1310 cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1311 "`%.*s' is not in NFC", (int) sz, buf);
1312 free (buf);
1316 static const cppchar_t utf8_signifier = 0xC0;
1318 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1319 an identifier. FIRST is TRUE if this starts an identifier. */
1320 static bool
1321 forms_identifier_p (cpp_reader *pfile, int first,
1322 struct normalize_state *state)
1324 cpp_buffer *buffer = pfile->buffer;
1326 if (*buffer->cur == '$')
1328 if (!CPP_OPTION (pfile, dollars_in_ident))
1329 return false;
1331 buffer->cur++;
1332 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1334 CPP_OPTION (pfile, warn_dollars) = 0;
1335 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1338 return true;
1341 /* Is this a syntactically valid UCN or a valid UTF-8 char? */
1342 if (CPP_OPTION (pfile, extended_identifiers))
1344 cppchar_t s;
1345 if (*buffer->cur >= utf8_signifier)
1347 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1348 state, &s))
1349 return true;
1351 else if (*buffer->cur == '\\'
1352 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1354 buffer->cur += 2;
1355 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1356 state, &s, NULL, NULL))
1357 return true;
1358 buffer->cur -= 2;
1362 return false;
1365 /* Helper function to issue error about improper __VA_OPT__ use. */
1366 static void
1367 maybe_va_opt_error (cpp_reader *pfile)
1369 if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1371 /* __VA_OPT__ should not be accepted at all, but allow it in
1372 system headers. */
1373 if (!_cpp_in_system_header (pfile))
1374 cpp_error (pfile, CPP_DL_PEDWARN,
1375 "__VA_OPT__ is not available until C++20");
1377 else if (!pfile->state.va_args_ok)
1379 /* __VA_OPT__ should only appear in the replacement list of a
1380 variadic macro. */
1381 cpp_error (pfile, CPP_DL_PEDWARN,
1382 "__VA_OPT__ can only appear in the expansion"
1383 " of a C++20 variadic macro");
1387 /* Helper function to get the cpp_hashnode of the identifier BASE. */
1388 static cpp_hashnode *
1389 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1391 cpp_hashnode *result;
1392 const uchar *cur;
1393 unsigned int len;
1394 unsigned int hash = HT_HASHSTEP (0, *base);
1396 cur = base + 1;
1397 while (ISIDNUM (*cur))
1399 hash = HT_HASHSTEP (hash, *cur);
1400 cur++;
1402 len = cur - base;
1403 hash = HT_HASHFINISH (hash, len);
1404 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1405 base, len, hash, HT_ALLOC));
1407 /* Rarely, identifiers require diagnostics when lexed. */
1408 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1409 && !pfile->state.skipping, 0))
1411 /* It is allowed to poison the same identifier twice. */
1412 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1413 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1414 NODE_NAME (result));
1416 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1417 replacement list of a variadic macro. */
1418 if (result == pfile->spec_nodes.n__VA_ARGS__
1419 && !pfile->state.va_args_ok)
1421 if (CPP_OPTION (pfile, cplusplus))
1422 cpp_error (pfile, CPP_DL_PEDWARN,
1423 "__VA_ARGS__ can only appear in the expansion"
1424 " of a C++11 variadic macro");
1425 else
1426 cpp_error (pfile, CPP_DL_PEDWARN,
1427 "__VA_ARGS__ can only appear in the expansion"
1428 " of a C99 variadic macro");
1431 if (result == pfile->spec_nodes.n__VA_OPT__)
1432 maybe_va_opt_error (pfile);
1434 /* For -Wc++-compat, warn about use of C++ named operators. */
1435 if (result->flags & NODE_WARN_OPERATOR)
1436 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1437 "identifier \"%s\" is a special operator name in C++",
1438 NODE_NAME (result));
1441 return result;
1444 /* Get the cpp_hashnode of an identifier specified by NAME in
1445 the current cpp_reader object. If none is found, NULL is returned. */
1446 cpp_hashnode *
1447 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1449 cpp_hashnode *result;
1450 result = lex_identifier_intern (pfile, (uchar *) name);
1451 return result;
1454 /* Lex an identifier starting at BUFFER->CUR - 1. */
1455 static cpp_hashnode *
1456 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1457 struct normalize_state *nst, cpp_hashnode **spelling)
1459 cpp_hashnode *result;
1460 const uchar *cur;
1461 unsigned int len;
1462 unsigned int hash = HT_HASHSTEP (0, *base);
1464 cur = pfile->buffer->cur;
1465 if (! starts_ucn)
1467 while (ISIDNUM (*cur))
1469 hash = HT_HASHSTEP (hash, *cur);
1470 cur++;
1472 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1474 pfile->buffer->cur = cur;
1475 if (starts_ucn || forms_identifier_p (pfile, false, nst))
1477 /* Slower version for identifiers containing UCNs
1478 or extended chars (including $). */
1479 do {
1480 while (ISIDNUM (*pfile->buffer->cur))
1482 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1483 pfile->buffer->cur++;
1485 } while (forms_identifier_p (pfile, false, nst));
1486 result = _cpp_interpret_identifier (pfile, base,
1487 pfile->buffer->cur - base);
1488 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1490 else
1492 len = cur - base;
1493 hash = HT_HASHFINISH (hash, len);
1495 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1496 base, len, hash, HT_ALLOC));
1497 *spelling = result;
1500 /* Rarely, identifiers require diagnostics when lexed. */
1501 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1502 && !pfile->state.skipping, 0))
1504 /* It is allowed to poison the same identifier twice. */
1505 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1506 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1507 NODE_NAME (result));
1509 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1510 replacement list of a variadic macro. */
1511 if (result == pfile->spec_nodes.n__VA_ARGS__
1512 && !pfile->state.va_args_ok)
1514 if (CPP_OPTION (pfile, cplusplus))
1515 cpp_error (pfile, CPP_DL_PEDWARN,
1516 "__VA_ARGS__ can only appear in the expansion"
1517 " of a C++11 variadic macro");
1518 else
1519 cpp_error (pfile, CPP_DL_PEDWARN,
1520 "__VA_ARGS__ can only appear in the expansion"
1521 " of a C99 variadic macro");
1524 /* __VA_OPT__ should only appear in the replacement list of a
1525 variadic macro. */
1526 if (result == pfile->spec_nodes.n__VA_OPT__)
1527 maybe_va_opt_error (pfile);
1529 /* For -Wc++-compat, warn about use of C++ named operators. */
1530 if (result->flags & NODE_WARN_OPERATOR)
1531 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1532 "identifier \"%s\" is a special operator name in C++",
1533 NODE_NAME (result));
1536 return result;
1539 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
1540 static void
1541 lex_number (cpp_reader *pfile, cpp_string *number,
1542 struct normalize_state *nst)
1544 const uchar *cur;
1545 const uchar *base;
1546 uchar *dest;
1548 base = pfile->buffer->cur - 1;
1551 const uchar *adj_digit_sep = NULL;
1552 cur = pfile->buffer->cur;
1554 /* N.B. ISIDNUM does not include $. */
1555 while (ISIDNUM (*cur)
1556 || (*cur == '.' && !DIGIT_SEP (cur[-1]))
1557 || DIGIT_SEP (*cur)
1558 || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
1560 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1561 /* Adjacent digit separators do not form part of the pp-number syntax.
1562 However, they can safely be diagnosed here as an error, since '' is
1563 not a valid preprocessing token. */
1564 if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
1565 adj_digit_sep = cur;
1566 cur++;
1568 /* A number can't end with a digit separator. */
1569 while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
1570 --cur;
1571 if (adj_digit_sep && adj_digit_sep < cur)
1572 cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
1574 pfile->buffer->cur = cur;
1576 while (forms_identifier_p (pfile, false, nst));
1578 number->len = cur - base;
1579 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1580 memcpy (dest, base, number->len);
1581 dest[number->len] = '\0';
1582 number->text = dest;
1585 /* Create a token of type TYPE with a literal spelling. */
1586 static void
1587 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1588 unsigned int len, enum cpp_ttype type)
1590 token->type = type;
1591 token->val.str.len = len;
1592 token->val.str.text = cpp_alloc_token_string (pfile, base, len);
1595 const uchar *
1596 cpp_alloc_token_string (cpp_reader *pfile,
1597 const unsigned char *ptr, unsigned len)
1599 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1601 dest[len] = 0;
1602 memcpy (dest, ptr, len);
1603 return dest;
1606 /* A pair of raw buffer pointers. The currently open one is [1], the
1607 first one is [0]. Used for string literal lexing. */
1608 struct lit_accum {
1609 _cpp_buff *first;
1610 _cpp_buff *last;
1611 const uchar *rpos;
1612 size_t accum;
1614 lit_accum ()
1615 : first (NULL), last (NULL), rpos (0), accum (0)
1619 void append (cpp_reader *, const uchar *, size_t);
1621 void read_begin (cpp_reader *);
1622 bool reading_p () const
1624 return rpos != NULL;
1626 char read_char ()
1628 char c = *rpos++;
1629 if (rpos == BUFF_FRONT (last))
1630 rpos = NULL;
1631 return c;
1635 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1636 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
1638 void
1639 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
1641 if (!last)
1642 /* Starting. */
1643 first = last = _cpp_get_buff (pfile, len);
1644 else if (len > BUFF_ROOM (last))
1646 /* There is insufficient room in the buffer. Copy what we can,
1647 and then either extend or create a new one. */
1648 size_t room = BUFF_ROOM (last);
1649 memcpy (BUFF_FRONT (last), base, room);
1650 BUFF_FRONT (last) += room;
1651 base += room;
1652 len -= room;
1653 accum += room;
1655 gcc_checking_assert (!rpos);
1657 last = _cpp_append_extend_buff (pfile, last, len);
1660 memcpy (BUFF_FRONT (last), base, len);
1661 BUFF_FRONT (last) += len;
1662 accum += len;
1665 void
1666 lit_accum::read_begin (cpp_reader *pfile)
1668 /* We never accumulate more than 4 chars to read. */
1669 if (BUFF_ROOM (last) < 4)
1671 last = _cpp_append_extend_buff (pfile, last, 4);
1672 rpos = BUFF_FRONT (last);
1675 /* Returns true if a macro has been defined.
1676 This might not work if compile with -save-temps,
1677 or preprocess separately from compilation. */
1679 static bool
1680 is_macro(cpp_reader *pfile, const uchar *base)
1682 const uchar *cur = base;
1683 if (! ISIDST (*cur))
1684 return false;
1685 unsigned int hash = HT_HASHSTEP (0, *cur);
1686 ++cur;
1687 while (ISIDNUM (*cur))
1689 hash = HT_HASHSTEP (hash, *cur);
1690 ++cur;
1692 hash = HT_HASHFINISH (hash, cur - base);
1694 cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1695 base, cur - base, hash, HT_NO_INSERT));
1697 return result && cpp_macro_p (result);
1700 /* Returns true if a literal suffix does not have the expected form
1701 and is defined as a macro. */
1703 static bool
1704 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
1706 /* User-defined literals outside of namespace std must start with a single
1707 underscore, so assume anything of that form really is a UDL suffix.
1708 We don't need to worry about UDLs defined inside namespace std because
1709 their names are reserved, so cannot be used as macro names in valid
1710 programs. */
1711 if (base[0] == '_' && base[1] != '_')
1712 return false;
1713 return is_macro (pfile, base);
1716 /* Lexes a raw string. The stored string contains the spelling,
1717 including double quotes, delimiter string, '(' and ')', any leading
1718 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
1719 the type of the literal, or CPP_OTHER if it was not properly
1720 terminated.
1722 BASE is the start of the token. Updates pfile->buffer->cur to just
1723 after the lexed string.
1725 The spelling is NUL-terminated, but it is not guaranteed that this
1726 is the first NUL since embedded NULs are preserved. */
1728 static void
1729 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1731 const uchar *pos = base;
1733 /* 'tis a pity this information isn't passed down from the lexer's
1734 initial categorization of the token. */
1735 enum cpp_ttype type = CPP_STRING;
1737 if (*pos == 'L')
1739 type = CPP_WSTRING;
1740 pos++;
1742 else if (*pos == 'U')
1744 type = CPP_STRING32;
1745 pos++;
1747 else if (*pos == 'u')
1749 if (pos[1] == '8')
1751 type = CPP_UTF8STRING;
1752 pos++;
1754 else
1755 type = CPP_STRING16;
1756 pos++;
1759 gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
1760 pos += 2;
1762 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1764 /* Skip notes before the ". */
1765 while (note->pos < pos)
1766 ++note;
1768 lit_accum accum;
1770 uchar prefix[17];
1771 unsigned prefix_len = 0;
1772 enum Phase
1774 PHASE_PREFIX = -2,
1775 PHASE_NONE = -1,
1776 PHASE_SUFFIX = 0
1777 } phase = PHASE_PREFIX;
1779 for (;;)
1781 gcc_checking_assert (note->pos >= pos);
1783 /* Undo any escaped newlines and trigraphs. */
1784 if (!accum.reading_p () && note->pos == pos)
1785 switch (note->type)
1787 case '\\':
1788 case ' ':
1789 /* Restore backslash followed by newline. */
1790 accum.append (pfile, base, pos - base);
1791 base = pos;
1792 accum.read_begin (pfile);
1793 accum.append (pfile, UC"\\", 1);
1795 after_backslash:
1796 if (note->type == ' ')
1797 /* GNU backslash whitespace newline extension. FIXME
1798 could be any sequence of non-vertical space. When we
1799 can properly restore any such sequence, we should
1800 mark this note as handled so _cpp_process_line_notes
1801 doesn't warn. */
1802 accum.append (pfile, UC" ", 1);
1804 accum.append (pfile, UC"\n", 1);
1805 note++;
1806 break;
1808 case '\n':
1809 /* This can happen for ??/<NEWLINE> when trigraphs are not
1810 being interpretted. */
1811 gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
1812 note->type = 0;
1813 note++;
1814 break;
1816 default:
1817 gcc_checking_assert (_cpp_trigraph_map[note->type]);
1819 /* Don't warn about this trigraph in
1820 _cpp_process_line_notes, since trigraphs show up as
1821 trigraphs in raw strings. */
1822 uchar type = note->type;
1823 note->type = 0;
1825 if (CPP_OPTION (pfile, trigraphs))
1827 accum.append (pfile, base, pos - base);
1828 base = pos;
1829 accum.read_begin (pfile);
1830 accum.append (pfile, UC"??", 2);
1831 accum.append (pfile, &type, 1);
1833 /* ??/ followed by newline gets two line notes, one for
1834 the trigraph and one for the backslash/newline. */
1835 if (type == '/' && note[1].pos == pos)
1837 note++;
1838 gcc_assert (note->type == '\\' || note->type == ' ');
1839 goto after_backslash;
1841 /* Skip the replacement character. */
1842 base = ++pos;
1845 note++;
1846 break;
1849 /* Now get a char to process. Either from an expanded note, or
1850 from the line buffer. */
1851 bool read_note = accum.reading_p ();
1852 char c = read_note ? accum.read_char () : *pos++;
1854 if (phase == PHASE_PREFIX)
1856 if (c == '(')
1858 /* Done. */
1859 phase = PHASE_NONE;
1860 prefix[prefix_len++] = '"';
1862 else if (prefix_len < 16
1863 /* Prefix chars are any of the basic character set,
1864 [lex.charset] except for '
1865 ()\\\t\v\f\n'. Optimized for a contiguous
1866 alphabet. */
1867 /* Unlike a switch, this collapses down to one or
1868 two shift and bitmask operations on an ASCII
1869 system, with an outlier or two. */
1870 && (('Z' - 'A' == 25
1871 ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
1872 : ISIDST (c))
1873 || (c >= '0' && c <= '9')
1874 || c == '_' || c == '{' || c == '}'
1875 || c == '[' || c == ']' || c == '#'
1876 || c == '<' || c == '>' || c == '%'
1877 || c == ':' || c == ';' || c == '.' || c == '?'
1878 || c == '*' || c == '+' || c == '-' || c == '/'
1879 || c == '^' || c == '&' || c == '|' || c == '~'
1880 || c == '!' || c == '=' || c == ','
1881 || c == '"' || c == '\''))
1882 prefix[prefix_len++] = c;
1883 else
1885 /* Something is wrong. */
1886 int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
1887 if (prefix_len == 16)
1888 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1889 col, "raw string delimiter longer "
1890 "than 16 characters");
1891 else if (c == '\n')
1892 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1893 col, "invalid new-line in raw "
1894 "string delimiter");
1895 else
1896 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1897 col, "invalid character '%c' in "
1898 "raw string delimiter", c);
1899 type = CPP_OTHER;
1900 phase = PHASE_NONE;
1901 /* Continue until we get a close quote, that's probably
1902 the best failure mode. */
1903 prefix_len = 0;
1905 if (c != '\n')
1906 continue;
1909 if (phase != PHASE_NONE)
1911 if (prefix[phase] != c)
1912 phase = PHASE_NONE;
1913 else if (unsigned (phase + 1) == prefix_len)
1914 break;
1915 else
1917 phase = Phase (phase + 1);
1918 continue;
1922 if (!prefix_len && c == '"')
1923 /* Failure mode lexing. */
1924 goto out;
1925 else if (prefix_len && c == ')')
1926 phase = PHASE_SUFFIX;
1927 else if (!read_note && c == '\n')
1929 pos--;
1930 pfile->buffer->cur = pos;
1931 if (pfile->state.in_directive
1932 || (pfile->state.parsing_args
1933 && pfile->buffer->next_line >= pfile->buffer->rlimit))
1935 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1936 "unterminated raw string");
1937 type = CPP_OTHER;
1938 goto out;
1941 accum.append (pfile, base, pos - base + 1);
1942 _cpp_process_line_notes (pfile, false);
1944 if (pfile->buffer->next_line < pfile->buffer->rlimit)
1945 CPP_INCREMENT_LINE (pfile, 0);
1946 pfile->buffer->need_line = true;
1948 if (!_cpp_get_fresh_line (pfile))
1950 /* We ran out of file and failed to get a line. */
1951 location_t src_loc = token->src_loc;
1952 token->type = CPP_EOF;
1953 /* Tell the compiler the line number of the EOF token. */
1954 token->src_loc = pfile->line_table->highest_line;
1955 token->flags = BOL;
1956 if (accum.first)
1957 _cpp_release_buff (pfile, accum.first);
1958 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1959 "unterminated raw string");
1960 /* Now pop the buffer that _cpp_get_fresh_line did not. */
1961 _cpp_pop_buffer (pfile);
1962 return;
1965 pos = base = pfile->buffer->cur;
1966 note = &pfile->buffer->notes[pfile->buffer->cur_note];
1970 if (CPP_OPTION (pfile, user_literals))
1972 /* If a string format macro, say from inttypes.h, is placed touching
1973 a string literal it could be parsed as a C++11 user-defined string
1974 literal thus breaking the program. */
1975 if (is_macro_not_literal_suffix (pfile, pos))
1977 /* Raise a warning, but do not consume subsequent tokens. */
1978 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1979 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1980 token->src_loc, 0,
1981 "invalid suffix on literal; C++11 requires "
1982 "a space between literal and string macro");
1984 /* Grab user defined literal suffix. */
1985 else if (ISIDST (*pos))
1987 type = cpp_userdef_string_add_type (type);
1988 ++pos;
1990 while (ISIDNUM (*pos))
1991 ++pos;
1995 out:
1996 pfile->buffer->cur = pos;
1997 if (!accum.accum)
1998 create_literal (pfile, token, base, pos - base, type);
1999 else
2001 size_t extra_len = pos - base;
2002 uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2004 token->type = type;
2005 token->val.str.len = accum.accum + extra_len;
2006 token->val.str.text = dest;
2007 for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2009 size_t len = BUFF_FRONT (buf) - buf->base;
2010 memcpy (dest, buf->base, len);
2011 dest += len;
2013 _cpp_release_buff (pfile, accum.first);
2014 memcpy (dest, base, extra_len);
2015 dest[extra_len] = '\0';
2019 /* Lexes a string, character constant, or angle-bracketed header file
2020 name. The stored string contains the spelling, including opening
2021 quote and any leading 'L', 'u', 'U' or 'u8' and optional
2022 'R' modifier. It returns the type of the literal, or CPP_OTHER
2023 if it was not properly terminated, or CPP_LESS for an unterminated
2024 header name which must be relexed as normal tokens.
2026 The spelling is NUL-terminated, but it is not guaranteed that this
2027 is the first NUL since embedded NULs are preserved. */
2028 static void
2029 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2031 bool saw_NUL = false;
2032 const uchar *cur;
2033 cppchar_t terminator;
2034 enum cpp_ttype type;
2036 cur = base;
2037 terminator = *cur++;
2038 if (terminator == 'L' || terminator == 'U')
2039 terminator = *cur++;
2040 else if (terminator == 'u')
2042 terminator = *cur++;
2043 if (terminator == '8')
2044 terminator = *cur++;
2046 if (terminator == 'R')
2048 lex_raw_string (pfile, token, base);
2049 return;
2051 if (terminator == '"')
2052 type = (*base == 'L' ? CPP_WSTRING :
2053 *base == 'U' ? CPP_STRING32 :
2054 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2055 : CPP_STRING);
2056 else if (terminator == '\'')
2057 type = (*base == 'L' ? CPP_WCHAR :
2058 *base == 'U' ? CPP_CHAR32 :
2059 *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2060 : CPP_CHAR);
2061 else
2062 terminator = '>', type = CPP_HEADER_NAME;
2064 for (;;)
2066 cppchar_t c = *cur++;
2068 /* In #include-style directives, terminators are not escapable. */
2069 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2070 cur++;
2071 else if (c == terminator)
2072 break;
2073 else if (c == '\n')
2075 cur--;
2076 /* Unmatched quotes always yield undefined behavior, but
2077 greedy lexing means that what appears to be an unterminated
2078 header name may actually be a legitimate sequence of tokens. */
2079 if (terminator == '>')
2081 token->type = CPP_LESS;
2082 return;
2084 type = CPP_OTHER;
2085 break;
2087 else if (c == '\0')
2088 saw_NUL = true;
2091 if (saw_NUL && !pfile->state.skipping)
2092 cpp_error (pfile, CPP_DL_WARNING,
2093 "null character(s) preserved in literal");
2095 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2096 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2097 (int) terminator);
2099 if (CPP_OPTION (pfile, user_literals))
2101 /* If a string format macro, say from inttypes.h, is placed touching
2102 a string literal it could be parsed as a C++11 user-defined string
2103 literal thus breaking the program. */
2104 if (is_macro_not_literal_suffix (pfile, cur))
2106 /* Raise a warning, but do not consume subsequent tokens. */
2107 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2108 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2109 token->src_loc, 0,
2110 "invalid suffix on literal; C++11 requires "
2111 "a space between literal and string macro");
2113 /* Grab user defined literal suffix. */
2114 else if (ISIDST (*cur))
2116 type = cpp_userdef_char_add_type (type);
2117 type = cpp_userdef_string_add_type (type);
2118 ++cur;
2120 while (ISIDNUM (*cur))
2121 ++cur;
2124 else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2125 && is_macro (pfile, cur)
2126 && !pfile->state.skipping)
2127 cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2128 token->src_loc, 0, "C++11 requires a space "
2129 "between string literal and macro");
2131 pfile->buffer->cur = cur;
2132 create_literal (pfile, token, base, cur - base, type);
2135 /* Return the comment table. The client may not make any assumption
2136 about the ordering of the table. */
2137 cpp_comment_table *
2138 cpp_get_comments (cpp_reader *pfile)
2140 return &pfile->comments;
2143 /* Append a comment to the end of the comment table. */
2144 static void
2145 store_comment (cpp_reader *pfile, cpp_token *token)
2147 int len;
2149 if (pfile->comments.allocated == 0)
2151 pfile->comments.allocated = 256;
2152 pfile->comments.entries = (cpp_comment *) xmalloc
2153 (pfile->comments.allocated * sizeof (cpp_comment));
2156 if (pfile->comments.count == pfile->comments.allocated)
2158 pfile->comments.allocated *= 2;
2159 pfile->comments.entries = (cpp_comment *) xrealloc
2160 (pfile->comments.entries,
2161 pfile->comments.allocated * sizeof (cpp_comment));
2164 len = token->val.str.len;
2166 /* Copy comment. Note, token may not be NULL terminated. */
2167 pfile->comments.entries[pfile->comments.count].comment =
2168 (char *) xmalloc (sizeof (char) * (len + 1));
2169 memcpy (pfile->comments.entries[pfile->comments.count].comment,
2170 token->val.str.text, len);
2171 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2173 /* Set source location. */
2174 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2176 /* Increment the count of entries in the comment table. */
2177 pfile->comments.count++;
2180 /* The stored comment includes the comment start and any terminator. */
2181 static void
2182 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2183 cppchar_t type)
2185 unsigned char *buffer;
2186 unsigned int len, clen, i;
2188 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
2190 /* C++ comments probably (not definitely) have moved past a new
2191 line, which we don't want to save in the comment. */
2192 if (is_vspace (pfile->buffer->cur[-1]))
2193 len--;
2195 /* If we are currently in a directive or in argument parsing, then
2196 we need to store all C++ comments as C comments internally, and
2197 so we need to allocate a little extra space in that case.
2199 Note that the only time we encounter a directive here is
2200 when we are saving comments in a "#define". */
2201 clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2202 && type == '/') ? len + 2 : len;
2204 buffer = _cpp_unaligned_alloc (pfile, clen);
2206 token->type = CPP_COMMENT;
2207 token->val.str.len = clen;
2208 token->val.str.text = buffer;
2210 buffer[0] = '/';
2211 memcpy (buffer + 1, from, len - 1);
2213 /* Finish conversion to a C comment, if necessary. */
2214 if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2216 buffer[1] = '*';
2217 buffer[clen - 2] = '*';
2218 buffer[clen - 1] = '/';
2219 /* As there can be in a C++ comments illegal sequences for C comments
2220 we need to filter them out. */
2221 for (i = 2; i < (clen - 2); i++)
2222 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2223 buffer[i] = '|';
2226 /* Finally store this comment for use by clients of libcpp. */
2227 store_comment (pfile, token);
2230 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2231 comment. */
2233 static bool
2234 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2236 const unsigned char *from = comment_start + 1;
2238 switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2240 /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2241 don't recognize any comments. The latter only checks attributes,
2242 the former doesn't warn. */
2243 case 0:
2244 default:
2245 return false;
2246 /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2247 content it has. */
2248 case 1:
2249 return true;
2250 case 2:
2251 /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2252 .*falls?[ \t-]*thr(u|ough).* regex. */
2253 for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2254 from++)
2256 /* Is there anything like strpbrk with upper boundary, or
2257 memchr looking for 2 characters rather than just one? */
2258 if (from[0] != 'f' && from[0] != 'F')
2259 continue;
2260 if (from[1] != 'a' && from[1] != 'A')
2261 continue;
2262 if (from[2] != 'l' && from[2] != 'L')
2263 continue;
2264 if (from[3] != 'l' && from[3] != 'L')
2265 continue;
2266 from += sizeof "fall" - 1;
2267 if (from[0] == 's' || from[0] == 'S')
2268 from++;
2269 while (*from == ' ' || *from == '\t' || *from == '-')
2270 from++;
2271 if (from[0] != 't' && from[0] != 'T')
2272 continue;
2273 if (from[1] != 'h' && from[1] != 'H')
2274 continue;
2275 if (from[2] != 'r' && from[2] != 'R')
2276 continue;
2277 if (from[3] == 'u' || from[3] == 'U')
2278 return true;
2279 if (from[3] != 'o' && from[3] != 'O')
2280 continue;
2281 if (from[4] != 'u' && from[4] != 'U')
2282 continue;
2283 if (from[5] != 'g' && from[5] != 'G')
2284 continue;
2285 if (from[6] != 'h' && from[6] != 'H')
2286 continue;
2287 return true;
2289 return false;
2290 case 3:
2291 case 4:
2292 break;
2295 /* Whole comment contents:
2296 -fallthrough
2297 @fallthrough@
2299 if (*from == '-' || *from == '@')
2301 size_t len = sizeof "fallthrough" - 1;
2302 if ((size_t) (pfile->buffer->cur - from - 1) < len)
2303 return false;
2304 if (memcmp (from + 1, "fallthrough", len))
2305 return false;
2306 if (*from == '@')
2308 if (from[len + 1] != '@')
2309 return false;
2310 len++;
2312 from += 1 + len;
2314 /* Whole comment contents (regex):
2315 lint -fallthrough[ \t]*
2317 else if (*from == 'l')
2319 size_t len = sizeof "int -fallthrough" - 1;
2320 if ((size_t) (pfile->buffer->cur - from - 1) < len)
2321 return false;
2322 if (memcmp (from + 1, "int -fallthrough", len))
2323 return false;
2324 from += 1 + len;
2325 while (*from == ' ' || *from == '\t')
2326 from++;
2328 /* Whole comment contents (regex):
2329 [ \t]*FALLTHR(U|OUGH)[ \t]*
2331 else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2333 while (*from == ' ' || *from == '\t')
2334 from++;
2335 if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
2336 return false;
2337 if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2338 return false;
2339 from += sizeof "FALLTHR" - 1;
2340 if (*from == 'U')
2341 from++;
2342 else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
2343 return false;
2344 else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2345 return false;
2346 else
2347 from += sizeof "OUGH" - 1;
2348 while (*from == ' ' || *from == '\t')
2349 from++;
2351 /* Whole comment contents (regex):
2352 [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2353 [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2354 [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2356 else
2358 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2359 from++;
2360 unsigned char f = *from;
2361 bool all_upper = false;
2362 if (f == 'E' || f == 'e')
2364 if ((size_t) (pfile->buffer->cur - from)
2365 < sizeof "else fallthru" - 1)
2366 return false;
2367 if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2368 all_upper = true;
2369 else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2370 return false;
2371 from += sizeof "else" - 1;
2372 if (*from == ',')
2373 from++;
2374 if (*from != ' ')
2375 return false;
2376 from++;
2377 if (all_upper && *from == 'f')
2378 return false;
2379 if (f == 'e' && *from == 'F')
2380 return false;
2381 f = *from;
2383 else if (f == 'I' || f == 'i')
2385 if ((size_t) (pfile->buffer->cur - from)
2386 < sizeof "intentional fallthru" - 1)
2387 return false;
2388 if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2389 sizeof "NTENTIONAL" - 1) == 0)
2390 all_upper = true;
2391 else if (memcmp (from + 1, "ntentional",
2392 sizeof "ntentional" - 1))
2393 return false;
2394 from += sizeof "intentional" - 1;
2395 if (*from == ' ')
2397 from++;
2398 if (all_upper && *from == 'f')
2399 return false;
2401 else if (all_upper)
2403 if (memcmp (from, "LY F", sizeof "LY F" - 1))
2404 return false;
2405 from += sizeof "LY " - 1;
2407 else
2409 if (memcmp (from, "ly ", sizeof "ly " - 1))
2410 return false;
2411 from += sizeof "ly " - 1;
2413 if (f == 'i' && *from == 'F')
2414 return false;
2415 f = *from;
2417 if (f != 'F' && f != 'f')
2418 return false;
2419 if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
2420 return false;
2421 if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
2422 all_upper = true;
2423 else if (all_upper)
2424 return false;
2425 else if (memcmp (from + 1, "all", sizeof "all" - 1))
2426 return false;
2427 from += sizeof "fall" - 1;
2428 if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
2429 from += 2;
2430 else if (*from == ' ' || *from == '-')
2431 from++;
2432 else if (*from != (all_upper ? 'T' : 't'))
2433 return false;
2434 if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
2435 return false;
2436 if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
2437 return false;
2438 if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
2440 if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
2441 return false;
2442 if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
2443 sizeof "hrough" - 1))
2444 return false;
2445 from += sizeof "through" - 1;
2447 else
2448 from += sizeof "thru" - 1;
2449 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2450 from++;
2451 if (*from == '-')
2453 from++;
2454 if (*comment_start == '*')
2458 while (*from && *from != '*'
2459 && *from != '\n' && *from != '\r')
2460 from++;
2461 if (*from != '*' || from[1] == '/')
2462 break;
2463 from++;
2465 while (1);
2467 else
2468 while (*from && *from != '\n' && *from != '\r')
2469 from++;
2472 /* C block comment. */
2473 if (*comment_start == '*')
2475 if (*from != '*' || from[1] != '/')
2476 return false;
2478 /* C++ line comment. */
2479 else if (*from != '\n')
2480 return false;
2482 return true;
2485 /* Allocate COUNT tokens for RUN. */
2486 void
2487 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
2489 run->base = XNEWVEC (cpp_token, count);
2490 run->limit = run->base + count;
2491 run->next = NULL;
2494 /* Returns the next tokenrun, or creates one if there is none. */
2495 static tokenrun *
2496 next_tokenrun (tokenrun *run)
2498 if (run->next == NULL)
2500 run->next = XNEW (tokenrun);
2501 run->next->prev = run;
2502 _cpp_init_tokenrun (run->next, 250);
2505 return run->next;
2508 /* Return the number of not yet processed token in a given
2509 context. */
2511 _cpp_remaining_tokens_num_in_context (cpp_context *context)
2513 if (context->tokens_kind == TOKENS_KIND_DIRECT)
2514 return (LAST (context).token - FIRST (context).token);
2515 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2516 || context->tokens_kind == TOKENS_KIND_EXTENDED)
2517 return (LAST (context).ptoken - FIRST (context).ptoken);
2518 else
2519 abort ();
2522 /* Returns the token present at index INDEX in a given context. If
2523 INDEX is zero, the next token to be processed is returned. */
2524 static const cpp_token*
2525 _cpp_token_from_context_at (cpp_context *context, int index)
2527 if (context->tokens_kind == TOKENS_KIND_DIRECT)
2528 return &(FIRST (context).token[index]);
2529 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2530 || context->tokens_kind == TOKENS_KIND_EXTENDED)
2531 return FIRST (context).ptoken[index];
2532 else
2533 abort ();
2536 /* Look ahead in the input stream. */
2537 const cpp_token *
2538 cpp_peek_token (cpp_reader *pfile, int index)
2540 cpp_context *context = pfile->context;
2541 const cpp_token *peektok;
2542 int count;
2544 /* First, scan through any pending cpp_context objects. */
2545 while (context->prev)
2547 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2549 if (index < (int) sz)
2550 return _cpp_token_from_context_at (context, index);
2551 index -= (int) sz;
2552 context = context->prev;
2555 /* We will have to read some new tokens after all (and do so
2556 without invalidating preceding tokens). */
2557 count = index;
2558 pfile->keep_tokens++;
2560 /* For peeked tokens temporarily disable line_change reporting,
2561 until the tokens are parsed for real. */
2562 void (*line_change) (cpp_reader *, const cpp_token *, int)
2563 = pfile->cb.line_change;
2564 pfile->cb.line_change = NULL;
2568 peektok = _cpp_lex_token (pfile);
2569 if (peektok->type == CPP_EOF)
2571 index--;
2572 break;
2574 else if (peektok->type == CPP_PRAGMA)
2576 /* Don't peek past a pragma. */
2577 if (peektok == &pfile->directive_result)
2578 /* Save the pragma in the buffer. */
2579 *pfile->cur_token++ = *peektok;
2580 index--;
2581 break;
2584 while (index--);
2586 _cpp_backup_tokens_direct (pfile, count - index);
2587 pfile->keep_tokens--;
2588 pfile->cb.line_change = line_change;
2590 return peektok;
2593 /* Allocate a single token that is invalidated at the same time as the
2594 rest of the tokens on the line. Has its line and col set to the
2595 same as the last lexed token, so that diagnostics appear in the
2596 right place. */
2597 cpp_token *
2598 _cpp_temp_token (cpp_reader *pfile)
2600 cpp_token *old, *result;
2601 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2602 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2604 old = pfile->cur_token - 1;
2605 /* Any pre-existing lookaheads must not be clobbered. */
2606 if (la)
2608 if (sz <= la)
2610 tokenrun *next = next_tokenrun (pfile->cur_run);
2612 if (sz < la)
2613 memmove (next->base + 1, next->base,
2614 (la - sz) * sizeof (cpp_token));
2616 next->base[0] = pfile->cur_run->limit[-1];
2619 if (sz > 1)
2620 memmove (pfile->cur_token + 1, pfile->cur_token,
2621 MIN (la, sz - 1) * sizeof (cpp_token));
2624 if (!sz && pfile->cur_token == pfile->cur_run->limit)
2626 pfile->cur_run = next_tokenrun (pfile->cur_run);
2627 pfile->cur_token = pfile->cur_run->base;
2630 result = pfile->cur_token++;
2631 result->src_loc = old->src_loc;
2632 return result;
2635 /* We're at the beginning of a logical line (so not in
2636 directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
2637 if we should enter deferred_pragma mode to tokenize the rest of the
2638 line as a module control-line. */
2640 static void
2641 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
2643 unsigned backup = 0; /* Tokens we peeked. */
2644 cpp_hashnode *node = result->val.node.node;
2645 cpp_token *peek = result;
2646 cpp_token *keyword = peek;
2647 cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
2648 int header_count = 0;
2650 /* Make sure the incoming state is as we expect it. This way we
2651 can restore it using constants. */
2652 gcc_checking_assert (!pfile->state.in_deferred_pragma
2653 && !pfile->state.skipping
2654 && !pfile->state.parsing_args
2655 && !pfile->state.angled_headers
2656 && (pfile->state.save_comments
2657 == !CPP_OPTION (pfile, discard_comments)));
2659 /* Enter directives mode sufficiently for peeking. We don't have
2660 to actually set in_directive. */
2661 pfile->state.in_deferred_pragma = true;
2663 /* These two fields are needed to process tokenization in deferred
2664 pragma mode. They are not used outside deferred pragma mode or
2665 directives mode. */
2666 pfile->state.pragma_allow_expansion = true;
2667 pfile->directive_line = result->src_loc;
2669 /* Saving comments is incompatible with directives mode. */
2670 pfile->state.save_comments = 0;
2672 if (node == n_modules[spec_nodes::M_EXPORT][0])
2674 peek = _cpp_lex_direct (pfile);
2675 keyword = peek;
2676 backup++;
2677 if (keyword->type != CPP_NAME)
2678 goto not_module;
2679 node = keyword->val.node.node;
2680 if (!(node->flags & NODE_MODULE))
2681 goto not_module;
2684 if (node == n_modules[spec_nodes::M__IMPORT][0])
2685 /* __import */
2686 header_count = backup + 2 + 16;
2687 else if (node == n_modules[spec_nodes::M_IMPORT][0])
2688 /* import */
2689 header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
2690 else if (node == n_modules[spec_nodes::M_MODULE][0])
2691 ; /* module */
2692 else
2693 goto not_module;
2695 /* We've seen [export] {module|import|__import}. Check the next token. */
2696 if (header_count)
2697 /* After '{,__}import' a header name may appear. */
2698 pfile->state.angled_headers = true;
2699 peek = _cpp_lex_direct (pfile);
2700 backup++;
2702 /* ... import followed by identifier, ':', '<' or
2703 header-name preprocessing tokens, or module
2704 followed by cpp-identifier, ':' or ';' preprocessing
2705 tokens. C++ keywords are not yet relevant. */
2706 if (peek->type == CPP_NAME
2707 || peek->type == CPP_COLON
2708 || (header_count
2709 ? (peek->type == CPP_LESS
2710 || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
2711 || peek->type == CPP_HEADER_NAME)
2712 : peek->type == CPP_SEMICOLON))
2714 pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
2715 if (!pfile->state.pragma_allow_expansion)
2716 pfile->state.prevent_expansion++;
2718 if (!header_count && linemap_included_from
2719 (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
2720 cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
2721 "module control-line cannot be in included file");
2723 /* The first one or two tokens cannot be macro names. */
2724 for (int ix = backup; ix--;)
2726 cpp_token *tok = ix ? keyword : result;
2727 cpp_hashnode *node = tok->val.node.node;
2729 /* Don't attempt to expand the token. */
2730 tok->flags |= NO_EXPAND;
2731 if (_cpp_defined_macro_p (node)
2732 && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
2733 && !cpp_fun_like_macro_p (node))
2734 cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
2735 "module control-line \"%s\" cannot be"
2736 " an object-like macro",
2737 NODE_NAME (node));
2740 /* Map to underbar variants. */
2741 keyword->val.node.node = n_modules[header_count
2742 ? spec_nodes::M_IMPORT
2743 : spec_nodes::M_MODULE][1];
2744 if (backup != 1)
2745 result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
2747 /* Maybe tell the tokenizer we expect a header-name down the
2748 road. */
2749 pfile->state.directive_file_token = header_count;
2751 else
2753 not_module:
2754 /* Drop out of directive mode. */
2755 /* We aaserted save_comments had this value upon entry. */
2756 pfile->state.save_comments
2757 = !CPP_OPTION (pfile, discard_comments);
2758 pfile->state.in_deferred_pragma = false;
2759 /* Do not let this remain on. */
2760 pfile->state.angled_headers = false;
2763 /* In either case we want to backup the peeked tokens. */
2764 if (backup)
2766 /* If we saw EOL, we should drop it, because this isn't a module
2767 control-line after all. */
2768 bool eol = peek->type == CPP_PRAGMA_EOL;
2769 if (!eol || backup > 1)
2771 /* Put put the peeked tokens back */
2772 _cpp_backup_tokens_direct (pfile, backup);
2773 /* But if the last one was an EOL, forget it. */
2774 if (eol)
2775 pfile->lookaheads--;
2780 /* Lex a token into RESULT (external interface). Takes care of issues
2781 like directive handling, token lookahead, multiple include
2782 optimization and skipping. */
2783 const cpp_token *
2784 _cpp_lex_token (cpp_reader *pfile)
2786 cpp_token *result;
2788 for (;;)
2790 if (pfile->cur_token == pfile->cur_run->limit)
2792 pfile->cur_run = next_tokenrun (pfile->cur_run);
2793 pfile->cur_token = pfile->cur_run->base;
2795 /* We assume that the current token is somewhere in the current
2796 run. */
2797 if (pfile->cur_token < pfile->cur_run->base
2798 || pfile->cur_token >= pfile->cur_run->limit)
2799 abort ();
2801 if (pfile->lookaheads)
2803 pfile->lookaheads--;
2804 result = pfile->cur_token++;
2806 else
2807 result = _cpp_lex_direct (pfile);
2809 if (result->flags & BOL)
2811 /* Is this a directive. If _cpp_handle_directive returns
2812 false, it is an assembler #. */
2813 if (result->type == CPP_HASH
2814 /* 6.10.3 p 11: Directives in a list of macro arguments
2815 gives undefined behavior. This implementation
2816 handles the directive as normal. */
2817 && pfile->state.parsing_args != 1)
2819 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2821 if (pfile->directive_result.type == CPP_PADDING)
2822 continue;
2823 result = &pfile->directive_result;
2826 else if (pfile->state.in_deferred_pragma)
2827 result = &pfile->directive_result;
2828 else if (result->type == CPP_NAME
2829 && (result->val.node.node->flags & NODE_MODULE)
2830 && !pfile->state.skipping
2831 /* Unlike regular directives, we do not deal with
2832 tokenizing module directives as macro arguments.
2833 That's not permitted. */
2834 && !pfile->state.parsing_args)
2836 /* P1857. Before macro expansion, At start of logical
2837 line ... */
2838 /* We don't have to consider lookaheads at this point. */
2839 gcc_checking_assert (!pfile->lookaheads);
2841 cpp_maybe_module_directive (pfile, result);
2844 if (pfile->cb.line_change && !pfile->state.skipping)
2845 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2848 /* We don't skip tokens in directives. */
2849 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2850 break;
2852 /* Outside a directive, invalidate controlling macros. At file
2853 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2854 get here and MI optimization works. */
2855 pfile->mi_valid = false;
2857 if (!pfile->state.skipping || result->type == CPP_EOF)
2858 break;
2861 return result;
2864 /* Returns true if a fresh line has been loaded. */
2865 bool
2866 _cpp_get_fresh_line (cpp_reader *pfile)
2868 /* We can't get a new line until we leave the current directive. */
2869 if (pfile->state.in_directive)
2870 return false;
2872 for (;;)
2874 cpp_buffer *buffer = pfile->buffer;
2876 if (!buffer->need_line)
2877 return true;
2879 if (buffer->next_line < buffer->rlimit)
2881 _cpp_clean_line (pfile);
2882 return true;
2885 /* First, get out of parsing arguments state. */
2886 if (pfile->state.parsing_args)
2887 return false;
2889 /* End of buffer. Non-empty files should end in a newline. */
2890 if (buffer->buf != buffer->rlimit
2891 && buffer->next_line > buffer->rlimit
2892 && !buffer->from_stage3)
2894 /* Clip to buffer size. */
2895 buffer->next_line = buffer->rlimit;
2898 if (buffer->prev && !buffer->return_at_eof)
2899 _cpp_pop_buffer (pfile);
2900 else
2902 /* End of translation. Do not pop the buffer yet. Increment
2903 line number so that the EOF token is on a line of its own
2904 (_cpp_lex_direct doesn't increment in that case, because
2905 it's hard for it to distinguish this special case). */
2906 CPP_INCREMENT_LINE (pfile, 0);
2907 return false;
2912 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
2913 do \
2915 result->type = ELSE_TYPE; \
2916 if (*buffer->cur == CHAR) \
2917 buffer->cur++, result->type = THEN_TYPE; \
2919 while (0)
2921 /* Lex a token into pfile->cur_token, which is also incremented, to
2922 get diagnostics pointing to the correct location.
2924 Does not handle issues such as token lookahead, multiple-include
2925 optimization, directives, skipping etc. This function is only
2926 suitable for use by _cpp_lex_token, and in special cases like
2927 lex_expansion_token which doesn't care for any of these issues.
2929 When meeting a newline, returns CPP_EOF if parsing a directive,
2930 otherwise returns to the start of the token buffer if permissible.
2931 Returns the location of the lexed token. */
2932 cpp_token *
2933 _cpp_lex_direct (cpp_reader *pfile)
2935 cppchar_t c;
2936 cpp_buffer *buffer;
2937 const unsigned char *comment_start;
2938 bool fallthrough_comment = false;
2939 cpp_token *result = pfile->cur_token++;
2941 fresh_line:
2942 result->flags = 0;
2943 buffer = pfile->buffer;
2944 if (buffer->need_line)
2946 gcc_assert (!pfile->state.in_deferred_pragma);
2947 if (!_cpp_get_fresh_line (pfile))
2949 result->type = CPP_EOF;
2950 /* Not a real EOF in a directive or arg parsing -- we refuse
2951 to advance to the next file now, and will once we're out
2952 of those modes. */
2953 if (!pfile->state.in_directive && !pfile->state.parsing_args)
2955 /* Tell the compiler the line number of the EOF token. */
2956 result->src_loc = pfile->line_table->highest_line;
2957 result->flags = BOL;
2958 /* Now pop the buffer that _cpp_get_fresh_line did not. */
2959 _cpp_pop_buffer (pfile);
2961 return result;
2963 if (buffer != pfile->buffer)
2964 fallthrough_comment = false;
2965 if (!pfile->keep_tokens)
2967 pfile->cur_run = &pfile->base_run;
2968 result = pfile->base_run.base;
2969 pfile->cur_token = result + 1;
2971 result->flags = BOL;
2972 if (pfile->state.parsing_args == 2)
2973 result->flags |= PREV_WHITE;
2975 buffer = pfile->buffer;
2976 update_tokens_line:
2977 result->src_loc = pfile->line_table->highest_line;
2979 skipped_white:
2980 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2981 && !pfile->overlaid_buffer)
2983 _cpp_process_line_notes (pfile, false);
2984 result->src_loc = pfile->line_table->highest_line;
2986 c = *buffer->cur++;
2988 if (pfile->forced_token_location)
2989 result->src_loc = pfile->forced_token_location;
2990 else
2991 result->src_loc = linemap_position_for_column (pfile->line_table,
2992 CPP_BUF_COLUMN (buffer, buffer->cur));
2994 switch (c)
2996 case ' ': case '\t': case '\f': case '\v': case '\0':
2997 result->flags |= PREV_WHITE;
2998 skip_whitespace (pfile, c);
2999 goto skipped_white;
3001 case '\n':
3002 /* Increment the line, unless this is the last line ... */
3003 if (buffer->cur < buffer->rlimit
3004 /* ... or this is a #include, (where _cpp_stack_file needs to
3005 unwind by one line) ... */
3006 || (pfile->state.in_directive > 1
3007 /* ... except traditional-cpp increments this elsewhere. */
3008 && !CPP_OPTION (pfile, traditional)))
3009 CPP_INCREMENT_LINE (pfile, 0);
3010 buffer->need_line = true;
3011 if (pfile->state.in_deferred_pragma)
3013 /* Produce the PRAGMA_EOL on this line. File reading
3014 ensures there is always a \n at end of the buffer, thus
3015 in a deferred pragma we always see CPP_PRAGMA_EOL before
3016 any CPP_EOF. */
3017 result->type = CPP_PRAGMA_EOL;
3018 result->flags &= ~PREV_WHITE;
3019 pfile->state.in_deferred_pragma = false;
3020 if (!pfile->state.pragma_allow_expansion)
3021 pfile->state.prevent_expansion--;
3022 return result;
3024 goto fresh_line;
3026 case '0': case '1': case '2': case '3': case '4':
3027 case '5': case '6': case '7': case '8': case '9':
3029 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3030 result->type = CPP_NUMBER;
3031 lex_number (pfile, &result->val.str, &nst);
3032 warn_about_normalization (pfile, result, &nst);
3033 break;
3036 case 'L':
3037 case 'u':
3038 case 'U':
3039 case 'R':
3040 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3041 wide strings or raw strings. */
3042 if (c == 'L' || CPP_OPTION (pfile, rliterals)
3043 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3045 if ((*buffer->cur == '\'' && c != 'R')
3046 || *buffer->cur == '"'
3047 || (*buffer->cur == 'R'
3048 && c != 'R'
3049 && buffer->cur[1] == '"'
3050 && CPP_OPTION (pfile, rliterals))
3051 || (*buffer->cur == '8'
3052 && c == 'u'
3053 && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3054 && CPP_OPTION (pfile, utf8_char_literals)))
3055 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3056 && CPP_OPTION (pfile, rliterals)))))
3058 lex_string (pfile, result, buffer->cur - 1);
3059 break;
3062 /* Fall through. */
3064 case '_':
3065 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3066 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3067 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3068 case 's': case 't': case 'v': case 'w': case 'x':
3069 case 'y': case 'z':
3070 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3071 case 'G': case 'H': case 'I': case 'J': case 'K':
3072 case 'M': case 'N': case 'O': case 'P': case 'Q':
3073 case 'S': case 'T': case 'V': case 'W': case 'X':
3074 case 'Y': case 'Z':
3075 result->type = CPP_NAME;
3077 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3078 result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3079 &nst,
3080 &result->val.node.spelling);
3081 warn_about_normalization (pfile, result, &nst);
3084 /* Convert named operators to their proper types. */
3085 if (result->val.node.node->flags & NODE_OPERATOR)
3087 result->flags |= NAMED_OP;
3088 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3091 /* Signal FALLTHROUGH comment followed by another token. */
3092 if (fallthrough_comment)
3093 result->flags |= PREV_FALLTHROUGH;
3094 break;
3096 case '\'':
3097 case '"':
3098 lex_string (pfile, result, buffer->cur - 1);
3099 break;
3101 case '/':
3102 /* A potential block or line comment. */
3103 comment_start = buffer->cur;
3104 c = *buffer->cur;
3106 if (c == '*')
3108 if (_cpp_skip_block_comment (pfile))
3109 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3111 else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3113 /* Don't warn for system headers. */
3114 if (_cpp_in_system_header (pfile))
3116 /* Warn about comments if pedantically GNUC89, and not
3117 in system headers. */
3118 else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3119 && CPP_PEDANTIC (pfile)
3120 && ! buffer->warned_cplusplus_comments)
3122 if (cpp_error (pfile, CPP_DL_PEDWARN,
3123 "C++ style comments are not allowed in ISO C90"))
3124 cpp_error (pfile, CPP_DL_NOTE,
3125 "(this will be reported only once per input file)");
3126 buffer->warned_cplusplus_comments = 1;
3128 /* Or if specifically desired via -Wc90-c99-compat. */
3129 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3130 && ! CPP_OPTION (pfile, cplusplus)
3131 && ! buffer->warned_cplusplus_comments)
3133 if (cpp_error (pfile, CPP_DL_WARNING,
3134 "C++ style comments are incompatible with C90"))
3135 cpp_error (pfile, CPP_DL_NOTE,
3136 "(this will be reported only once per input file)");
3137 buffer->warned_cplusplus_comments = 1;
3139 /* In C89/C94, C++ style comments are forbidden. */
3140 else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3141 || CPP_OPTION (pfile, lang) == CLK_STDC94))
3143 /* But don't be confused about valid code such as
3144 - // immediately followed by *,
3145 - // in a preprocessing directive,
3146 - // in an #if 0 block. */
3147 if (buffer->cur[1] == '*'
3148 || pfile->state.in_directive
3149 || pfile->state.skipping)
3151 result->type = CPP_DIV;
3152 break;
3154 else if (! buffer->warned_cplusplus_comments)
3156 if (cpp_error (pfile, CPP_DL_ERROR,
3157 "C++ style comments are not allowed in "
3158 "ISO C90"))
3159 cpp_error (pfile, CPP_DL_NOTE,
3160 "(this will be reported only once per input "
3161 "file)");
3162 buffer->warned_cplusplus_comments = 1;
3165 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3166 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3168 else if (c == '=')
3170 buffer->cur++;
3171 result->type = CPP_DIV_EQ;
3172 break;
3174 else
3176 result->type = CPP_DIV;
3177 break;
3180 if (fallthrough_comment_p (pfile, comment_start))
3181 fallthrough_comment = true;
3183 if (pfile->cb.comment)
3185 size_t len = pfile->buffer->cur - comment_start;
3186 pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3187 len + 1);
3190 if (!pfile->state.save_comments)
3192 result->flags |= PREV_WHITE;
3193 goto update_tokens_line;
3196 if (fallthrough_comment)
3197 result->flags |= PREV_FALLTHROUGH;
3199 /* Save the comment as a token in its own right. */
3200 save_comment (pfile, result, comment_start, c);
3201 break;
3203 case '<':
3204 if (pfile->state.angled_headers)
3206 lex_string (pfile, result, buffer->cur - 1);
3207 if (result->type != CPP_LESS)
3208 break;
3211 result->type = CPP_LESS;
3212 if (*buffer->cur == '=')
3214 buffer->cur++, result->type = CPP_LESS_EQ;
3215 if (*buffer->cur == '>'
3216 && CPP_OPTION (pfile, cplusplus)
3217 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
3218 buffer->cur++, result->type = CPP_SPACESHIP;
3220 else if (*buffer->cur == '<')
3222 buffer->cur++;
3223 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
3225 else if (CPP_OPTION (pfile, digraphs))
3227 if (*buffer->cur == ':')
3229 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
3230 three characters are <:: and the subsequent character
3231 is neither : nor >, the < is treated as a preprocessor
3232 token by itself". */
3233 if (CPP_OPTION (pfile, cplusplus)
3234 && CPP_OPTION (pfile, lang) != CLK_CXX98
3235 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
3236 && buffer->cur[1] == ':'
3237 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3238 break;
3240 buffer->cur++;
3241 result->flags |= DIGRAPH;
3242 result->type = CPP_OPEN_SQUARE;
3244 else if (*buffer->cur == '%')
3246 buffer->cur++;
3247 result->flags |= DIGRAPH;
3248 result->type = CPP_OPEN_BRACE;
3251 break;
3253 case '>':
3254 result->type = CPP_GREATER;
3255 if (*buffer->cur == '=')
3256 buffer->cur++, result->type = CPP_GREATER_EQ;
3257 else if (*buffer->cur == '>')
3259 buffer->cur++;
3260 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3262 break;
3264 case '%':
3265 result->type = CPP_MOD;
3266 if (*buffer->cur == '=')
3267 buffer->cur++, result->type = CPP_MOD_EQ;
3268 else if (CPP_OPTION (pfile, digraphs))
3270 if (*buffer->cur == ':')
3272 buffer->cur++;
3273 result->flags |= DIGRAPH;
3274 result->type = CPP_HASH;
3275 if (*buffer->cur == '%' && buffer->cur[1] == ':')
3276 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3278 else if (*buffer->cur == '>')
3280 buffer->cur++;
3281 result->flags |= DIGRAPH;
3282 result->type = CPP_CLOSE_BRACE;
3285 break;
3287 case '.':
3288 result->type = CPP_DOT;
3289 if (ISDIGIT (*buffer->cur))
3291 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3292 result->type = CPP_NUMBER;
3293 lex_number (pfile, &result->val.str, &nst);
3294 warn_about_normalization (pfile, result, &nst);
3296 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3297 buffer->cur += 2, result->type = CPP_ELLIPSIS;
3298 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3299 buffer->cur++, result->type = CPP_DOT_STAR;
3300 break;
3302 case '+':
3303 result->type = CPP_PLUS;
3304 if (*buffer->cur == '+')
3305 buffer->cur++, result->type = CPP_PLUS_PLUS;
3306 else if (*buffer->cur == '=')
3307 buffer->cur++, result->type = CPP_PLUS_EQ;
3308 break;
3310 case '-':
3311 result->type = CPP_MINUS;
3312 if (*buffer->cur == '>')
3314 buffer->cur++;
3315 result->type = CPP_DEREF;
3316 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3317 buffer->cur++, result->type = CPP_DEREF_STAR;
3319 else if (*buffer->cur == '-')
3320 buffer->cur++, result->type = CPP_MINUS_MINUS;
3321 else if (*buffer->cur == '=')
3322 buffer->cur++, result->type = CPP_MINUS_EQ;
3323 break;
3325 case '&':
3326 result->type = CPP_AND;
3327 if (*buffer->cur == '&')
3328 buffer->cur++, result->type = CPP_AND_AND;
3329 else if (*buffer->cur == '=')
3330 buffer->cur++, result->type = CPP_AND_EQ;
3331 break;
3333 case '|':
3334 result->type = CPP_OR;
3335 if (*buffer->cur == '|')
3336 buffer->cur++, result->type = CPP_OR_OR;
3337 else if (*buffer->cur == '=')
3338 buffer->cur++, result->type = CPP_OR_EQ;
3339 break;
3341 case ':':
3342 result->type = CPP_COLON;
3343 if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
3344 buffer->cur++, result->type = CPP_SCOPE;
3345 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3347 buffer->cur++;
3348 result->flags |= DIGRAPH;
3349 result->type = CPP_CLOSE_SQUARE;
3351 break;
3353 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3354 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3355 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3356 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3357 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3359 case '?': result->type = CPP_QUERY; break;
3360 case '~': result->type = CPP_COMPL; break;
3361 case ',': result->type = CPP_COMMA; break;
3362 case '(': result->type = CPP_OPEN_PAREN; break;
3363 case ')': result->type = CPP_CLOSE_PAREN; break;
3364 case '[': result->type = CPP_OPEN_SQUARE; break;
3365 case ']': result->type = CPP_CLOSE_SQUARE; break;
3366 case '{': result->type = CPP_OPEN_BRACE; break;
3367 case '}': result->type = CPP_CLOSE_BRACE; break;
3368 case ';': result->type = CPP_SEMICOLON; break;
3370 /* @ is a punctuator in Objective-C. */
3371 case '@': result->type = CPP_ATSIGN; break;
3373 default:
3375 const uchar *base = --buffer->cur;
3377 /* Check for an extended identifier ($ or UCN or UTF-8). */
3378 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3379 if (forms_identifier_p (pfile, true, &nst))
3381 result->type = CPP_NAME;
3382 result->val.node.node = lex_identifier (pfile, base, true, &nst,
3383 &result->val.node.spelling);
3384 warn_about_normalization (pfile, result, &nst);
3385 break;
3388 /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
3389 single token. */
3390 buffer->cur++;
3391 if (c >= utf8_signifier)
3393 const uchar *pstr = base;
3394 cppchar_t s;
3395 if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
3396 buffer->cur = pstr;
3398 create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
3399 break;
3404 /* Potentially convert the location of the token to a range. */
3405 if (result->src_loc >= RESERVED_LOCATION_COUNT
3406 && result->type != CPP_EOF)
3408 /* Ensure that any line notes are processed, so that we have the
3409 correct physical line/column for the end-point of the token even
3410 when a logical line is split via one or more backslashes. */
3411 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3412 && !pfile->overlaid_buffer)
3413 _cpp_process_line_notes (pfile, false);
3415 source_range tok_range;
3416 tok_range.m_start = result->src_loc;
3417 tok_range.m_finish
3418 = linemap_position_for_column (pfile->line_table,
3419 CPP_BUF_COLUMN (buffer, buffer->cur));
3421 result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
3422 result->src_loc,
3423 tok_range, NULL);
3426 return result;
3429 /* An upper bound on the number of bytes needed to spell TOKEN.
3430 Does not include preceding whitespace. */
3431 unsigned int
3432 cpp_token_len (const cpp_token *token)
3434 unsigned int len;
3436 switch (TOKEN_SPELL (token))
3438 default: len = 6; break;
3439 case SPELL_LITERAL: len = token->val.str.len; break;
3440 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
3443 return len;
3446 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
3447 Return the number of bytes read out of NAME. (There are always
3448 10 bytes written to BUFFER.) */
3450 static size_t
3451 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
3453 int j;
3454 int ucn_len = 0;
3455 int ucn_len_c;
3456 unsigned t;
3457 unsigned long utf32;
3459 /* Compute the length of the UTF-8 sequence. */
3460 for (t = *name; t & 0x80; t <<= 1)
3461 ucn_len++;
3463 utf32 = *name & (0x7F >> ucn_len);
3464 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
3466 utf32 = (utf32 << 6) | (*++name & 0x3F);
3468 /* Ill-formed UTF-8. */
3469 if ((*name & ~0x3F) != 0x80)
3470 abort ();
3473 *buffer++ = '\\';
3474 *buffer++ = 'U';
3475 for (j = 7; j >= 0; j--)
3476 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
3477 return ucn_len;
3480 /* Given a token TYPE corresponding to a digraph, return a pointer to
3481 the spelling of the digraph. */
3482 static const unsigned char *
3483 cpp_digraph2name (enum cpp_ttype type)
3485 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
3488 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
3489 The buffer must already contain the enough space to hold the
3490 token's spelling. Returns a pointer to the character after the
3491 last character written. */
3492 unsigned char *
3493 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
3495 size_t i;
3496 const unsigned char *name = NODE_NAME (ident);
3498 for (i = 0; i < NODE_LEN (ident); i++)
3499 if (name[i] & ~0x7F)
3501 i += utf8_to_ucn (buffer, name + i) - 1;
3502 buffer += 10;
3504 else
3505 *buffer++ = name[i];
3507 return buffer;
3510 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
3511 already contain the enough space to hold the token's spelling.
3512 Returns a pointer to the character after the last character written.
3513 FORSTRING is true if this is to be the spelling after translation
3514 phase 1 (with the original spelling of extended identifiers), false
3515 if extended identifiers should always be written using UCNs (there is
3516 no option for always writing them in the internal UTF-8 form).
3517 FIXME: Would be nice if we didn't need the PFILE argument. */
3518 unsigned char *
3519 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
3520 unsigned char *buffer, bool forstring)
3522 switch (TOKEN_SPELL (token))
3524 case SPELL_OPERATOR:
3526 const unsigned char *spelling;
3527 unsigned char c;
3529 if (token->flags & DIGRAPH)
3530 spelling = cpp_digraph2name (token->type);
3531 else if (token->flags & NAMED_OP)
3532 goto spell_ident;
3533 else
3534 spelling = TOKEN_NAME (token);
3536 while ((c = *spelling++) != '\0')
3537 *buffer++ = c;
3539 break;
3541 spell_ident:
3542 case SPELL_IDENT:
3543 if (forstring)
3545 memcpy (buffer, NODE_NAME (token->val.node.spelling),
3546 NODE_LEN (token->val.node.spelling));
3547 buffer += NODE_LEN (token->val.node.spelling);
3549 else
3550 buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
3551 break;
3553 case SPELL_LITERAL:
3554 memcpy (buffer, token->val.str.text, token->val.str.len);
3555 buffer += token->val.str.len;
3556 break;
3558 case SPELL_NONE:
3559 cpp_error (pfile, CPP_DL_ICE,
3560 "unspellable token %s", TOKEN_NAME (token));
3561 break;
3564 return buffer;
3567 /* Returns TOKEN spelt as a null-terminated string. The string is
3568 freed when the reader is destroyed. Useful for diagnostics. */
3569 unsigned char *
3570 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
3572 unsigned int len = cpp_token_len (token) + 1;
3573 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
3575 end = cpp_spell_token (pfile, token, start, false);
3576 end[0] = '\0';
3578 return start;
3581 /* Returns a pointer to a string which spells the token defined by
3582 TYPE and FLAGS. Used by C front ends, which really should move to
3583 using cpp_token_as_text. */
3584 const char *
3585 cpp_type2name (enum cpp_ttype type, unsigned char flags)
3587 if (flags & DIGRAPH)
3588 return (const char *) cpp_digraph2name (type);
3589 else if (flags & NAMED_OP)
3590 return cpp_named_operator2name (type);
3592 return (const char *) token_spellings[type].name;
3595 /* Writes the spelling of token to FP, without any preceding space.
3596 Separated from cpp_spell_token for efficiency - to avoid stdio
3597 double-buffering. */
3598 void
3599 cpp_output_token (const cpp_token *token, FILE *fp)
3601 switch (TOKEN_SPELL (token))
3603 case SPELL_OPERATOR:
3605 const unsigned char *spelling;
3606 int c;
3608 if (token->flags & DIGRAPH)
3609 spelling = cpp_digraph2name (token->type);
3610 else if (token->flags & NAMED_OP)
3611 goto spell_ident;
3612 else
3613 spelling = TOKEN_NAME (token);
3615 c = *spelling;
3617 putc (c, fp);
3618 while ((c = *++spelling) != '\0');
3620 break;
3622 spell_ident:
3623 case SPELL_IDENT:
3625 size_t i;
3626 const unsigned char * name = NODE_NAME (token->val.node.node);
3628 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
3629 if (name[i] & ~0x7F)
3631 unsigned char buffer[10];
3632 i += utf8_to_ucn (buffer, name + i) - 1;
3633 fwrite (buffer, 1, 10, fp);
3635 else
3636 fputc (NODE_NAME (token->val.node.node)[i], fp);
3638 break;
3640 case SPELL_LITERAL:
3641 if (token->type == CPP_HEADER_NAME)
3642 fputc ('"', fp);
3643 fwrite (token->val.str.text, 1, token->val.str.len, fp);
3644 if (token->type == CPP_HEADER_NAME)
3645 fputc ('"', fp);
3646 break;
3648 case SPELL_NONE:
3649 /* An error, most probably. */
3650 break;
3654 /* Compare two tokens. */
3656 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
3658 if (a->type == b->type && a->flags == b->flags)
3659 switch (TOKEN_SPELL (a))
3661 default: /* Keep compiler happy. */
3662 case SPELL_OPERATOR:
3663 /* token_no is used to track where multiple consecutive ##
3664 tokens were originally located. */
3665 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
3666 case SPELL_NONE:
3667 return (a->type != CPP_MACRO_ARG
3668 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
3669 && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
3670 case SPELL_IDENT:
3671 return (a->val.node.node == b->val.node.node
3672 && a->val.node.spelling == b->val.node.spelling);
3673 case SPELL_LITERAL:
3674 return (a->val.str.len == b->val.str.len
3675 && !memcmp (a->val.str.text, b->val.str.text,
3676 a->val.str.len));
3679 return 0;
3682 /* Returns nonzero if a space should be inserted to avoid an
3683 accidental token paste for output. For simplicity, it is
3684 conservative, and occasionally advises a space where one is not
3685 needed, e.g. "." and ".2". */
3687 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
3688 const cpp_token *token2)
3690 enum cpp_ttype a = token1->type, b = token2->type;
3691 cppchar_t c;
3693 if (token1->flags & NAMED_OP)
3694 a = CPP_NAME;
3695 if (token2->flags & NAMED_OP)
3696 b = CPP_NAME;
3698 c = EOF;
3699 if (token2->flags & DIGRAPH)
3700 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
3701 else if (token_spellings[b].category == SPELL_OPERATOR)
3702 c = token_spellings[b].name[0];
3704 /* Quickly get everything that can paste with an '='. */
3705 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
3706 return 1;
3708 switch (a)
3710 case CPP_GREATER: return c == '>';
3711 case CPP_LESS: return c == '<' || c == '%' || c == ':';
3712 case CPP_PLUS: return c == '+';
3713 case CPP_MINUS: return c == '-' || c == '>';
3714 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
3715 case CPP_MOD: return c == ':' || c == '>';
3716 case CPP_AND: return c == '&';
3717 case CPP_OR: return c == '|';
3718 case CPP_COLON: return c == ':' || c == '>';
3719 case CPP_DEREF: return c == '*';
3720 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
3721 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
3722 case CPP_PRAGMA:
3723 case CPP_NAME: return ((b == CPP_NUMBER
3724 && name_p (pfile, &token2->val.str))
3725 || b == CPP_NAME
3726 || b == CPP_CHAR || b == CPP_STRING); /* L */
3727 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
3728 || b == CPP_CHAR
3729 || c == '.' || c == '+' || c == '-');
3730 /* UCNs */
3731 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
3732 && b == CPP_NAME)
3733 || (CPP_OPTION (pfile, objc)
3734 && token1->val.str.text[0] == '@'
3735 && (b == CPP_NAME || b == CPP_STRING)));
3736 case CPP_LESS_EQ: return c == '>';
3737 case CPP_STRING:
3738 case CPP_WSTRING:
3739 case CPP_UTF8STRING:
3740 case CPP_STRING16:
3741 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
3742 && (b == CPP_NAME
3743 || (TOKEN_SPELL (token2) == SPELL_LITERAL
3744 && ISIDST (token2->val.str.text[0]))));
3746 default: break;
3749 return 0;
3752 /* Output all the remaining tokens on the current line, and a newline
3753 character, to FP. Leading whitespace is removed. If there are
3754 macros, special token padding is not performed. */
3755 void
3756 cpp_output_line (cpp_reader *pfile, FILE *fp)
3758 const cpp_token *token;
3760 token = cpp_get_token (pfile);
3761 while (token->type != CPP_EOF)
3763 cpp_output_token (token, fp);
3764 token = cpp_get_token (pfile);
3765 if (token->flags & PREV_WHITE)
3766 putc (' ', fp);
3769 putc ('\n', fp);
3772 /* Return a string representation of all the remaining tokens on the
3773 current line. The result is allocated using xmalloc and must be
3774 freed by the caller. */
3775 unsigned char *
3776 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
3778 const cpp_token *token;
3779 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
3780 unsigned int alloced = 120 + out;
3781 unsigned char *result = (unsigned char *) xmalloc (alloced);
3783 /* If DIR_NAME is empty, there are no initial contents. */
3784 if (dir_name)
3786 sprintf ((char *) result, "#%s ", dir_name);
3787 out += 2;
3790 token = cpp_get_token (pfile);
3791 while (token->type != CPP_EOF)
3793 unsigned char *last;
3794 /* Include room for a possible space and the terminating nul. */
3795 unsigned int len = cpp_token_len (token) + 2;
3797 if (out + len > alloced)
3799 alloced *= 2;
3800 if (out + len > alloced)
3801 alloced = out + len;
3802 result = (unsigned char *) xrealloc (result, alloced);
3805 last = cpp_spell_token (pfile, token, &result[out], 0);
3806 out = last - result;
3808 token = cpp_get_token (pfile);
3809 if (token->flags & PREV_WHITE)
3810 result[out++] = ' ';
3813 result[out] = '\0';
3814 return result;
3817 /* Memory buffers. Changing these three constants can have a dramatic
3818 effect on performance. The values here are reasonable defaults,
3819 but might be tuned. If you adjust them, be sure to test across a
3820 range of uses of cpplib, including heavy nested function-like macro
3821 expansion. Also check the change in peak memory usage (NJAMD is a
3822 good tool for this). */
3823 #define MIN_BUFF_SIZE 8000
3824 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3825 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
3826 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
3828 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
3829 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
3830 #endif
3832 /* Create a new allocation buffer. Place the control block at the end
3833 of the buffer, so that buffer overflows will cause immediate chaos. */
3834 static _cpp_buff *
3835 new_buff (size_t len)
3837 _cpp_buff *result;
3838 unsigned char *base;
3840 if (len < MIN_BUFF_SIZE)
3841 len = MIN_BUFF_SIZE;
3842 len = CPP_ALIGN (len);
3844 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3845 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
3846 struct first. */
3847 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
3848 base = XNEWVEC (unsigned char, len + slen);
3849 result = (_cpp_buff *) base;
3850 base += slen;
3851 #else
3852 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3853 result = (_cpp_buff *) (base + len);
3854 #endif
3855 result->base = base;
3856 result->cur = base;
3857 result->limit = base + len;
3858 result->next = NULL;
3859 return result;
3862 /* Place a chain of unwanted allocation buffers on the free list. */
3863 void
3864 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3866 _cpp_buff *end = buff;
3868 while (end->next)
3869 end = end->next;
3870 end->next = pfile->free_buffs;
3871 pfile->free_buffs = buff;
3874 /* Return a free buffer of size at least MIN_SIZE. */
3875 _cpp_buff *
3876 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3878 _cpp_buff *result, **p;
3880 for (p = &pfile->free_buffs;; p = &(*p)->next)
3882 size_t size;
3884 if (*p == NULL)
3885 return new_buff (min_size);
3886 result = *p;
3887 size = result->limit - result->base;
3888 /* Return a buffer that's big enough, but don't waste one that's
3889 way too big. */
3890 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3891 break;
3894 *p = result->next;
3895 result->next = NULL;
3896 result->cur = result->base;
3897 return result;
3900 /* Creates a new buffer with enough space to hold the uncommitted
3901 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
3902 the excess bytes to the new buffer. Chains the new buffer after
3903 BUFF, and returns the new buffer. */
3904 _cpp_buff *
3905 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3907 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3908 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3910 buff->next = new_buff;
3911 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3912 return new_buff;
3915 /* Creates a new buffer with enough space to hold the uncommitted
3916 remaining bytes of the buffer pointed to by BUFF, and at least
3917 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
3918 Chains the new buffer before the buffer pointed to by BUFF, and
3919 updates the pointer to point to the new buffer. */
3920 void
3921 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3923 _cpp_buff *new_buff, *old_buff = *pbuff;
3924 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3926 new_buff = _cpp_get_buff (pfile, size);
3927 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3928 new_buff->next = old_buff;
3929 *pbuff = new_buff;
3932 /* Free a chain of buffers starting at BUFF. */
3933 void
3934 _cpp_free_buff (_cpp_buff *buff)
3936 _cpp_buff *next;
3938 for (; buff; buff = next)
3940 next = buff->next;
3941 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3942 free (buff);
3943 #else
3944 free (buff->base);
3945 #endif
3949 /* Allocate permanent, unaligned storage of length LEN. */
3950 unsigned char *
3951 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3953 _cpp_buff *buff = pfile->u_buff;
3954 unsigned char *result = buff->cur;
3956 if (len > (size_t) (buff->limit - result))
3958 buff = _cpp_get_buff (pfile, len);
3959 buff->next = pfile->u_buff;
3960 pfile->u_buff = buff;
3961 result = buff->cur;
3964 buff->cur = result + len;
3965 return result;
3968 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3969 That buffer is used for growing allocations when saving macro
3970 replacement lists in a #define, and when parsing an answer to an
3971 assertion in #assert, #unassert or #if (and therefore possibly
3972 whilst expanding macros). It therefore must not be used by any
3973 code that they might call: specifically the lexer and the guts of
3974 the macro expander.
3976 All existing other uses clearly fit this restriction: storing
3977 registered pragmas during initialization. */
3978 unsigned char *
3979 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3981 _cpp_buff *buff = pfile->a_buff;
3982 unsigned char *result = buff->cur;
3984 if (len > (size_t) (buff->limit - result))
3986 buff = _cpp_get_buff (pfile, len);
3987 buff->next = pfile->a_buff;
3988 pfile->a_buff = buff;
3989 result = buff->cur;
3992 buff->cur = result + len;
3993 return result;
3996 /* Commit or allocate storage from a buffer. */
3998 void *
3999 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4001 void *ptr = BUFF_FRONT (pfile->a_buff);
4003 if (pfile->hash_table->alloc_subobject)
4005 void *copy = pfile->hash_table->alloc_subobject (size);
4006 memcpy (copy, ptr, size);
4007 ptr = copy;
4009 else
4010 BUFF_FRONT (pfile->a_buff) += size;
4012 return ptr;
4015 /* Say which field of TOK is in use. */
4017 enum cpp_token_fld_kind
4018 cpp_token_val_index (const cpp_token *tok)
4020 switch (TOKEN_SPELL (tok))
4022 case SPELL_IDENT:
4023 return CPP_TOKEN_FLD_NODE;
4024 case SPELL_LITERAL:
4025 return CPP_TOKEN_FLD_STR;
4026 case SPELL_OPERATOR:
4027 /* Operands which were originally spelled as ident keep around
4028 the node for the exact spelling. */
4029 if (tok->flags & NAMED_OP)
4030 return CPP_TOKEN_FLD_NODE;
4031 else if (tok->type == CPP_PASTE)
4032 return CPP_TOKEN_FLD_TOKEN_NO;
4033 else
4034 return CPP_TOKEN_FLD_NONE;
4035 case SPELL_NONE:
4036 if (tok->type == CPP_MACRO_ARG)
4037 return CPP_TOKEN_FLD_ARG_NO;
4038 else if (tok->type == CPP_PADDING)
4039 return CPP_TOKEN_FLD_SOURCE;
4040 else if (tok->type == CPP_PRAGMA)
4041 return CPP_TOKEN_FLD_PRAGMA;
4042 /* fall through */
4043 default:
4044 return CPP_TOKEN_FLD_NONE;
4048 /* All tokens lexed in R after calling this function will be forced to
4049 have their location_t to be P, until
4050 cpp_stop_forcing_token_locations is called for R. */
4052 void
4053 cpp_force_token_locations (cpp_reader *r, location_t loc)
4055 r->forced_token_location = loc;
4058 /* Go back to assigning locations naturally for lexed tokens. */
4060 void
4061 cpp_stop_forcing_token_locations (cpp_reader *r)
4063 r->forced_token_location = 0;
4066 /* We're looking at \, if it's escaping EOL, look past it. If at
4067 LIMIT, don't advance. */
4069 static const unsigned char *
4070 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4072 const unsigned char *probe = peek;
4074 if (__builtin_expect (peek[1] == '\n', true))
4076 eol:
4077 probe += 2;
4078 if (__builtin_expect (probe < limit, true))
4080 peek = probe;
4081 if (*peek == '\\')
4082 /* The user might be perverse. */
4083 return do_peek_backslash (peek, limit);
4086 else if (__builtin_expect (peek[1] == '\r', false))
4088 if (probe[2] == '\n')
4089 probe++;
4090 goto eol;
4093 return peek;
4096 static const unsigned char *
4097 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4099 if (__builtin_expect (*peek == '\\', false))
4100 peek = do_peek_backslash (peek, limit);
4101 return peek;
4104 static const unsigned char *
4105 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4107 if (peek == bound)
4108 return NULL;
4110 unsigned char c = *--peek;
4111 if (__builtin_expect (c == '\n', false)
4112 || __builtin_expect (c == 'r', false))
4114 if (peek == bound)
4115 return peek;
4116 int ix = -1;
4117 if (c == '\n' && peek[ix] == '\r')
4119 if (peek + ix == bound)
4120 return peek;
4121 ix--;
4124 if (peek[ix] == '\\')
4125 return do_peek_prev (peek + ix, bound);
4127 return peek;
4129 else
4130 return peek;
4133 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4134 space. Otherwise return NULL. */
4136 static const unsigned char *
4137 do_peek_ident (const char *match, const unsigned char *peek,
4138 const unsigned char *limit)
4140 for (; *++match; peek++)
4141 if (*peek != *match)
4143 peek = do_peek_next (peek, limit);
4144 if (*peek != *match)
4145 return NULL;
4148 /* Must now not be looking at an identifier char. */
4149 peek = do_peek_next (peek, limit);
4150 if (ISIDNUM (*peek))
4151 return NULL;
4153 /* Skip control-line whitespace. */
4155 while (*peek == ' ' || *peek == '\t')
4156 peek++;
4157 if (__builtin_expect (*peek == '\\', false))
4159 peek = do_peek_backslash (peek, limit);
4160 if (*peek != '\\')
4161 goto ws;
4164 return peek;
4167 /* Are we looking at a module control line starting as PEEK - 1? */
4169 static bool
4170 do_peek_module (cpp_reader *pfile, unsigned char c,
4171 const unsigned char *peek, const unsigned char *limit)
4173 bool import = false;
4175 if (__builtin_expect (c == 'e', false))
4177 if (!((peek[0] == 'x' || peek[0] == '\\')
4178 && (peek = do_peek_ident ("export", peek, limit))))
4179 return false;
4181 /* export, peek for import or module. No need to peek __import
4182 here. */
4183 if (peek[0] == 'i')
4185 if (!((peek[1] == 'm' || peek[1] == '\\')
4186 && (peek = do_peek_ident ("import", peek + 1, limit))))
4187 return false;
4188 import = true;
4190 else if (peek[0] == 'm')
4192 if (!((peek[1] == 'o' || peek[1] == '\\')
4193 && (peek = do_peek_ident ("module", peek + 1, limit))))
4194 return false;
4196 else
4197 return false;
4199 else if (__builtin_expect (c == 'i', false))
4201 if (!((peek[0] == 'm' || peek[0] == '\\')
4202 && (peek = do_peek_ident ("import", peek, limit))))
4203 return false;
4204 import = true;
4206 else if (__builtin_expect (c == '_', false))
4208 /* Needed for translated includes. */
4209 if (!((peek[0] == '_' || peek[0] == '\\')
4210 && (peek = do_peek_ident ("__import", peek, limit))))
4211 return false;
4212 import = true;
4214 else if (__builtin_expect (c == 'm', false))
4216 if (!((peek[0] == 'o' || peek[0] == '\\')
4217 && (peek = do_peek_ident ("module", peek, limit))))
4218 return false;
4220 else
4221 return false;
4223 /* Peek the next character to see if it's good enough. We'll be at
4224 the first non-whitespace char, including skipping an escaped
4225 newline. */
4226 /* ... import followed by identifier, ':', '<' or header-name
4227 preprocessing tokens, or module followed by identifier, ':' or
4228 ';' preprocessing tokens. */
4229 unsigned char p = *peek++;
4231 /* A character literal is ... single quotes, ... optionally preceded
4232 by u8, u, U, or L */
4233 /* A string-literal is a ... double quotes, optionally prefixed by
4234 R, u8, u8R, u, uR, U, UR, L, or LR */
4235 if (p == 'u')
4237 peek = do_peek_next (peek, limit);
4238 if (*peek == '8')
4240 peek++;
4241 goto peek_u8;
4243 goto peek_u;
4245 else if (p == 'U' || p == 'L')
4247 peek_u8:
4248 peek = do_peek_next (peek, limit);
4249 peek_u:
4250 if (*peek == '\"' || *peek == '\'')
4251 return false;
4253 if (*peek == 'R')
4254 goto peek_R;
4255 /* Identifier. Ok. */
4257 else if (p == 'R')
4259 peek_R:
4260 if (CPP_OPTION (pfile, rliterals))
4262 peek = do_peek_next (peek, limit);
4263 if (*peek == '\"')
4264 return false;
4266 /* Identifier. Ok. */
4268 else if ('Z' - 'A' == 25
4269 ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
4270 : ISIDST (p))
4272 /* Identifier. Ok. */
4274 else if (p == '<')
4276 /* Maybe angle header, ok for import. Reject
4277 '<=', '<<' digraph:'<:'. */
4278 if (!import)
4279 return false;
4280 peek = do_peek_next (peek, limit);
4281 if (*peek == '=' || *peek == '<'
4282 || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
4283 return false;
4285 else if (p == ';')
4287 /* SEMICOLON, ok for module. */
4288 if (import)
4289 return false;
4291 else if (p == '"')
4293 /* STRING, ok for import. */
4294 if (!import)
4295 return false;
4297 else if (p == ':')
4299 /* Maybe COLON, ok. Reject '::', digraph:':>'. */
4300 peek = do_peek_next (peek, limit);
4301 if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
4302 return false;
4304 else
4305 /* FIXME: Detect a unicode character, excluding those not
4306 permitted as the initial character. [lex.name]/1. I presume
4307 we need to check the \[uU] spellings, and directly using
4308 Unicode in say UTF8 form? Or perhaps we do the phase-1
4309 conversion of UTF8 to universal-character-names? */
4310 return false;
4312 return true;
4315 /* Directives-only scanning. Somewhat more relaxed than correct
4316 parsing -- some ill-formed programs will not be rejected. */
4318 void
4319 cpp_directive_only_process (cpp_reader *pfile,
4320 void *data,
4321 void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
4323 bool module_p = CPP_OPTION (pfile, module_directives);
4327 restart:
4328 /* Buffer initialization, but no line cleaning. */
4329 cpp_buffer *buffer = pfile->buffer;
4330 buffer->cur_note = buffer->notes_used = 0;
4331 buffer->cur = buffer->line_base = buffer->next_line;
4332 buffer->need_line = false;
4333 /* Files always end in a newline or carriage return. We rely on this for
4334 character peeking safety. */
4335 gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
4337 const unsigned char *base = buffer->cur;
4338 unsigned line_count = 0;
4339 const unsigned char *line_start = base;
4341 bool bol = true;
4342 bool raw = false;
4344 const unsigned char *lwm = base;
4345 for (const unsigned char *pos = base, *limit = buffer->rlimit;
4346 pos < limit;)
4348 unsigned char c = *pos++;
4349 /* This matches the switch in _cpp_lex_direct. */
4350 switch (c)
4352 case ' ': case '\t': case '\f': case '\v':
4353 /* Whitespace, do nothing. */
4354 break;
4356 case '\r': /* MAC line ending, or Windows \r\n */
4357 if (*pos == '\n')
4358 pos++;
4359 /* FALLTHROUGH */
4361 case '\n':
4362 bol = true;
4364 next_line:
4365 CPP_INCREMENT_LINE (pfile, 0);
4366 line_count++;
4367 line_start = pos;
4368 break;
4370 case '\\':
4371 /* <backslash><newline> is removed, and doesn't undo any
4372 preceeding escape or whatnot. */
4373 if (*pos == '\n')
4375 pos++;
4376 goto next_line;
4378 else if (*pos == '\r')
4380 if (pos[1] == '\n')
4381 pos++;
4382 pos++;
4383 goto next_line;
4385 goto dflt;
4387 case '#':
4388 if (bol)
4390 /* Line directive. */
4391 if (pos - 1 > base && !pfile->state.skipping)
4392 cb (pfile, CPP_DO_print, data,
4393 line_count, base, pos - 1 - base);
4395 /* Prep things for directive handling. */
4396 buffer->next_line = pos;
4397 buffer->need_line = true;
4398 bool ok = _cpp_get_fresh_line (pfile);
4399 gcc_checking_assert (ok);
4401 /* Ensure proper column numbering for generated
4402 error messages. */
4403 buffer->line_base -= pos - line_start;
4405 _cpp_handle_directive (pfile, line_start + 1 != pos);
4407 /* Sanitize the line settings. Duplicate #include's can
4408 mess things up. */
4409 // FIXME: Necessary?
4410 pfile->line_table->highest_location
4411 = pfile->line_table->highest_line;
4413 if (!pfile->state.skipping
4414 && pfile->buffer->next_line < pfile->buffer->rlimit)
4415 cb (pfile, CPP_DO_location, data,
4416 pfile->line_table->highest_line);
4418 goto restart;
4420 goto dflt;
4422 case '/':
4424 const unsigned char *peek = do_peek_next (pos, limit);
4425 if (!(*peek == '/' || *peek == '*'))
4426 goto dflt;
4428 /* Line or block comment */
4429 bool is_block = *peek == '*';
4430 bool star = false;
4431 bool esc = false;
4432 location_t sloc
4433 = linemap_position_for_column (pfile->line_table,
4434 pos - line_start);
4436 while (pos < limit)
4438 char c = *pos++;
4439 switch (c)
4441 case '\\':
4442 esc = true;
4443 break;
4445 case '\r':
4446 if (*pos == '\n')
4447 pos++;
4448 /* FALLTHROUGH */
4450 case '\n':
4452 CPP_INCREMENT_LINE (pfile, 0);
4453 line_count++;
4454 line_start = pos;
4455 if (!esc && !is_block)
4457 bol = true;
4458 goto done_comment;
4461 if (!esc)
4462 star = false;
4463 esc = false;
4464 break;
4466 case '*':
4467 if (pos > peek && !esc)
4468 star = is_block;
4469 esc = false;
4470 break;
4472 case '/':
4473 if (star)
4474 goto done_comment;
4475 /* FALLTHROUGH */
4477 default:
4478 star = false;
4479 esc = false;
4480 break;
4483 if (pos < limit || is_block)
4484 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4485 "unterminated comment");
4486 done_comment:
4487 lwm = pos;
4488 break;
4491 case '\'':
4492 if (!CPP_OPTION (pfile, digit_separators))
4493 goto delimited_string;
4495 /* Possibly a number punctuator. */
4496 if (!ISIDNUM (*do_peek_next (pos, limit)))
4497 goto delimited_string;
4499 goto quote_peek;
4501 case '\"':
4502 if (!CPP_OPTION (pfile, rliterals))
4503 goto delimited_string;
4505 quote_peek:
4507 /* For ' see if it's a number punctuator
4508 \.?<digit>(<digit>|<identifier-nondigit>
4509 |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
4510 /* For " see if it's a raw string
4511 {U,L,u,u8}R. This includes CPP_NUMBER detection,
4512 because that could be 0e+R. */
4513 const unsigned char *peek = pos - 1;
4514 bool quote_first = c == '"';
4515 bool quote_eight = false;
4516 bool maybe_number_start = false;
4517 bool want_number = false;
4519 while ((peek = do_peek_prev (peek, lwm)))
4521 unsigned char p = *peek;
4522 if (quote_first)
4524 if (!raw)
4526 if (p != 'R')
4527 break;
4528 raw = true;
4529 continue;
4532 quote_first = false;
4533 if (p == 'L' || p == 'U' || p == 'u')
4535 else if (p == '8')
4536 quote_eight = true;
4537 else
4538 goto second_raw;
4540 else if (quote_eight)
4542 if (p != 'u')
4544 raw = false;
4545 break;
4547 quote_eight = false;
4549 else if (c == '"')
4551 second_raw:;
4552 if (!want_number && ISIDNUM (p))
4554 raw = false;
4555 break;
4559 if (ISDIGIT (p))
4560 maybe_number_start = true;
4561 else if (p == '.')
4562 want_number = true;
4563 else if (ISIDNUM (p))
4564 maybe_number_start = false;
4565 else if (p == '+' || p == '-')
4567 if (const unsigned char *peek_prev
4568 = do_peek_prev (peek, lwm))
4570 p = *peek_prev;
4571 if (p == 'e' || p == 'E'
4572 || p == 'p' || p == 'P')
4574 want_number = true;
4575 maybe_number_start = false;
4577 else
4578 break;
4580 else
4581 break;
4583 else if (p == '\'' || p == '\"')
4585 /* If this is lwm, this must be the end of a
4586 previous string. So this is a trailing
4587 literal type, (a) if those are allowed,
4588 and (b) maybe_start is false. Otherwise
4589 this must be a CPP_NUMBER because we've
4590 met another ', and we'd have checked that
4591 in its own right. */
4592 if (peek == lwm && CPP_OPTION (pfile, uliterals))
4594 if (!maybe_number_start && !want_number)
4595 /* Must be a literal type. */
4596 raw = false;
4598 else if (p == '\''
4599 && CPP_OPTION (pfile, digit_separators))
4600 maybe_number_start = true;
4601 break;
4603 else if (c == '\'')
4604 break;
4605 else if (!quote_first && !quote_eight)
4606 break;
4609 if (maybe_number_start)
4611 if (c == '\'')
4612 /* A CPP NUMBER. */
4613 goto dflt;
4614 raw = false;
4617 goto delimited_string;
4620 delimited_string:
4622 /* (Possibly raw) string or char literal. */
4623 unsigned char end = c;
4624 int delim_len = -1;
4625 const unsigned char *delim = NULL;
4626 location_t sloc = linemap_position_for_column (pfile->line_table,
4627 pos - line_start);
4628 int esc = 0;
4630 if (raw)
4632 /* There can be no line breaks in the delimiter. */
4633 delim = pos;
4634 for (delim_len = 0; (c = *pos++) != '('; delim_len++)
4636 if (delim_len == 16)
4638 cpp_error_with_line (pfile, CPP_DL_ERROR,
4639 sloc, 0,
4640 "raw string delimiter"
4641 " longer than %d"
4642 " characters",
4643 delim_len);
4644 raw = false;
4645 pos = delim;
4646 break;
4648 if (strchr (") \\\t\v\f\n", c))
4650 cpp_error_with_line (pfile, CPP_DL_ERROR,
4651 sloc, 0,
4652 "invalid character '%c'"
4653 " in raw string"
4654 " delimiter", c);
4655 raw = false;
4656 pos = delim;
4657 break;
4659 if (pos >= limit)
4660 goto bad_string;
4664 while (pos < limit)
4666 char c = *pos++;
4667 switch (c)
4669 case '\\':
4670 if (!raw)
4671 esc++;
4672 break;
4674 case '\r':
4675 if (*pos == '\n')
4676 pos++;
4677 /* FALLTHROUGH */
4679 case '\n':
4681 CPP_INCREMENT_LINE (pfile, 0);
4682 line_count++;
4683 line_start = pos;
4685 if (esc)
4686 esc--;
4687 break;
4689 case ')':
4690 if (raw
4691 && pos + delim_len + 1 < limit
4692 && pos[delim_len] == end
4693 && !memcmp (delim, pos, delim_len))
4695 pos += delim_len + 1;
4696 raw = false;
4697 goto done_string;
4699 break;
4701 default:
4702 if (!raw && !(esc & 1) && c == end)
4703 goto done_string;
4704 esc = 0;
4705 break;
4708 bad_string:
4709 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4710 "unterminated literal");
4712 done_string:
4713 raw = false;
4714 lwm = pos - 1;
4716 goto dflt;
4718 case '_':
4719 case 'e':
4720 case 'i':
4721 case 'm':
4722 if (bol && module_p && !pfile->state.skipping
4723 && do_peek_module (pfile, c, pos, limit))
4725 /* We've seen the start of a module control line.
4726 Start up the tokenizer. */
4727 pos--; /* Backup over the first character. */
4729 /* Backup over whitespace to start of line. */
4730 while (pos > line_start
4731 && (pos[-1] == ' ' || pos[-1] == '\t'))
4732 pos--;
4734 if (pos > base)
4735 cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
4737 /* Prep things for directive handling. */
4738 buffer->next_line = pos;
4739 buffer->need_line = true;
4741 /* Now get tokens until the PRAGMA_EOL. */
4744 location_t spelling;
4745 const cpp_token *tok
4746 = cpp_get_token_with_location (pfile, &spelling);
4748 gcc_assert (pfile->state.in_deferred_pragma
4749 || tok->type == CPP_PRAGMA_EOL);
4750 cb (pfile, CPP_DO_token, data, tok, spelling);
4752 while (pfile->state.in_deferred_pragma);
4754 if (pfile->buffer->next_line < pfile->buffer->rlimit)
4755 cb (pfile, CPP_DO_location, data,
4756 pfile->line_table->highest_line);
4758 pfile->mi_valid = false;
4759 goto restart;
4761 goto dflt;
4763 default:
4764 dflt:
4765 bol = false;
4766 pfile->mi_valid = false;
4767 break;
4771 if (buffer->rlimit > base && !pfile->state.skipping)
4773 const unsigned char *limit = buffer->rlimit;
4774 /* If the file was not newline terminated, add rlimit, which is
4775 guaranteed to point to a newline, to the end of our range. */
4776 if (limit[-1] != '\n')
4778 limit++;
4779 CPP_INCREMENT_LINE (pfile, 0);
4780 line_count++;
4782 cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
4785 _cpp_pop_buffer (pfile);
4787 while (pfile->buffer);