* gcc-plugin.h (enum plugin_event): Add PLUGIN_ALL_IPA_PASSES_START,
[official-gcc.git] / libcpp / lex.c
blob55bffa9a326e71a1b1d28b6d2122300401d94342
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009
3 Free Software Foundation, Inc.
4 Contributed by Per Bothner, 1994-95.
5 Based on CCCP program by Paul Rubin, June 1986
6 Adapted to ANSI C, Richard Stallman, Jan 1987
7 Broken out to separate file, Zack Weinberg, Mar 2000
9 This program is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 3, or (at your option) any
12 later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
23 #include "config.h"
24 #include "system.h"
25 #include "cpplib.h"
26 #include "internal.h"
28 enum spell_type
30 SPELL_OPERATOR = 0,
31 SPELL_IDENT,
32 SPELL_LITERAL,
33 SPELL_NONE
36 struct token_spelling
38 enum spell_type category;
39 const unsigned char *name;
42 static const unsigned char *const digraph_spellings[] =
43 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
45 #define OP(e, s) { SPELL_OPERATOR, UC s },
46 #define TK(e, s) { SPELL_ ## s, UC #e },
47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
48 #undef OP
49 #undef TK
51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
55 static int skip_line_comment (cpp_reader *);
56 static void skip_whitespace (cpp_reader *, cppchar_t);
57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
59 static void store_comment (cpp_reader *, cpp_token *);
60 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
61 unsigned int, enum cpp_ttype);
62 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
63 static int name_p (cpp_reader *, const cpp_string *);
64 static tokenrun *next_tokenrun (tokenrun *);
66 static _cpp_buff *new_buff (size_t);
69 /* Utility routine:
71 Compares, the token TOKEN to the NUL-terminated string STRING.
72 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
73 int
74 cpp_ideq (const cpp_token *token, const char *string)
76 if (token->type != CPP_NAME)
77 return 0;
79 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
82 /* Record a note TYPE at byte POS into the current cleaned logical
83 line. */
84 static void
85 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
87 if (buffer->notes_used == buffer->notes_cap)
89 buffer->notes_cap = buffer->notes_cap * 2 + 200;
90 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
91 buffer->notes_cap);
94 buffer->notes[buffer->notes_used].pos = pos;
95 buffer->notes[buffer->notes_used].type = type;
96 buffer->notes_used++;
99 /* Returns with a logical line that contains no escaped newlines or
100 trigraphs. This is a time-critical inner loop. */
101 void
102 _cpp_clean_line (cpp_reader *pfile)
104 cpp_buffer *buffer;
105 const uchar *s;
106 uchar c, *d, *p;
108 buffer = pfile->buffer;
109 buffer->cur_note = buffer->notes_used = 0;
110 buffer->cur = buffer->line_base = buffer->next_line;
111 buffer->need_line = false;
112 s = buffer->next_line - 1;
114 if (!buffer->from_stage3)
116 const uchar *pbackslash = NULL;
118 /* Short circuit for the common case of an un-escaped line with
119 no trigraphs. The primary win here is by not writing any
120 data back to memory until we have to. */
121 for (;;)
123 c = *++s;
124 if (__builtin_expect (c == '\n', false)
125 || __builtin_expect (c == '\r', false))
127 d = (uchar *) s;
129 if (__builtin_expect (s == buffer->rlimit, false))
130 goto done;
132 /* DOS line ending? */
133 if (__builtin_expect (c == '\r', false)
134 && s[1] == '\n')
136 s++;
137 if (s == buffer->rlimit)
138 goto done;
141 if (__builtin_expect (pbackslash == NULL, true))
142 goto done;
144 /* Check for escaped newline. */
145 p = d;
146 while (is_nvspace (p[-1]))
147 p--;
148 if (p - 1 != pbackslash)
149 goto done;
151 /* Have an escaped newline; process it and proceed to
152 the slow path. */
153 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
154 d = p - 2;
155 buffer->next_line = p - 1;
156 break;
158 if (__builtin_expect (c == '\\', false))
159 pbackslash = s;
160 else if (__builtin_expect (c == '?', false)
161 && __builtin_expect (s[1] == '?', false)
162 && _cpp_trigraph_map[s[2]])
164 /* Have a trigraph. We may or may not have to convert
165 it. Add a line note regardless, for -Wtrigraphs. */
166 add_line_note (buffer, s, s[2]);
167 if (CPP_OPTION (pfile, trigraphs))
169 /* We do, and that means we have to switch to the
170 slow path. */
171 d = (uchar *) s;
172 *d = _cpp_trigraph_map[s[2]];
173 s += 2;
174 break;
180 for (;;)
182 c = *++s;
183 *++d = c;
185 if (c == '\n' || c == '\r')
187 /* Handle DOS line endings. */
188 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
189 s++;
190 if (s == buffer->rlimit)
191 break;
193 /* Escaped? */
194 p = d;
195 while (p != buffer->next_line && is_nvspace (p[-1]))
196 p--;
197 if (p == buffer->next_line || p[-1] != '\\')
198 break;
200 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
201 d = p - 2;
202 buffer->next_line = p - 1;
204 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
206 /* Add a note regardless, for the benefit of -Wtrigraphs. */
207 add_line_note (buffer, d, s[2]);
208 if (CPP_OPTION (pfile, trigraphs))
210 *d = _cpp_trigraph_map[s[2]];
211 s += 2;
216 else
219 s++;
220 while (*s != '\n' && *s != '\r');
221 d = (uchar *) s;
223 /* Handle DOS line endings. */
224 if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
225 s++;
228 done:
229 *d = '\n';
230 /* A sentinel note that should never be processed. */
231 add_line_note (buffer, d + 1, '\n');
232 buffer->next_line = s + 1;
235 /* Return true if the trigraph indicated by NOTE should be warned
236 about in a comment. */
237 static bool
238 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
240 const uchar *p;
242 /* Within comments we don't warn about trigraphs, unless the
243 trigraph forms an escaped newline, as that may change
244 behavior. */
245 if (note->type != '/')
246 return false;
248 /* If -trigraphs, then this was an escaped newline iff the next note
249 is coincident. */
250 if (CPP_OPTION (pfile, trigraphs))
251 return note[1].pos == note->pos;
253 /* Otherwise, see if this forms an escaped newline. */
254 p = note->pos + 3;
255 while (is_nvspace (*p))
256 p++;
258 /* There might have been escaped newlines between the trigraph and the
259 newline we found. Hence the position test. */
260 return (*p == '\n' && p < note[1].pos);
263 /* Process the notes created by add_line_note as far as the current
264 location. */
265 void
266 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
268 cpp_buffer *buffer = pfile->buffer;
270 for (;;)
272 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
273 unsigned int col;
275 if (note->pos > buffer->cur)
276 break;
278 buffer->cur_note++;
279 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
281 if (note->type == '\\' || note->type == ' ')
283 if (note->type == ' ' && !in_comment)
284 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
285 "backslash and newline separated by space");
287 if (buffer->next_line > buffer->rlimit)
289 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
290 "backslash-newline at end of file");
291 /* Prevent "no newline at end of file" warning. */
292 buffer->next_line = buffer->rlimit;
295 buffer->line_base = note->pos;
296 CPP_INCREMENT_LINE (pfile, 0);
298 else if (_cpp_trigraph_map[note->type])
300 if (CPP_OPTION (pfile, warn_trigraphs)
301 && (!in_comment || warn_in_comment (pfile, note)))
303 if (CPP_OPTION (pfile, trigraphs))
304 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
305 "trigraph ??%c converted to %c",
306 note->type,
307 (int) _cpp_trigraph_map[note->type]);
308 else
310 cpp_error_with_line
311 (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
312 "trigraph ??%c ignored, use -trigraphs to enable",
313 note->type);
317 else
318 abort ();
322 /* Skip a C-style block comment. We find the end of the comment by
323 seeing if an asterisk is before every '/' we encounter. Returns
324 nonzero if comment terminated by EOF, zero otherwise.
326 Buffer->cur points to the initial asterisk of the comment. */
327 bool
328 _cpp_skip_block_comment (cpp_reader *pfile)
330 cpp_buffer *buffer = pfile->buffer;
331 const uchar *cur = buffer->cur;
332 uchar c;
334 cur++;
335 if (*cur == '/')
336 cur++;
338 for (;;)
340 /* People like decorating comments with '*', so check for '/'
341 instead for efficiency. */
342 c = *cur++;
344 if (c == '/')
346 if (cur[-2] == '*')
347 break;
349 /* Warn about potential nested comments, but not if the '/'
350 comes immediately before the true comment delimiter.
351 Don't bother to get it right across escaped newlines. */
352 if (CPP_OPTION (pfile, warn_comments)
353 && cur[0] == '*' && cur[1] != '/')
355 buffer->cur = cur;
356 cpp_error_with_line (pfile, CPP_DL_WARNING,
357 pfile->line_table->highest_line, CPP_BUF_COL (buffer),
358 "\"/*\" within comment");
361 else if (c == '\n')
363 unsigned int cols;
364 buffer->cur = cur - 1;
365 _cpp_process_line_notes (pfile, true);
366 if (buffer->next_line >= buffer->rlimit)
367 return true;
368 _cpp_clean_line (pfile);
370 cols = buffer->next_line - buffer->line_base;
371 CPP_INCREMENT_LINE (pfile, cols);
373 cur = buffer->cur;
377 buffer->cur = cur;
378 _cpp_process_line_notes (pfile, true);
379 return false;
382 /* Skip a C++ line comment, leaving buffer->cur pointing to the
383 terminating newline. Handles escaped newlines. Returns nonzero
384 if a multiline comment. */
385 static int
386 skip_line_comment (cpp_reader *pfile)
388 cpp_buffer *buffer = pfile->buffer;
389 source_location orig_line = pfile->line_table->highest_line;
391 while (*buffer->cur != '\n')
392 buffer->cur++;
394 _cpp_process_line_notes (pfile, true);
395 return orig_line != pfile->line_table->highest_line;
398 /* Skips whitespace, saving the next non-whitespace character. */
399 static void
400 skip_whitespace (cpp_reader *pfile, cppchar_t c)
402 cpp_buffer *buffer = pfile->buffer;
403 bool saw_NUL = false;
407 /* Horizontal space always OK. */
408 if (c == ' ' || c == '\t')
410 /* Just \f \v or \0 left. */
411 else if (c == '\0')
412 saw_NUL = true;
413 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
414 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
415 CPP_BUF_COL (buffer),
416 "%s in preprocessing directive",
417 c == '\f' ? "form feed" : "vertical tab");
419 c = *buffer->cur++;
421 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
422 while (is_nvspace (c));
424 if (saw_NUL)
425 cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
427 buffer->cur--;
430 /* See if the characters of a number token are valid in a name (no
431 '.', '+' or '-'). */
432 static int
433 name_p (cpp_reader *pfile, const cpp_string *string)
435 unsigned int i;
437 for (i = 0; i < string->len; i++)
438 if (!is_idchar (string->text[i]))
439 return 0;
441 return 1;
444 /* After parsing an identifier or other sequence, produce a warning about
445 sequences not in NFC/NFKC. */
446 static void
447 warn_about_normalization (cpp_reader *pfile,
448 const cpp_token *token,
449 const struct normalize_state *s)
451 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
452 && !pfile->state.skipping)
454 /* Make sure that the token is printed using UCNs, even
455 if we'd otherwise happily print UTF-8. */
456 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
457 size_t sz;
459 sz = cpp_spell_token (pfile, token, buf, false) - buf;
460 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
461 cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
462 "`%.*s' is not in NFKC", (int) sz, buf);
463 else
464 cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
465 "`%.*s' is not in NFC", (int) sz, buf);
469 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
470 an identifier. FIRST is TRUE if this starts an identifier. */
471 static bool
472 forms_identifier_p (cpp_reader *pfile, int first,
473 struct normalize_state *state)
475 cpp_buffer *buffer = pfile->buffer;
477 if (*buffer->cur == '$')
479 if (!CPP_OPTION (pfile, dollars_in_ident))
480 return false;
482 buffer->cur++;
483 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
485 CPP_OPTION (pfile, warn_dollars) = 0;
486 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
489 return true;
492 /* Is this a syntactically valid UCN? */
493 if (CPP_OPTION (pfile, extended_identifiers)
494 && *buffer->cur == '\\'
495 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
497 buffer->cur += 2;
498 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
499 state))
500 return true;
501 buffer->cur -= 2;
504 return false;
507 /* Lex an identifier starting at BUFFER->CUR - 1. */
508 static cpp_hashnode *
509 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
510 struct normalize_state *nst)
512 cpp_hashnode *result;
513 const uchar *cur;
514 unsigned int len;
515 unsigned int hash = HT_HASHSTEP (0, *base);
517 cur = pfile->buffer->cur;
518 if (! starts_ucn)
519 while (ISIDNUM (*cur))
521 hash = HT_HASHSTEP (hash, *cur);
522 cur++;
524 pfile->buffer->cur = cur;
525 if (starts_ucn || forms_identifier_p (pfile, false, nst))
527 /* Slower version for identifiers containing UCNs (or $). */
528 do {
529 while (ISIDNUM (*pfile->buffer->cur))
531 pfile->buffer->cur++;
532 NORMALIZE_STATE_UPDATE_IDNUM (nst);
534 } while (forms_identifier_p (pfile, false, nst));
535 result = _cpp_interpret_identifier (pfile, base,
536 pfile->buffer->cur - base);
538 else
540 len = cur - base;
541 hash = HT_HASHFINISH (hash, len);
543 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
544 base, len, hash, HT_ALLOC));
547 /* Rarely, identifiers require diagnostics when lexed. */
548 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
549 && !pfile->state.skipping, 0))
551 /* It is allowed to poison the same identifier twice. */
552 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
553 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
554 NODE_NAME (result));
556 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
557 replacement list of a variadic macro. */
558 if (result == pfile->spec_nodes.n__VA_ARGS__
559 && !pfile->state.va_args_ok)
560 cpp_error (pfile, CPP_DL_PEDWARN,
561 "__VA_ARGS__ can only appear in the expansion"
562 " of a C99 variadic macro");
564 /* For -Wc++-compat, warn about use of C++ named operators. */
565 if (result->flags & NODE_WARN_OPERATOR)
566 cpp_error (pfile, CPP_DL_WARNING,
567 "identifier \"%s\" is a special operator name in C++",
568 NODE_NAME (result));
571 return result;
574 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
575 static void
576 lex_number (cpp_reader *pfile, cpp_string *number,
577 struct normalize_state *nst)
579 const uchar *cur;
580 const uchar *base;
581 uchar *dest;
583 base = pfile->buffer->cur - 1;
586 cur = pfile->buffer->cur;
588 /* N.B. ISIDNUM does not include $. */
589 while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
591 cur++;
592 NORMALIZE_STATE_UPDATE_IDNUM (nst);
595 pfile->buffer->cur = cur;
597 while (forms_identifier_p (pfile, false, nst));
599 number->len = cur - base;
600 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
601 memcpy (dest, base, number->len);
602 dest[number->len] = '\0';
603 number->text = dest;
606 /* Create a token of type TYPE with a literal spelling. */
607 static void
608 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
609 unsigned int len, enum cpp_ttype type)
611 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
613 memcpy (dest, base, len);
614 dest[len] = '\0';
615 token->type = type;
616 token->val.str.len = len;
617 token->val.str.text = dest;
620 /* Lexes a raw string. The stored string contains the spelling, including
621 double quotes, delimiter string, '[' and ']', any leading
622 'L', 'u', 'U' or 'u8' and 'R' modifier. It returns the type of the
623 literal, or CPP_OTHER if it was not properly terminated.
625 The spelling is NUL-terminated, but it is not guaranteed that this
626 is the first NUL since embedded NULs are preserved. */
628 static void
629 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
630 const uchar *cur)
632 source_location saw_NUL = 0;
633 const uchar *raw_prefix;
634 unsigned int raw_prefix_len = 0;
635 enum cpp_ttype type;
636 size_t total_len = 0;
637 _cpp_buff *first_buff = NULL, *last_buff = NULL;
639 type = (*base == 'L' ? CPP_WSTRING :
640 *base == 'U' ? CPP_STRING32 :
641 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
642 : CPP_STRING);
644 raw_prefix = cur + 1;
645 while (raw_prefix_len < 16)
647 switch (raw_prefix[raw_prefix_len])
649 case ' ': case '[': case ']': case '\t':
650 case '\v': case '\f': case '\n': default:
651 break;
652 /* Basic source charset except the above chars. */
653 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
654 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
655 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
656 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
657 case 'y': case 'z':
658 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
659 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
660 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
661 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
662 case 'Y': case 'Z':
663 case '0': case '1': case '2': case '3': case '4': case '5':
664 case '6': case '7': case '8': case '9':
665 case '_': case '{': case '}': case '#': case '(': case ')':
666 case '<': case '>': case '%': case ':': case ';': case '.':
667 case '?': case '*': case '+': case '-': case '/': case '^':
668 case '&': case '|': case '~': case '!': case '=': case ',':
669 case '\\': case '"': case '\'':
670 raw_prefix_len++;
671 continue;
673 break;
676 if (raw_prefix[raw_prefix_len] != '[')
678 int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
679 + 1;
680 if (raw_prefix_len == 16)
681 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
682 "raw string delimiter longer than 16 characters");
683 else
684 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
685 "invalid character '%c' in raw string delimiter",
686 (int) raw_prefix[raw_prefix_len]);
687 pfile->buffer->cur = raw_prefix - 1;
688 create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
689 return;
692 cur = raw_prefix + raw_prefix_len + 1;
693 for (;;)
695 cppchar_t c = *cur++;
697 if (c == ']'
698 && strncmp ((const char *) cur, (const char *) raw_prefix,
699 raw_prefix_len) == 0
700 && cur[raw_prefix_len] == '"')
702 cur += raw_prefix_len + 1;
703 break;
705 else if (c == '\n')
707 if (pfile->state.in_directive
708 || pfile->state.parsing_args
709 || pfile->state.in_deferred_pragma)
711 cur--;
712 type = CPP_OTHER;
713 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
714 "unterminated raw string");
715 break;
718 /* raw strings allow embedded non-escaped newlines, which
719 complicates this routine a lot. */
720 if (first_buff == NULL)
722 total_len = cur - base;
723 first_buff = last_buff = _cpp_get_buff (pfile, total_len);
724 memcpy (BUFF_FRONT (last_buff), base, total_len);
725 raw_prefix = BUFF_FRONT (last_buff) + (raw_prefix - base);
726 BUFF_FRONT (last_buff) += total_len;
728 else
730 size_t len = cur - base;
731 size_t cur_len = len > BUFF_ROOM (last_buff)
732 ? BUFF_ROOM (last_buff) : len;
734 total_len += len;
735 memcpy (BUFF_FRONT (last_buff), base, cur_len);
736 BUFF_FRONT (last_buff) += cur_len;
737 if (len > cur_len)
739 last_buff = _cpp_append_extend_buff (pfile, last_buff,
740 len - cur_len);
741 memcpy (BUFF_FRONT (last_buff), base + cur_len,
742 len - cur_len);
743 BUFF_FRONT (last_buff) += len - cur_len;
747 if (pfile->buffer->cur < pfile->buffer->rlimit)
748 CPP_INCREMENT_LINE (pfile, 0);
749 pfile->buffer->need_line = true;
751 if (!_cpp_get_fresh_line (pfile))
753 source_location src_loc = token->src_loc;
754 token->type = CPP_EOF;
755 /* Tell the compiler the line number of the EOF token. */
756 token->src_loc = pfile->line_table->highest_line;
757 token->flags = BOL;
758 if (first_buff != NULL)
759 _cpp_release_buff (pfile, first_buff);
760 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
761 "unterminated raw string");
762 return;
765 cur = base = pfile->buffer->cur;
767 else if (c == '\0' && !saw_NUL)
768 LINEMAP_POSITION_FOR_COLUMN (saw_NUL, pfile->line_table,
769 CPP_BUF_COLUMN (pfile->buffer, cur));
772 if (saw_NUL && !pfile->state.skipping)
773 cpp_error_with_line (pfile, CPP_DL_WARNING, saw_NUL, 0,
774 "null character(s) preserved in literal");
776 pfile->buffer->cur = cur;
777 if (first_buff == NULL)
778 create_literal (pfile, token, base, cur - base, type);
779 else
781 uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
783 token->type = type;
784 token->val.str.len = total_len + (cur - base);
785 token->val.str.text = dest;
786 last_buff = first_buff;
787 while (last_buff != NULL)
789 memcpy (dest, last_buff->base,
790 BUFF_FRONT (last_buff) - last_buff->base);
791 dest += BUFF_FRONT (last_buff) - last_buff->base;
792 last_buff = last_buff->next;
794 _cpp_release_buff (pfile, first_buff);
795 memcpy (dest, base, cur - base);
796 dest[cur - base] = '\0';
800 /* Lexes a string, character constant, or angle-bracketed header file
801 name. The stored string contains the spelling, including opening
802 quote and any leading 'L', 'u', 'U' or 'u8' and optional
803 'R' modifier. It returns the type of the literal, or CPP_OTHER
804 if it was not properly terminated, or CPP_LESS for an unterminated
805 header name which must be relexed as normal tokens.
807 The spelling is NUL-terminated, but it is not guaranteed that this
808 is the first NUL since embedded NULs are preserved. */
809 static void
810 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
812 bool saw_NUL = false;
813 const uchar *cur;
814 cppchar_t terminator;
815 enum cpp_ttype type;
817 cur = base;
818 terminator = *cur++;
819 if (terminator == 'L' || terminator == 'U')
820 terminator = *cur++;
821 else if (terminator == 'u')
823 terminator = *cur++;
824 if (terminator == '8')
825 terminator = *cur++;
827 if (terminator == 'R')
829 lex_raw_string (pfile, token, base, cur);
830 return;
832 if (terminator == '"')
833 type = (*base == 'L' ? CPP_WSTRING :
834 *base == 'U' ? CPP_STRING32 :
835 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
836 : CPP_STRING);
837 else if (terminator == '\'')
838 type = (*base == 'L' ? CPP_WCHAR :
839 *base == 'U' ? CPP_CHAR32 :
840 *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
841 else
842 terminator = '>', type = CPP_HEADER_NAME;
844 for (;;)
846 cppchar_t c = *cur++;
848 /* In #include-style directives, terminators are not escapable. */
849 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
850 cur++;
851 else if (c == terminator)
852 break;
853 else if (c == '\n')
855 cur--;
856 /* Unmatched quotes always yield undefined behavior, but
857 greedy lexing means that what appears to be an unterminated
858 header name may actually be a legitimate sequence of tokens. */
859 if (terminator == '>')
861 token->type = CPP_LESS;
862 return;
864 type = CPP_OTHER;
865 break;
867 else if (c == '\0')
868 saw_NUL = true;
871 if (saw_NUL && !pfile->state.skipping)
872 cpp_error (pfile, CPP_DL_WARNING,
873 "null character(s) preserved in literal");
875 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
876 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
877 (int) terminator);
879 pfile->buffer->cur = cur;
880 create_literal (pfile, token, base, cur - base, type);
883 /* Return the comment table. The client may not make any assumption
884 about the ordering of the table. */
885 cpp_comment_table *
886 cpp_get_comments (cpp_reader *pfile)
888 return &pfile->comments;
891 /* Append a comment to the end of the comment table. */
892 static void
893 store_comment (cpp_reader *pfile, cpp_token *token)
895 int len;
897 if (pfile->comments.allocated == 0)
899 pfile->comments.allocated = 256;
900 pfile->comments.entries = (cpp_comment *) xmalloc
901 (pfile->comments.allocated * sizeof (cpp_comment));
904 if (pfile->comments.count == pfile->comments.allocated)
906 pfile->comments.allocated *= 2;
907 pfile->comments.entries = (cpp_comment *) xrealloc
908 (pfile->comments.entries,
909 pfile->comments.allocated * sizeof (cpp_comment));
912 len = token->val.str.len;
914 /* Copy comment. Note, token may not be NULL terminated. */
915 pfile->comments.entries[pfile->comments.count].comment =
916 (char *) xmalloc (sizeof (char) * (len + 1));
917 memcpy (pfile->comments.entries[pfile->comments.count].comment,
918 token->val.str.text, len);
919 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
921 /* Set source location. */
922 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
924 /* Increment the count of entries in the comment table. */
925 pfile->comments.count++;
928 /* The stored comment includes the comment start and any terminator. */
929 static void
930 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
931 cppchar_t type)
933 unsigned char *buffer;
934 unsigned int len, clen;
936 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
938 /* C++ comments probably (not definitely) have moved past a new
939 line, which we don't want to save in the comment. */
940 if (is_vspace (pfile->buffer->cur[-1]))
941 len--;
943 /* If we are currently in a directive, then we need to store all
944 C++ comments as C comments internally, and so we need to
945 allocate a little extra space in that case.
947 Note that the only time we encounter a directive here is
948 when we are saving comments in a "#define". */
949 clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
951 buffer = _cpp_unaligned_alloc (pfile, clen);
953 token->type = CPP_COMMENT;
954 token->val.str.len = clen;
955 token->val.str.text = buffer;
957 buffer[0] = '/';
958 memcpy (buffer + 1, from, len - 1);
960 /* Finish conversion to a C comment, if necessary. */
961 if (pfile->state.in_directive && type == '/')
963 buffer[1] = '*';
964 buffer[clen - 2] = '*';
965 buffer[clen - 1] = '/';
968 /* Finally store this comment for use by clients of libcpp. */
969 store_comment (pfile, token);
972 /* Allocate COUNT tokens for RUN. */
973 void
974 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
976 run->base = XNEWVEC (cpp_token, count);
977 run->limit = run->base + count;
978 run->next = NULL;
981 /* Returns the next tokenrun, or creates one if there is none. */
982 static tokenrun *
983 next_tokenrun (tokenrun *run)
985 if (run->next == NULL)
987 run->next = XNEW (tokenrun);
988 run->next->prev = run;
989 _cpp_init_tokenrun (run->next, 250);
992 return run->next;
995 /* Look ahead in the input stream. */
996 const cpp_token *
997 cpp_peek_token (cpp_reader *pfile, int index)
999 cpp_context *context = pfile->context;
1000 const cpp_token *peektok;
1001 int count;
1003 /* First, scan through any pending cpp_context objects. */
1004 while (context->prev)
1006 ptrdiff_t sz = (context->direct_p
1007 ? LAST (context).token - FIRST (context).token
1008 : LAST (context).ptoken - FIRST (context).ptoken);
1010 if (index < (int) sz)
1011 return (context->direct_p
1012 ? FIRST (context).token + index
1013 : *(FIRST (context).ptoken + index));
1015 index -= (int) sz;
1016 context = context->prev;
1019 /* We will have to read some new tokens after all (and do so
1020 without invalidating preceding tokens). */
1021 count = index;
1022 pfile->keep_tokens++;
1026 peektok = _cpp_lex_token (pfile);
1027 if (peektok->type == CPP_EOF)
1028 return peektok;
1030 while (index--);
1032 _cpp_backup_tokens_direct (pfile, count + 1);
1033 pfile->keep_tokens--;
1035 return peektok;
1038 /* Allocate a single token that is invalidated at the same time as the
1039 rest of the tokens on the line. Has its line and col set to the
1040 same as the last lexed token, so that diagnostics appear in the
1041 right place. */
1042 cpp_token *
1043 _cpp_temp_token (cpp_reader *pfile)
1045 cpp_token *old, *result;
1046 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1047 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1049 old = pfile->cur_token - 1;
1050 /* Any pre-existing lookaheads must not be clobbered. */
1051 if (la)
1053 if (sz <= la)
1055 tokenrun *next = next_tokenrun (pfile->cur_run);
1057 if (sz < la)
1058 memmove (next->base + 1, next->base,
1059 (la - sz) * sizeof (cpp_token));
1061 next->base[0] = pfile->cur_run->limit[-1];
1064 if (sz > 1)
1065 memmove (pfile->cur_token + 1, pfile->cur_token,
1066 MIN (la, sz - 1) * sizeof (cpp_token));
1069 if (!sz && pfile->cur_token == pfile->cur_run->limit)
1071 pfile->cur_run = next_tokenrun (pfile->cur_run);
1072 pfile->cur_token = pfile->cur_run->base;
1075 result = pfile->cur_token++;
1076 result->src_loc = old->src_loc;
1077 return result;
1080 /* Lex a token into RESULT (external interface). Takes care of issues
1081 like directive handling, token lookahead, multiple include
1082 optimization and skipping. */
1083 const cpp_token *
1084 _cpp_lex_token (cpp_reader *pfile)
1086 cpp_token *result;
1088 for (;;)
1090 if (pfile->cur_token == pfile->cur_run->limit)
1092 pfile->cur_run = next_tokenrun (pfile->cur_run);
1093 pfile->cur_token = pfile->cur_run->base;
1095 /* We assume that the current token is somewhere in the current
1096 run. */
1097 if (pfile->cur_token < pfile->cur_run->base
1098 || pfile->cur_token >= pfile->cur_run->limit)
1099 abort ();
1101 if (pfile->lookaheads)
1103 pfile->lookaheads--;
1104 result = pfile->cur_token++;
1106 else
1107 result = _cpp_lex_direct (pfile);
1109 if (result->flags & BOL)
1111 /* Is this a directive. If _cpp_handle_directive returns
1112 false, it is an assembler #. */
1113 if (result->type == CPP_HASH
1114 /* 6.10.3 p 11: Directives in a list of macro arguments
1115 gives undefined behavior. This implementation
1116 handles the directive as normal. */
1117 && pfile->state.parsing_args != 1)
1119 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1121 if (pfile->directive_result.type == CPP_PADDING)
1122 continue;
1123 result = &pfile->directive_result;
1126 else if (pfile->state.in_deferred_pragma)
1127 result = &pfile->directive_result;
1129 if (pfile->cb.line_change && !pfile->state.skipping)
1130 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1133 /* We don't skip tokens in directives. */
1134 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
1135 break;
1137 /* Outside a directive, invalidate controlling macros. At file
1138 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
1139 get here and MI optimization works. */
1140 pfile->mi_valid = false;
1142 if (!pfile->state.skipping || result->type == CPP_EOF)
1143 break;
1146 return result;
1149 /* Returns true if a fresh line has been loaded. */
1150 bool
1151 _cpp_get_fresh_line (cpp_reader *pfile)
1153 int return_at_eof;
1155 /* We can't get a new line until we leave the current directive. */
1156 if (pfile->state.in_directive)
1157 return false;
1159 for (;;)
1161 cpp_buffer *buffer = pfile->buffer;
1163 if (!buffer->need_line)
1164 return true;
1166 if (buffer->next_line < buffer->rlimit)
1168 _cpp_clean_line (pfile);
1169 return true;
1172 /* First, get out of parsing arguments state. */
1173 if (pfile->state.parsing_args)
1174 return false;
1176 /* End of buffer. Non-empty files should end in a newline. */
1177 if (buffer->buf != buffer->rlimit
1178 && buffer->next_line > buffer->rlimit
1179 && !buffer->from_stage3)
1181 /* Clip to buffer size. */
1182 buffer->next_line = buffer->rlimit;
1185 return_at_eof = buffer->return_at_eof;
1186 _cpp_pop_buffer (pfile);
1187 if (pfile->buffer == NULL || return_at_eof)
1188 return false;
1192 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
1193 do \
1195 result->type = ELSE_TYPE; \
1196 if (*buffer->cur == CHAR) \
1197 buffer->cur++, result->type = THEN_TYPE; \
1199 while (0)
1201 /* Lex a token into pfile->cur_token, which is also incremented, to
1202 get diagnostics pointing to the correct location.
1204 Does not handle issues such as token lookahead, multiple-include
1205 optimization, directives, skipping etc. This function is only
1206 suitable for use by _cpp_lex_token, and in special cases like
1207 lex_expansion_token which doesn't care for any of these issues.
1209 When meeting a newline, returns CPP_EOF if parsing a directive,
1210 otherwise returns to the start of the token buffer if permissible.
1211 Returns the location of the lexed token. */
1212 cpp_token *
1213 _cpp_lex_direct (cpp_reader *pfile)
1215 cppchar_t c;
1216 cpp_buffer *buffer;
1217 const unsigned char *comment_start;
1218 cpp_token *result = pfile->cur_token++;
1220 fresh_line:
1221 result->flags = 0;
1222 buffer = pfile->buffer;
1223 if (buffer->need_line)
1225 if (pfile->state.in_deferred_pragma)
1227 result->type = CPP_PRAGMA_EOL;
1228 pfile->state.in_deferred_pragma = false;
1229 if (!pfile->state.pragma_allow_expansion)
1230 pfile->state.prevent_expansion--;
1231 return result;
1233 if (!_cpp_get_fresh_line (pfile))
1235 result->type = CPP_EOF;
1236 if (!pfile->state.in_directive)
1238 /* Tell the compiler the line number of the EOF token. */
1239 result->src_loc = pfile->line_table->highest_line;
1240 result->flags = BOL;
1242 return result;
1244 if (!pfile->keep_tokens)
1246 pfile->cur_run = &pfile->base_run;
1247 result = pfile->base_run.base;
1248 pfile->cur_token = result + 1;
1250 result->flags = BOL;
1251 if (pfile->state.parsing_args == 2)
1252 result->flags |= PREV_WHITE;
1254 buffer = pfile->buffer;
1255 update_tokens_line:
1256 result->src_loc = pfile->line_table->highest_line;
1258 skipped_white:
1259 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
1260 && !pfile->overlaid_buffer)
1262 _cpp_process_line_notes (pfile, false);
1263 result->src_loc = pfile->line_table->highest_line;
1265 c = *buffer->cur++;
1267 LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
1268 CPP_BUF_COLUMN (buffer, buffer->cur));
1270 switch (c)
1272 case ' ': case '\t': case '\f': case '\v': case '\0':
1273 result->flags |= PREV_WHITE;
1274 skip_whitespace (pfile, c);
1275 goto skipped_white;
1277 case '\n':
1278 if (buffer->cur < buffer->rlimit)
1279 CPP_INCREMENT_LINE (pfile, 0);
1280 buffer->need_line = true;
1281 goto fresh_line;
1283 case '0': case '1': case '2': case '3': case '4':
1284 case '5': case '6': case '7': case '8': case '9':
1286 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1287 result->type = CPP_NUMBER;
1288 lex_number (pfile, &result->val.str, &nst);
1289 warn_about_normalization (pfile, result, &nst);
1290 break;
1293 case 'L':
1294 case 'u':
1295 case 'U':
1296 case 'R':
1297 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
1298 wide strings or raw strings. */
1299 if (c == 'L' || CPP_OPTION (pfile, uliterals))
1301 if ((*buffer->cur == '\'' && c != 'R')
1302 || *buffer->cur == '"'
1303 || (*buffer->cur == 'R'
1304 && c != 'R'
1305 && buffer->cur[1] == '"'
1306 && CPP_OPTION (pfile, uliterals))
1307 || (*buffer->cur == '8'
1308 && c == 'u'
1309 && (buffer->cur[1] == '"'
1310 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'))))
1312 lex_string (pfile, result, buffer->cur - 1);
1313 break;
1316 /* Fall through. */
1318 case '_':
1319 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1320 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1321 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1322 case 's': case 't': case 'v': case 'w': case 'x':
1323 case 'y': case 'z':
1324 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1325 case 'G': case 'H': case 'I': case 'J': case 'K':
1326 case 'M': case 'N': case 'O': case 'P': case 'Q':
1327 case 'S': case 'T': case 'V': case 'W': case 'X':
1328 case 'Y': case 'Z':
1329 result->type = CPP_NAME;
1331 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1332 result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
1333 &nst);
1334 warn_about_normalization (pfile, result, &nst);
1337 /* Convert named operators to their proper types. */
1338 if (result->val.node.node->flags & NODE_OPERATOR)
1340 result->flags |= NAMED_OP;
1341 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
1343 break;
1345 case '\'':
1346 case '"':
1347 lex_string (pfile, result, buffer->cur - 1);
1348 break;
1350 case '/':
1351 /* A potential block or line comment. */
1352 comment_start = buffer->cur;
1353 c = *buffer->cur;
1355 if (c == '*')
1357 if (_cpp_skip_block_comment (pfile))
1358 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
1360 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1361 || cpp_in_system_header (pfile)))
1363 /* Warn about comments only if pedantically GNUC89, and not
1364 in system headers. */
1365 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1366 && ! buffer->warned_cplusplus_comments)
1368 cpp_error (pfile, CPP_DL_PEDWARN,
1369 "C++ style comments are not allowed in ISO C90");
1370 cpp_error (pfile, CPP_DL_PEDWARN,
1371 "(this will be reported only once per input file)");
1372 buffer->warned_cplusplus_comments = 1;
1375 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1376 cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
1378 else if (c == '=')
1380 buffer->cur++;
1381 result->type = CPP_DIV_EQ;
1382 break;
1384 else
1386 result->type = CPP_DIV;
1387 break;
1390 if (!pfile->state.save_comments)
1392 result->flags |= PREV_WHITE;
1393 goto update_tokens_line;
1396 /* Save the comment as a token in its own right. */
1397 save_comment (pfile, result, comment_start, c);
1398 break;
1400 case '<':
1401 if (pfile->state.angled_headers)
1403 lex_string (pfile, result, buffer->cur - 1);
1404 if (result->type != CPP_LESS)
1405 break;
1408 result->type = CPP_LESS;
1409 if (*buffer->cur == '=')
1410 buffer->cur++, result->type = CPP_LESS_EQ;
1411 else if (*buffer->cur == '<')
1413 buffer->cur++;
1414 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1416 else if (CPP_OPTION (pfile, digraphs))
1418 if (*buffer->cur == ':')
1420 buffer->cur++;
1421 result->flags |= DIGRAPH;
1422 result->type = CPP_OPEN_SQUARE;
1424 else if (*buffer->cur == '%')
1426 buffer->cur++;
1427 result->flags |= DIGRAPH;
1428 result->type = CPP_OPEN_BRACE;
1431 break;
1433 case '>':
1434 result->type = CPP_GREATER;
1435 if (*buffer->cur == '=')
1436 buffer->cur++, result->type = CPP_GREATER_EQ;
1437 else if (*buffer->cur == '>')
1439 buffer->cur++;
1440 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1442 break;
1444 case '%':
1445 result->type = CPP_MOD;
1446 if (*buffer->cur == '=')
1447 buffer->cur++, result->type = CPP_MOD_EQ;
1448 else if (CPP_OPTION (pfile, digraphs))
1450 if (*buffer->cur == ':')
1452 buffer->cur++;
1453 result->flags |= DIGRAPH;
1454 result->type = CPP_HASH;
1455 if (*buffer->cur == '%' && buffer->cur[1] == ':')
1456 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
1458 else if (*buffer->cur == '>')
1460 buffer->cur++;
1461 result->flags |= DIGRAPH;
1462 result->type = CPP_CLOSE_BRACE;
1465 break;
1467 case '.':
1468 result->type = CPP_DOT;
1469 if (ISDIGIT (*buffer->cur))
1471 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1472 result->type = CPP_NUMBER;
1473 lex_number (pfile, &result->val.str, &nst);
1474 warn_about_normalization (pfile, result, &nst);
1476 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
1477 buffer->cur += 2, result->type = CPP_ELLIPSIS;
1478 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1479 buffer->cur++, result->type = CPP_DOT_STAR;
1480 break;
1482 case '+':
1483 result->type = CPP_PLUS;
1484 if (*buffer->cur == '+')
1485 buffer->cur++, result->type = CPP_PLUS_PLUS;
1486 else if (*buffer->cur == '=')
1487 buffer->cur++, result->type = CPP_PLUS_EQ;
1488 break;
1490 case '-':
1491 result->type = CPP_MINUS;
1492 if (*buffer->cur == '>')
1494 buffer->cur++;
1495 result->type = CPP_DEREF;
1496 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1497 buffer->cur++, result->type = CPP_DEREF_STAR;
1499 else if (*buffer->cur == '-')
1500 buffer->cur++, result->type = CPP_MINUS_MINUS;
1501 else if (*buffer->cur == '=')
1502 buffer->cur++, result->type = CPP_MINUS_EQ;
1503 break;
1505 case '&':
1506 result->type = CPP_AND;
1507 if (*buffer->cur == '&')
1508 buffer->cur++, result->type = CPP_AND_AND;
1509 else if (*buffer->cur == '=')
1510 buffer->cur++, result->type = CPP_AND_EQ;
1511 break;
1513 case '|':
1514 result->type = CPP_OR;
1515 if (*buffer->cur == '|')
1516 buffer->cur++, result->type = CPP_OR_OR;
1517 else if (*buffer->cur == '=')
1518 buffer->cur++, result->type = CPP_OR_EQ;
1519 break;
1521 case ':':
1522 result->type = CPP_COLON;
1523 if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1524 buffer->cur++, result->type = CPP_SCOPE;
1525 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1527 buffer->cur++;
1528 result->flags |= DIGRAPH;
1529 result->type = CPP_CLOSE_SQUARE;
1531 break;
1533 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1534 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1535 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1536 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1537 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
1539 case '?': result->type = CPP_QUERY; break;
1540 case '~': result->type = CPP_COMPL; break;
1541 case ',': result->type = CPP_COMMA; break;
1542 case '(': result->type = CPP_OPEN_PAREN; break;
1543 case ')': result->type = CPP_CLOSE_PAREN; break;
1544 case '[': result->type = CPP_OPEN_SQUARE; break;
1545 case ']': result->type = CPP_CLOSE_SQUARE; break;
1546 case '{': result->type = CPP_OPEN_BRACE; break;
1547 case '}': result->type = CPP_CLOSE_BRACE; break;
1548 case ';': result->type = CPP_SEMICOLON; break;
1550 /* @ is a punctuator in Objective-C. */
1551 case '@': result->type = CPP_ATSIGN; break;
1553 case '$':
1554 case '\\':
1556 const uchar *base = --buffer->cur;
1557 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1559 if (forms_identifier_p (pfile, true, &nst))
1561 result->type = CPP_NAME;
1562 result->val.node.node = lex_identifier (pfile, base, true, &nst);
1563 warn_about_normalization (pfile, result, &nst);
1564 break;
1566 buffer->cur++;
1569 default:
1570 create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1571 break;
1574 return result;
1577 /* An upper bound on the number of bytes needed to spell TOKEN.
1578 Does not include preceding whitespace. */
1579 unsigned int
1580 cpp_token_len (const cpp_token *token)
1582 unsigned int len;
1584 switch (TOKEN_SPELL (token))
1586 default: len = 6; break;
1587 case SPELL_LITERAL: len = token->val.str.len; break;
1588 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
1591 return len;
1594 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
1595 Return the number of bytes read out of NAME. (There are always
1596 10 bytes written to BUFFER.) */
1598 static size_t
1599 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
1601 int j;
1602 int ucn_len = 0;
1603 int ucn_len_c;
1604 unsigned t;
1605 unsigned long utf32;
1607 /* Compute the length of the UTF-8 sequence. */
1608 for (t = *name; t & 0x80; t <<= 1)
1609 ucn_len++;
1611 utf32 = *name & (0x7F >> ucn_len);
1612 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
1614 utf32 = (utf32 << 6) | (*++name & 0x3F);
1616 /* Ill-formed UTF-8. */
1617 if ((*name & ~0x3F) != 0x80)
1618 abort ();
1621 *buffer++ = '\\';
1622 *buffer++ = 'U';
1623 for (j = 7; j >= 0; j--)
1624 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
1625 return ucn_len;
1628 /* Given a token TYPE corresponding to a digraph, return a pointer to
1629 the spelling of the digraph. */
1630 static const unsigned char *
1631 cpp_digraph2name (enum cpp_ttype type)
1633 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
1636 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1637 already contain the enough space to hold the token's spelling.
1638 Returns a pointer to the character after the last character written.
1639 FORSTRING is true if this is to be the spelling after translation
1640 phase 1 (this is different for UCNs).
1641 FIXME: Would be nice if we didn't need the PFILE argument. */
1642 unsigned char *
1643 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1644 unsigned char *buffer, bool forstring)
1646 switch (TOKEN_SPELL (token))
1648 case SPELL_OPERATOR:
1650 const unsigned char *spelling;
1651 unsigned char c;
1653 if (token->flags & DIGRAPH)
1654 spelling = cpp_digraph2name (token->type);
1655 else if (token->flags & NAMED_OP)
1656 goto spell_ident;
1657 else
1658 spelling = TOKEN_NAME (token);
1660 while ((c = *spelling++) != '\0')
1661 *buffer++ = c;
1663 break;
1665 spell_ident:
1666 case SPELL_IDENT:
1667 if (forstring)
1669 memcpy (buffer, NODE_NAME (token->val.node.node),
1670 NODE_LEN (token->val.node.node));
1671 buffer += NODE_LEN (token->val.node.node);
1673 else
1675 size_t i;
1676 const unsigned char * name = NODE_NAME (token->val.node.node);
1678 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
1679 if (name[i] & ~0x7F)
1681 i += utf8_to_ucn (buffer, name + i) - 1;
1682 buffer += 10;
1684 else
1685 *buffer++ = NODE_NAME (token->val.node.node)[i];
1687 break;
1689 case SPELL_LITERAL:
1690 memcpy (buffer, token->val.str.text, token->val.str.len);
1691 buffer += token->val.str.len;
1692 break;
1694 case SPELL_NONE:
1695 cpp_error (pfile, CPP_DL_ICE,
1696 "unspellable token %s", TOKEN_NAME (token));
1697 break;
1700 return buffer;
1703 /* Returns TOKEN spelt as a null-terminated string. The string is
1704 freed when the reader is destroyed. Useful for diagnostics. */
1705 unsigned char *
1706 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1708 unsigned int len = cpp_token_len (token) + 1;
1709 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1711 end = cpp_spell_token (pfile, token, start, false);
1712 end[0] = '\0';
1714 return start;
1717 /* Returns a pointer to a string which spells the token defined by
1718 TYPE and FLAGS. Used by C front ends, which really should move to
1719 using cpp_token_as_text. */
1720 const char *
1721 cpp_type2name (enum cpp_ttype type, unsigned char flags)
1723 if (flags & DIGRAPH)
1724 return (const char *) cpp_digraph2name (type);
1725 else if (flags & NAMED_OP)
1726 return cpp_named_operator2name (type);
1728 return (const char *) token_spellings[type].name;
1731 /* Writes the spelling of token to FP, without any preceding space.
1732 Separated from cpp_spell_token for efficiency - to avoid stdio
1733 double-buffering. */
1734 void
1735 cpp_output_token (const cpp_token *token, FILE *fp)
1737 switch (TOKEN_SPELL (token))
1739 case SPELL_OPERATOR:
1741 const unsigned char *spelling;
1742 int c;
1744 if (token->flags & DIGRAPH)
1745 spelling = cpp_digraph2name (token->type);
1746 else if (token->flags & NAMED_OP)
1747 goto spell_ident;
1748 else
1749 spelling = TOKEN_NAME (token);
1751 c = *spelling;
1753 putc (c, fp);
1754 while ((c = *++spelling) != '\0');
1756 break;
1758 spell_ident:
1759 case SPELL_IDENT:
1761 size_t i;
1762 const unsigned char * name = NODE_NAME (token->val.node.node);
1764 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
1765 if (name[i] & ~0x7F)
1767 unsigned char buffer[10];
1768 i += utf8_to_ucn (buffer, name + i) - 1;
1769 fwrite (buffer, 1, 10, fp);
1771 else
1772 fputc (NODE_NAME (token->val.node.node)[i], fp);
1774 break;
1776 case SPELL_LITERAL:
1777 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1778 break;
1780 case SPELL_NONE:
1781 /* An error, most probably. */
1782 break;
1786 /* Compare two tokens. */
1788 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1790 if (a->type == b->type && a->flags == b->flags)
1791 switch (TOKEN_SPELL (a))
1793 default: /* Keep compiler happy. */
1794 case SPELL_OPERATOR:
1795 /* token_no is used to track where multiple consecutive ##
1796 tokens were originally located. */
1797 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
1798 case SPELL_NONE:
1799 return (a->type != CPP_MACRO_ARG
1800 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
1801 case SPELL_IDENT:
1802 return a->val.node.node == b->val.node.node;
1803 case SPELL_LITERAL:
1804 return (a->val.str.len == b->val.str.len
1805 && !memcmp (a->val.str.text, b->val.str.text,
1806 a->val.str.len));
1809 return 0;
1812 /* Returns nonzero if a space should be inserted to avoid an
1813 accidental token paste for output. For simplicity, it is
1814 conservative, and occasionally advises a space where one is not
1815 needed, e.g. "." and ".2". */
1817 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1818 const cpp_token *token2)
1820 enum cpp_ttype a = token1->type, b = token2->type;
1821 cppchar_t c;
1823 if (token1->flags & NAMED_OP)
1824 a = CPP_NAME;
1825 if (token2->flags & NAMED_OP)
1826 b = CPP_NAME;
1828 c = EOF;
1829 if (token2->flags & DIGRAPH)
1830 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1831 else if (token_spellings[b].category == SPELL_OPERATOR)
1832 c = token_spellings[b].name[0];
1834 /* Quickly get everything that can paste with an '='. */
1835 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1836 return 1;
1838 switch (a)
1840 case CPP_GREATER: return c == '>';
1841 case CPP_LESS: return c == '<' || c == '%' || c == ':';
1842 case CPP_PLUS: return c == '+';
1843 case CPP_MINUS: return c == '-' || c == '>';
1844 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1845 case CPP_MOD: return c == ':' || c == '>';
1846 case CPP_AND: return c == '&';
1847 case CPP_OR: return c == '|';
1848 case CPP_COLON: return c == ':' || c == '>';
1849 case CPP_DEREF: return c == '*';
1850 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1851 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1852 case CPP_NAME: return ((b == CPP_NUMBER
1853 && name_p (pfile, &token2->val.str))
1854 || b == CPP_NAME
1855 || b == CPP_CHAR || b == CPP_STRING); /* L */
1856 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1857 || c == '.' || c == '+' || c == '-');
1858 /* UCNs */
1859 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
1860 && b == CPP_NAME)
1861 || (CPP_OPTION (pfile, objc)
1862 && token1->val.str.text[0] == '@'
1863 && (b == CPP_NAME || b == CPP_STRING)));
1864 default: break;
1867 return 0;
1870 /* Output all the remaining tokens on the current line, and a newline
1871 character, to FP. Leading whitespace is removed. If there are
1872 macros, special token padding is not performed. */
1873 void
1874 cpp_output_line (cpp_reader *pfile, FILE *fp)
1876 const cpp_token *token;
1878 token = cpp_get_token (pfile);
1879 while (token->type != CPP_EOF)
1881 cpp_output_token (token, fp);
1882 token = cpp_get_token (pfile);
1883 if (token->flags & PREV_WHITE)
1884 putc (' ', fp);
1887 putc ('\n', fp);
1890 /* Return a string representation of all the remaining tokens on the
1891 current line. The result is allocated using xmalloc and must be
1892 freed by the caller. */
1893 unsigned char *
1894 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
1896 const cpp_token *token;
1897 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
1898 unsigned int alloced = 120 + out;
1899 unsigned char *result = (unsigned char *) xmalloc (alloced);
1901 /* If DIR_NAME is empty, there are no initial contents. */
1902 if (dir_name)
1904 sprintf ((char *) result, "#%s ", dir_name);
1905 out += 2;
1908 token = cpp_get_token (pfile);
1909 while (token->type != CPP_EOF)
1911 unsigned char *last;
1912 /* Include room for a possible space and the terminating nul. */
1913 unsigned int len = cpp_token_len (token) + 2;
1915 if (out + len > alloced)
1917 alloced *= 2;
1918 if (out + len > alloced)
1919 alloced = out + len;
1920 result = (unsigned char *) xrealloc (result, alloced);
1923 last = cpp_spell_token (pfile, token, &result[out], 0);
1924 out = last - result;
1926 token = cpp_get_token (pfile);
1927 if (token->flags & PREV_WHITE)
1928 result[out++] = ' ';
1931 result[out] = '\0';
1932 return result;
1935 /* Memory buffers. Changing these three constants can have a dramatic
1936 effect on performance. The values here are reasonable defaults,
1937 but might be tuned. If you adjust them, be sure to test across a
1938 range of uses of cpplib, including heavy nested function-like macro
1939 expansion. Also check the change in peak memory usage (NJAMD is a
1940 good tool for this). */
1941 #define MIN_BUFF_SIZE 8000
1942 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1943 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1944 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1946 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1947 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1948 #endif
1950 /* Create a new allocation buffer. Place the control block at the end
1951 of the buffer, so that buffer overflows will cause immediate chaos. */
1952 static _cpp_buff *
1953 new_buff (size_t len)
1955 _cpp_buff *result;
1956 unsigned char *base;
1958 if (len < MIN_BUFF_SIZE)
1959 len = MIN_BUFF_SIZE;
1960 len = CPP_ALIGN (len);
1962 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
1963 result = (_cpp_buff *) (base + len);
1964 result->base = base;
1965 result->cur = base;
1966 result->limit = base + len;
1967 result->next = NULL;
1968 return result;
1971 /* Place a chain of unwanted allocation buffers on the free list. */
1972 void
1973 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
1975 _cpp_buff *end = buff;
1977 while (end->next)
1978 end = end->next;
1979 end->next = pfile->free_buffs;
1980 pfile->free_buffs = buff;
1983 /* Return a free buffer of size at least MIN_SIZE. */
1984 _cpp_buff *
1985 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
1987 _cpp_buff *result, **p;
1989 for (p = &pfile->free_buffs;; p = &(*p)->next)
1991 size_t size;
1993 if (*p == NULL)
1994 return new_buff (min_size);
1995 result = *p;
1996 size = result->limit - result->base;
1997 /* Return a buffer that's big enough, but don't waste one that's
1998 way too big. */
1999 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2000 break;
2003 *p = result->next;
2004 result->next = NULL;
2005 result->cur = result->base;
2006 return result;
2009 /* Creates a new buffer with enough space to hold the uncommitted
2010 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
2011 the excess bytes to the new buffer. Chains the new buffer after
2012 BUFF, and returns the new buffer. */
2013 _cpp_buff *
2014 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2016 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2017 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2019 buff->next = new_buff;
2020 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2021 return new_buff;
2024 /* Creates a new buffer with enough space to hold the uncommitted
2025 remaining bytes of the buffer pointed to by BUFF, and at least
2026 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
2027 Chains the new buffer before the buffer pointed to by BUFF, and
2028 updates the pointer to point to the new buffer. */
2029 void
2030 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
2032 _cpp_buff *new_buff, *old_buff = *pbuff;
2033 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2035 new_buff = _cpp_get_buff (pfile, size);
2036 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2037 new_buff->next = old_buff;
2038 *pbuff = new_buff;
2041 /* Free a chain of buffers starting at BUFF. */
2042 void
2043 _cpp_free_buff (_cpp_buff *buff)
2045 _cpp_buff *next;
2047 for (; buff; buff = next)
2049 next = buff->next;
2050 free (buff->base);
2054 /* Allocate permanent, unaligned storage of length LEN. */
2055 unsigned char *
2056 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2058 _cpp_buff *buff = pfile->u_buff;
2059 unsigned char *result = buff->cur;
2061 if (len > (size_t) (buff->limit - result))
2063 buff = _cpp_get_buff (pfile, len);
2064 buff->next = pfile->u_buff;
2065 pfile->u_buff = buff;
2066 result = buff->cur;
2069 buff->cur = result + len;
2070 return result;
2073 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2074 That buffer is used for growing allocations when saving macro
2075 replacement lists in a #define, and when parsing an answer to an
2076 assertion in #assert, #unassert or #if (and therefore possibly
2077 whilst expanding macros). It therefore must not be used by any
2078 code that they might call: specifically the lexer and the guts of
2079 the macro expander.
2081 All existing other uses clearly fit this restriction: storing
2082 registered pragmas during initialization. */
2083 unsigned char *
2084 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2086 _cpp_buff *buff = pfile->a_buff;
2087 unsigned char *result = buff->cur;
2089 if (len > (size_t) (buff->limit - result))
2091 buff = _cpp_get_buff (pfile, len);
2092 buff->next = pfile->a_buff;
2093 pfile->a_buff = buff;
2094 result = buff->cur;
2097 buff->cur = result + len;
2098 return result;
2101 /* Say which field of TOK is in use. */
2103 enum cpp_token_fld_kind
2104 cpp_token_val_index (cpp_token *tok)
2106 switch (TOKEN_SPELL (tok))
2108 case SPELL_IDENT:
2109 return CPP_TOKEN_FLD_NODE;
2110 case SPELL_LITERAL:
2111 return CPP_TOKEN_FLD_STR;
2112 case SPELL_OPERATOR:
2113 if (tok->type == CPP_PASTE)
2114 return CPP_TOKEN_FLD_TOKEN_NO;
2115 else
2116 return CPP_TOKEN_FLD_NONE;
2117 case SPELL_NONE:
2118 if (tok->type == CPP_MACRO_ARG)
2119 return CPP_TOKEN_FLD_ARG_NO;
2120 else if (tok->type == CPP_PADDING)
2121 return CPP_TOKEN_FLD_SOURCE;
2122 else if (tok->type == CPP_PRAGMA)
2123 return CPP_TOKEN_FLD_PRAGMA;
2124 /* else fall through */
2125 default:
2126 return CPP_TOKEN_FLD_NONE;