2003-12-26 Guilhem Lavaux <guilhem@kaffe.org>
[official-gcc.git] / gcc / cpplex.c
blob783732fa444c6063ce8ab6b012c766f021b71e5a
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
11 later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "cpphash.h"
27 enum spell_type
29 SPELL_OPERATOR = 0,
30 SPELL_IDENT,
31 SPELL_LITERAL,
32 SPELL_NONE
35 struct token_spelling
37 enum spell_type category;
38 const unsigned char *name;
41 static const unsigned char *const digraph_spellings[] =
42 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
44 #define OP(e, s) { SPELL_OPERATOR, U s },
45 #define TK(e, s) { s, U #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54 static int skip_line_comment (cpp_reader *);
55 static void skip_whitespace (cpp_reader *, cppchar_t);
56 static cpp_hashnode *lex_identifier (cpp_reader *, const uchar *);
57 static void lex_number (cpp_reader *, cpp_string *);
58 static bool forms_identifier_p (cpp_reader *, int);
59 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
60 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
61 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
62 unsigned int, enum cpp_ttype);
63 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
64 static int name_p (cpp_reader *, const cpp_string *);
65 static tokenrun *next_tokenrun (tokenrun *);
67 static _cpp_buff *new_buff (size_t);
70 /* Utility routine:
72 Compares, the token TOKEN to the NUL-terminated string STRING.
73 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
74 int
75 cpp_ideq (const cpp_token *token, const char *string)
77 if (token->type != CPP_NAME)
78 return 0;
80 return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
83 /* Record a note TYPE at byte POS into the current cleaned logical
84 line. */
85 static void
86 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
88 if (buffer->notes_used == buffer->notes_cap)
90 buffer->notes_cap = buffer->notes_cap * 2 + 200;
91 buffer->notes = xrealloc (buffer->notes,
92 buffer->notes_cap * sizeof (_cpp_line_note));
95 buffer->notes[buffer->notes_used].pos = pos;
96 buffer->notes[buffer->notes_used].type = type;
97 buffer->notes_used++;
100 /* Returns with a logical line that contains no escaped newlines or
101 trigraphs. This is a time-critical inner loop. */
102 void
103 _cpp_clean_line (cpp_reader *pfile)
105 cpp_buffer *buffer;
106 const uchar *s;
107 uchar c, *d, *p;
109 buffer = pfile->buffer;
110 buffer->cur_note = buffer->notes_used = 0;
111 buffer->cur = buffer->line_base = buffer->next_line;
112 buffer->need_line = false;
113 s = buffer->next_line - 1;
115 if (!buffer->from_stage3)
117 /* Short circuit for the common case of an un-escaped line with
118 no trigraphs. The primary win here is by not writing any
119 data back to memory until we have to. */
120 for (;;)
122 c = *++s;
123 if (c == '\n' || c == '\r')
125 d = (uchar *) s;
127 if (s == buffer->rlimit)
128 goto done;
130 /* DOS line ending? */
131 if (c == '\r' && s[1] == '\n')
132 s++;
134 if (s == buffer->rlimit)
135 goto done;
137 /* check for escaped newline */
138 p = d;
139 while (p != buffer->next_line && is_nvspace (p[-1]))
140 p--;
141 if (p == buffer->next_line || p[-1] != '\\')
142 goto done;
144 /* Have an escaped newline; process it and proceed to
145 the slow path. */
146 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
147 d = p - 2;
148 buffer->next_line = p - 1;
149 break;
151 if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
153 /* Have a trigraph. We may or may not have to convert
154 it. Add a line note regardless, for -Wtrigraphs. */
155 add_line_note (buffer, s, s[2]);
156 if (CPP_OPTION (pfile, trigraphs))
158 /* We do, and that means we have to switch to the
159 slow path. */
160 d = (uchar *) s;
161 *d = _cpp_trigraph_map[s[2]];
162 s += 2;
163 break;
169 for (;;)
171 c = *++s;
172 *++d = c;
174 if (c == '\n' || c == '\r')
176 /* Handle DOS line endings. */
177 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
178 s++;
179 if (s == buffer->rlimit)
180 break;
182 /* Escaped? */
183 p = d;
184 while (p != buffer->next_line && is_nvspace (p[-1]))
185 p--;
186 if (p == buffer->next_line || p[-1] != '\\')
187 break;
189 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
190 d = p - 2;
191 buffer->next_line = p - 1;
193 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
195 /* Add a note regardless, for the benefit of -Wtrigraphs. */
196 add_line_note (buffer, d, s[2]);
197 if (CPP_OPTION (pfile, trigraphs))
199 *d = _cpp_trigraph_map[s[2]];
200 s += 2;
205 else
208 s++;
209 while (*s != '\n' && *s != '\r');
210 d = (uchar *) s;
212 /* Handle DOS line endings. */
213 if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
214 s++;
217 done:
218 *d = '\n';
219 /* A sentinel note that should never be processed. */
220 add_line_note (buffer, d + 1, '\n');
221 buffer->next_line = s + 1;
224 /* Return true if the trigraph indicated by NOTE should be warned
225 about in a comment. */
226 static bool
227 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
229 const uchar *p;
231 /* Within comments we don't warn about trigraphs, unless the
232 trigraph forms an escaped newline, as that may change
233 behavior. */
234 if (note->type != '/')
235 return false;
237 /* If -trigraphs, then this was an escaped newline iff the next note
238 is coincident. */
239 if (CPP_OPTION (pfile, trigraphs))
240 return note[1].pos == note->pos;
242 /* Otherwise, see if this forms an escaped newline. */
243 p = note->pos + 3;
244 while (is_nvspace (*p))
245 p++;
247 /* There might have been escaped newlines between the trigraph and the
248 newline we found. Hence the position test. */
249 return (*p == '\n' && p < note[1].pos);
252 /* Process the notes created by add_line_note as far as the current
253 location. */
254 void
255 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
257 cpp_buffer *buffer = pfile->buffer;
259 for (;;)
261 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
262 unsigned int col;
264 if (note->pos > buffer->cur)
265 break;
267 buffer->cur_note++;
268 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
270 if (note->type == '\\' || note->type == ' ')
272 if (note->type == ' ' && !in_comment)
273 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line, col,
274 "backslash and newline separated by space");
276 if (buffer->next_line > buffer->rlimit)
278 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line, col,
279 "backslash-newline at end of file");
280 /* Prevent "no newline at end of file" warning. */
281 buffer->next_line = buffer->rlimit;
284 buffer->line_base = note->pos;
285 pfile->line++;
287 else if (_cpp_trigraph_map[note->type])
289 if (CPP_OPTION (pfile, warn_trigraphs)
290 && (!in_comment || warn_in_comment (pfile, note)))
292 if (CPP_OPTION (pfile, trigraphs))
293 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line, col,
294 "trigraph ??%c converted to %c",
295 note->type,
296 (int) _cpp_trigraph_map[note->type]);
297 else
299 cpp_error_with_line
300 (pfile, CPP_DL_WARNING, pfile->line, col,
301 "trigraph ??%c ignored, use -trigraphs to enable",
302 note->type);
306 else
307 abort ();
311 /* Skip a C-style block comment. We find the end of the comment by
312 seeing if an asterisk is before every '/' we encounter. Returns
313 nonzero if comment terminated by EOF, zero otherwise.
315 Buffer->cur points to the initial asterisk of the comment. */
316 bool
317 _cpp_skip_block_comment (cpp_reader *pfile)
319 cpp_buffer *buffer = pfile->buffer;
320 const uchar *cur = buffer->cur;
321 uchar c;
323 cur++;
324 if (*cur == '/')
325 cur++;
327 for (;;)
329 /* People like decorating comments with '*', so check for '/'
330 instead for efficiency. */
331 c = *cur++;
333 if (c == '/')
335 if (cur[-2] == '*')
336 break;
338 /* Warn about potential nested comments, but not if the '/'
339 comes immediately before the true comment delimiter.
340 Don't bother to get it right across escaped newlines. */
341 if (CPP_OPTION (pfile, warn_comments)
342 && cur[0] == '*' && cur[1] != '/')
344 buffer->cur = cur;
345 cpp_error_with_line (pfile, CPP_DL_WARNING,
346 pfile->line, CPP_BUF_COL (buffer),
347 "\"/*\" within comment");
350 else if (c == '\n')
352 buffer->cur = cur - 1;
353 _cpp_process_line_notes (pfile, true);
354 if (buffer->next_line >= buffer->rlimit)
355 return true;
356 _cpp_clean_line (pfile);
357 pfile->line++;
358 cur = buffer->cur;
362 buffer->cur = cur;
363 _cpp_process_line_notes (pfile, true);
364 return false;
367 /* Skip a C++ line comment, leaving buffer->cur pointing to the
368 terminating newline. Handles escaped newlines. Returns nonzero
369 if a multiline comment. */
370 static int
371 skip_line_comment (cpp_reader *pfile)
373 cpp_buffer *buffer = pfile->buffer;
374 unsigned int orig_line = pfile->line;
376 while (*buffer->cur != '\n')
377 buffer->cur++;
379 _cpp_process_line_notes (pfile, true);
380 return orig_line != pfile->line;
383 /* Skips whitespace, saving the next non-whitespace character. */
384 static void
385 skip_whitespace (cpp_reader *pfile, cppchar_t c)
387 cpp_buffer *buffer = pfile->buffer;
388 bool saw_NUL = false;
392 /* Horizontal space always OK. */
393 if (c == ' ' || c == '\t')
395 /* Just \f \v or \0 left. */
396 else if (c == '\0')
397 saw_NUL = true;
398 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
399 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line,
400 CPP_BUF_COL (buffer),
401 "%s in preprocessing directive",
402 c == '\f' ? "form feed" : "vertical tab");
404 c = *buffer->cur++;
406 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
407 while (is_nvspace (c));
409 if (saw_NUL)
410 cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
412 buffer->cur--;
415 /* See if the characters of a number token are valid in a name (no
416 '.', '+' or '-'). */
417 static int
418 name_p (cpp_reader *pfile, const cpp_string *string)
420 unsigned int i;
422 for (i = 0; i < string->len; i++)
423 if (!is_idchar (string->text[i]))
424 return 0;
426 return 1;
429 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
430 an identifier. FIRST is TRUE if this starts an identifier. */
431 static bool
432 forms_identifier_p (cpp_reader *pfile, int first)
434 cpp_buffer *buffer = pfile->buffer;
436 if (*buffer->cur == '$')
438 if (!CPP_OPTION (pfile, dollars_in_ident))
439 return false;
441 buffer->cur++;
442 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
444 CPP_OPTION (pfile, warn_dollars) = 0;
445 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
448 return true;
451 /* Is this a syntactically valid UCN? */
452 if (0 && *buffer->cur == '\\'
453 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
455 buffer->cur += 2;
456 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first))
457 return true;
458 buffer->cur -= 2;
461 return false;
464 /* Lex an identifier starting at BUFFER->CUR - 1. */
465 static cpp_hashnode *
466 lex_identifier (cpp_reader *pfile, const uchar *base)
468 cpp_hashnode *result;
469 const uchar *cur;
473 cur = pfile->buffer->cur;
475 /* N.B. ISIDNUM does not include $. */
476 while (ISIDNUM (*cur))
477 cur++;
479 pfile->buffer->cur = cur;
481 while (forms_identifier_p (pfile, false));
483 result = (cpp_hashnode *)
484 ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
486 /* Rarely, identifiers require diagnostics when lexed. */
487 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
488 && !pfile->state.skipping, 0))
490 /* It is allowed to poison the same identifier twice. */
491 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
492 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
493 NODE_NAME (result));
495 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
496 replacement list of a variadic macro. */
497 if (result == pfile->spec_nodes.n__VA_ARGS__
498 && !pfile->state.va_args_ok)
499 cpp_error (pfile, CPP_DL_PEDWARN,
500 "__VA_ARGS__ can only appear in the expansion"
501 " of a C99 variadic macro");
504 return result;
507 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
508 static void
509 lex_number (cpp_reader *pfile, cpp_string *number)
511 const uchar *cur;
512 const uchar *base;
513 uchar *dest;
515 base = pfile->buffer->cur - 1;
518 cur = pfile->buffer->cur;
520 /* N.B. ISIDNUM does not include $. */
521 while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
522 cur++;
524 pfile->buffer->cur = cur;
526 while (forms_identifier_p (pfile, false));
528 number->len = cur - base;
529 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
530 memcpy (dest, base, number->len);
531 dest[number->len] = '\0';
532 number->text = dest;
535 /* Create a token of type TYPE with a literal spelling. */
536 static void
537 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
538 unsigned int len, enum cpp_ttype type)
540 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
542 memcpy (dest, base, len);
543 dest[len] = '\0';
544 token->type = type;
545 token->val.str.len = len;
546 token->val.str.text = dest;
549 /* Lexes a string, character constant, or angle-bracketed header file
550 name. The stored string contains the spelling, including opening
551 quote and leading any leading 'L'. It returns the type of the
552 literal, or CPP_OTHER if it was not properly terminated.
554 The spelling is NUL-terminated, but it is not guaranteed that this
555 is the first NUL since embedded NULs are preserved. */
556 static void
557 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
559 bool saw_NUL = false;
560 const uchar *cur;
561 cppchar_t terminator;
562 enum cpp_ttype type;
564 cur = base;
565 terminator = *cur++;
566 if (terminator == 'L')
567 terminator = *cur++;
568 if (terminator == '\"')
569 type = *base == 'L' ? CPP_WSTRING: CPP_STRING;
570 else if (terminator == '\'')
571 type = *base == 'L' ? CPP_WCHAR: CPP_CHAR;
572 else
573 terminator = '>', type = CPP_HEADER_NAME;
575 for (;;)
577 cppchar_t c = *cur++;
579 /* In #include-style directives, terminators are not escapable. */
580 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
581 cur++;
582 else if (c == terminator)
583 break;
584 else if (c == '\n')
586 cur--;
587 type = CPP_OTHER;
588 break;
590 else if (c == '\0')
591 saw_NUL = true;
594 if (saw_NUL && !pfile->state.skipping)
595 cpp_error (pfile, CPP_DL_WARNING,
596 "null character(s) preserved in literal");
598 pfile->buffer->cur = cur;
599 create_literal (pfile, token, base, cur - base, type);
602 /* The stored comment includes the comment start and any terminator. */
603 static void
604 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
605 cppchar_t type)
607 unsigned char *buffer;
608 unsigned int len, clen;
610 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
612 /* C++ comments probably (not definitely) have moved past a new
613 line, which we don't want to save in the comment. */
614 if (is_vspace (pfile->buffer->cur[-1]))
615 len--;
617 /* If we are currently in a directive, then we need to store all
618 C++ comments as C comments internally, and so we need to
619 allocate a little extra space in that case.
621 Note that the only time we encounter a directive here is
622 when we are saving comments in a "#define". */
623 clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
625 buffer = _cpp_unaligned_alloc (pfile, clen);
627 token->type = CPP_COMMENT;
628 token->val.str.len = clen;
629 token->val.str.text = buffer;
631 buffer[0] = '/';
632 memcpy (buffer + 1, from, len - 1);
634 /* Finish conversion to a C comment, if necessary. */
635 if (pfile->state.in_directive && type == '/')
637 buffer[1] = '*';
638 buffer[clen - 2] = '*';
639 buffer[clen - 1] = '/';
643 /* Allocate COUNT tokens for RUN. */
644 void
645 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
647 run->base = xnewvec (cpp_token, count);
648 run->limit = run->base + count;
649 run->next = NULL;
652 /* Returns the next tokenrun, or creates one if there is none. */
653 static tokenrun *
654 next_tokenrun (tokenrun *run)
656 if (run->next == NULL)
658 run->next = xnew (tokenrun);
659 run->next->prev = run;
660 _cpp_init_tokenrun (run->next, 250);
663 return run->next;
666 /* Allocate a single token that is invalidated at the same time as the
667 rest of the tokens on the line. Has its line and col set to the
668 same as the last lexed token, so that diagnostics appear in the
669 right place. */
670 cpp_token *
671 _cpp_temp_token (cpp_reader *pfile)
673 cpp_token *old, *result;
675 old = pfile->cur_token - 1;
676 if (pfile->cur_token == pfile->cur_run->limit)
678 pfile->cur_run = next_tokenrun (pfile->cur_run);
679 pfile->cur_token = pfile->cur_run->base;
682 result = pfile->cur_token++;
683 result->line = old->line;
684 result->col = old->col;
685 return result;
688 /* Lex a token into RESULT (external interface). Takes care of issues
689 like directive handling, token lookahead, multiple include
690 optimization and skipping. */
691 const cpp_token *
692 _cpp_lex_token (cpp_reader *pfile)
694 cpp_token *result;
696 for (;;)
698 if (pfile->cur_token == pfile->cur_run->limit)
700 pfile->cur_run = next_tokenrun (pfile->cur_run);
701 pfile->cur_token = pfile->cur_run->base;
704 if (pfile->lookaheads)
706 pfile->lookaheads--;
707 result = pfile->cur_token++;
709 else
710 result = _cpp_lex_direct (pfile);
712 if (result->flags & BOL)
714 /* Is this a directive. If _cpp_handle_directive returns
715 false, it is an assembler #. */
716 if (result->type == CPP_HASH
717 /* 6.10.3 p 11: Directives in a list of macro arguments
718 gives undefined behavior. This implementation
719 handles the directive as normal. */
720 && pfile->state.parsing_args != 1
721 && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
722 continue;
723 if (pfile->cb.line_change && !pfile->state.skipping)
724 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
727 /* We don't skip tokens in directives. */
728 if (pfile->state.in_directive)
729 break;
731 /* Outside a directive, invalidate controlling macros. At file
732 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
733 get here and MI optimization works. */
734 pfile->mi_valid = false;
736 if (!pfile->state.skipping || result->type == CPP_EOF)
737 break;
740 return result;
743 /* Returns true if a fresh line has been loaded. */
744 bool
745 _cpp_get_fresh_line (cpp_reader *pfile)
747 /* We can't get a new line until we leave the current directive. */
748 if (pfile->state.in_directive)
749 return false;
751 for (;;)
753 cpp_buffer *buffer = pfile->buffer;
755 if (!buffer->need_line)
756 return true;
758 if (buffer->next_line < buffer->rlimit)
760 _cpp_clean_line (pfile);
761 return true;
764 /* First, get out of parsing arguments state. */
765 if (pfile->state.parsing_args)
766 return false;
768 /* End of buffer. Non-empty files should end in a newline. */
769 if (buffer->buf != buffer->rlimit
770 && buffer->next_line > buffer->rlimit
771 && !buffer->from_stage3)
773 /* Only warn once. */
774 buffer->next_line = buffer->rlimit;
775 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line - 1,
776 CPP_BUF_COLUMN (buffer, buffer->cur),
777 "no newline at end of file");
780 _cpp_pop_buffer (pfile);
781 if (pfile->buffer == NULL)
782 return false;
786 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
787 do \
789 result->type = ELSE_TYPE; \
790 if (*buffer->cur == CHAR) \
791 buffer->cur++, result->type = THEN_TYPE; \
793 while (0)
795 /* Lex a token into pfile->cur_token, which is also incremented, to
796 get diagnostics pointing to the correct location.
798 Does not handle issues such as token lookahead, multiple-include
799 optimization, directives, skipping etc. This function is only
800 suitable for use by _cpp_lex_token, and in special cases like
801 lex_expansion_token which doesn't care for any of these issues.
803 When meeting a newline, returns CPP_EOF if parsing a directive,
804 otherwise returns to the start of the token buffer if permissible.
805 Returns the location of the lexed token. */
806 cpp_token *
807 _cpp_lex_direct (cpp_reader *pfile)
809 cppchar_t c;
810 cpp_buffer *buffer;
811 const unsigned char *comment_start;
812 cpp_token *result = pfile->cur_token++;
814 fresh_line:
815 result->flags = 0;
816 buffer = pfile->buffer;
817 if (buffer->need_line)
819 if (!_cpp_get_fresh_line (pfile))
821 result->type = CPP_EOF;
822 if (!pfile->state.in_directive)
824 /* Tell the compiler the line number of the EOF token. */
825 result->line = pfile->line;
826 result->flags = BOL;
828 return result;
830 if (!pfile->keep_tokens)
832 pfile->cur_run = &pfile->base_run;
833 result = pfile->base_run.base;
834 pfile->cur_token = result + 1;
836 result->flags = BOL;
837 if (pfile->state.parsing_args == 2)
838 result->flags |= PREV_WHITE;
840 buffer = pfile->buffer;
841 update_tokens_line:
842 result->line = pfile->line;
844 skipped_white:
845 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
846 && !pfile->overlaid_buffer)
848 _cpp_process_line_notes (pfile, false);
849 result->line = pfile->line;
851 c = *buffer->cur++;
852 result->col = CPP_BUF_COLUMN (buffer, buffer->cur);
854 switch (c)
856 case ' ': case '\t': case '\f': case '\v': case '\0':
857 result->flags |= PREV_WHITE;
858 skip_whitespace (pfile, c);
859 goto skipped_white;
861 case '\n':
862 pfile->line++;
863 buffer->need_line = true;
864 goto fresh_line;
866 case '0': case '1': case '2': case '3': case '4':
867 case '5': case '6': case '7': case '8': case '9':
868 result->type = CPP_NUMBER;
869 lex_number (pfile, &result->val.str);
870 break;
872 case 'L':
873 /* 'L' may introduce wide characters or strings. */
874 if (*buffer->cur == '\'' || *buffer->cur == '"')
876 lex_string (pfile, result, buffer->cur - 1);
877 break;
879 /* Fall through. */
881 case '_':
882 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
883 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
884 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
885 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
886 case 'y': case 'z':
887 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
888 case 'G': case 'H': case 'I': case 'J': case 'K':
889 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
890 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
891 case 'Y': case 'Z':
892 result->type = CPP_NAME;
893 result->val.node = lex_identifier (pfile, buffer->cur - 1);
895 /* Convert named operators to their proper types. */
896 if (result->val.node->flags & NODE_OPERATOR)
898 result->flags |= NAMED_OP;
899 result->type = result->val.node->directive_index;
901 break;
903 case '\'':
904 case '"':
905 lex_string (pfile, result, buffer->cur - 1);
906 break;
908 case '/':
909 /* A potential block or line comment. */
910 comment_start = buffer->cur;
911 c = *buffer->cur;
913 if (c == '*')
915 if (_cpp_skip_block_comment (pfile))
916 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
918 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
919 || CPP_IN_SYSTEM_HEADER (pfile)))
921 /* Warn about comments only if pedantically GNUC89, and not
922 in system headers. */
923 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
924 && ! buffer->warned_cplusplus_comments)
926 cpp_error (pfile, CPP_DL_PEDWARN,
927 "C++ style comments are not allowed in ISO C90");
928 cpp_error (pfile, CPP_DL_PEDWARN,
929 "(this will be reported only once per input file)");
930 buffer->warned_cplusplus_comments = 1;
933 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
934 cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
936 else if (c == '=')
938 buffer->cur++;
939 result->type = CPP_DIV_EQ;
940 break;
942 else
944 result->type = CPP_DIV;
945 break;
948 if (!pfile->state.save_comments)
950 result->flags |= PREV_WHITE;
951 goto update_tokens_line;
954 /* Save the comment as a token in its own right. */
955 save_comment (pfile, result, comment_start, c);
956 break;
958 case '<':
959 if (pfile->state.angled_headers)
961 lex_string (pfile, result, buffer->cur - 1);
962 break;
965 result->type = CPP_LESS;
966 if (*buffer->cur == '=')
967 buffer->cur++, result->type = CPP_LESS_EQ;
968 else if (*buffer->cur == '<')
970 buffer->cur++;
971 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
973 else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
975 buffer->cur++;
976 IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
978 else if (CPP_OPTION (pfile, digraphs))
980 if (*buffer->cur == ':')
982 buffer->cur++;
983 result->flags |= DIGRAPH;
984 result->type = CPP_OPEN_SQUARE;
986 else if (*buffer->cur == '%')
988 buffer->cur++;
989 result->flags |= DIGRAPH;
990 result->type = CPP_OPEN_BRACE;
993 break;
995 case '>':
996 result->type = CPP_GREATER;
997 if (*buffer->cur == '=')
998 buffer->cur++, result->type = CPP_GREATER_EQ;
999 else if (*buffer->cur == '>')
1001 buffer->cur++;
1002 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1004 else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
1006 buffer->cur++;
1007 IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
1009 break;
1011 case '%':
1012 result->type = CPP_MOD;
1013 if (*buffer->cur == '=')
1014 buffer->cur++, result->type = CPP_MOD_EQ;
1015 else if (CPP_OPTION (pfile, digraphs))
1017 if (*buffer->cur == ':')
1019 buffer->cur++;
1020 result->flags |= DIGRAPH;
1021 result->type = CPP_HASH;
1022 if (*buffer->cur == '%' && buffer->cur[1] == ':')
1023 buffer->cur += 2, result->type = CPP_PASTE;
1025 else if (*buffer->cur == '>')
1027 buffer->cur++;
1028 result->flags |= DIGRAPH;
1029 result->type = CPP_CLOSE_BRACE;
1032 break;
1034 case '.':
1035 result->type = CPP_DOT;
1036 if (ISDIGIT (*buffer->cur))
1038 result->type = CPP_NUMBER;
1039 lex_number (pfile, &result->val.str);
1041 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
1042 buffer->cur += 2, result->type = CPP_ELLIPSIS;
1043 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1044 buffer->cur++, result->type = CPP_DOT_STAR;
1045 break;
1047 case '+':
1048 result->type = CPP_PLUS;
1049 if (*buffer->cur == '+')
1050 buffer->cur++, result->type = CPP_PLUS_PLUS;
1051 else if (*buffer->cur == '=')
1052 buffer->cur++, result->type = CPP_PLUS_EQ;
1053 break;
1055 case '-':
1056 result->type = CPP_MINUS;
1057 if (*buffer->cur == '>')
1059 buffer->cur++;
1060 result->type = CPP_DEREF;
1061 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1062 buffer->cur++, result->type = CPP_DEREF_STAR;
1064 else if (*buffer->cur == '-')
1065 buffer->cur++, result->type = CPP_MINUS_MINUS;
1066 else if (*buffer->cur == '=')
1067 buffer->cur++, result->type = CPP_MINUS_EQ;
1068 break;
1070 case '&':
1071 result->type = CPP_AND;
1072 if (*buffer->cur == '&')
1073 buffer->cur++, result->type = CPP_AND_AND;
1074 else if (*buffer->cur == '=')
1075 buffer->cur++, result->type = CPP_AND_EQ;
1076 break;
1078 case '|':
1079 result->type = CPP_OR;
1080 if (*buffer->cur == '|')
1081 buffer->cur++, result->type = CPP_OR_OR;
1082 else if (*buffer->cur == '=')
1083 buffer->cur++, result->type = CPP_OR_EQ;
1084 break;
1086 case ':':
1087 result->type = CPP_COLON;
1088 if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1089 buffer->cur++, result->type = CPP_SCOPE;
1090 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1092 buffer->cur++;
1093 result->flags |= DIGRAPH;
1094 result->type = CPP_CLOSE_SQUARE;
1096 break;
1098 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1099 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1100 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1101 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1102 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1104 case '?': result->type = CPP_QUERY; break;
1105 case '~': result->type = CPP_COMPL; break;
1106 case ',': result->type = CPP_COMMA; break;
1107 case '(': result->type = CPP_OPEN_PAREN; break;
1108 case ')': result->type = CPP_CLOSE_PAREN; break;
1109 case '[': result->type = CPP_OPEN_SQUARE; break;
1110 case ']': result->type = CPP_CLOSE_SQUARE; break;
1111 case '{': result->type = CPP_OPEN_BRACE; break;
1112 case '}': result->type = CPP_CLOSE_BRACE; break;
1113 case ';': result->type = CPP_SEMICOLON; break;
1115 /* @ is a punctuator in Objective-C. */
1116 case '@': result->type = CPP_ATSIGN; break;
1118 case '$':
1119 case '\\':
1121 const uchar *base = --buffer->cur;
1123 if (forms_identifier_p (pfile, true))
1125 result->type = CPP_NAME;
1126 result->val.node = lex_identifier (pfile, base);
1127 break;
1129 buffer->cur++;
1132 default:
1133 create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1134 break;
1137 return result;
1140 /* An upper bound on the number of bytes needed to spell TOKEN.
1141 Does not include preceding whitespace. */
1142 unsigned int
1143 cpp_token_len (const cpp_token *token)
1145 unsigned int len;
1147 switch (TOKEN_SPELL (token))
1149 default: len = 4; break;
1150 case SPELL_LITERAL: len = token->val.str.len; break;
1151 case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
1154 return len;
1157 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1158 already contain the enough space to hold the token's spelling.
1159 Returns a pointer to the character after the last character written.
1160 FIXME: Would be nice if we didn't need the PFILE argument. */
1161 unsigned char *
1162 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1163 unsigned char *buffer)
1165 switch (TOKEN_SPELL (token))
1167 case SPELL_OPERATOR:
1169 const unsigned char *spelling;
1170 unsigned char c;
1172 if (token->flags & DIGRAPH)
1173 spelling
1174 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1175 else if (token->flags & NAMED_OP)
1176 goto spell_ident;
1177 else
1178 spelling = TOKEN_NAME (token);
1180 while ((c = *spelling++) != '\0')
1181 *buffer++ = c;
1183 break;
1185 spell_ident:
1186 case SPELL_IDENT:
1187 memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
1188 buffer += NODE_LEN (token->val.node);
1189 break;
1191 case SPELL_LITERAL:
1192 memcpy (buffer, token->val.str.text, token->val.str.len);
1193 buffer += token->val.str.len;
1194 break;
1196 case SPELL_NONE:
1197 cpp_error (pfile, CPP_DL_ICE,
1198 "unspellable token %s", TOKEN_NAME (token));
1199 break;
1202 return buffer;
1205 /* Returns TOKEN spelt as a null-terminated string. The string is
1206 freed when the reader is destroyed. Useful for diagnostics. */
1207 unsigned char *
1208 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1210 unsigned int len = cpp_token_len (token) + 1;
1211 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1213 end = cpp_spell_token (pfile, token, start);
1214 end[0] = '\0';
1216 return start;
1219 /* Used by C front ends, which really should move to using
1220 cpp_token_as_text. */
1221 const char *
1222 cpp_type2name (enum cpp_ttype type)
1224 return (const char *) token_spellings[type].name;
1227 /* Writes the spelling of token to FP, without any preceding space.
1228 Separated from cpp_spell_token for efficiency - to avoid stdio
1229 double-buffering. */
1230 void
1231 cpp_output_token (const cpp_token *token, FILE *fp)
1233 switch (TOKEN_SPELL (token))
1235 case SPELL_OPERATOR:
1237 const unsigned char *spelling;
1238 int c;
1240 if (token->flags & DIGRAPH)
1241 spelling
1242 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1243 else if (token->flags & NAMED_OP)
1244 goto spell_ident;
1245 else
1246 spelling = TOKEN_NAME (token);
1248 c = *spelling;
1250 putc (c, fp);
1251 while ((c = *++spelling) != '\0');
1253 break;
1255 spell_ident:
1256 case SPELL_IDENT:
1257 fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
1258 break;
1260 case SPELL_LITERAL:
1261 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1262 break;
1264 case SPELL_NONE:
1265 /* An error, most probably. */
1266 break;
1270 /* Compare two tokens. */
1272 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1274 if (a->type == b->type && a->flags == b->flags)
1275 switch (TOKEN_SPELL (a))
1277 default: /* Keep compiler happy. */
1278 case SPELL_OPERATOR:
1279 return 1;
1280 case SPELL_NONE:
1281 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1282 case SPELL_IDENT:
1283 return a->val.node == b->val.node;
1284 case SPELL_LITERAL:
1285 return (a->val.str.len == b->val.str.len
1286 && !memcmp (a->val.str.text, b->val.str.text,
1287 a->val.str.len));
1290 return 0;
1293 /* Returns nonzero if a space should be inserted to avoid an
1294 accidental token paste for output. For simplicity, it is
1295 conservative, and occasionally advises a space where one is not
1296 needed, e.g. "." and ".2". */
1298 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1299 const cpp_token *token2)
1301 enum cpp_ttype a = token1->type, b = token2->type;
1302 cppchar_t c;
1304 if (token1->flags & NAMED_OP)
1305 a = CPP_NAME;
1306 if (token2->flags & NAMED_OP)
1307 b = CPP_NAME;
1309 c = EOF;
1310 if (token2->flags & DIGRAPH)
1311 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1312 else if (token_spellings[b].category == SPELL_OPERATOR)
1313 c = token_spellings[b].name[0];
1315 /* Quickly get everything that can paste with an '='. */
1316 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1317 return 1;
1319 switch (a)
1321 case CPP_GREATER: return c == '>' || c == '?';
1322 case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
1323 case CPP_PLUS: return c == '+';
1324 case CPP_MINUS: return c == '-' || c == '>';
1325 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1326 case CPP_MOD: return c == ':' || c == '>';
1327 case CPP_AND: return c == '&';
1328 case CPP_OR: return c == '|';
1329 case CPP_COLON: return c == ':' || c == '>';
1330 case CPP_DEREF: return c == '*';
1331 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1332 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1333 case CPP_NAME: return ((b == CPP_NUMBER
1334 && name_p (pfile, &token2->val.str))
1335 || b == CPP_NAME
1336 || b == CPP_CHAR || b == CPP_STRING); /* L */
1337 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1338 || c == '.' || c == '+' || c == '-');
1339 /* UCNs */
1340 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
1341 && b == CPP_NAME)
1342 || (CPP_OPTION (pfile, objc)
1343 && token1->val.str.text[0] == '@'
1344 && (b == CPP_NAME || b == CPP_STRING)));
1345 default: break;
1348 return 0;
1351 /* Output all the remaining tokens on the current line, and a newline
1352 character, to FP. Leading whitespace is removed. If there are
1353 macros, special token padding is not performed. */
1354 void
1355 cpp_output_line (cpp_reader *pfile, FILE *fp)
1357 const cpp_token *token;
1359 token = cpp_get_token (pfile);
1360 while (token->type != CPP_EOF)
1362 cpp_output_token (token, fp);
1363 token = cpp_get_token (pfile);
1364 if (token->flags & PREV_WHITE)
1365 putc (' ', fp);
1368 putc ('\n', fp);
1371 /* Memory buffers. Changing these three constants can have a dramatic
1372 effect on performance. The values here are reasonable defaults,
1373 but might be tuned. If you adjust them, be sure to test across a
1374 range of uses of cpplib, including heavy nested function-like macro
1375 expansion. Also check the change in peak memory usage (NJAMD is a
1376 good tool for this). */
1377 #define MIN_BUFF_SIZE 8000
1378 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1379 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1380 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1382 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1383 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1384 #endif
1386 /* Create a new allocation buffer. Place the control block at the end
1387 of the buffer, so that buffer overflows will cause immediate chaos. */
1388 static _cpp_buff *
1389 new_buff (size_t len)
1391 _cpp_buff *result;
1392 unsigned char *base;
1394 if (len < MIN_BUFF_SIZE)
1395 len = MIN_BUFF_SIZE;
1396 len = CPP_ALIGN (len);
1398 base = xmalloc (len + sizeof (_cpp_buff));
1399 result = (_cpp_buff *) (base + len);
1400 result->base = base;
1401 result->cur = base;
1402 result->limit = base + len;
1403 result->next = NULL;
1404 return result;
1407 /* Place a chain of unwanted allocation buffers on the free list. */
1408 void
1409 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
1411 _cpp_buff *end = buff;
1413 while (end->next)
1414 end = end->next;
1415 end->next = pfile->free_buffs;
1416 pfile->free_buffs = buff;
1419 /* Return a free buffer of size at least MIN_SIZE. */
1420 _cpp_buff *
1421 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
1423 _cpp_buff *result, **p;
1425 for (p = &pfile->free_buffs;; p = &(*p)->next)
1427 size_t size;
1429 if (*p == NULL)
1430 return new_buff (min_size);
1431 result = *p;
1432 size = result->limit - result->base;
1433 /* Return a buffer that's big enough, but don't waste one that's
1434 way too big. */
1435 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1436 break;
1439 *p = result->next;
1440 result->next = NULL;
1441 result->cur = result->base;
1442 return result;
1445 /* Creates a new buffer with enough space to hold the uncommitted
1446 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
1447 the excess bytes to the new buffer. Chains the new buffer after
1448 BUFF, and returns the new buffer. */
1449 _cpp_buff *
1450 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
1452 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1453 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1455 buff->next = new_buff;
1456 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1457 return new_buff;
1460 /* Creates a new buffer with enough space to hold the uncommitted
1461 remaining bytes of the buffer pointed to by BUFF, and at least
1462 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
1463 Chains the new buffer before the buffer pointed to by BUFF, and
1464 updates the pointer to point to the new buffer. */
1465 void
1466 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
1468 _cpp_buff *new_buff, *old_buff = *pbuff;
1469 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1471 new_buff = _cpp_get_buff (pfile, size);
1472 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1473 new_buff->next = old_buff;
1474 *pbuff = new_buff;
1477 /* Free a chain of buffers starting at BUFF. */
1478 void
1479 _cpp_free_buff (_cpp_buff *buff)
1481 _cpp_buff *next;
1483 for (; buff; buff = next)
1485 next = buff->next;
1486 free (buff->base);
1490 /* Allocate permanent, unaligned storage of length LEN. */
1491 unsigned char *
1492 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
1494 _cpp_buff *buff = pfile->u_buff;
1495 unsigned char *result = buff->cur;
1497 if (len > (size_t) (buff->limit - result))
1499 buff = _cpp_get_buff (pfile, len);
1500 buff->next = pfile->u_buff;
1501 pfile->u_buff = buff;
1502 result = buff->cur;
1505 buff->cur = result + len;
1506 return result;
1509 /* Allocate permanent, unaligned storage of length LEN from a_buff.
1510 That buffer is used for growing allocations when saving macro
1511 replacement lists in a #define, and when parsing an answer to an
1512 assertion in #assert, #unassert or #if (and therefore possibly
1513 whilst expanding macros). It therefore must not be used by any
1514 code that they might call: specifically the lexer and the guts of
1515 the macro expander.
1517 All existing other uses clearly fit this restriction: storing
1518 registered pragmas during initialization. */
1519 unsigned char *
1520 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
1522 _cpp_buff *buff = pfile->a_buff;
1523 unsigned char *result = buff->cur;
1525 if (len > (size_t) (buff->limit - result))
1527 buff = _cpp_get_buff (pfile, len);
1528 buff->next = pfile->a_buff;
1529 pfile->a_buff = buff;
1530 result = buff->cur;
1533 buff->cur = result + len;
1534 return result;