* cpplib.h (CPP_AT_NAME, CPP_OBJC_STRING): New token types.
[official-gcc.git] / gcc / cpplex.c
blobedb765dc61b47cb9466ea8a1b1ed8e7e4d38c46e
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
11 later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "cpplib.h"
27 #include "cpphash.h"
29 enum spell_type
31 SPELL_OPERATOR = 0,
32 SPELL_IDENT,
33 SPELL_LITERAL,
34 SPELL_NONE
37 struct token_spelling
39 enum spell_type category;
40 const unsigned char *name;
43 static const unsigned char *const digraph_spellings[] =
44 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
46 #define OP(e, s) { SPELL_OPERATOR, U s },
47 #define TK(e, s) { s, U #e },
48 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
49 #undef OP
50 #undef TK
52 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
53 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
55 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
56 static int skip_line_comment (cpp_reader *);
57 static void skip_whitespace (cpp_reader *, cppchar_t);
58 static cpp_hashnode *lex_identifier (cpp_reader *, const uchar *);
59 static void lex_number (cpp_reader *, cpp_string *);
60 static bool forms_identifier_p (cpp_reader *, int);
61 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
62 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
63 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
64 unsigned int, enum cpp_ttype);
65 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
66 static int name_p (cpp_reader *, const cpp_string *);
67 static tokenrun *next_tokenrun (tokenrun *);
69 static _cpp_buff *new_buff (size_t);
72 /* Utility routine:
74 Compares, the token TOKEN to the NUL-terminated string STRING.
75 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
76 int
77 cpp_ideq (const cpp_token *token, const char *string)
79 if (token->type != CPP_NAME)
80 return 0;
82 return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
85 /* Record a note TYPE at byte POS into the current cleaned logical
86 line. */
87 static void
88 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
90 if (buffer->notes_used == buffer->notes_cap)
92 buffer->notes_cap = buffer->notes_cap * 2 + 200;
93 buffer->notes = (_cpp_line_note *)
94 xrealloc (buffer->notes, buffer->notes_cap * sizeof (_cpp_line_note));
97 buffer->notes[buffer->notes_used].pos = pos;
98 buffer->notes[buffer->notes_used].type = type;
99 buffer->notes_used++;
102 /* Returns with a logical line that contains no escaped newlines or
103 trigraphs. This is a time-critical inner loop. */
104 void
105 _cpp_clean_line (cpp_reader *pfile)
107 cpp_buffer *buffer;
108 const uchar *s;
109 uchar c, *d, *p;
111 buffer = pfile->buffer;
112 buffer->cur_note = buffer->notes_used = 0;
113 buffer->cur = buffer->line_base = buffer->next_line;
114 buffer->need_line = false;
115 s = buffer->next_line - 1;
117 if (!buffer->from_stage3)
119 d = (uchar *) s;
121 for (;;)
123 c = *++s;
124 *++d = c;
126 if (c == '\n' || c == '\r')
128 /* Handle DOS line endings. */
129 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
130 s++;
131 if (s == buffer->rlimit)
132 break;
134 /* Escaped? */
135 p = d;
136 while (p != buffer->next_line && is_nvspace (p[-1]))
137 p--;
138 if (p == buffer->next_line || p[-1] != '\\')
139 break;
141 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
142 d = p - 2;
143 buffer->next_line = p - 1;
145 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
147 /* Add a note regardless, for the benefit of -Wtrigraphs. */
148 add_line_note (buffer, d, s[2]);
149 if (CPP_OPTION (pfile, trigraphs))
151 *d = _cpp_trigraph_map[s[2]];
152 s += 2;
157 else
160 s++;
161 while (*s != '\n' && *s != '\r');
162 d = (uchar *) s;
164 /* Handle DOS line endings. */
165 if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
166 s++;
169 *d = '\n';
170 /* A sentinel note that should never be processed. */
171 add_line_note (buffer, d + 1, '\n');
172 buffer->next_line = s + 1;
175 /* Return true if the trigraph indicated by NOTE should be warned
176 about in a comment. */
177 static bool
178 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
180 const uchar *p;
182 /* Within comments we don't warn about trigraphs, unless the
183 trigraph forms an escaped newline, as that may change
184 behavior. */
185 if (note->type != '/')
186 return false;
188 /* If -trigraphs, then this was an escaped newline iff the next note
189 is coincident. */
190 if (CPP_OPTION (pfile, trigraphs))
191 return note[1].pos == note->pos;
193 /* Otherwise, see if this forms an escaped newline. */
194 p = note->pos + 3;
195 while (is_nvspace (*p))
196 p++;
198 /* There might have been escaped newlines between the trigraph and the
199 newline we found. Hence the position test. */
200 return (*p == '\n' && p < note[1].pos);
203 /* Process the notes created by add_line_note as far as the current
204 location. */
205 void
206 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
208 cpp_buffer *buffer = pfile->buffer;
210 for (;;)
212 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
213 unsigned int col;
215 if (note->pos > buffer->cur)
216 break;
218 buffer->cur_note++;
219 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
221 if (note->type == '\\' || note->type == ' ')
223 if (note->type == ' ' && !in_comment)
224 cpp_error_with_line (pfile, DL_WARNING, pfile->line, col,
225 "backslash and newline separated by space");
227 if (buffer->next_line > buffer->rlimit)
229 cpp_error_with_line (pfile, DL_PEDWARN, pfile->line, col,
230 "backslash-newline at end of file");
231 /* Prevent "no newline at end of file" warning. */
232 buffer->next_line = buffer->rlimit;
235 buffer->line_base = note->pos;
236 pfile->line++;
238 else if (_cpp_trigraph_map[note->type])
240 if (CPP_OPTION (pfile, warn_trigraphs)
241 && (!in_comment || warn_in_comment (pfile, note)))
243 if (CPP_OPTION (pfile, trigraphs))
244 cpp_error_with_line (pfile, DL_WARNING, pfile->line, col,
245 "trigraph ??%c converted to %c",
246 note->type,
247 (int) _cpp_trigraph_map[note->type]);
248 else
249 cpp_error_with_line (pfile, DL_WARNING, pfile->line, col,
250 "trigraph ??%c ignored",
251 note->type);
254 else
255 abort ();
259 /* Skip a C-style block comment. We find the end of the comment by
260 seeing if an asterisk is before every '/' we encounter. Returns
261 nonzero if comment terminated by EOF, zero otherwise.
263 Buffer->cur points to the initial asterisk of the comment. */
264 bool
265 _cpp_skip_block_comment (cpp_reader *pfile)
267 cpp_buffer *buffer = pfile->buffer;
268 cppchar_t c;
270 buffer->cur++;
271 if (*buffer->cur == '/')
272 buffer->cur++;
274 for (;;)
276 c = *buffer->cur++;
278 /* People like decorating comments with '*', so check for '/'
279 instead for efficiency. */
280 if (c == '/')
282 if (buffer->cur[-2] == '*')
283 break;
285 /* Warn about potential nested comments, but not if the '/'
286 comes immediately before the true comment delimiter.
287 Don't bother to get it right across escaped newlines. */
288 if (CPP_OPTION (pfile, warn_comments)
289 && buffer->cur[0] == '*' && buffer->cur[1] != '/')
290 cpp_error_with_line (pfile, DL_WARNING,
291 pfile->line, CPP_BUF_COL (buffer),
292 "\"/*\" within comment");
294 else if (c == '\n')
296 buffer->cur--;
297 _cpp_process_line_notes (pfile, true);
298 if (buffer->next_line >= buffer->rlimit)
299 return true;
300 _cpp_clean_line (pfile);
301 pfile->line++;
305 _cpp_process_line_notes (pfile, true);
306 return false;
309 /* Skip a C++ line comment, leaving buffer->cur pointing to the
310 terminating newline. Handles escaped newlines. Returns nonzero
311 if a multiline comment. */
312 static int
313 skip_line_comment (cpp_reader *pfile)
315 cpp_buffer *buffer = pfile->buffer;
316 unsigned int orig_line = pfile->line;
318 while (*buffer->cur != '\n')
319 buffer->cur++;
321 _cpp_process_line_notes (pfile, true);
322 return orig_line != pfile->line;
325 /* Skips whitespace, saving the next non-whitespace character. */
326 static void
327 skip_whitespace (cpp_reader *pfile, cppchar_t c)
329 cpp_buffer *buffer = pfile->buffer;
330 bool saw_NUL = false;
334 /* Horizontal space always OK. */
335 if (c == ' ' || c == '\t')
337 /* Just \f \v or \0 left. */
338 else if (c == '\0')
339 saw_NUL = true;
340 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
341 cpp_error_with_line (pfile, DL_PEDWARN, pfile->line,
342 CPP_BUF_COL (buffer),
343 "%s in preprocessing directive",
344 c == '\f' ? "form feed" : "vertical tab");
346 c = *buffer->cur++;
348 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
349 while (is_nvspace (c));
351 if (saw_NUL)
352 cpp_error (pfile, DL_WARNING, "null character(s) ignored");
354 buffer->cur--;
357 /* See if the characters of a number token are valid in a name (no
358 '.', '+' or '-'). */
359 static int
360 name_p (cpp_reader *pfile, const cpp_string *string)
362 unsigned int i;
364 for (i = 0; i < string->len; i++)
365 if (!is_idchar (string->text[i]))
366 return 0;
368 return 1;
371 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
372 an identifier. FIRST is TRUE if this starts an identifier. */
373 static bool
374 forms_identifier_p (cpp_reader *pfile, int first)
376 cpp_buffer *buffer = pfile->buffer;
378 if (*buffer->cur == '$')
380 if (!CPP_OPTION (pfile, dollars_in_ident))
381 return false;
383 buffer->cur++;
384 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
386 CPP_OPTION (pfile, warn_dollars) = 0;
387 cpp_error (pfile, DL_PEDWARN, "'$' in identifier or number");
390 return true;
393 /* Is this a syntactically valid UCN? */
394 if (0 && *buffer->cur == '\\'
395 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
397 buffer->cur += 2;
398 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first))
399 return true;
400 buffer->cur -= 2;
403 return false;
406 /* Lex an identifier starting at BUFFER->CUR - 1. */
407 static cpp_hashnode *
408 lex_identifier (cpp_reader *pfile, const uchar *base)
410 cpp_hashnode *result;
411 const uchar *cur;
415 cur = pfile->buffer->cur;
417 /* N.B. ISIDNUM does not include $. */
418 while (ISIDNUM (*cur))
419 cur++;
421 pfile->buffer->cur = cur;
423 while (forms_identifier_p (pfile, false));
425 result = (cpp_hashnode *)
426 ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
428 /* Rarely, identifiers require diagnostics when lexed. */
429 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
430 && !pfile->state.skipping, 0))
432 /* It is allowed to poison the same identifier twice. */
433 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
434 cpp_error (pfile, DL_ERROR, "attempt to use poisoned \"%s\"",
435 NODE_NAME (result));
437 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
438 replacement list of a variadic macro. */
439 if (result == pfile->spec_nodes.n__VA_ARGS__
440 && !pfile->state.va_args_ok)
441 cpp_error (pfile, DL_PEDWARN,
442 "__VA_ARGS__ can only appear in the expansion"
443 " of a C99 variadic macro");
446 return result;
449 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
450 static void
451 lex_number (cpp_reader *pfile, cpp_string *number)
453 const uchar *cur;
454 const uchar *base;
455 uchar *dest;
457 base = pfile->buffer->cur - 1;
460 cur = pfile->buffer->cur;
462 /* N.B. ISIDNUM does not include $. */
463 while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
464 cur++;
466 pfile->buffer->cur = cur;
468 while (forms_identifier_p (pfile, false));
470 number->len = cur - base;
471 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
472 memcpy (dest, base, number->len);
473 dest[number->len] = '\0';
474 number->text = dest;
477 /* Create a token of type TYPE with a literal spelling. */
478 static void
479 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
480 unsigned int len, enum cpp_ttype type)
482 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
484 memcpy (dest, base, len);
485 dest[len] = '\0';
486 token->type = type;
487 token->val.str.len = len;
488 token->val.str.text = dest;
491 /* Lexes a string, character constant, or angle-bracketed header file
492 name. The stored string contains the spelling, including opening
493 quote and leading any leading 'L'. It returns the type of the
494 literal, or CPP_OTHER if it was not properly terminated.
496 The spelling is NUL-terminated, but it is not guaranteed that this
497 is the first NUL since embedded NULs are preserved. */
498 static void
499 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
501 bool saw_NUL = false;
502 const uchar *cur;
503 cppchar_t terminator;
504 enum cpp_ttype type;
506 cur = base;
507 terminator = *cur++;
508 if (terminator == 'L')
509 terminator = *cur++;
510 if (terminator == '\"')
511 type = *base == 'L' ? CPP_WSTRING: CPP_STRING;
512 else if (terminator == '\'')
513 type = *base == 'L' ? CPP_WCHAR: CPP_CHAR;
514 else
515 terminator = '>', type = CPP_HEADER_NAME;
517 for (;;)
519 cppchar_t c = *cur++;
521 /* In #include-style directives, terminators are not escapable. */
522 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
523 cur++;
524 else if (c == terminator)
525 break;
526 else if (c == '\n')
528 cur--;
529 type = CPP_OTHER;
530 break;
532 else if (c == '\0')
533 saw_NUL = true;
536 if (saw_NUL && !pfile->state.skipping)
537 cpp_error (pfile, DL_WARNING, "null character(s) preserved in literal");
539 pfile->buffer->cur = cur;
540 create_literal (pfile, token, base, cur - base, type);
543 /* The stored comment includes the comment start and any terminator. */
544 static void
545 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
546 cppchar_t type)
548 unsigned char *buffer;
549 unsigned int len, clen;
551 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
553 /* C++ comments probably (not definitely) have moved past a new
554 line, which we don't want to save in the comment. */
555 if (is_vspace (pfile->buffer->cur[-1]))
556 len--;
558 /* If we are currently in a directive, then we need to store all
559 C++ comments as C comments internally, and so we need to
560 allocate a little extra space in that case.
562 Note that the only time we encounter a directive here is
563 when we are saving comments in a "#define". */
564 clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
566 buffer = _cpp_unaligned_alloc (pfile, clen);
568 token->type = CPP_COMMENT;
569 token->val.str.len = clen;
570 token->val.str.text = buffer;
572 buffer[0] = '/';
573 memcpy (buffer + 1, from, len - 1);
575 /* Finish conversion to a C comment, if necessary. */
576 if (pfile->state.in_directive && type == '/')
578 buffer[1] = '*';
579 buffer[clen - 2] = '*';
580 buffer[clen - 1] = '/';
584 /* Allocate COUNT tokens for RUN. */
585 void
586 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
588 run->base = xnewvec (cpp_token, count);
589 run->limit = run->base + count;
590 run->next = NULL;
593 /* Returns the next tokenrun, or creates one if there is none. */
594 static tokenrun *
595 next_tokenrun (tokenrun *run)
597 if (run->next == NULL)
599 run->next = xnew (tokenrun);
600 run->next->prev = run;
601 _cpp_init_tokenrun (run->next, 250);
604 return run->next;
607 /* Allocate a single token that is invalidated at the same time as the
608 rest of the tokens on the line. Has its line and col set to the
609 same as the last lexed token, so that diagnostics appear in the
610 right place. */
611 cpp_token *
612 _cpp_temp_token (cpp_reader *pfile)
614 cpp_token *old, *result;
616 old = pfile->cur_token - 1;
617 if (pfile->cur_token == pfile->cur_run->limit)
619 pfile->cur_run = next_tokenrun (pfile->cur_run);
620 pfile->cur_token = pfile->cur_run->base;
623 result = pfile->cur_token++;
624 result->line = old->line;
625 result->col = old->col;
626 return result;
629 /* Lex a token into RESULT (external interface). Takes care of issues
630 like directive handling, token lookahead, multiple include
631 optimization and skipping. */
632 const cpp_token *
633 _cpp_lex_token (cpp_reader *pfile)
635 cpp_token *result;
637 for (;;)
639 if (pfile->cur_token == pfile->cur_run->limit)
641 pfile->cur_run = next_tokenrun (pfile->cur_run);
642 pfile->cur_token = pfile->cur_run->base;
645 if (pfile->lookaheads)
647 pfile->lookaheads--;
648 result = pfile->cur_token++;
650 else
651 result = _cpp_lex_direct (pfile);
653 if (result->flags & BOL)
655 /* Is this a directive. If _cpp_handle_directive returns
656 false, it is an assembler #. */
657 if (result->type == CPP_HASH
658 /* 6.10.3 p 11: Directives in a list of macro arguments
659 gives undefined behavior. This implementation
660 handles the directive as normal. */
661 && pfile->state.parsing_args != 1
662 && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
663 continue;
664 if (pfile->cb.line_change && !pfile->state.skipping)
665 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
668 /* We don't skip tokens in directives. */
669 if (pfile->state.in_directive)
670 break;
672 /* Outside a directive, invalidate controlling macros. At file
673 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
674 get here and MI optimization works. */
675 pfile->mi_valid = false;
677 if (!pfile->state.skipping || result->type == CPP_EOF)
678 break;
681 return result;
684 /* Returns true if a fresh line has been loaded. */
685 bool
686 _cpp_get_fresh_line (cpp_reader *pfile)
688 /* We can't get a new line until we leave the current directive. */
689 if (pfile->state.in_directive)
690 return false;
692 for (;;)
694 cpp_buffer *buffer = pfile->buffer;
696 if (!buffer->need_line)
697 return true;
699 if (buffer->next_line < buffer->rlimit)
701 _cpp_clean_line (pfile);
702 return true;
705 /* First, get out of parsing arguments state. */
706 if (pfile->state.parsing_args)
707 return false;
709 /* End of buffer. Non-empty files should end in a newline. */
710 if (buffer->buf != buffer->rlimit
711 && buffer->next_line > buffer->rlimit
712 && !buffer->from_stage3)
714 /* Only warn once. */
715 buffer->next_line = buffer->rlimit;
716 cpp_error_with_line (pfile, DL_PEDWARN, pfile->line - 1,
717 CPP_BUF_COLUMN (buffer, buffer->cur),
718 "no newline at end of file");
721 if (!buffer->prev)
722 return false;
724 if (buffer->return_at_eof)
726 _cpp_pop_buffer (pfile);
727 return false;
730 _cpp_pop_buffer (pfile);
734 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
735 do \
737 result->type = ELSE_TYPE; \
738 if (*buffer->cur == CHAR) \
739 buffer->cur++, result->type = THEN_TYPE; \
741 while (0)
743 /* Lex a token into pfile->cur_token, which is also incremented, to
744 get diagnostics pointing to the correct location.
746 Does not handle issues such as token lookahead, multiple-include
747 optimization, directives, skipping etc. This function is only
748 suitable for use by _cpp_lex_token, and in special cases like
749 lex_expansion_token which doesn't care for any of these issues.
751 When meeting a newline, returns CPP_EOF if parsing a directive,
752 otherwise returns to the start of the token buffer if permissible.
753 Returns the location of the lexed token. */
754 cpp_token *
755 _cpp_lex_direct (cpp_reader *pfile)
757 cppchar_t c;
758 cpp_buffer *buffer;
759 const unsigned char *comment_start;
760 cpp_token *result = pfile->cur_token++;
762 fresh_line:
763 result->flags = 0;
764 if (pfile->buffer->need_line)
766 if (!_cpp_get_fresh_line (pfile))
768 result->type = CPP_EOF;
769 if (!pfile->state.in_directive)
771 /* Tell the compiler the line number of the EOF token. */
772 result->line = pfile->line;
773 result->flags = BOL;
775 return result;
777 if (!pfile->keep_tokens)
779 pfile->cur_run = &pfile->base_run;
780 result = pfile->base_run.base;
781 pfile->cur_token = result + 1;
783 result->flags = BOL;
784 if (pfile->state.parsing_args == 2)
785 result->flags |= PREV_WHITE;
787 buffer = pfile->buffer;
788 update_tokens_line:
789 result->line = pfile->line;
791 skipped_white:
792 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
793 && !pfile->overlaid_buffer)
795 _cpp_process_line_notes (pfile, false);
796 result->line = pfile->line;
798 c = *buffer->cur++;
799 result->col = CPP_BUF_COLUMN (buffer, buffer->cur);
801 switch (c)
803 case ' ': case '\t': case '\f': case '\v': case '\0':
804 result->flags |= PREV_WHITE;
805 skip_whitespace (pfile, c);
806 goto skipped_white;
808 case '\n':
809 pfile->line++;
810 buffer->need_line = true;
811 goto fresh_line;
813 case '0': case '1': case '2': case '3': case '4':
814 case '5': case '6': case '7': case '8': case '9':
815 result->type = CPP_NUMBER;
816 lex_number (pfile, &result->val.str);
817 break;
819 case 'L':
820 /* 'L' may introduce wide characters or strings. */
821 if (*buffer->cur == '\'' || *buffer->cur == '"')
823 lex_string (pfile, result, buffer->cur - 1);
824 break;
826 /* Fall through. */
828 case '_':
829 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
830 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
831 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
832 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
833 case 'y': case 'z':
834 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
835 case 'G': case 'H': case 'I': case 'J': case 'K':
836 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
837 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
838 case 'Y': case 'Z':
839 result->type = CPP_NAME;
840 result->val.node = lex_identifier (pfile, buffer->cur - 1);
842 /* Convert named operators to their proper types. */
843 if (result->val.node->flags & NODE_OPERATOR)
845 result->flags |= NAMED_OP;
846 result->type = result->val.node->directive_index;
848 break;
850 case '\'':
851 case '"':
852 lex_string (pfile, result, buffer->cur - 1);
853 break;
855 case '/':
856 /* A potential block or line comment. */
857 comment_start = buffer->cur;
858 c = *buffer->cur;
860 if (c == '*')
862 if (_cpp_skip_block_comment (pfile))
863 cpp_error (pfile, DL_ERROR, "unterminated comment");
865 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
866 || CPP_IN_SYSTEM_HEADER (pfile)))
868 /* Warn about comments only if pedantically GNUC89, and not
869 in system headers. */
870 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
871 && ! buffer->warned_cplusplus_comments)
873 cpp_error (pfile, DL_PEDWARN,
874 "C++ style comments are not allowed in ISO C90");
875 cpp_error (pfile, DL_PEDWARN,
876 "(this will be reported only once per input file)");
877 buffer->warned_cplusplus_comments = 1;
880 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
881 cpp_error (pfile, DL_WARNING, "multi-line comment");
883 else if (c == '=')
885 buffer->cur++;
886 result->type = CPP_DIV_EQ;
887 break;
889 else
891 result->type = CPP_DIV;
892 break;
895 if (!pfile->state.save_comments)
897 result->flags |= PREV_WHITE;
898 goto update_tokens_line;
901 /* Save the comment as a token in its own right. */
902 save_comment (pfile, result, comment_start, c);
903 break;
905 case '<':
906 if (pfile->state.angled_headers)
908 lex_string (pfile, result, buffer->cur - 1);
909 break;
912 result->type = CPP_LESS;
913 if (*buffer->cur == '=')
914 buffer->cur++, result->type = CPP_LESS_EQ;
915 else if (*buffer->cur == '<')
917 buffer->cur++;
918 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
920 else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
922 buffer->cur++;
923 IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
925 else if (CPP_OPTION (pfile, digraphs))
927 if (*buffer->cur == ':')
929 buffer->cur++;
930 result->flags |= DIGRAPH;
931 result->type = CPP_OPEN_SQUARE;
933 else if (*buffer->cur == '%')
935 buffer->cur++;
936 result->flags |= DIGRAPH;
937 result->type = CPP_OPEN_BRACE;
940 break;
942 case '>':
943 result->type = CPP_GREATER;
944 if (*buffer->cur == '=')
945 buffer->cur++, result->type = CPP_GREATER_EQ;
946 else if (*buffer->cur == '>')
948 buffer->cur++;
949 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
951 else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
953 buffer->cur++;
954 IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
956 break;
958 case '%':
959 result->type = CPP_MOD;
960 if (*buffer->cur == '=')
961 buffer->cur++, result->type = CPP_MOD_EQ;
962 else if (CPP_OPTION (pfile, digraphs))
964 if (*buffer->cur == ':')
966 buffer->cur++;
967 result->flags |= DIGRAPH;
968 result->type = CPP_HASH;
969 if (*buffer->cur == '%' && buffer->cur[1] == ':')
970 buffer->cur += 2, result->type = CPP_PASTE;
972 else if (*buffer->cur == '>')
974 buffer->cur++;
975 result->flags |= DIGRAPH;
976 result->type = CPP_CLOSE_BRACE;
979 break;
981 case '.':
982 result->type = CPP_DOT;
983 if (ISDIGIT (*buffer->cur))
985 result->type = CPP_NUMBER;
986 lex_number (pfile, &result->val.str);
988 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
989 buffer->cur += 2, result->type = CPP_ELLIPSIS;
990 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
991 buffer->cur++, result->type = CPP_DOT_STAR;
992 break;
994 case '+':
995 result->type = CPP_PLUS;
996 if (*buffer->cur == '+')
997 buffer->cur++, result->type = CPP_PLUS_PLUS;
998 else if (*buffer->cur == '=')
999 buffer->cur++, result->type = CPP_PLUS_EQ;
1000 break;
1002 case '-':
1003 result->type = CPP_MINUS;
1004 if (*buffer->cur == '>')
1006 buffer->cur++;
1007 result->type = CPP_DEREF;
1008 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1009 buffer->cur++, result->type = CPP_DEREF_STAR;
1011 else if (*buffer->cur == '-')
1012 buffer->cur++, result->type = CPP_MINUS_MINUS;
1013 else if (*buffer->cur == '=')
1014 buffer->cur++, result->type = CPP_MINUS_EQ;
1015 break;
1017 case '&':
1018 result->type = CPP_AND;
1019 if (*buffer->cur == '&')
1020 buffer->cur++, result->type = CPP_AND_AND;
1021 else if (*buffer->cur == '=')
1022 buffer->cur++, result->type = CPP_AND_EQ;
1023 break;
1025 case '|':
1026 result->type = CPP_OR;
1027 if (*buffer->cur == '|')
1028 buffer->cur++, result->type = CPP_OR_OR;
1029 else if (*buffer->cur == '=')
1030 buffer->cur++, result->type = CPP_OR_EQ;
1031 break;
1033 case ':':
1034 result->type = CPP_COLON;
1035 if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1036 buffer->cur++, result->type = CPP_SCOPE;
1037 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1039 buffer->cur++;
1040 result->flags |= DIGRAPH;
1041 result->type = CPP_CLOSE_SQUARE;
1043 break;
1045 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1046 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1047 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1048 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1049 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1051 case '?': result->type = CPP_QUERY; break;
1052 case '~': result->type = CPP_COMPL; break;
1053 case ',': result->type = CPP_COMMA; break;
1054 case '(': result->type = CPP_OPEN_PAREN; break;
1055 case ')': result->type = CPP_CLOSE_PAREN; break;
1056 case '[': result->type = CPP_OPEN_SQUARE; break;
1057 case ']': result->type = CPP_CLOSE_SQUARE; break;
1058 case '{': result->type = CPP_OPEN_BRACE; break;
1059 case '}': result->type = CPP_CLOSE_BRACE; break;
1060 case ';': result->type = CPP_SEMICOLON; break;
1062 /* @ is a punctuator in Objective-C. */
1063 case '@': result->type = CPP_ATSIGN; break;
1065 case '$':
1066 case '\\':
1068 const uchar *base = --buffer->cur;
1070 if (forms_identifier_p (pfile, true))
1072 result->type = CPP_NAME;
1073 result->val.node = lex_identifier (pfile, base);
1074 break;
1076 buffer->cur++;
1079 default:
1080 create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1081 break;
1084 return result;
1087 /* An upper bound on the number of bytes needed to spell TOKEN.
1088 Does not include preceding whitespace. */
1089 unsigned int
1090 cpp_token_len (const cpp_token *token)
1092 unsigned int len;
1094 switch (TOKEN_SPELL (token))
1096 default: len = 4; break;
1097 case SPELL_LITERAL: len = token->val.str.len; break;
1098 case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
1101 return len;
1104 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1105 already contain the enough space to hold the token's spelling.
1106 Returns a pointer to the character after the last character written.
1107 FIXME: Would be nice if we didn't need the PFILE argument. */
1108 unsigned char *
1109 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1110 unsigned char *buffer)
1112 switch (TOKEN_SPELL (token))
1114 case SPELL_OPERATOR:
1116 const unsigned char *spelling;
1117 unsigned char c;
1119 if (token->flags & DIGRAPH)
1120 spelling
1121 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1122 else if (token->flags & NAMED_OP)
1123 goto spell_ident;
1124 else
1125 spelling = TOKEN_NAME (token);
1127 while ((c = *spelling++) != '\0')
1128 *buffer++ = c;
1130 break;
1132 spell_ident:
1133 case SPELL_IDENT:
1134 memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
1135 buffer += NODE_LEN (token->val.node);
1136 break;
1138 case SPELL_LITERAL:
1139 memcpy (buffer, token->val.str.text, token->val.str.len);
1140 buffer += token->val.str.len;
1141 break;
1143 case SPELL_NONE:
1144 cpp_error (pfile, DL_ICE, "unspellable token %s", TOKEN_NAME (token));
1145 break;
1148 return buffer;
1151 /* Returns TOKEN spelt as a null-terminated string. The string is
1152 freed when the reader is destroyed. Useful for diagnostics. */
1153 unsigned char *
1154 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1156 unsigned int len = cpp_token_len (token) + 1;
1157 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1159 end = cpp_spell_token (pfile, token, start);
1160 end[0] = '\0';
1162 return start;
1165 /* Used by C front ends, which really should move to using
1166 cpp_token_as_text. */
1167 const char *
1168 cpp_type2name (enum cpp_ttype type)
1170 return (const char *) token_spellings[type].name;
1173 /* Writes the spelling of token to FP, without any preceding space.
1174 Separated from cpp_spell_token for efficiency - to avoid stdio
1175 double-buffering. */
1176 void
1177 cpp_output_token (const cpp_token *token, FILE *fp)
1179 switch (TOKEN_SPELL (token))
1181 case SPELL_OPERATOR:
1183 const unsigned char *spelling;
1184 int c;
1186 if (token->flags & DIGRAPH)
1187 spelling
1188 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1189 else if (token->flags & NAMED_OP)
1190 goto spell_ident;
1191 else
1192 spelling = TOKEN_NAME (token);
1194 c = *spelling;
1196 putc (c, fp);
1197 while ((c = *++spelling) != '\0');
1199 break;
1201 spell_ident:
1202 case SPELL_IDENT:
1203 fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
1204 break;
1206 case SPELL_LITERAL:
1207 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1208 break;
1210 case SPELL_NONE:
1211 /* An error, most probably. */
1212 break;
1216 /* Compare two tokens. */
1218 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1220 if (a->type == b->type && a->flags == b->flags)
1221 switch (TOKEN_SPELL (a))
1223 default: /* Keep compiler happy. */
1224 case SPELL_OPERATOR:
1225 return 1;
1226 case SPELL_NONE:
1227 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1228 case SPELL_IDENT:
1229 return a->val.node == b->val.node;
1230 case SPELL_LITERAL:
1231 return (a->val.str.len == b->val.str.len
1232 && !memcmp (a->val.str.text, b->val.str.text,
1233 a->val.str.len));
1236 return 0;
1239 /* Returns nonzero if a space should be inserted to avoid an
1240 accidental token paste for output. For simplicity, it is
1241 conservative, and occasionally advises a space where one is not
1242 needed, e.g. "." and ".2". */
1244 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1245 const cpp_token *token2)
1247 enum cpp_ttype a = token1->type, b = token2->type;
1248 cppchar_t c;
1250 if (token1->flags & NAMED_OP)
1251 a = CPP_NAME;
1252 if (token2->flags & NAMED_OP)
1253 b = CPP_NAME;
1255 c = EOF;
1256 if (token2->flags & DIGRAPH)
1257 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1258 else if (token_spellings[b].category == SPELL_OPERATOR)
1259 c = token_spellings[b].name[0];
1261 /* Quickly get everything that can paste with an '='. */
1262 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1263 return 1;
1265 switch (a)
1267 case CPP_GREATER: return c == '>' || c == '?';
1268 case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
1269 case CPP_PLUS: return c == '+';
1270 case CPP_MINUS: return c == '-' || c == '>';
1271 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1272 case CPP_MOD: return c == ':' || c == '>';
1273 case CPP_AND: return c == '&';
1274 case CPP_OR: return c == '|';
1275 case CPP_COLON: return c == ':' || c == '>';
1276 case CPP_DEREF: return c == '*';
1277 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1278 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1279 case CPP_NAME: return ((b == CPP_NUMBER
1280 && name_p (pfile, &token2->val.str))
1281 || b == CPP_NAME
1282 || b == CPP_CHAR || b == CPP_STRING); /* L */
1283 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1284 || c == '.' || c == '+' || c == '-');
1285 /* UCNs */
1286 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
1287 && b == CPP_NAME)
1288 || (CPP_OPTION (pfile, objc)
1289 && token1->val.str.text[0] == '@'
1290 && (b == CPP_NAME || b == CPP_STRING)));
1291 default: break;
1294 return 0;
1297 /* Output all the remaining tokens on the current line, and a newline
1298 character, to FP. Leading whitespace is removed. If there are
1299 macros, special token padding is not performed. */
1300 void
1301 cpp_output_line (cpp_reader *pfile, FILE *fp)
1303 const cpp_token *token;
1305 token = cpp_get_token (pfile);
1306 while (token->type != CPP_EOF)
1308 cpp_output_token (token, fp);
1309 token = cpp_get_token (pfile);
1310 if (token->flags & PREV_WHITE)
1311 putc (' ', fp);
1314 putc ('\n', fp);
1317 /* Memory buffers. Changing these three constants can have a dramatic
1318 effect on performance. The values here are reasonable defaults,
1319 but might be tuned. If you adjust them, be sure to test across a
1320 range of uses of cpplib, including heavy nested function-like macro
1321 expansion. Also check the change in peak memory usage (NJAMD is a
1322 good tool for this). */
1323 #define MIN_BUFF_SIZE 8000
1324 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1325 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1326 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1328 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1329 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1330 #endif
1332 /* Create a new allocation buffer. Place the control block at the end
1333 of the buffer, so that buffer overflows will cause immediate chaos. */
1334 static _cpp_buff *
1335 new_buff (size_t len)
1337 _cpp_buff *result;
1338 unsigned char *base;
1340 if (len < MIN_BUFF_SIZE)
1341 len = MIN_BUFF_SIZE;
1342 len = CPP_ALIGN (len);
1344 base = xmalloc (len + sizeof (_cpp_buff));
1345 result = (_cpp_buff *) (base + len);
1346 result->base = base;
1347 result->cur = base;
1348 result->limit = base + len;
1349 result->next = NULL;
1350 return result;
1353 /* Place a chain of unwanted allocation buffers on the free list. */
1354 void
1355 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
1357 _cpp_buff *end = buff;
1359 while (end->next)
1360 end = end->next;
1361 end->next = pfile->free_buffs;
1362 pfile->free_buffs = buff;
1365 /* Return a free buffer of size at least MIN_SIZE. */
1366 _cpp_buff *
1367 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
1369 _cpp_buff *result, **p;
1371 for (p = &pfile->free_buffs;; p = &(*p)->next)
1373 size_t size;
1375 if (*p == NULL)
1376 return new_buff (min_size);
1377 result = *p;
1378 size = result->limit - result->base;
1379 /* Return a buffer that's big enough, but don't waste one that's
1380 way too big. */
1381 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1382 break;
1385 *p = result->next;
1386 result->next = NULL;
1387 result->cur = result->base;
1388 return result;
1391 /* Creates a new buffer with enough space to hold the uncommitted
1392 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
1393 the excess bytes to the new buffer. Chains the new buffer after
1394 BUFF, and returns the new buffer. */
1395 _cpp_buff *
1396 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
1398 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1399 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1401 buff->next = new_buff;
1402 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1403 return new_buff;
1406 /* Creates a new buffer with enough space to hold the uncommitted
1407 remaining bytes of the buffer pointed to by BUFF, and at least
1408 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
1409 Chains the new buffer before the buffer pointed to by BUFF, and
1410 updates the pointer to point to the new buffer. */
1411 void
1412 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
1414 _cpp_buff *new_buff, *old_buff = *pbuff;
1415 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1417 new_buff = _cpp_get_buff (pfile, size);
1418 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1419 new_buff->next = old_buff;
1420 *pbuff = new_buff;
1423 /* Free a chain of buffers starting at BUFF. */
1424 void
1425 _cpp_free_buff (buff)
1426 _cpp_buff *buff;
1428 _cpp_buff *next;
1430 for (; buff; buff = next)
1432 next = buff->next;
1433 free (buff->base);
1437 /* Allocate permanent, unaligned storage of length LEN. */
1438 unsigned char *
1439 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
1441 _cpp_buff *buff = pfile->u_buff;
1442 unsigned char *result = buff->cur;
1444 if (len > (size_t) (buff->limit - result))
1446 buff = _cpp_get_buff (pfile, len);
1447 buff->next = pfile->u_buff;
1448 pfile->u_buff = buff;
1449 result = buff->cur;
1452 buff->cur = result + len;
1453 return result;
1456 /* Allocate permanent, unaligned storage of length LEN from a_buff.
1457 That buffer is used for growing allocations when saving macro
1458 replacement lists in a #define, and when parsing an answer to an
1459 assertion in #assert, #unassert or #if (and therefore possibly
1460 whilst expanding macros). It therefore must not be used by any
1461 code that they might call: specifically the lexer and the guts of
1462 the macro expander.
1464 All existing other uses clearly fit this restriction: storing
1465 registered pragmas during initialization. */
1466 unsigned char *
1467 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
1469 _cpp_buff *buff = pfile->a_buff;
1470 unsigned char *result = buff->cur;
1472 if (len > (size_t) (buff->limit - result))
1474 buff = _cpp_get_buff (pfile, len);
1475 buff->next = pfile->a_buff;
1476 pfile->a_buff = buff;
1477 result = buff->cur;
1480 buff->cur = result + len;
1481 return result;