* c-parse.in (array_declarator): New. Handle C99 constructs.
[official-gcc.git] / gcc / cpplex.c
blob3185accece0ce1f7df9b9564d511be2990ef71f4
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7 Single-pass line tokenization by Neil Booth, April 2000
9 This program is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
12 later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23 /* This lexer works with a single pass of the file. Recently I
24 re-wrote it to minimize the places where we step backwards in the
25 input stream, to make future changes to support multi-byte
26 character sets fairly straight-forward.
28 There is now only one routine where we do step backwards:
29 skip_escaped_newlines. This routine could probably also be changed
30 so that it doesn't need to step back. One possibility is to use a
31 trick similar to that used in lex_period and lex_percent. Two
32 extra characters might be needed, but skip_escaped_newlines itself
33 would probably be the only place that needs to be aware of that,
34 and changes to the remaining routines would probably only be needed
35 if they process a backslash. */
37 #include "config.h"
38 #include "system.h"
39 #include "cpplib.h"
40 #include "cpphash.h"
42 /* MULTIBYTE_CHARS support only works for native compilers.
43 ??? Ideally what we want is to model widechar support after
44 the current floating point support. */
45 #ifdef CROSS_COMPILE
46 #undef MULTIBYTE_CHARS
47 #endif
49 #ifdef MULTIBYTE_CHARS
50 #include "mbchar.h"
51 #include <locale.h>
52 #endif
54 /* Tokens with SPELL_STRING store their spelling in the token list,
55 and it's length in the token->val.name.len. */
56 enum spell_type
58 SPELL_OPERATOR = 0,
59 SPELL_CHAR,
60 SPELL_IDENT,
61 SPELL_STRING,
62 SPELL_NONE
65 struct token_spelling
67 enum spell_type category;
68 const unsigned char *name;
71 const unsigned char *digraph_spellings [] = {U"%:", U"%:%:", U"<:",
72 U":>", U"<%", U"%>"};
74 #define OP(e, s) { SPELL_OPERATOR, U s },
75 #define TK(e, s) { s, U STRINGX (e) },
76 const struct token_spelling token_spellings [N_TTYPES] = {TTYPE_TABLE };
77 #undef OP
78 #undef TK
80 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
81 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
83 static cppchar_t handle_newline PARAMS ((cpp_buffer *, cppchar_t));
84 static cppchar_t skip_escaped_newlines PARAMS ((cpp_buffer *, cppchar_t));
85 static cppchar_t get_effective_char PARAMS ((cpp_buffer *));
87 static int skip_block_comment PARAMS ((cpp_reader *));
88 static int skip_line_comment PARAMS ((cpp_reader *));
89 static void adjust_column PARAMS ((cpp_reader *));
90 static void skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
91 static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *, cppchar_t));
92 static void parse_number PARAMS ((cpp_reader *, cpp_string *, cppchar_t, int));
93 static int unescaped_terminator_p PARAMS ((cpp_reader *, const U_CHAR *));
94 static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
95 static void unterminated PARAMS ((cpp_reader *, int));
96 static int trigraph_ok PARAMS ((cpp_reader *, cppchar_t));
97 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const U_CHAR *));
98 static void lex_percent PARAMS ((cpp_buffer *, cpp_token *));
99 static void lex_dot PARAMS ((cpp_reader *, cpp_token *));
100 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
101 static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
102 const unsigned char *, unsigned int *));
104 static cpp_chunk *new_chunk PARAMS ((unsigned int));
105 static int chunk_suitable PARAMS ((cpp_pool *, cpp_chunk *, unsigned int));
106 static unsigned int hex_digit_value PARAMS ((unsigned int));
108 /* Utility routine:
110 Compares, the token TOKEN to the NUL-terminated string STRING.
111 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
114 cpp_ideq (token, string)
115 const cpp_token *token;
116 const char *string;
118 if (token->type != CPP_NAME)
119 return 0;
121 return !ustrcmp (NODE_NAME (token->val.node), (const U_CHAR *) string);
124 /* Call when meeting a newline. Returns the character after the newline
125 (or carriage-return newline combination), or EOF. */
126 static cppchar_t
127 handle_newline (buffer, newline_char)
128 cpp_buffer *buffer;
129 cppchar_t newline_char;
131 cppchar_t next = EOF;
133 buffer->col_adjust = 0;
134 buffer->lineno++;
135 buffer->line_base = buffer->cur;
137 /* Handle CR-LF and LF-CR combinations, get the next character. */
138 if (buffer->cur < buffer->rlimit)
140 next = *buffer->cur++;
141 if (next + newline_char == '\r' + '\n')
143 buffer->line_base = buffer->cur;
144 if (buffer->cur < buffer->rlimit)
145 next = *buffer->cur++;
146 else
147 next = EOF;
151 buffer->read_ahead = next;
152 return next;
155 /* Subroutine of skip_escaped_newlines; called when a trigraph is
156 encountered. It warns if necessary, and returns true if the
157 trigraph should be honoured. FROM_CHAR is the third character of a
158 trigraph, and presumed to be the previous character for position
159 reporting. */
160 static int
161 trigraph_ok (pfile, from_char)
162 cpp_reader *pfile;
163 cppchar_t from_char;
165 int accept = CPP_OPTION (pfile, trigraphs);
167 /* Don't warn about trigraphs in comments. */
168 if (CPP_OPTION (pfile, warn_trigraphs) && !pfile->state.lexing_comment)
170 cpp_buffer *buffer = pfile->buffer;
171 if (accept)
172 cpp_warning_with_line (pfile, buffer->lineno, CPP_BUF_COL (buffer) - 2,
173 "trigraph ??%c converted to %c",
174 (int) from_char,
175 (int) _cpp_trigraph_map[from_char]);
176 else if (buffer->cur != buffer->last_Wtrigraphs)
178 buffer->last_Wtrigraphs = buffer->cur;
179 cpp_warning_with_line (pfile, buffer->lineno,
180 CPP_BUF_COL (buffer) - 2,
181 "trigraph ??%c ignored", (int) from_char);
185 return accept;
188 /* Assumes local variables buffer and result. */
189 #define ACCEPT_CHAR(t) \
190 do { result->type = t; buffer->read_ahead = EOF; } while (0)
192 /* When we move to multibyte character sets, add to these something
193 that saves and restores the state of the multibyte conversion
194 library. This probably involves saving and restoring a "cookie".
195 In the case of glibc it is an 8-byte structure, so is not a high
196 overhead operation. In any case, it's out of the fast path. */
197 #define SAVE_STATE() do { saved_cur = buffer->cur; } while (0)
198 #define RESTORE_STATE() do { buffer->cur = saved_cur; } while (0)
200 /* Skips any escaped newlines introduced by NEXT, which is either a
201 '?' or a '\\'. Returns the next character, which will also have
202 been placed in buffer->read_ahead. This routine performs
203 preprocessing stages 1 and 2 of the ISO C standard. */
204 static cppchar_t
205 skip_escaped_newlines (buffer, next)
206 cpp_buffer *buffer;
207 cppchar_t next;
209 /* Only do this if we apply stages 1 and 2. */
210 if (!buffer->from_stage3)
212 cppchar_t next1;
213 const unsigned char *saved_cur;
214 int space;
218 if (buffer->cur == buffer->rlimit)
219 break;
221 SAVE_STATE ();
222 if (next == '?')
224 next1 = *buffer->cur++;
225 if (next1 != '?' || buffer->cur == buffer->rlimit)
227 RESTORE_STATE ();
228 break;
231 next1 = *buffer->cur++;
232 if (!_cpp_trigraph_map[next1]
233 || !trigraph_ok (buffer->pfile, next1))
235 RESTORE_STATE ();
236 break;
239 /* We have a full trigraph here. */
240 next = _cpp_trigraph_map[next1];
241 if (next != '\\' || buffer->cur == buffer->rlimit)
242 break;
243 SAVE_STATE ();
246 /* We have a backslash, and room for at least one more character. */
247 space = 0;
250 next1 = *buffer->cur++;
251 if (!is_nvspace (next1))
252 break;
253 space = 1;
255 while (buffer->cur < buffer->rlimit);
257 if (!is_vspace (next1))
259 RESTORE_STATE ();
260 break;
263 if (space && !buffer->pfile->state.lexing_comment)
264 cpp_warning (buffer->pfile,
265 "backslash and newline separated by space");
267 next = handle_newline (buffer, next1);
268 if (next == EOF)
269 cpp_pedwarn (buffer->pfile, "backslash-newline at end of file");
271 while (next == '\\' || next == '?');
274 buffer->read_ahead = next;
275 return next;
278 /* Obtain the next character, after trigraph conversion and skipping
279 an arbitrary string of escaped newlines. The common case of no
280 trigraphs or escaped newlines falls through quickly. */
281 static cppchar_t
282 get_effective_char (buffer)
283 cpp_buffer *buffer;
285 cppchar_t next = EOF;
287 if (buffer->cur < buffer->rlimit)
289 next = *buffer->cur++;
291 /* '?' can introduce trigraphs (and therefore backslash); '\\'
292 can introduce escaped newlines, which we want to skip, or
293 UCNs, which, depending upon lexer state, we will handle in
294 the future. */
295 if (next == '?' || next == '\\')
296 next = skip_escaped_newlines (buffer, next);
299 buffer->read_ahead = next;
300 return next;
303 /* Skip a C-style block comment. We find the end of the comment by
304 seeing if an asterisk is before every '/' we encounter. Returns
305 non-zero if comment terminated by EOF, zero otherwise. */
306 static int
307 skip_block_comment (pfile)
308 cpp_reader *pfile;
310 cpp_buffer *buffer = pfile->buffer;
311 cppchar_t c = EOF, prevc = EOF;
313 pfile->state.lexing_comment = 1;
314 while (buffer->cur != buffer->rlimit)
316 prevc = c, c = *buffer->cur++;
318 next_char:
319 /* FIXME: For speed, create a new character class of characters
320 of interest inside block comments. */
321 if (c == '?' || c == '\\')
322 c = skip_escaped_newlines (buffer, c);
324 /* People like decorating comments with '*', so check for '/'
325 instead for efficiency. */
326 if (c == '/')
328 if (prevc == '*')
329 break;
331 /* Warn about potential nested comments, but not if the '/'
332 comes immediately before the true comment delimeter.
333 Don't bother to get it right across escaped newlines. */
334 if (CPP_OPTION (pfile, warn_comments)
335 && buffer->cur != buffer->rlimit)
337 prevc = c, c = *buffer->cur++;
338 if (c == '*' && buffer->cur != buffer->rlimit)
340 prevc = c, c = *buffer->cur++;
341 if (c != '/')
342 cpp_warning_with_line (pfile, CPP_BUF_LINE (buffer),
343 CPP_BUF_COL (buffer),
344 "\"/*\" within comment");
346 goto next_char;
349 else if (is_vspace (c))
351 prevc = c, c = handle_newline (buffer, c);
352 goto next_char;
354 else if (c == '\t')
355 adjust_column (pfile);
358 pfile->state.lexing_comment = 0;
359 buffer->read_ahead = EOF;
360 return c != '/' || prevc != '*';
363 /* Skip a C++ line comment. Handles escaped newlines. Returns
364 non-zero if a multiline comment. The following new line, if any,
365 is left in buffer->read_ahead. */
366 static int
367 skip_line_comment (pfile)
368 cpp_reader *pfile;
370 cpp_buffer *buffer = pfile->buffer;
371 unsigned int orig_lineno = buffer->lineno;
372 cppchar_t c;
374 pfile->state.lexing_comment = 1;
377 c = EOF;
378 if (buffer->cur == buffer->rlimit)
379 break;
381 c = *buffer->cur++;
382 if (c == '?' || c == '\\')
383 c = skip_escaped_newlines (buffer, c);
385 while (!is_vspace (c));
387 pfile->state.lexing_comment = 0;
388 buffer->read_ahead = c; /* Leave any newline for caller. */
389 return orig_lineno != buffer->lineno;
392 /* pfile->buffer->cur is one beyond the \t character. Update
393 col_adjust so we track the column correctly. */
394 static void
395 adjust_column (pfile)
396 cpp_reader *pfile;
398 cpp_buffer *buffer = pfile->buffer;
399 unsigned int col = CPP_BUF_COL (buffer) - 1; /* Zero-based column. */
401 /* Round it up to multiple of the tabstop, but subtract 1 since the
402 tab itself occupies a character position. */
403 buffer->col_adjust += (CPP_OPTION (pfile, tabstop)
404 - col % CPP_OPTION (pfile, tabstop)) - 1;
407 /* Skips whitespace, saving the next non-whitespace character.
408 Adjusts pfile->col_adjust to account for tabs. Without this,
409 tokens might be assigned an incorrect column. */
410 static void
411 skip_whitespace (pfile, c)
412 cpp_reader *pfile;
413 cppchar_t c;
415 cpp_buffer *buffer = pfile->buffer;
416 unsigned int warned = 0;
420 /* Horizontal space always OK. */
421 if (c == ' ')
423 else if (c == '\t')
424 adjust_column (pfile);
425 /* Just \f \v or \0 left. */
426 else if (c == '\0')
428 if (!warned)
430 cpp_warning (pfile, "null character(s) ignored");
431 warned = 1;
434 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
435 cpp_pedwarn_with_line (pfile, CPP_BUF_LINE (buffer),
436 CPP_BUF_COL (buffer),
437 "%s in preprocessing directive",
438 c == '\f' ? "form feed" : "vertical tab");
440 c = EOF;
441 if (buffer->cur == buffer->rlimit)
442 break;
443 c = *buffer->cur++;
445 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
446 while (is_nvspace (c));
448 /* Remember the next character. */
449 buffer->read_ahead = c;
452 /* See if the characters of a number token are valid in a name (no
453 '.', '+' or '-'). */
454 static int
455 name_p (pfile, string)
456 cpp_reader *pfile;
457 const cpp_string *string;
459 unsigned int i;
461 for (i = 0; i < string->len; i++)
462 if (!is_idchar (string->text[i]))
463 return 0;
465 return 1;
468 /* Parse an identifier, skipping embedded backslash-newlines.
469 Calculate the hash value of the token while parsing, for improved
470 performance. The hashing algorithm *must* match cpp_lookup(). */
472 static cpp_hashnode *
473 parse_identifier (pfile, c)
474 cpp_reader *pfile;
475 cppchar_t c;
477 cpp_hashnode *result;
478 cpp_buffer *buffer = pfile->buffer;
479 unsigned int saw_dollar = 0, len;
480 struct obstack *stack = &pfile->hash_table->stack;
486 obstack_1grow (stack, c);
488 if (c == '$')
489 saw_dollar++;
491 c = EOF;
492 if (buffer->cur == buffer->rlimit)
493 break;
495 c = *buffer->cur++;
497 while (is_idchar (c));
499 /* Potential escaped newline? */
500 if (c != '?' && c != '\\')
501 break;
502 c = skip_escaped_newlines (buffer, c);
504 while (is_idchar (c));
506 /* Remember the next character. */
507 buffer->read_ahead = c;
509 /* $ is not a identifier character in the standard, but is commonly
510 accepted as an extension. Don't warn about it in skipped
511 conditional blocks. */
512 if (saw_dollar && CPP_PEDANTIC (pfile) && ! pfile->skipping)
513 cpp_pedwarn (pfile, "'$' character(s) in identifier");
515 /* Identifiers are null-terminated. */
516 len = obstack_object_size (stack);
517 obstack_1grow (stack, '\0');
519 /* This routine commits the memory if necessary. */
520 result = (cpp_hashnode *)
521 ht_lookup (pfile->hash_table, obstack_finish (stack), len, HT_ALLOCED);
523 /* Some identifiers require diagnostics when lexed. */
524 if (result->flags & NODE_DIAGNOSTIC && !pfile->skipping)
526 /* It is allowed to poison the same identifier twice. */
527 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
528 cpp_error (pfile, "attempt to use poisoned \"%s\"",
529 NODE_NAME (result));
531 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
532 replacement list of a variadic macro. */
533 if (result == pfile->spec_nodes.n__VA_ARGS__
534 && !pfile->state.va_args_ok)
535 cpp_pedwarn (pfile, "__VA_ARGS__ can only appear in the expansion of a C99 variadic macro");
538 return result;
541 /* Parse a number, skipping embedded backslash-newlines. */
542 static void
543 parse_number (pfile, number, c, leading_period)
544 cpp_reader *pfile;
545 cpp_string *number;
546 cppchar_t c;
547 int leading_period;
549 cpp_buffer *buffer = pfile->buffer;
550 cpp_pool *pool = &pfile->ident_pool;
551 unsigned char *dest, *limit;
553 dest = POOL_FRONT (pool);
554 limit = POOL_LIMIT (pool);
556 /* Place a leading period. */
557 if (leading_period)
559 if (dest >= limit)
560 limit = _cpp_next_chunk (pool, 0, &dest);
561 *dest++ = '.';
568 /* Need room for terminating null. */
569 if (dest + 1 >= limit)
570 limit = _cpp_next_chunk (pool, 0, &dest);
571 *dest++ = c;
573 c = EOF;
574 if (buffer->cur == buffer->rlimit)
575 break;
577 c = *buffer->cur++;
579 while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
581 /* Potential escaped newline? */
582 if (c != '?' && c != '\\')
583 break;
584 c = skip_escaped_newlines (buffer, c);
586 while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
588 /* Remember the next character. */
589 buffer->read_ahead = c;
591 /* Null-terminate the number. */
592 *dest = '\0';
594 number->text = POOL_FRONT (pool);
595 number->len = dest - number->text;
596 POOL_COMMIT (pool, number->len + 1);
599 /* Subroutine of parse_string. Emits error for unterminated strings. */
600 static void
601 unterminated (pfile, term)
602 cpp_reader *pfile;
603 int term;
605 cpp_error (pfile, "missing terminating %c character", term);
607 if (term == '\"' && pfile->mlstring_pos.line
608 && pfile->mlstring_pos.line != pfile->lexer_pos.line)
610 cpp_error_with_line (pfile, pfile->mlstring_pos.line,
611 pfile->mlstring_pos.col,
612 "possible start of unterminated string literal");
613 pfile->mlstring_pos.line = 0;
617 /* Subroutine of parse_string. */
618 static int
619 unescaped_terminator_p (pfile, dest)
620 cpp_reader *pfile;
621 const unsigned char *dest;
623 const unsigned char *start, *temp;
625 /* In #include-style directives, terminators are not escapeable. */
626 if (pfile->state.angled_headers)
627 return 1;
629 start = POOL_FRONT (&pfile->ident_pool);
631 /* An odd number of consecutive backslashes represents an escaped
632 terminator. */
633 for (temp = dest; temp > start && temp[-1] == '\\'; temp--)
636 return ((dest - temp) & 1) == 0;
639 /* Parses a string, character constant, or angle-bracketed header file
640 name. Handles embedded trigraphs and escaped newlines. The stored
641 string is guaranteed NUL-terminated, but it is not guaranteed that
642 this is the first NUL since embedded NULs are preserved.
644 Multi-line strings are allowed, but they are deprecated. */
645 static void
646 parse_string (pfile, token, terminator)
647 cpp_reader *pfile;
648 cpp_token *token;
649 cppchar_t terminator;
651 cpp_buffer *buffer = pfile->buffer;
652 cpp_pool *pool = &pfile->ident_pool;
653 unsigned char *dest, *limit;
654 cppchar_t c;
655 unsigned int nulls = 0;
657 dest = POOL_FRONT (pool);
658 limit = POOL_LIMIT (pool);
660 for (;;)
662 if (buffer->cur == buffer->rlimit)
663 c = EOF;
664 else
665 c = *buffer->cur++;
667 have_char:
668 /* We need space for the terminating NUL. */
669 if (dest >= limit)
670 limit = _cpp_next_chunk (pool, 0, &dest);
672 if (c == EOF)
674 unterminated (pfile, terminator);
675 break;
678 /* Handle trigraphs, escaped newlines etc. */
679 if (c == '?' || c == '\\')
680 c = skip_escaped_newlines (buffer, c);
682 if (c == terminator && unescaped_terminator_p (pfile, dest))
684 c = EOF;
685 break;
687 else if (is_vspace (c))
689 /* In assembly language, silently terminate string and
690 character literals at end of line. This is a kludge
691 around not knowing where comments are. */
692 if (CPP_OPTION (pfile, lang) == CLK_ASM && terminator != '>')
693 break;
695 /* Character constants and header names may not extend over
696 multiple lines. In Standard C, neither may strings.
697 Unfortunately, we accept multiline strings as an
698 extension, except in #include family directives. */
699 if (terminator != '"' || pfile->state.angled_headers)
701 unterminated (pfile, terminator);
702 break;
705 cpp_pedwarn (pfile, "multi-line string literals are deprecated");
706 if (pfile->mlstring_pos.line == 0)
707 pfile->mlstring_pos = pfile->lexer_pos;
709 c = handle_newline (buffer, c);
710 *dest++ = '\n';
711 goto have_char;
713 else if (c == '\0')
715 if (nulls++ == 0)
716 cpp_warning (pfile, "null character(s) preserved in literal");
719 *dest++ = c;
722 /* Remember the next character. */
723 buffer->read_ahead = c;
724 *dest = '\0';
726 token->val.str.text = POOL_FRONT (pool);
727 token->val.str.len = dest - token->val.str.text;
728 POOL_COMMIT (pool, token->val.str.len + 1);
731 /* The stored comment includes the comment start and any terminator. */
732 static void
733 save_comment (pfile, token, from)
734 cpp_reader *pfile;
735 cpp_token *token;
736 const unsigned char *from;
738 unsigned char *buffer;
739 unsigned int len;
741 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
742 /* C++ comments probably (not definitely) have moved past a new
743 line, which we don't want to save in the comment. */
744 if (pfile->buffer->read_ahead != EOF)
745 len--;
746 buffer = _cpp_pool_alloc (&pfile->ident_pool, len);
748 token->type = CPP_COMMENT;
749 token->val.str.len = len;
750 token->val.str.text = buffer;
752 buffer[0] = '/';
753 memcpy (buffer + 1, from, len - 1);
756 /* Subroutine of lex_token to handle '%'. A little tricky, since we
757 want to avoid stepping back when lexing %:%X. */
758 static void
759 lex_percent (buffer, result)
760 cpp_buffer *buffer;
761 cpp_token *result;
763 cppchar_t c;
765 result->type = CPP_MOD;
766 /* Parsing %:%X could leave an extra character. */
767 if (buffer->extra_char == EOF)
768 c = get_effective_char (buffer);
769 else
771 c = buffer->read_ahead = buffer->extra_char;
772 buffer->extra_char = EOF;
775 if (c == '=')
776 ACCEPT_CHAR (CPP_MOD_EQ);
777 else if (CPP_OPTION (buffer->pfile, digraphs))
779 if (c == ':')
781 result->flags |= DIGRAPH;
782 ACCEPT_CHAR (CPP_HASH);
783 if (get_effective_char (buffer) == '%')
785 buffer->extra_char = get_effective_char (buffer);
786 if (buffer->extra_char == ':')
788 buffer->extra_char = EOF;
789 ACCEPT_CHAR (CPP_PASTE);
791 else
792 /* We'll catch the extra_char when we're called back. */
793 buffer->read_ahead = '%';
796 else if (c == '>')
798 result->flags |= DIGRAPH;
799 ACCEPT_CHAR (CPP_CLOSE_BRACE);
804 /* Subroutine of lex_token to handle '.'. This is tricky, since we
805 want to avoid stepping back when lexing '...' or '.123'. In the
806 latter case we should also set a flag for parse_number. */
807 static void
808 lex_dot (pfile, result)
809 cpp_reader *pfile;
810 cpp_token *result;
812 cpp_buffer *buffer = pfile->buffer;
813 cppchar_t c;
815 /* Parsing ..X could leave an extra character. */
816 if (buffer->extra_char == EOF)
817 c = get_effective_char (buffer);
818 else
820 c = buffer->read_ahead = buffer->extra_char;
821 buffer->extra_char = EOF;
824 /* All known character sets have 0...9 contiguous. */
825 if (c >= '0' && c <= '9')
827 result->type = CPP_NUMBER;
828 parse_number (pfile, &result->val.str, c, 1);
830 else
832 result->type = CPP_DOT;
833 if (c == '.')
835 buffer->extra_char = get_effective_char (buffer);
836 if (buffer->extra_char == '.')
838 buffer->extra_char = EOF;
839 ACCEPT_CHAR (CPP_ELLIPSIS);
841 else
842 /* We'll catch the extra_char when we're called back. */
843 buffer->read_ahead = '.';
845 else if (c == '*' && CPP_OPTION (pfile, cplusplus))
846 ACCEPT_CHAR (CPP_DOT_STAR);
850 void
851 _cpp_lex_token (pfile, result)
852 cpp_reader *pfile;
853 cpp_token *result;
855 cppchar_t c;
856 cpp_buffer *buffer;
857 const unsigned char *comment_start;
858 unsigned char bol;
860 skip:
861 bol = pfile->state.next_bol;
862 done_directive:
863 buffer = pfile->buffer;
864 pfile->state.next_bol = 0;
865 result->flags = buffer->saved_flags;
866 buffer->saved_flags = 0;
867 next_char:
868 pfile->lexer_pos.line = buffer->lineno;
869 next_char2:
870 pfile->lexer_pos.col = CPP_BUF_COLUMN (buffer, buffer->cur);
872 c = buffer->read_ahead;
873 if (c == EOF && buffer->cur < buffer->rlimit)
875 c = *buffer->cur++;
876 pfile->lexer_pos.col++;
879 do_switch:
880 buffer->read_ahead = EOF;
881 switch (c)
883 case EOF:
884 /* Non-empty files should end in a newline. Checking "bol" too
885 prevents multiple warnings when hitting the EOF more than
886 once, like in a directive. Don't warn for command line and
887 _Pragma buffers. */
888 if (pfile->lexer_pos.col != 0 && !bol && !buffer->from_stage3)
889 cpp_pedwarn (pfile, "no newline at end of file");
890 pfile->state.next_bol = 1;
891 pfile->skipping = 0; /* In case missing #endif. */
892 result->type = CPP_EOF;
893 /* Don't do MI optimisation. */
894 return;
896 case ' ': case '\t': case '\f': case '\v': case '\0':
897 skip_whitespace (pfile, c);
898 result->flags |= PREV_WHITE;
899 goto next_char2;
901 case '\n': case '\r':
902 if (!pfile->state.in_directive)
904 handle_newline (buffer, c);
905 bol = 1;
906 pfile->lexer_pos.output_line = buffer->lineno;
907 /* This is a new line, so clear any white space flag.
908 Newlines in arguments are white space (6.10.3.10);
909 parse_arg takes care of that. */
910 result->flags &= ~(PREV_WHITE | AVOID_LPASTE);
911 goto next_char;
914 /* Don't let directives spill over to the next line. */
915 buffer->read_ahead = c;
916 pfile->state.next_bol = 1;
917 result->type = CPP_EOF;
918 /* Don't break; pfile->skipping might be true. */
919 return;
921 case '?':
922 case '\\':
923 /* These could start an escaped newline, or '?' a trigraph. Let
924 skip_escaped_newlines do all the work. */
926 unsigned int lineno = buffer->lineno;
928 c = skip_escaped_newlines (buffer, c);
929 if (lineno != buffer->lineno)
930 /* We had at least one escaped newline of some sort, and the
931 next character is in buffer->read_ahead. Update the
932 token's line and column. */
933 goto next_char;
935 /* We are either the original '?' or '\\', or a trigraph. */
936 result->type = CPP_QUERY;
937 buffer->read_ahead = EOF;
938 if (c == '\\')
939 goto random_char;
940 else if (c != '?')
941 goto do_switch;
943 break;
945 case '0': case '1': case '2': case '3': case '4':
946 case '5': case '6': case '7': case '8': case '9':
947 result->type = CPP_NUMBER;
948 parse_number (pfile, &result->val.str, c, 0);
949 break;
951 case '$':
952 if (!CPP_OPTION (pfile, dollars_in_ident))
953 goto random_char;
954 /* Fall through... */
956 case '_':
957 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
958 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
959 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
960 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
961 case 'y': case 'z':
962 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
963 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
964 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
965 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
966 case 'Y': case 'Z':
967 result->type = CPP_NAME;
968 result->val.node = parse_identifier (pfile, c);
970 /* 'L' may introduce wide characters or strings. */
971 if (result->val.node == pfile->spec_nodes.n_L)
973 c = buffer->read_ahead; /* For make_string. */
974 if (c == '\'' || c == '"')
976 ACCEPT_CHAR (c == '"' ? CPP_WSTRING: CPP_WCHAR);
977 goto make_string;
980 /* Convert named operators to their proper types. */
981 else if (result->val.node->flags & NODE_OPERATOR)
983 result->flags |= NAMED_OP;
984 result->type = result->val.node->value.operator;
986 break;
988 case '\'':
989 case '"':
990 result->type = c == '"' ? CPP_STRING: CPP_CHAR;
991 make_string:
992 parse_string (pfile, result, c);
993 break;
995 case '/':
996 /* A potential block or line comment. */
997 comment_start = buffer->cur;
998 result->type = CPP_DIV;
999 c = get_effective_char (buffer);
1000 if (c == '=')
1001 ACCEPT_CHAR (CPP_DIV_EQ);
1002 if (c != '/' && c != '*')
1003 break;
1004 if (buffer->from_stage3)
1005 break;
1007 if (c == '*')
1009 if (skip_block_comment (pfile))
1010 cpp_error_with_line (pfile, pfile->lexer_pos.line,
1011 pfile->lexer_pos.col,
1012 "unterminated comment");
1014 else
1016 if (!CPP_OPTION (pfile, cplusplus_comments)
1017 && !CPP_IN_SYSTEM_HEADER (pfile))
1018 break;
1020 /* Warn about comments only if pedantically GNUC89, and not
1021 in system headers. */
1022 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1023 && ! buffer->warned_cplusplus_comments)
1025 cpp_pedwarn (pfile,
1026 "C++ style comments are not allowed in ISO C89");
1027 cpp_pedwarn (pfile,
1028 "(this will be reported only once per input file)");
1029 buffer->warned_cplusplus_comments = 1;
1032 /* Skip_line_comment updates buffer->read_ahead. */
1033 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1034 cpp_warning_with_line (pfile, pfile->lexer_pos.line,
1035 pfile->lexer_pos.col,
1036 "multi-line comment");
1039 /* Skipping the comment has updated buffer->read_ahead. */
1040 if (!pfile->state.save_comments)
1042 result->flags |= PREV_WHITE;
1043 goto next_char;
1046 /* Save the comment as a token in its own right. */
1047 save_comment (pfile, result, comment_start);
1048 /* Don't do MI optimisation. */
1049 return;
1051 case '<':
1052 if (pfile->state.angled_headers)
1054 result->type = CPP_HEADER_NAME;
1055 c = '>'; /* terminator. */
1056 goto make_string;
1059 result->type = CPP_LESS;
1060 c = get_effective_char (buffer);
1061 if (c == '=')
1062 ACCEPT_CHAR (CPP_LESS_EQ);
1063 else if (c == '<')
1065 ACCEPT_CHAR (CPP_LSHIFT);
1066 if (get_effective_char (buffer) == '=')
1067 ACCEPT_CHAR (CPP_LSHIFT_EQ);
1069 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1071 ACCEPT_CHAR (CPP_MIN);
1072 if (get_effective_char (buffer) == '=')
1073 ACCEPT_CHAR (CPP_MIN_EQ);
1075 else if (c == ':' && CPP_OPTION (pfile, digraphs))
1077 ACCEPT_CHAR (CPP_OPEN_SQUARE);
1078 result->flags |= DIGRAPH;
1080 else if (c == '%' && CPP_OPTION (pfile, digraphs))
1082 ACCEPT_CHAR (CPP_OPEN_BRACE);
1083 result->flags |= DIGRAPH;
1085 break;
1087 case '>':
1088 result->type = CPP_GREATER;
1089 c = get_effective_char (buffer);
1090 if (c == '=')
1091 ACCEPT_CHAR (CPP_GREATER_EQ);
1092 else if (c == '>')
1094 ACCEPT_CHAR (CPP_RSHIFT);
1095 if (get_effective_char (buffer) == '=')
1096 ACCEPT_CHAR (CPP_RSHIFT_EQ);
1098 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1100 ACCEPT_CHAR (CPP_MAX);
1101 if (get_effective_char (buffer) == '=')
1102 ACCEPT_CHAR (CPP_MAX_EQ);
1104 break;
1106 case '%':
1107 lex_percent (buffer, result);
1108 if (result->type == CPP_HASH)
1109 goto do_hash;
1110 break;
1112 case '.':
1113 lex_dot (pfile, result);
1114 break;
1116 case '+':
1117 result->type = CPP_PLUS;
1118 c = get_effective_char (buffer);
1119 if (c == '=')
1120 ACCEPT_CHAR (CPP_PLUS_EQ);
1121 else if (c == '+')
1122 ACCEPT_CHAR (CPP_PLUS_PLUS);
1123 break;
1125 case '-':
1126 result->type = CPP_MINUS;
1127 c = get_effective_char (buffer);
1128 if (c == '>')
1130 ACCEPT_CHAR (CPP_DEREF);
1131 if (CPP_OPTION (pfile, cplusplus)
1132 && get_effective_char (buffer) == '*')
1133 ACCEPT_CHAR (CPP_DEREF_STAR);
1135 else if (c == '=')
1136 ACCEPT_CHAR (CPP_MINUS_EQ);
1137 else if (c == '-')
1138 ACCEPT_CHAR (CPP_MINUS_MINUS);
1139 break;
1141 case '*':
1142 result->type = CPP_MULT;
1143 if (get_effective_char (buffer) == '=')
1144 ACCEPT_CHAR (CPP_MULT_EQ);
1145 break;
1147 case '=':
1148 result->type = CPP_EQ;
1149 if (get_effective_char (buffer) == '=')
1150 ACCEPT_CHAR (CPP_EQ_EQ);
1151 break;
1153 case '!':
1154 result->type = CPP_NOT;
1155 if (get_effective_char (buffer) == '=')
1156 ACCEPT_CHAR (CPP_NOT_EQ);
1157 break;
1159 case '&':
1160 result->type = CPP_AND;
1161 c = get_effective_char (buffer);
1162 if (c == '=')
1163 ACCEPT_CHAR (CPP_AND_EQ);
1164 else if (c == '&')
1165 ACCEPT_CHAR (CPP_AND_AND);
1166 break;
1168 case '#':
1169 c = buffer->extra_char; /* Can be set by error condition below. */
1170 if (c != EOF)
1172 buffer->read_ahead = c;
1173 buffer->extra_char = EOF;
1175 else
1176 c = get_effective_char (buffer);
1178 if (c == '#')
1180 ACCEPT_CHAR (CPP_PASTE);
1181 break;
1184 result->type = CPP_HASH;
1185 do_hash:
1186 if (!bol)
1187 break;
1188 /* 6.10.3 paragraph 11: If there are sequences of preprocessing
1189 tokens within the list of arguments that would otherwise act
1190 as preprocessing directives, the behavior is undefined.
1192 This implementation will report a hard error, terminate the
1193 macro invocation, and proceed to process the directive. */
1194 if (pfile->state.parsing_args)
1196 if (pfile->state.parsing_args == 2)
1197 cpp_error (pfile,
1198 "directives may not be used inside a macro argument");
1200 /* Put a '#' in lookahead, return CPP_EOF for parse_arg. */
1201 buffer->extra_char = buffer->read_ahead;
1202 buffer->read_ahead = '#';
1203 pfile->state.next_bol = 1;
1204 result->type = CPP_EOF;
1206 /* Get whitespace right - newline_in_args sets it. */
1207 if (pfile->lexer_pos.col == 1)
1208 result->flags &= ~(PREV_WHITE | AVOID_LPASTE);
1210 else
1212 /* This is the hash introducing a directive. */
1213 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1214 goto done_directive; /* bol still 1. */
1215 /* This is in fact an assembler #. */
1217 break;
1219 case '|':
1220 result->type = CPP_OR;
1221 c = get_effective_char (buffer);
1222 if (c == '=')
1223 ACCEPT_CHAR (CPP_OR_EQ);
1224 else if (c == '|')
1225 ACCEPT_CHAR (CPP_OR_OR);
1226 break;
1228 case '^':
1229 result->type = CPP_XOR;
1230 if (get_effective_char (buffer) == '=')
1231 ACCEPT_CHAR (CPP_XOR_EQ);
1232 break;
1234 case ':':
1235 result->type = CPP_COLON;
1236 c = get_effective_char (buffer);
1237 if (c == ':' && CPP_OPTION (pfile, cplusplus))
1238 ACCEPT_CHAR (CPP_SCOPE);
1239 else if (c == '>' && CPP_OPTION (pfile, digraphs))
1241 result->flags |= DIGRAPH;
1242 ACCEPT_CHAR (CPP_CLOSE_SQUARE);
1244 break;
1246 case '~': result->type = CPP_COMPL; break;
1247 case ',': result->type = CPP_COMMA; break;
1248 case '(': result->type = CPP_OPEN_PAREN; break;
1249 case ')': result->type = CPP_CLOSE_PAREN; break;
1250 case '[': result->type = CPP_OPEN_SQUARE; break;
1251 case ']': result->type = CPP_CLOSE_SQUARE; break;
1252 case '{': result->type = CPP_OPEN_BRACE; break;
1253 case '}': result->type = CPP_CLOSE_BRACE; break;
1254 case ';': result->type = CPP_SEMICOLON; break;
1256 /* @ is a punctuator in Objective C. */
1257 case '@': result->type = CPP_ATSIGN; break;
1259 random_char:
1260 default:
1261 result->type = CPP_OTHER;
1262 result->val.c = c;
1263 break;
1266 if (pfile->skipping)
1267 goto skip;
1269 /* If not in a directive, this token invalidates controlling macros. */
1270 if (!pfile->state.in_directive)
1271 pfile->mi_state = MI_FAILED;
1274 /* An upper bound on the number of bytes needed to spell a token,
1275 including preceding whitespace. */
1276 unsigned int
1277 cpp_token_len (token)
1278 const cpp_token *token;
1280 unsigned int len;
1282 switch (TOKEN_SPELL (token))
1284 default: len = 0; break;
1285 case SPELL_STRING: len = token->val.str.len; break;
1286 case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
1288 /* 1 for whitespace, 4 for comment delimeters. */
1289 return len + 5;
1292 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1293 already contain the enough space to hold the token's spelling.
1294 Returns a pointer to the character after the last character
1295 written. */
1296 unsigned char *
1297 cpp_spell_token (pfile, token, buffer)
1298 cpp_reader *pfile; /* Would be nice to be rid of this... */
1299 const cpp_token *token;
1300 unsigned char *buffer;
1302 switch (TOKEN_SPELL (token))
1304 case SPELL_OPERATOR:
1306 const unsigned char *spelling;
1307 unsigned char c;
1309 if (token->flags & DIGRAPH)
1310 spelling
1311 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1312 else if (token->flags & NAMED_OP)
1313 goto spell_ident;
1314 else
1315 spelling = TOKEN_NAME (token);
1317 while ((c = *spelling++) != '\0')
1318 *buffer++ = c;
1320 break;
1322 case SPELL_IDENT:
1323 spell_ident:
1324 memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
1325 buffer += NODE_LEN (token->val.node);
1326 break;
1328 case SPELL_STRING:
1330 int left, right, tag;
1331 switch (token->type)
1333 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1334 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1335 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1336 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1337 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1338 default: left = '\0'; right = '\0'; tag = '\0'; break;
1340 if (tag) *buffer++ = tag;
1341 if (left) *buffer++ = left;
1342 memcpy (buffer, token->val.str.text, token->val.str.len);
1343 buffer += token->val.str.len;
1344 if (right) *buffer++ = right;
1346 break;
1348 case SPELL_CHAR:
1349 *buffer++ = token->val.c;
1350 break;
1352 case SPELL_NONE:
1353 cpp_ice (pfile, "Unspellable token %s", TOKEN_NAME (token));
1354 break;
1357 return buffer;
1360 /* Returns a token as a null-terminated string. The string is
1361 temporary, and automatically freed later. Useful for diagnostics. */
1362 unsigned char *
1363 cpp_token_as_text (pfile, token)
1364 cpp_reader *pfile;
1365 const cpp_token *token;
1367 unsigned int len = cpp_token_len (token);
1368 unsigned char *start = _cpp_pool_alloc (&pfile->ident_pool, len), *end;
1370 end = cpp_spell_token (pfile, token, start);
1371 end[0] = '\0';
1373 return start;
1376 /* Used by C front ends. Should really move to using cpp_token_as_text. */
1377 const char *
1378 cpp_type2name (type)
1379 enum cpp_ttype type;
1381 return (const char *) token_spellings[type].name;
1384 /* Writes the spelling of token to FP. Separate from cpp_spell_token
1385 for efficiency - to avoid double-buffering. Also, outputs a space
1386 if PREV_WHITE is flagged. */
1387 void
1388 cpp_output_token (token, fp)
1389 const cpp_token *token;
1390 FILE *fp;
1392 if (token->flags & PREV_WHITE)
1393 putc (' ', fp);
1395 switch (TOKEN_SPELL (token))
1397 case SPELL_OPERATOR:
1399 const unsigned char *spelling;
1401 if (token->flags & DIGRAPH)
1402 spelling
1403 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1404 else if (token->flags & NAMED_OP)
1405 goto spell_ident;
1406 else
1407 spelling = TOKEN_NAME (token);
1409 ufputs (spelling, fp);
1411 break;
1413 spell_ident:
1414 case SPELL_IDENT:
1415 ufputs (NODE_NAME (token->val.node), fp);
1416 break;
1418 case SPELL_STRING:
1420 int left, right, tag;
1421 switch (token->type)
1423 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1424 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1425 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1426 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1427 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1428 default: left = '\0'; right = '\0'; tag = '\0'; break;
1430 if (tag) putc (tag, fp);
1431 if (left) putc (left, fp);
1432 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1433 if (right) putc (right, fp);
1435 break;
1437 case SPELL_CHAR:
1438 putc (token->val.c, fp);
1439 break;
1441 case SPELL_NONE:
1442 /* An error, most probably. */
1443 break;
1447 /* Compare two tokens. */
1449 _cpp_equiv_tokens (a, b)
1450 const cpp_token *a, *b;
1452 if (a->type == b->type && a->flags == b->flags)
1453 switch (TOKEN_SPELL (a))
1455 default: /* Keep compiler happy. */
1456 case SPELL_OPERATOR:
1457 return 1;
1458 case SPELL_CHAR:
1459 return a->val.c == b->val.c; /* Character. */
1460 case SPELL_NONE:
1461 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1462 case SPELL_IDENT:
1463 return a->val.node == b->val.node;
1464 case SPELL_STRING:
1465 return (a->val.str.len == b->val.str.len
1466 && !memcmp (a->val.str.text, b->val.str.text,
1467 a->val.str.len));
1470 return 0;
1473 /* Determine whether two tokens can be pasted together, and if so,
1474 what the resulting token is. Returns CPP_EOF if the tokens cannot
1475 be pasted, or the appropriate type for the merged token if they
1476 can. */
1477 enum cpp_ttype
1478 cpp_can_paste (pfile, token1, token2, digraph)
1479 cpp_reader * pfile;
1480 const cpp_token *token1, *token2;
1481 int* digraph;
1483 enum cpp_ttype a = token1->type, b = token2->type;
1484 int cxx = CPP_OPTION (pfile, cplusplus);
1486 /* Treat named operators as if they were ordinary NAMEs. */
1487 if (token1->flags & NAMED_OP)
1488 a = CPP_NAME;
1489 if (token2->flags & NAMED_OP)
1490 b = CPP_NAME;
1492 if ((int) a <= (int) CPP_LAST_EQ && b == CPP_EQ)
1493 return (enum cpp_ttype) ((int) a + ((int) CPP_EQ_EQ - (int) CPP_EQ));
1495 switch (a)
1497 case CPP_GREATER:
1498 if (b == a) return CPP_RSHIFT;
1499 if (b == CPP_QUERY && cxx) return CPP_MAX;
1500 if (b == CPP_GREATER_EQ) return CPP_RSHIFT_EQ;
1501 break;
1502 case CPP_LESS:
1503 if (b == a) return CPP_LSHIFT;
1504 if (b == CPP_QUERY && cxx) return CPP_MIN;
1505 if (b == CPP_LESS_EQ) return CPP_LSHIFT_EQ;
1506 if (CPP_OPTION (pfile, digraphs))
1508 if (b == CPP_COLON)
1509 {*digraph = 1; return CPP_OPEN_SQUARE;} /* <: digraph */
1510 if (b == CPP_MOD)
1511 {*digraph = 1; return CPP_OPEN_BRACE;} /* <% digraph */
1513 break;
1515 case CPP_PLUS: if (b == a) return CPP_PLUS_PLUS; break;
1516 case CPP_AND: if (b == a) return CPP_AND_AND; break;
1517 case CPP_OR: if (b == a) return CPP_OR_OR; break;
1519 case CPP_MINUS:
1520 if (b == a) return CPP_MINUS_MINUS;
1521 if (b == CPP_GREATER) return CPP_DEREF;
1522 break;
1523 case CPP_COLON:
1524 if (b == a && cxx) return CPP_SCOPE;
1525 if (b == CPP_GREATER && CPP_OPTION (pfile, digraphs))
1526 {*digraph = 1; return CPP_CLOSE_SQUARE;} /* :> digraph */
1527 break;
1529 case CPP_MOD:
1530 if (CPP_OPTION (pfile, digraphs))
1532 if (b == CPP_GREATER)
1533 {*digraph = 1; return CPP_CLOSE_BRACE;} /* %> digraph */
1534 if (b == CPP_COLON)
1535 {*digraph = 1; return CPP_HASH;} /* %: digraph */
1537 break;
1538 case CPP_DEREF:
1539 if (b == CPP_MULT && cxx) return CPP_DEREF_STAR;
1540 break;
1541 case CPP_DOT:
1542 if (b == CPP_MULT && cxx) return CPP_DOT_STAR;
1543 if (b == CPP_NUMBER) return CPP_NUMBER;
1544 break;
1546 case CPP_HASH:
1547 if (b == a && (token1->flags & DIGRAPH) == (token2->flags & DIGRAPH))
1548 /* %:%: digraph */
1549 {*digraph = (token1->flags & DIGRAPH); return CPP_PASTE;}
1550 break;
1552 case CPP_NAME:
1553 if (b == CPP_NAME) return CPP_NAME;
1554 if (b == CPP_NUMBER
1555 && name_p (pfile, &token2->val.str)) return CPP_NAME;
1556 if (b == CPP_CHAR
1557 && token1->val.node == pfile->spec_nodes.n_L) return CPP_WCHAR;
1558 if (b == CPP_STRING
1559 && token1->val.node == pfile->spec_nodes.n_L) return CPP_WSTRING;
1560 break;
1562 case CPP_NUMBER:
1563 if (b == CPP_NUMBER) return CPP_NUMBER;
1564 if (b == CPP_NAME) return CPP_NUMBER;
1565 if (b == CPP_DOT) return CPP_NUMBER;
1566 /* Numbers cannot have length zero, so this is safe. */
1567 if ((b == CPP_PLUS || b == CPP_MINUS)
1568 && VALID_SIGN ('+', token1->val.str.text[token1->val.str.len - 1]))
1569 return CPP_NUMBER;
1570 break;
1572 default:
1573 break;
1576 return CPP_EOF;
1579 /* Returns nonzero if a space should be inserted to avoid an
1580 accidental token paste for output. For simplicity, it is
1581 conservative, and occasionally advises a space where one is not
1582 needed, e.g. "." and ".2". */
1585 cpp_avoid_paste (pfile, token1, token2)
1586 cpp_reader *pfile;
1587 const cpp_token *token1, *token2;
1589 enum cpp_ttype a = token1->type, b = token2->type;
1590 cppchar_t c;
1592 if (token1->flags & NAMED_OP)
1593 a = CPP_NAME;
1594 if (token2->flags & NAMED_OP)
1595 b = CPP_NAME;
1597 c = EOF;
1598 if (token2->flags & DIGRAPH)
1599 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1600 else if (token_spellings[b].category == SPELL_OPERATOR)
1601 c = token_spellings[b].name[0];
1603 /* Quickly get everything that can paste with an '='. */
1604 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1605 return 1;
1607 switch (a)
1609 case CPP_GREATER: return c == '>' || c == '?';
1610 case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
1611 case CPP_PLUS: return c == '+';
1612 case CPP_MINUS: return c == '-' || c == '>';
1613 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1614 case CPP_MOD: return c == ':' || c == '>';
1615 case CPP_AND: return c == '&';
1616 case CPP_OR: return c == '|';
1617 case CPP_COLON: return c == ':' || c == '>';
1618 case CPP_DEREF: return c == '*';
1619 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1620 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1621 case CPP_NAME: return ((b == CPP_NUMBER
1622 && name_p (pfile, &token2->val.str))
1623 || b == CPP_NAME
1624 || b == CPP_CHAR || b == CPP_STRING); /* L */
1625 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1626 || c == '.' || c == '+' || c == '-');
1627 case CPP_OTHER: return (CPP_OPTION (pfile, objc)
1628 && token1->val.c == '@'
1629 && (b == CPP_NAME || b == CPP_STRING));
1630 default: break;
1633 return 0;
1636 /* Output all the remaining tokens on the current line, and a newline
1637 character, to FP. Leading whitespace is removed. */
1638 void
1639 cpp_output_line (pfile, fp)
1640 cpp_reader *pfile;
1641 FILE *fp;
1643 cpp_token token;
1645 cpp_get_token (pfile, &token);
1646 token.flags &= ~PREV_WHITE;
1647 while (token.type != CPP_EOF)
1649 cpp_output_token (&token, fp);
1650 cpp_get_token (pfile, &token);
1653 putc ('\n', fp);
1656 /* Returns the value of a hexadecimal digit. */
1657 static unsigned int
1658 hex_digit_value (c)
1659 unsigned int c;
1661 if (c >= 'a' && c <= 'f')
1662 return c - 'a' + 10;
1663 if (c >= 'A' && c <= 'F')
1664 return c - 'A' + 10;
1665 if (c >= '0' && c <= '9')
1666 return c - '0';
1667 abort ();
1670 /* Parse a '\uNNNN' or '\UNNNNNNNN' sequence. Returns 1 to indicate
1671 failure if cpplib is not parsing C++ or C99. Such failure is
1672 silent, and no variables are updated. Otherwise returns 0, and
1673 warns if -Wtraditional.
1675 [lex.charset]: The character designated by the universal character
1676 name \UNNNNNNNN is that character whose character short name in
1677 ISO/IEC 10646 is NNNNNNNN; the character designated by the
1678 universal character name \uNNNN is that character whose character
1679 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
1680 for a universal character name is less than 0x20 or in the range
1681 0x7F-0x9F (inclusive), or if the universal character name
1682 designates a character in the basic source character set, then the
1683 program is ill-formed.
1685 We assume that wchar_t is Unicode, so we don't need to do any
1686 mapping. Is this ever wrong?
1688 PC points to the 'u' or 'U', PSTR is points to the byte after PC,
1689 LIMIT is the end of the string or charconst. PSTR is updated to
1690 point after the UCS on return, and the UCS is written into PC. */
1692 static int
1693 maybe_read_ucs (pfile, pstr, limit, pc)
1694 cpp_reader *pfile;
1695 const unsigned char **pstr;
1696 const unsigned char *limit;
1697 unsigned int *pc;
1699 const unsigned char *p = *pstr;
1700 unsigned int code = 0;
1701 unsigned int c = *pc, length;
1703 /* Only attempt to interpret a UCS for C++ and C99. */
1704 if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
1705 return 1;
1707 if (CPP_WTRADITIONAL (pfile))
1708 cpp_warning (pfile, "the meaning of '\\%c' varies with -traditional", c);
1710 length = (c == 'u' ? 4: 8);
1712 if ((size_t) (limit - p) < length)
1714 cpp_error (pfile, "incomplete universal-character-name");
1715 /* Skip to the end to avoid more diagnostics. */
1716 p = limit;
1718 else
1720 for (; length; length--, p++)
1722 c = *p;
1723 if (ISXDIGIT (c))
1724 code = (code << 4) + hex_digit_value (c);
1725 else
1727 cpp_error (pfile,
1728 "non-hex digit '%c' in universal-character-name", c);
1729 /* We shouldn't skip in case there are multibyte chars. */
1730 break;
1735 #ifdef TARGET_EBCDIC
1736 cpp_error (pfile, "universal-character-name on EBCDIC target");
1737 code = 0x3f; /* EBCDIC invalid character */
1738 #else
1739 /* True extended characters are OK. */
1740 if (code >= 0xa0
1741 && !(code & 0x80000000)
1742 && !(code >= 0xD800 && code <= 0xDFFF))
1744 /* The standard permits $, @ and ` to be specified as UCNs. We use
1745 hex escapes so that this also works with EBCDIC hosts. */
1746 else if (code == 0x24 || code == 0x40 || code == 0x60)
1748 /* Don't give another error if one occurred above. */
1749 else if (length == 0)
1750 cpp_error (pfile, "universal-character-name out of range");
1751 #endif
1753 *pstr = p;
1754 *pc = code;
1755 return 0;
1758 /* Interpret an escape sequence, and return its value. PSTR points to
1759 the input pointer, which is just after the backslash. LIMIT is how
1760 much text we have. MASK is a bitmask for the precision for the
1761 destination type (char or wchar_t). TRADITIONAL, if true, does not
1762 interpret escapes that did not exist in traditional C.
1764 Handles all relevant diagnostics. */
1766 unsigned int
1767 cpp_parse_escape (pfile, pstr, limit, mask, traditional)
1768 cpp_reader *pfile;
1769 const unsigned char **pstr;
1770 const unsigned char *limit;
1771 unsigned HOST_WIDE_INT mask;
1772 int traditional;
1774 int unknown = 0;
1775 const unsigned char *str = *pstr;
1776 unsigned int c = *str++;
1778 switch (c)
1780 case '\\': case '\'': case '"': case '?': break;
1781 case 'b': c = TARGET_BS; break;
1782 case 'f': c = TARGET_FF; break;
1783 case 'n': c = TARGET_NEWLINE; break;
1784 case 'r': c = TARGET_CR; break;
1785 case 't': c = TARGET_TAB; break;
1786 case 'v': c = TARGET_VT; break;
1788 case '(': case '{': case '[': case '%':
1789 /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
1790 '\%' is used to prevent SCCS from getting confused. */
1791 unknown = CPP_PEDANTIC (pfile);
1792 break;
1794 case 'a':
1795 if (CPP_WTRADITIONAL (pfile))
1796 cpp_warning (pfile, "the meaning of '\\a' varies with -traditional");
1797 if (!traditional)
1798 c = TARGET_BELL;
1799 break;
1801 case 'e': case 'E':
1802 if (CPP_PEDANTIC (pfile))
1803 cpp_pedwarn (pfile, "non-ISO-standard escape sequence, '\\%c'", c);
1804 c = TARGET_ESC;
1805 break;
1807 case 'u': case 'U':
1808 unknown = maybe_read_ucs (pfile, &str, limit, &c);
1809 break;
1811 case 'x':
1812 if (CPP_WTRADITIONAL (pfile))
1813 cpp_warning (pfile, "the meaning of '\\x' varies with -traditional");
1815 if (!traditional)
1817 unsigned int i = 0, overflow = 0;
1818 int digits_found = 0;
1820 while (str < limit)
1822 c = *str;
1823 if (! ISXDIGIT (c))
1824 break;
1825 str++;
1826 overflow |= i ^ (i << 4 >> 4);
1827 i = (i << 4) + hex_digit_value (c);
1828 digits_found = 1;
1831 if (!digits_found)
1832 cpp_error (pfile, "\\x used with no following hex digits");
1834 if (overflow | (i != (i & mask)))
1836 cpp_pedwarn (pfile, "hex escape sequence out of range");
1837 i &= mask;
1839 c = i;
1841 break;
1843 case '0': case '1': case '2': case '3':
1844 case '4': case '5': case '6': case '7':
1846 unsigned int i = c - '0';
1847 int count = 0;
1849 while (str < limit && ++count < 3)
1851 c = *str;
1852 if (c < '0' || c > '7')
1853 break;
1854 str++;
1855 i = (i << 3) + c - '0';
1858 if (i != (i & mask))
1860 cpp_pedwarn (pfile, "octal escape sequence out of range");
1861 i &= mask;
1863 c = i;
1865 break;
1867 default:
1868 unknown = 1;
1869 break;
1872 if (unknown)
1874 if (ISGRAPH (c))
1875 cpp_pedwarn (pfile, "unknown escape sequence '\\%c'", c);
1876 else
1877 cpp_pedwarn (pfile, "unknown escape sequence: '\\%03o'", c);
1880 if (c > mask)
1881 cpp_pedwarn (pfile, "escape sequence out of range for character");
1883 *pstr = str;
1884 return c;
1887 #ifndef MAX_CHAR_TYPE_SIZE
1888 #define MAX_CHAR_TYPE_SIZE CHAR_TYPE_SIZE
1889 #endif
1891 #ifndef MAX_WCHAR_TYPE_SIZE
1892 #define MAX_WCHAR_TYPE_SIZE WCHAR_TYPE_SIZE
1893 #endif
1895 /* Interpret a (possibly wide) character constant in TOKEN.
1896 WARN_MULTI warns about multi-character charconsts, if not
1897 TRADITIONAL. TRADITIONAL also indicates not to interpret escapes
1898 that did not exist in traditional C. PCHARS_SEEN points to a
1899 variable that is filled in with the number of characters seen. */
1900 HOST_WIDE_INT
1901 cpp_interpret_charconst (pfile, token, warn_multi, traditional, pchars_seen)
1902 cpp_reader *pfile;
1903 const cpp_token *token;
1904 int warn_multi;
1905 int traditional;
1906 unsigned int *pchars_seen;
1908 const unsigned char *str = token->val.str.text;
1909 const unsigned char *limit = str + token->val.str.len;
1910 unsigned int chars_seen = 0;
1911 unsigned int width, max_chars, c;
1912 unsigned HOST_WIDE_INT mask;
1913 HOST_WIDE_INT result = 0;
1915 #ifdef MULTIBYTE_CHARS
1916 (void) local_mbtowc (NULL, NULL, 0);
1917 #endif
1919 /* Width in bits. */
1920 if (token->type == CPP_CHAR)
1921 width = MAX_CHAR_TYPE_SIZE;
1922 else
1923 width = MAX_WCHAR_TYPE_SIZE;
1925 if (width < HOST_BITS_PER_WIDE_INT)
1926 mask = ((unsigned HOST_WIDE_INT) 1 << width) - 1;
1927 else
1928 mask = ~0;
1929 max_chars = HOST_BITS_PER_WIDE_INT / width;
1931 while (str < limit)
1933 #ifdef MULTIBYTE_CHARS
1934 wchar_t wc;
1935 int char_len;
1937 char_len = local_mbtowc (&wc, str, limit - str);
1938 if (char_len == -1)
1940 cpp_warning (pfile, "ignoring invalid multibyte character");
1941 c = *str++;
1943 else
1945 str += char_len;
1946 c = wc;
1948 #else
1949 c = *str++;
1950 #endif
1952 if (c == '\\')
1953 c = cpp_parse_escape (pfile, &str, limit, mask, traditional);
1955 #ifdef MAP_CHARACTER
1956 if (ISPRINT (c))
1957 c = MAP_CHARACTER (c);
1958 #endif
1960 /* Merge character into result; ignore excess chars. */
1961 if (++chars_seen <= max_chars)
1963 if (width < HOST_BITS_PER_WIDE_INT)
1964 result = (result << width) | (c & mask);
1965 else
1966 result = c;
1970 if (chars_seen == 0)
1971 cpp_error (pfile, "empty character constant");
1972 else if (chars_seen > max_chars)
1974 chars_seen = max_chars;
1975 cpp_warning (pfile, "character constant too long");
1977 else if (chars_seen > 1 && !traditional && warn_multi)
1978 cpp_warning (pfile, "multi-character character constant");
1980 /* If char type is signed, sign-extend the constant. The
1981 __CHAR_UNSIGNED__ macro is set by the driver if appropriate. */
1982 if (token->type == CPP_CHAR && chars_seen)
1984 unsigned int nbits = chars_seen * width;
1985 unsigned int mask = (unsigned int) ~0 >> (HOST_BITS_PER_INT - nbits);
1987 if (pfile->spec_nodes.n__CHAR_UNSIGNED__->type == NT_MACRO
1988 || ((result >> (nbits - 1)) & 1) == 0)
1989 result &= mask;
1990 else
1991 result |= ~mask;
1994 *pchars_seen = chars_seen;
1995 return result;
1998 /* Memory pools. */
2000 struct dummy
2002 char c;
2003 union
2005 double d;
2006 int *p;
2007 } u;
2010 #define DEFAULT_ALIGNMENT (offsetof (struct dummy, u))
2012 static int
2013 chunk_suitable (pool, chunk, size)
2014 cpp_pool *pool;
2015 cpp_chunk *chunk;
2016 unsigned int size;
2018 /* Being at least twice SIZE means we can use memcpy in
2019 _cpp_next_chunk rather than memmove. Besides, it's a good idea
2020 anyway. */
2021 return (chunk && pool->locked != chunk
2022 && (unsigned int) (chunk->limit - chunk->base) >= size * 2);
2025 /* Returns the end of the new pool. PTR points to a char in the old
2026 pool, and is updated to point to the same char in the new pool. */
2027 unsigned char *
2028 _cpp_next_chunk (pool, len, ptr)
2029 cpp_pool *pool;
2030 unsigned int len;
2031 unsigned char **ptr;
2033 cpp_chunk *chunk = pool->cur->next;
2035 /* LEN is the minimum size we want in the new pool. */
2036 len += POOL_ROOM (pool);
2037 if (! chunk_suitable (pool, chunk, len))
2039 chunk = new_chunk (POOL_SIZE (pool) * 2 + len);
2041 chunk->next = pool->cur->next;
2042 pool->cur->next = chunk;
2045 /* Update the pointer before changing chunk's front. */
2046 if (ptr)
2047 *ptr += chunk->base - POOL_FRONT (pool);
2049 memcpy (chunk->base, POOL_FRONT (pool), POOL_ROOM (pool));
2050 chunk->front = chunk->base;
2052 pool->cur = chunk;
2053 return POOL_LIMIT (pool);
2056 static cpp_chunk *
2057 new_chunk (size)
2058 unsigned int size;
2060 unsigned char *base;
2061 cpp_chunk *result;
2063 size = POOL_ALIGN (size, DEFAULT_ALIGNMENT);
2064 base = (unsigned char *) xmalloc (size + sizeof (cpp_chunk));
2065 /* Put the chunk descriptor at the end. Then chunk overruns will
2066 cause obvious chaos. */
2067 result = (cpp_chunk *) (base + size);
2068 result->base = base;
2069 result->front = base;
2070 result->limit = base + size;
2071 result->next = 0;
2073 return result;
2076 void
2077 _cpp_init_pool (pool, size, align, temp)
2078 cpp_pool *pool;
2079 unsigned int size, align, temp;
2081 if (align == 0)
2082 align = DEFAULT_ALIGNMENT;
2083 if (align & (align - 1))
2084 abort ();
2085 pool->align = align;
2086 pool->cur = new_chunk (size);
2087 pool->locked = 0;
2088 pool->locks = 0;
2089 if (temp)
2090 pool->cur->next = pool->cur;
2093 void
2094 _cpp_lock_pool (pool)
2095 cpp_pool *pool;
2097 if (pool->locks++ == 0)
2098 pool->locked = pool->cur;
2101 void
2102 _cpp_unlock_pool (pool)
2103 cpp_pool *pool;
2105 if (--pool->locks == 0)
2106 pool->locked = 0;
2109 void
2110 _cpp_free_pool (pool)
2111 cpp_pool *pool;
2113 cpp_chunk *chunk = pool->cur, *next;
2117 next = chunk->next;
2118 free (chunk->base);
2119 chunk = next;
2121 while (chunk && chunk != pool->cur);
2124 /* Reserve LEN bytes from a memory pool. */
2125 unsigned char *
2126 _cpp_pool_reserve (pool, len)
2127 cpp_pool *pool;
2128 unsigned int len;
2130 len = POOL_ALIGN (len, pool->align);
2131 if (len > (unsigned int) POOL_ROOM (pool))
2132 _cpp_next_chunk (pool, len, 0);
2134 return POOL_FRONT (pool);
2137 /* Allocate LEN bytes from a memory pool. */
2138 unsigned char *
2139 _cpp_pool_alloc (pool, len)
2140 cpp_pool *pool;
2141 unsigned int len;
2143 unsigned char *result = _cpp_pool_reserve (pool, len);
2145 POOL_COMMIT (pool, len);
2146 return result;