* invoke.texi: Add more options to summary list.
[official-gcc.git] / gcc / cpplex.c
blob290d33977ad0409dcfac8854db443ee0cbacc61c
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7 Single-pass line tokenization by Neil Booth, April 2000
9 This program is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
12 later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23 /* This lexer works with a single pass of the file. Recently I
24 re-wrote it to minimize the places where we step backwards in the
25 input stream, to make future changes to support multi-byte
26 character sets fairly straight-forward.
28 There is now only one routine where we do step backwards:
29 skip_escaped_newlines. This routine could probably also be changed
30 so that it doesn't need to step back. One possibility is to use a
31 trick similar to that used in lex_period and lex_percent. Two
32 extra characters might be needed, but skip_escaped_newlines itself
33 would probably be the only place that needs to be aware of that,
34 and changes to the remaining routines would probably only be needed
35 if they process a backslash. */
37 #include "config.h"
38 #include "system.h"
39 #include "cpplib.h"
40 #include "cpphash.h"
42 /* MULTIBYTE_CHARS support only works for native compilers.
43 ??? Ideally what we want is to model widechar support after
44 the current floating point support. */
45 #ifdef CROSS_COMPILE
46 #undef MULTIBYTE_CHARS
47 #endif
49 #ifdef MULTIBYTE_CHARS
50 #include "mbchar.h"
51 #include <locale.h>
52 #endif
54 /* Tokens with SPELL_STRING store their spelling in the token list,
55 and it's length in the token->val.name.len. */
56 enum spell_type
58 SPELL_OPERATOR = 0,
59 SPELL_CHAR,
60 SPELL_IDENT,
61 SPELL_STRING,
62 SPELL_NONE
65 struct token_spelling
67 enum spell_type category;
68 const unsigned char *name;
71 const unsigned char *digraph_spellings [] = {U"%:", U"%:%:", U"<:",
72 U":>", U"<%", U"%>"};
74 #define OP(e, s) { SPELL_OPERATOR, U s },
75 #define TK(e, s) { s, U STRINGX (e) },
76 const struct token_spelling token_spellings [N_TTYPES] = {TTYPE_TABLE };
77 #undef OP
78 #undef TK
80 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
81 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
83 static cppchar_t handle_newline PARAMS ((cpp_buffer *, cppchar_t));
84 static cppchar_t skip_escaped_newlines PARAMS ((cpp_buffer *, cppchar_t));
85 static cppchar_t get_effective_char PARAMS ((cpp_buffer *));
87 static int skip_block_comment PARAMS ((cpp_reader *));
88 static int skip_line_comment PARAMS ((cpp_reader *));
89 static void adjust_column PARAMS ((cpp_reader *));
90 static void skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
91 static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *, cppchar_t));
92 static void parse_number PARAMS ((cpp_reader *, cpp_string *, cppchar_t, int));
93 static int unescaped_terminator_p PARAMS ((cpp_reader *, const U_CHAR *));
94 static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
95 static void unterminated PARAMS ((cpp_reader *, int));
96 static int trigraph_ok PARAMS ((cpp_reader *, cppchar_t));
97 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const U_CHAR *));
98 static void lex_percent PARAMS ((cpp_buffer *, cpp_token *));
99 static void lex_dot PARAMS ((cpp_reader *, cpp_token *));
100 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
101 static unsigned int parse_escape PARAMS ((cpp_reader *, const unsigned char **,
102 const unsigned char *, HOST_WIDE_INT,
103 int));
104 static unsigned int read_ucs PARAMS ((cpp_reader *, const unsigned char **,
105 const unsigned char *, unsigned int));
107 static cpp_chunk *new_chunk PARAMS ((unsigned int));
108 static int chunk_suitable PARAMS ((cpp_pool *, cpp_chunk *, unsigned int));
109 static unsigned int hex_digit_value PARAMS ((unsigned int));
111 /* Utility routine:
113 Compares, the token TOKEN to the NUL-terminated string STRING.
114 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
117 cpp_ideq (token, string)
118 const cpp_token *token;
119 const char *string;
121 if (token->type != CPP_NAME)
122 return 0;
124 return !ustrcmp (NODE_NAME (token->val.node), (const U_CHAR *) string);
127 /* Call when meeting a newline. Returns the character after the newline
128 (or carriage-return newline combination), or EOF. */
129 static cppchar_t
130 handle_newline (buffer, newline_char)
131 cpp_buffer *buffer;
132 cppchar_t newline_char;
134 cppchar_t next = EOF;
136 buffer->col_adjust = 0;
137 buffer->lineno++;
138 buffer->line_base = buffer->cur;
140 /* Handle CR-LF and LF-CR combinations, get the next character. */
141 if (buffer->cur < buffer->rlimit)
143 next = *buffer->cur++;
144 if (next + newline_char == '\r' + '\n')
146 buffer->line_base = buffer->cur;
147 if (buffer->cur < buffer->rlimit)
148 next = *buffer->cur++;
149 else
150 next = EOF;
154 buffer->read_ahead = next;
155 return next;
158 /* Subroutine of skip_escaped_newlines; called when a trigraph is
159 encountered. It warns if necessary, and returns true if the
160 trigraph should be honoured. FROM_CHAR is the third character of a
161 trigraph, and presumed to be the previous character for position
162 reporting. */
163 static int
164 trigraph_ok (pfile, from_char)
165 cpp_reader *pfile;
166 cppchar_t from_char;
168 int accept = CPP_OPTION (pfile, trigraphs);
170 /* Don't warn about trigraphs in comments. */
171 if (CPP_OPTION (pfile, warn_trigraphs) && !pfile->state.lexing_comment)
173 cpp_buffer *buffer = pfile->buffer;
174 if (accept)
175 cpp_warning_with_line (pfile, buffer->lineno, CPP_BUF_COL (buffer) - 2,
176 "trigraph ??%c converted to %c",
177 (int) from_char,
178 (int) _cpp_trigraph_map[from_char]);
179 else if (buffer->cur != buffer->last_Wtrigraphs)
181 buffer->last_Wtrigraphs = buffer->cur;
182 cpp_warning_with_line (pfile, buffer->lineno,
183 CPP_BUF_COL (buffer) - 2,
184 "trigraph ??%c ignored", (int) from_char);
188 return accept;
191 /* Assumes local variables buffer and result. */
192 #define ACCEPT_CHAR(t) \
193 do { result->type = t; buffer->read_ahead = EOF; } while (0)
195 /* When we move to multibyte character sets, add to these something
196 that saves and restores the state of the multibyte conversion
197 library. This probably involves saving and restoring a "cookie".
198 In the case of glibc it is an 8-byte structure, so is not a high
199 overhead operation. In any case, it's out of the fast path. */
200 #define SAVE_STATE() do { saved_cur = buffer->cur; } while (0)
201 #define RESTORE_STATE() do { buffer->cur = saved_cur; } while (0)
203 /* Skips any escaped newlines introduced by NEXT, which is either a
204 '?' or a '\\'. Returns the next character, which will also have
205 been placed in buffer->read_ahead. This routine performs
206 preprocessing stages 1 and 2 of the ISO C standard. */
207 static cppchar_t
208 skip_escaped_newlines (buffer, next)
209 cpp_buffer *buffer;
210 cppchar_t next;
212 /* Only do this if we apply stages 1 and 2. */
213 if (!buffer->from_stage3)
215 cppchar_t next1;
216 const unsigned char *saved_cur;
217 int space;
221 if (buffer->cur == buffer->rlimit)
222 break;
224 SAVE_STATE ();
225 if (next == '?')
227 next1 = *buffer->cur++;
228 if (next1 != '?' || buffer->cur == buffer->rlimit)
230 RESTORE_STATE ();
231 break;
234 next1 = *buffer->cur++;
235 if (!_cpp_trigraph_map[next1]
236 || !trigraph_ok (buffer->pfile, next1))
238 RESTORE_STATE ();
239 break;
242 /* We have a full trigraph here. */
243 next = _cpp_trigraph_map[next1];
244 if (next != '\\' || buffer->cur == buffer->rlimit)
245 break;
246 SAVE_STATE ();
249 /* We have a backslash, and room for at least one more character. */
250 space = 0;
253 next1 = *buffer->cur++;
254 if (!is_nvspace (next1))
255 break;
256 space = 1;
258 while (buffer->cur < buffer->rlimit);
260 if (!is_vspace (next1))
262 RESTORE_STATE ();
263 break;
266 if (space && !buffer->pfile->state.lexing_comment)
267 cpp_warning (buffer->pfile,
268 "backslash and newline separated by space");
270 next = handle_newline (buffer, next1);
271 if (next == EOF)
272 cpp_pedwarn (buffer->pfile, "backslash-newline at end of file");
274 while (next == '\\' || next == '?');
277 buffer->read_ahead = next;
278 return next;
281 /* Obtain the next character, after trigraph conversion and skipping
282 an arbitrary string of escaped newlines. The common case of no
283 trigraphs or escaped newlines falls through quickly. */
284 static cppchar_t
285 get_effective_char (buffer)
286 cpp_buffer *buffer;
288 cppchar_t next = EOF;
290 if (buffer->cur < buffer->rlimit)
292 next = *buffer->cur++;
294 /* '?' can introduce trigraphs (and therefore backslash); '\\'
295 can introduce escaped newlines, which we want to skip, or
296 UCNs, which, depending upon lexer state, we will handle in
297 the future. */
298 if (next == '?' || next == '\\')
299 next = skip_escaped_newlines (buffer, next);
302 buffer->read_ahead = next;
303 return next;
306 /* Skip a C-style block comment. We find the end of the comment by
307 seeing if an asterisk is before every '/' we encounter. Returns
308 non-zero if comment terminated by EOF, zero otherwise. */
309 static int
310 skip_block_comment (pfile)
311 cpp_reader *pfile;
313 cpp_buffer *buffer = pfile->buffer;
314 cppchar_t c = EOF, prevc = EOF;
316 pfile->state.lexing_comment = 1;
317 while (buffer->cur != buffer->rlimit)
319 prevc = c, c = *buffer->cur++;
321 next_char:
322 /* FIXME: For speed, create a new character class of characters
323 of interest inside block comments. */
324 if (c == '?' || c == '\\')
325 c = skip_escaped_newlines (buffer, c);
327 /* People like decorating comments with '*', so check for '/'
328 instead for efficiency. */
329 if (c == '/')
331 if (prevc == '*')
332 break;
334 /* Warn about potential nested comments, but not if the '/'
335 comes immediately before the true comment delimeter.
336 Don't bother to get it right across escaped newlines. */
337 if (CPP_OPTION (pfile, warn_comments)
338 && buffer->cur != buffer->rlimit)
340 prevc = c, c = *buffer->cur++;
341 if (c == '*' && buffer->cur != buffer->rlimit)
343 prevc = c, c = *buffer->cur++;
344 if (c != '/')
345 cpp_warning_with_line (pfile, CPP_BUF_LINE (buffer),
346 CPP_BUF_COL (buffer),
347 "\"/*\" within comment");
349 goto next_char;
352 else if (is_vspace (c))
354 prevc = c, c = handle_newline (buffer, c);
355 goto next_char;
357 else if (c == '\t')
358 adjust_column (pfile);
361 pfile->state.lexing_comment = 0;
362 buffer->read_ahead = EOF;
363 return c != '/' || prevc != '*';
366 /* Skip a C++ line comment. Handles escaped newlines. Returns
367 non-zero if a multiline comment. The following new line, if any,
368 is left in buffer->read_ahead. */
369 static int
370 skip_line_comment (pfile)
371 cpp_reader *pfile;
373 cpp_buffer *buffer = pfile->buffer;
374 unsigned int orig_lineno = buffer->lineno;
375 cppchar_t c;
377 pfile->state.lexing_comment = 1;
380 c = EOF;
381 if (buffer->cur == buffer->rlimit)
382 break;
384 c = *buffer->cur++;
385 if (c == '?' || c == '\\')
386 c = skip_escaped_newlines (buffer, c);
388 while (!is_vspace (c));
390 pfile->state.lexing_comment = 0;
391 buffer->read_ahead = c; /* Leave any newline for caller. */
392 return orig_lineno != buffer->lineno;
395 /* pfile->buffer->cur is one beyond the \t character. Update
396 col_adjust so we track the column correctly. */
397 static void
398 adjust_column (pfile)
399 cpp_reader *pfile;
401 cpp_buffer *buffer = pfile->buffer;
402 unsigned int col = CPP_BUF_COL (buffer) - 1; /* Zero-based column. */
404 /* Round it up to multiple of the tabstop, but subtract 1 since the
405 tab itself occupies a character position. */
406 buffer->col_adjust += (CPP_OPTION (pfile, tabstop)
407 - col % CPP_OPTION (pfile, tabstop)) - 1;
410 /* Skips whitespace, saving the next non-whitespace character.
411 Adjusts pfile->col_adjust to account for tabs. Without this,
412 tokens might be assigned an incorrect column. */
413 static void
414 skip_whitespace (pfile, c)
415 cpp_reader *pfile;
416 cppchar_t c;
418 cpp_buffer *buffer = pfile->buffer;
419 unsigned int warned = 0;
423 /* Horizontal space always OK. */
424 if (c == ' ')
426 else if (c == '\t')
427 adjust_column (pfile);
428 /* Just \f \v or \0 left. */
429 else if (c == '\0')
431 if (!warned)
433 cpp_warning (pfile, "null character(s) ignored");
434 warned = 1;
437 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
438 cpp_pedwarn_with_line (pfile, CPP_BUF_LINE (buffer),
439 CPP_BUF_COL (buffer),
440 "%s in preprocessing directive",
441 c == '\f' ? "form feed" : "vertical tab");
443 c = EOF;
444 if (buffer->cur == buffer->rlimit)
445 break;
446 c = *buffer->cur++;
448 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
449 while (is_nvspace (c));
451 /* Remember the next character. */
452 buffer->read_ahead = c;
455 /* See if the characters of a number token are valid in a name (no
456 '.', '+' or '-'). */
457 static int
458 name_p (pfile, string)
459 cpp_reader *pfile;
460 const cpp_string *string;
462 unsigned int i;
464 for (i = 0; i < string->len; i++)
465 if (!is_idchar (string->text[i]))
466 return 0;
468 return 1;
471 /* Parse an identifier, skipping embedded backslash-newlines.
472 Calculate the hash value of the token while parsing, for improved
473 performance. The hashing algorithm *must* match cpp_lookup(). */
475 static cpp_hashnode *
476 parse_identifier (pfile, c)
477 cpp_reader *pfile;
478 cppchar_t c;
480 cpp_hashnode *result;
481 cpp_buffer *buffer = pfile->buffer;
482 unsigned int saw_dollar = 0, len;
483 struct obstack *stack = &pfile->hash_table->stack;
489 obstack_1grow (stack, c);
491 if (c == '$')
492 saw_dollar++;
494 c = EOF;
495 if (buffer->cur == buffer->rlimit)
496 break;
498 c = *buffer->cur++;
500 while (is_idchar (c));
502 /* Potential escaped newline? */
503 if (c != '?' && c != '\\')
504 break;
505 c = skip_escaped_newlines (buffer, c);
507 while (is_idchar (c));
509 /* Remember the next character. */
510 buffer->read_ahead = c;
512 /* $ is not a identifier character in the standard, but is commonly
513 accepted as an extension. Don't warn about it in skipped
514 conditional blocks. */
515 if (saw_dollar && CPP_PEDANTIC (pfile) && ! pfile->skipping)
516 cpp_pedwarn (pfile, "'$' character(s) in identifier");
518 /* Identifiers are null-terminated. */
519 len = obstack_object_size (stack);
520 obstack_1grow (stack, '\0');
522 /* This routine commits the memory if necessary. */
523 result = (cpp_hashnode *)
524 ht_lookup (pfile->hash_table, obstack_finish (stack), len, HT_ALLOCED);
526 /* Some identifiers require diagnostics when lexed. */
527 if (result->flags & NODE_DIAGNOSTIC && !pfile->skipping)
529 /* It is allowed to poison the same identifier twice. */
530 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
531 cpp_error (pfile, "attempt to use poisoned \"%s\"",
532 NODE_NAME (result));
534 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
535 replacement list of a variadic macro. */
536 if (result == pfile->spec_nodes.n__VA_ARGS__
537 && !pfile->state.va_args_ok)
538 cpp_pedwarn (pfile, "__VA_ARGS__ can only appear in the expansion of a C99 variadic macro");
541 return result;
544 /* Parse a number, skipping embedded backslash-newlines. */
545 static void
546 parse_number (pfile, number, c, leading_period)
547 cpp_reader *pfile;
548 cpp_string *number;
549 cppchar_t c;
550 int leading_period;
552 cpp_buffer *buffer = pfile->buffer;
553 cpp_pool *pool = &pfile->ident_pool;
554 unsigned char *dest, *limit;
556 dest = POOL_FRONT (pool);
557 limit = POOL_LIMIT (pool);
559 /* Place a leading period. */
560 if (leading_period)
562 if (dest >= limit)
563 limit = _cpp_next_chunk (pool, 0, &dest);
564 *dest++ = '.';
571 /* Need room for terminating null. */
572 if (dest + 1 >= limit)
573 limit = _cpp_next_chunk (pool, 0, &dest);
574 *dest++ = c;
576 c = EOF;
577 if (buffer->cur == buffer->rlimit)
578 break;
580 c = *buffer->cur++;
582 while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
584 /* Potential escaped newline? */
585 if (c != '?' && c != '\\')
586 break;
587 c = skip_escaped_newlines (buffer, c);
589 while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
591 /* Remember the next character. */
592 buffer->read_ahead = c;
594 /* Null-terminate the number. */
595 *dest = '\0';
597 number->text = POOL_FRONT (pool);
598 number->len = dest - number->text;
599 POOL_COMMIT (pool, number->len + 1);
602 /* Subroutine of parse_string. Emits error for unterminated strings. */
603 static void
604 unterminated (pfile, term)
605 cpp_reader *pfile;
606 int term;
608 cpp_error (pfile, "missing terminating %c character", term);
610 if (term == '\"' && pfile->mlstring_pos.line
611 && pfile->mlstring_pos.line != pfile->lexer_pos.line)
613 cpp_error_with_line (pfile, pfile->mlstring_pos.line,
614 pfile->mlstring_pos.col,
615 "possible start of unterminated string literal");
616 pfile->mlstring_pos.line = 0;
620 /* Subroutine of parse_string. */
621 static int
622 unescaped_terminator_p (pfile, dest)
623 cpp_reader *pfile;
624 const unsigned char *dest;
626 const unsigned char *start, *temp;
628 /* In #include-style directives, terminators are not escapeable. */
629 if (pfile->state.angled_headers)
630 return 1;
632 start = POOL_FRONT (&pfile->ident_pool);
634 /* An odd number of consecutive backslashes represents an escaped
635 terminator. */
636 for (temp = dest; temp > start && temp[-1] == '\\'; temp--)
639 return ((dest - temp) & 1) == 0;
642 /* Parses a string, character constant, or angle-bracketed header file
643 name. Handles embedded trigraphs and escaped newlines. The stored
644 string is guaranteed NUL-terminated, but it is not guaranteed that
645 this is the first NUL since embedded NULs are preserved.
647 Multi-line strings are allowed, but they are deprecated. */
648 static void
649 parse_string (pfile, token, terminator)
650 cpp_reader *pfile;
651 cpp_token *token;
652 cppchar_t terminator;
654 cpp_buffer *buffer = pfile->buffer;
655 cpp_pool *pool = &pfile->ident_pool;
656 unsigned char *dest, *limit;
657 cppchar_t c;
658 unsigned int nulls = 0;
660 dest = POOL_FRONT (pool);
661 limit = POOL_LIMIT (pool);
663 for (;;)
665 if (buffer->cur == buffer->rlimit)
666 c = EOF;
667 else
668 c = *buffer->cur++;
670 have_char:
671 /* We need space for the terminating NUL. */
672 if (dest >= limit)
673 limit = _cpp_next_chunk (pool, 0, &dest);
675 if (c == EOF)
677 unterminated (pfile, terminator);
678 break;
681 /* Handle trigraphs, escaped newlines etc. */
682 if (c == '?' || c == '\\')
683 c = skip_escaped_newlines (buffer, c);
685 if (c == terminator && unescaped_terminator_p (pfile, dest))
687 c = EOF;
688 break;
690 else if (is_vspace (c))
692 /* In assembly language, silently terminate string and
693 character literals at end of line. This is a kludge
694 around not knowing where comments are. */
695 if (CPP_OPTION (pfile, lang) == CLK_ASM && terminator != '>')
696 break;
698 /* Character constants and header names may not extend over
699 multiple lines. In Standard C, neither may strings.
700 Unfortunately, we accept multiline strings as an
701 extension, except in #include family directives. */
702 if (terminator != '"' || pfile->state.angled_headers)
704 unterminated (pfile, terminator);
705 break;
708 cpp_pedwarn (pfile, "multi-line string literals are deprecated");
709 if (pfile->mlstring_pos.line == 0)
710 pfile->mlstring_pos = pfile->lexer_pos;
712 c = handle_newline (buffer, c);
713 *dest++ = '\n';
714 goto have_char;
716 else if (c == '\0')
718 if (nulls++ == 0)
719 cpp_warning (pfile, "null character(s) preserved in literal");
722 *dest++ = c;
725 /* Remember the next character. */
726 buffer->read_ahead = c;
727 *dest = '\0';
729 token->val.str.text = POOL_FRONT (pool);
730 token->val.str.len = dest - token->val.str.text;
731 POOL_COMMIT (pool, token->val.str.len + 1);
734 /* The stored comment includes the comment start and any terminator. */
735 static void
736 save_comment (pfile, token, from)
737 cpp_reader *pfile;
738 cpp_token *token;
739 const unsigned char *from;
741 unsigned char *buffer;
742 unsigned int len;
744 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
745 /* C++ comments probably (not definitely) have moved past a new
746 line, which we don't want to save in the comment. */
747 if (pfile->buffer->read_ahead != EOF)
748 len--;
749 buffer = _cpp_pool_alloc (&pfile->ident_pool, len);
751 token->type = CPP_COMMENT;
752 token->val.str.len = len;
753 token->val.str.text = buffer;
755 buffer[0] = '/';
756 memcpy (buffer + 1, from, len - 1);
759 /* Subroutine of lex_token to handle '%'. A little tricky, since we
760 want to avoid stepping back when lexing %:%X. */
761 static void
762 lex_percent (buffer, result)
763 cpp_buffer *buffer;
764 cpp_token *result;
766 cppchar_t c;
768 result->type = CPP_MOD;
769 /* Parsing %:%X could leave an extra character. */
770 if (buffer->extra_char == EOF)
771 c = get_effective_char (buffer);
772 else
774 c = buffer->read_ahead = buffer->extra_char;
775 buffer->extra_char = EOF;
778 if (c == '=')
779 ACCEPT_CHAR (CPP_MOD_EQ);
780 else if (CPP_OPTION (buffer->pfile, digraphs))
782 if (c == ':')
784 result->flags |= DIGRAPH;
785 ACCEPT_CHAR (CPP_HASH);
786 if (get_effective_char (buffer) == '%')
788 buffer->extra_char = get_effective_char (buffer);
789 if (buffer->extra_char == ':')
791 buffer->extra_char = EOF;
792 ACCEPT_CHAR (CPP_PASTE);
794 else
795 /* We'll catch the extra_char when we're called back. */
796 buffer->read_ahead = '%';
799 else if (c == '>')
801 result->flags |= DIGRAPH;
802 ACCEPT_CHAR (CPP_CLOSE_BRACE);
807 /* Subroutine of lex_token to handle '.'. This is tricky, since we
808 want to avoid stepping back when lexing '...' or '.123'. In the
809 latter case we should also set a flag for parse_number. */
810 static void
811 lex_dot (pfile, result)
812 cpp_reader *pfile;
813 cpp_token *result;
815 cpp_buffer *buffer = pfile->buffer;
816 cppchar_t c;
818 /* Parsing ..X could leave an extra character. */
819 if (buffer->extra_char == EOF)
820 c = get_effective_char (buffer);
821 else
823 c = buffer->read_ahead = buffer->extra_char;
824 buffer->extra_char = EOF;
827 /* All known character sets have 0...9 contiguous. */
828 if (c >= '0' && c <= '9')
830 result->type = CPP_NUMBER;
831 parse_number (pfile, &result->val.str, c, 1);
833 else
835 result->type = CPP_DOT;
836 if (c == '.')
838 buffer->extra_char = get_effective_char (buffer);
839 if (buffer->extra_char == '.')
841 buffer->extra_char = EOF;
842 ACCEPT_CHAR (CPP_ELLIPSIS);
844 else
845 /* We'll catch the extra_char when we're called back. */
846 buffer->read_ahead = '.';
848 else if (c == '*' && CPP_OPTION (pfile, cplusplus))
849 ACCEPT_CHAR (CPP_DOT_STAR);
853 void
854 _cpp_lex_token (pfile, result)
855 cpp_reader *pfile;
856 cpp_token *result;
858 cppchar_t c;
859 cpp_buffer *buffer;
860 const unsigned char *comment_start;
861 unsigned char bol;
863 skip:
864 bol = pfile->state.next_bol;
865 done_directive:
866 buffer = pfile->buffer;
867 pfile->state.next_bol = 0;
868 result->flags = buffer->saved_flags;
869 buffer->saved_flags = 0;
870 next_char:
871 pfile->lexer_pos.line = buffer->lineno;
872 next_char2:
873 pfile->lexer_pos.col = CPP_BUF_COLUMN (buffer, buffer->cur);
875 c = buffer->read_ahead;
876 if (c == EOF && buffer->cur < buffer->rlimit)
878 c = *buffer->cur++;
879 pfile->lexer_pos.col++;
882 do_switch:
883 buffer->read_ahead = EOF;
884 switch (c)
886 case EOF:
887 /* Non-empty files should end in a newline. Checking "bol" too
888 prevents multiple warnings when hitting the EOF more than
889 once, like in a directive. Don't warn for command line and
890 _Pragma buffers. */
891 if (pfile->lexer_pos.col != 0 && !bol && !buffer->from_stage3)
892 cpp_pedwarn (pfile, "no newline at end of file");
893 pfile->state.next_bol = 1;
894 pfile->skipping = 0; /* In case missing #endif. */
895 result->type = CPP_EOF;
896 /* Don't do MI optimisation. */
897 return;
899 case ' ': case '\t': case '\f': case '\v': case '\0':
900 skip_whitespace (pfile, c);
901 result->flags |= PREV_WHITE;
902 goto next_char2;
904 case '\n': case '\r':
905 if (!pfile->state.in_directive)
907 handle_newline (buffer, c);
908 bol = 1;
909 pfile->lexer_pos.output_line = buffer->lineno;
910 /* This is a new line, so clear any white space flag.
911 Newlines in arguments are white space (6.10.3.10);
912 parse_arg takes care of that. */
913 result->flags &= ~(PREV_WHITE | AVOID_LPASTE);
914 goto next_char;
917 /* Don't let directives spill over to the next line. */
918 buffer->read_ahead = c;
919 pfile->state.next_bol = 1;
920 result->type = CPP_EOF;
921 /* Don't break; pfile->skipping might be true. */
922 return;
924 case '?':
925 case '\\':
926 /* These could start an escaped newline, or '?' a trigraph. Let
927 skip_escaped_newlines do all the work. */
929 unsigned int lineno = buffer->lineno;
931 c = skip_escaped_newlines (buffer, c);
932 if (lineno != buffer->lineno)
933 /* We had at least one escaped newline of some sort, and the
934 next character is in buffer->read_ahead. Update the
935 token's line and column. */
936 goto next_char;
938 /* We are either the original '?' or '\\', or a trigraph. */
939 result->type = CPP_QUERY;
940 buffer->read_ahead = EOF;
941 if (c == '\\')
942 goto random_char;
943 else if (c != '?')
944 goto do_switch;
946 break;
948 case '0': case '1': case '2': case '3': case '4':
949 case '5': case '6': case '7': case '8': case '9':
950 result->type = CPP_NUMBER;
951 parse_number (pfile, &result->val.str, c, 0);
952 break;
954 case '$':
955 if (!CPP_OPTION (pfile, dollars_in_ident))
956 goto random_char;
957 /* Fall through... */
959 case '_':
960 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
961 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
962 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
963 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
964 case 'y': case 'z':
965 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
966 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
967 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
968 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
969 case 'Y': case 'Z':
970 result->type = CPP_NAME;
971 result->val.node = parse_identifier (pfile, c);
973 /* 'L' may introduce wide characters or strings. */
974 if (result->val.node == pfile->spec_nodes.n_L)
976 c = buffer->read_ahead; /* For make_string. */
977 if (c == '\'' || c == '"')
979 ACCEPT_CHAR (c == '"' ? CPP_WSTRING: CPP_WCHAR);
980 goto make_string;
983 /* Convert named operators to their proper types. */
984 else if (result->val.node->flags & NODE_OPERATOR)
986 result->flags |= NAMED_OP;
987 result->type = result->val.node->value.operator;
989 break;
991 case '\'':
992 case '"':
993 result->type = c == '"' ? CPP_STRING: CPP_CHAR;
994 make_string:
995 parse_string (pfile, result, c);
996 break;
998 case '/':
999 /* A potential block or line comment. */
1000 comment_start = buffer->cur;
1001 result->type = CPP_DIV;
1002 c = get_effective_char (buffer);
1003 if (c == '=')
1004 ACCEPT_CHAR (CPP_DIV_EQ);
1005 if (c != '/' && c != '*')
1006 break;
1007 if (buffer->from_stage3)
1008 break;
1010 if (c == '*')
1012 if (skip_block_comment (pfile))
1013 cpp_error_with_line (pfile, pfile->lexer_pos.line,
1014 pfile->lexer_pos.col,
1015 "unterminated comment");
1017 else
1019 if (!CPP_OPTION (pfile, cplusplus_comments)
1020 && !CPP_IN_SYSTEM_HEADER (pfile))
1021 break;
1023 /* Warn about comments only if pedantically GNUC89, and not
1024 in system headers. */
1025 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1026 && ! buffer->warned_cplusplus_comments)
1028 cpp_pedwarn (pfile,
1029 "C++ style comments are not allowed in ISO C89");
1030 cpp_pedwarn (pfile,
1031 "(this will be reported only once per input file)");
1032 buffer->warned_cplusplus_comments = 1;
1035 /* Skip_line_comment updates buffer->read_ahead. */
1036 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1037 cpp_warning_with_line (pfile, pfile->lexer_pos.line,
1038 pfile->lexer_pos.col,
1039 "multi-line comment");
1042 /* Skipping the comment has updated buffer->read_ahead. */
1043 if (!pfile->state.save_comments)
1045 result->flags |= PREV_WHITE;
1046 goto next_char;
1049 /* Save the comment as a token in its own right. */
1050 save_comment (pfile, result, comment_start);
1051 /* Don't do MI optimisation. */
1052 return;
1054 case '<':
1055 if (pfile->state.angled_headers)
1057 result->type = CPP_HEADER_NAME;
1058 c = '>'; /* terminator. */
1059 goto make_string;
1062 result->type = CPP_LESS;
1063 c = get_effective_char (buffer);
1064 if (c == '=')
1065 ACCEPT_CHAR (CPP_LESS_EQ);
1066 else if (c == '<')
1068 ACCEPT_CHAR (CPP_LSHIFT);
1069 if (get_effective_char (buffer) == '=')
1070 ACCEPT_CHAR (CPP_LSHIFT_EQ);
1072 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1074 ACCEPT_CHAR (CPP_MIN);
1075 if (get_effective_char (buffer) == '=')
1076 ACCEPT_CHAR (CPP_MIN_EQ);
1078 else if (c == ':' && CPP_OPTION (pfile, digraphs))
1080 ACCEPT_CHAR (CPP_OPEN_SQUARE);
1081 result->flags |= DIGRAPH;
1083 else if (c == '%' && CPP_OPTION (pfile, digraphs))
1085 ACCEPT_CHAR (CPP_OPEN_BRACE);
1086 result->flags |= DIGRAPH;
1088 break;
1090 case '>':
1091 result->type = CPP_GREATER;
1092 c = get_effective_char (buffer);
1093 if (c == '=')
1094 ACCEPT_CHAR (CPP_GREATER_EQ);
1095 else if (c == '>')
1097 ACCEPT_CHAR (CPP_RSHIFT);
1098 if (get_effective_char (buffer) == '=')
1099 ACCEPT_CHAR (CPP_RSHIFT_EQ);
1101 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1103 ACCEPT_CHAR (CPP_MAX);
1104 if (get_effective_char (buffer) == '=')
1105 ACCEPT_CHAR (CPP_MAX_EQ);
1107 break;
1109 case '%':
1110 lex_percent (buffer, result);
1111 if (result->type == CPP_HASH)
1112 goto do_hash;
1113 break;
1115 case '.':
1116 lex_dot (pfile, result);
1117 break;
1119 case '+':
1120 result->type = CPP_PLUS;
1121 c = get_effective_char (buffer);
1122 if (c == '=')
1123 ACCEPT_CHAR (CPP_PLUS_EQ);
1124 else if (c == '+')
1125 ACCEPT_CHAR (CPP_PLUS_PLUS);
1126 break;
1128 case '-':
1129 result->type = CPP_MINUS;
1130 c = get_effective_char (buffer);
1131 if (c == '>')
1133 ACCEPT_CHAR (CPP_DEREF);
1134 if (CPP_OPTION (pfile, cplusplus)
1135 && get_effective_char (buffer) == '*')
1136 ACCEPT_CHAR (CPP_DEREF_STAR);
1138 else if (c == '=')
1139 ACCEPT_CHAR (CPP_MINUS_EQ);
1140 else if (c == '-')
1141 ACCEPT_CHAR (CPP_MINUS_MINUS);
1142 break;
1144 case '*':
1145 result->type = CPP_MULT;
1146 if (get_effective_char (buffer) == '=')
1147 ACCEPT_CHAR (CPP_MULT_EQ);
1148 break;
1150 case '=':
1151 result->type = CPP_EQ;
1152 if (get_effective_char (buffer) == '=')
1153 ACCEPT_CHAR (CPP_EQ_EQ);
1154 break;
1156 case '!':
1157 result->type = CPP_NOT;
1158 if (get_effective_char (buffer) == '=')
1159 ACCEPT_CHAR (CPP_NOT_EQ);
1160 break;
1162 case '&':
1163 result->type = CPP_AND;
1164 c = get_effective_char (buffer);
1165 if (c == '=')
1166 ACCEPT_CHAR (CPP_AND_EQ);
1167 else if (c == '&')
1168 ACCEPT_CHAR (CPP_AND_AND);
1169 break;
1171 case '#':
1172 c = buffer->extra_char; /* Can be set by error condition below. */
1173 if (c != EOF)
1175 buffer->read_ahead = c;
1176 buffer->extra_char = EOF;
1178 else
1179 c = get_effective_char (buffer);
1181 if (c == '#')
1183 ACCEPT_CHAR (CPP_PASTE);
1184 break;
1187 result->type = CPP_HASH;
1188 do_hash:
1189 if (!bol)
1190 break;
1191 /* 6.10.3 paragraph 11: If there are sequences of preprocessing
1192 tokens within the list of arguments that would otherwise act
1193 as preprocessing directives, the behavior is undefined.
1195 This implementation will report a hard error, terminate the
1196 macro invocation, and proceed to process the directive. */
1197 if (pfile->state.parsing_args)
1199 if (pfile->state.parsing_args == 2)
1200 cpp_error (pfile,
1201 "directives may not be used inside a macro argument");
1203 /* Put a '#' in lookahead, return CPP_EOF for parse_arg. */
1204 buffer->extra_char = buffer->read_ahead;
1205 buffer->read_ahead = '#';
1206 pfile->state.next_bol = 1;
1207 result->type = CPP_EOF;
1209 /* Get whitespace right - newline_in_args sets it. */
1210 if (pfile->lexer_pos.col == 1)
1211 result->flags &= ~(PREV_WHITE | AVOID_LPASTE);
1213 else
1215 /* This is the hash introducing a directive. */
1216 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1217 goto done_directive; /* bol still 1. */
1218 /* This is in fact an assembler #. */
1220 break;
1222 case '|':
1223 result->type = CPP_OR;
1224 c = get_effective_char (buffer);
1225 if (c == '=')
1226 ACCEPT_CHAR (CPP_OR_EQ);
1227 else if (c == '|')
1228 ACCEPT_CHAR (CPP_OR_OR);
1229 break;
1231 case '^':
1232 result->type = CPP_XOR;
1233 if (get_effective_char (buffer) == '=')
1234 ACCEPT_CHAR (CPP_XOR_EQ);
1235 break;
1237 case ':':
1238 result->type = CPP_COLON;
1239 c = get_effective_char (buffer);
1240 if (c == ':' && CPP_OPTION (pfile, cplusplus))
1241 ACCEPT_CHAR (CPP_SCOPE);
1242 else if (c == '>' && CPP_OPTION (pfile, digraphs))
1244 result->flags |= DIGRAPH;
1245 ACCEPT_CHAR (CPP_CLOSE_SQUARE);
1247 break;
1249 case '~': result->type = CPP_COMPL; break;
1250 case ',': result->type = CPP_COMMA; break;
1251 case '(': result->type = CPP_OPEN_PAREN; break;
1252 case ')': result->type = CPP_CLOSE_PAREN; break;
1253 case '[': result->type = CPP_OPEN_SQUARE; break;
1254 case ']': result->type = CPP_CLOSE_SQUARE; break;
1255 case '{': result->type = CPP_OPEN_BRACE; break;
1256 case '}': result->type = CPP_CLOSE_BRACE; break;
1257 case ';': result->type = CPP_SEMICOLON; break;
1259 /* @ is a punctuator in Objective C. */
1260 case '@': result->type = CPP_ATSIGN; break;
1262 random_char:
1263 default:
1264 result->type = CPP_OTHER;
1265 result->val.c = c;
1266 break;
1269 if (pfile->skipping)
1270 goto skip;
1272 /* If not in a directive, this token invalidates controlling macros. */
1273 if (!pfile->state.in_directive)
1274 pfile->mi_state = MI_FAILED;
1277 /* An upper bound on the number of bytes needed to spell a token,
1278 including preceding whitespace. */
1279 unsigned int
1280 cpp_token_len (token)
1281 const cpp_token *token;
1283 unsigned int len;
1285 switch (TOKEN_SPELL (token))
1287 default: len = 0; break;
1288 case SPELL_STRING: len = token->val.str.len; break;
1289 case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
1291 /* 1 for whitespace, 4 for comment delimeters. */
1292 return len + 5;
1295 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1296 already contain the enough space to hold the token's spelling.
1297 Returns a pointer to the character after the last character
1298 written. */
1299 unsigned char *
1300 cpp_spell_token (pfile, token, buffer)
1301 cpp_reader *pfile; /* Would be nice to be rid of this... */
1302 const cpp_token *token;
1303 unsigned char *buffer;
1305 switch (TOKEN_SPELL (token))
1307 case SPELL_OPERATOR:
1309 const unsigned char *spelling;
1310 unsigned char c;
1312 if (token->flags & DIGRAPH)
1313 spelling
1314 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1315 else if (token->flags & NAMED_OP)
1316 goto spell_ident;
1317 else
1318 spelling = TOKEN_NAME (token);
1320 while ((c = *spelling++) != '\0')
1321 *buffer++ = c;
1323 break;
1325 case SPELL_IDENT:
1326 spell_ident:
1327 memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
1328 buffer += NODE_LEN (token->val.node);
1329 break;
1331 case SPELL_STRING:
1333 int left, right, tag;
1334 switch (token->type)
1336 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1337 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1338 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1339 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1340 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1341 default: left = '\0'; right = '\0'; tag = '\0'; break;
1343 if (tag) *buffer++ = tag;
1344 if (left) *buffer++ = left;
1345 memcpy (buffer, token->val.str.text, token->val.str.len);
1346 buffer += token->val.str.len;
1347 if (right) *buffer++ = right;
1349 break;
1351 case SPELL_CHAR:
1352 *buffer++ = token->val.c;
1353 break;
1355 case SPELL_NONE:
1356 cpp_ice (pfile, "Unspellable token %s", TOKEN_NAME (token));
1357 break;
1360 return buffer;
1363 /* Returns a token as a null-terminated string. The string is
1364 temporary, and automatically freed later. Useful for diagnostics. */
1365 unsigned char *
1366 cpp_token_as_text (pfile, token)
1367 cpp_reader *pfile;
1368 const cpp_token *token;
1370 unsigned int len = cpp_token_len (token);
1371 unsigned char *start = _cpp_pool_alloc (&pfile->ident_pool, len), *end;
1373 end = cpp_spell_token (pfile, token, start);
1374 end[0] = '\0';
1376 return start;
1379 /* Used by C front ends. Should really move to using cpp_token_as_text. */
1380 const char *
1381 cpp_type2name (type)
1382 enum cpp_ttype type;
1384 return (const char *) token_spellings[type].name;
1387 /* Writes the spelling of token to FP. Separate from cpp_spell_token
1388 for efficiency - to avoid double-buffering. Also, outputs a space
1389 if PREV_WHITE is flagged. */
1390 void
1391 cpp_output_token (token, fp)
1392 const cpp_token *token;
1393 FILE *fp;
1395 if (token->flags & PREV_WHITE)
1396 putc (' ', fp);
1398 switch (TOKEN_SPELL (token))
1400 case SPELL_OPERATOR:
1402 const unsigned char *spelling;
1404 if (token->flags & DIGRAPH)
1405 spelling
1406 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1407 else if (token->flags & NAMED_OP)
1408 goto spell_ident;
1409 else
1410 spelling = TOKEN_NAME (token);
1412 ufputs (spelling, fp);
1414 break;
1416 spell_ident:
1417 case SPELL_IDENT:
1418 ufputs (NODE_NAME (token->val.node), fp);
1419 break;
1421 case SPELL_STRING:
1423 int left, right, tag;
1424 switch (token->type)
1426 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1427 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1428 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1429 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1430 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1431 default: left = '\0'; right = '\0'; tag = '\0'; break;
1433 if (tag) putc (tag, fp);
1434 if (left) putc (left, fp);
1435 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1436 if (right) putc (right, fp);
1438 break;
1440 case SPELL_CHAR:
1441 putc (token->val.c, fp);
1442 break;
1444 case SPELL_NONE:
1445 /* An error, most probably. */
1446 break;
1450 /* Compare two tokens. */
1452 _cpp_equiv_tokens (a, b)
1453 const cpp_token *a, *b;
1455 if (a->type == b->type && a->flags == b->flags)
1456 switch (TOKEN_SPELL (a))
1458 default: /* Keep compiler happy. */
1459 case SPELL_OPERATOR:
1460 return 1;
1461 case SPELL_CHAR:
1462 return a->val.c == b->val.c; /* Character. */
1463 case SPELL_NONE:
1464 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1465 case SPELL_IDENT:
1466 return a->val.node == b->val.node;
1467 case SPELL_STRING:
1468 return (a->val.str.len == b->val.str.len
1469 && !memcmp (a->val.str.text, b->val.str.text,
1470 a->val.str.len));
1473 return 0;
1476 /* Determine whether two tokens can be pasted together, and if so,
1477 what the resulting token is. Returns CPP_EOF if the tokens cannot
1478 be pasted, or the appropriate type for the merged token if they
1479 can. */
1480 enum cpp_ttype
1481 cpp_can_paste (pfile, token1, token2, digraph)
1482 cpp_reader * pfile;
1483 const cpp_token *token1, *token2;
1484 int* digraph;
1486 enum cpp_ttype a = token1->type, b = token2->type;
1487 int cxx = CPP_OPTION (pfile, cplusplus);
1489 /* Treat named operators as if they were ordinary NAMEs. */
1490 if (token1->flags & NAMED_OP)
1491 a = CPP_NAME;
1492 if (token2->flags & NAMED_OP)
1493 b = CPP_NAME;
1495 if ((int) a <= (int) CPP_LAST_EQ && b == CPP_EQ)
1496 return (enum cpp_ttype) ((int) a + ((int) CPP_EQ_EQ - (int) CPP_EQ));
1498 switch (a)
1500 case CPP_GREATER:
1501 if (b == a) return CPP_RSHIFT;
1502 if (b == CPP_QUERY && cxx) return CPP_MAX;
1503 if (b == CPP_GREATER_EQ) return CPP_RSHIFT_EQ;
1504 break;
1505 case CPP_LESS:
1506 if (b == a) return CPP_LSHIFT;
1507 if (b == CPP_QUERY && cxx) return CPP_MIN;
1508 if (b == CPP_LESS_EQ) return CPP_LSHIFT_EQ;
1509 if (CPP_OPTION (pfile, digraphs))
1511 if (b == CPP_COLON)
1512 {*digraph = 1; return CPP_OPEN_SQUARE;} /* <: digraph */
1513 if (b == CPP_MOD)
1514 {*digraph = 1; return CPP_OPEN_BRACE;} /* <% digraph */
1516 break;
1518 case CPP_PLUS: if (b == a) return CPP_PLUS_PLUS; break;
1519 case CPP_AND: if (b == a) return CPP_AND_AND; break;
1520 case CPP_OR: if (b == a) return CPP_OR_OR; break;
1522 case CPP_MINUS:
1523 if (b == a) return CPP_MINUS_MINUS;
1524 if (b == CPP_GREATER) return CPP_DEREF;
1525 break;
1526 case CPP_COLON:
1527 if (b == a && cxx) return CPP_SCOPE;
1528 if (b == CPP_GREATER && CPP_OPTION (pfile, digraphs))
1529 {*digraph = 1; return CPP_CLOSE_SQUARE;} /* :> digraph */
1530 break;
1532 case CPP_MOD:
1533 if (CPP_OPTION (pfile, digraphs))
1535 if (b == CPP_GREATER)
1536 {*digraph = 1; return CPP_CLOSE_BRACE;} /* %> digraph */
1537 if (b == CPP_COLON)
1538 {*digraph = 1; return CPP_HASH;} /* %: digraph */
1540 break;
1541 case CPP_DEREF:
1542 if (b == CPP_MULT && cxx) return CPP_DEREF_STAR;
1543 break;
1544 case CPP_DOT:
1545 if (b == CPP_MULT && cxx) return CPP_DOT_STAR;
1546 if (b == CPP_NUMBER) return CPP_NUMBER;
1547 break;
1549 case CPP_HASH:
1550 if (b == a && (token1->flags & DIGRAPH) == (token2->flags & DIGRAPH))
1551 /* %:%: digraph */
1552 {*digraph = (token1->flags & DIGRAPH); return CPP_PASTE;}
1553 break;
1555 case CPP_NAME:
1556 if (b == CPP_NAME) return CPP_NAME;
1557 if (b == CPP_NUMBER
1558 && name_p (pfile, &token2->val.str)) return CPP_NAME;
1559 if (b == CPP_CHAR
1560 && token1->val.node == pfile->spec_nodes.n_L) return CPP_WCHAR;
1561 if (b == CPP_STRING
1562 && token1->val.node == pfile->spec_nodes.n_L) return CPP_WSTRING;
1563 break;
1565 case CPP_NUMBER:
1566 if (b == CPP_NUMBER) return CPP_NUMBER;
1567 if (b == CPP_NAME) return CPP_NUMBER;
1568 if (b == CPP_DOT) return CPP_NUMBER;
1569 /* Numbers cannot have length zero, so this is safe. */
1570 if ((b == CPP_PLUS || b == CPP_MINUS)
1571 && VALID_SIGN ('+', token1->val.str.text[token1->val.str.len - 1]))
1572 return CPP_NUMBER;
1573 break;
1575 default:
1576 break;
1579 return CPP_EOF;
1582 /* Returns nonzero if a space should be inserted to avoid an
1583 accidental token paste for output. For simplicity, it is
1584 conservative, and occasionally advises a space where one is not
1585 needed, e.g. "." and ".2". */
1588 cpp_avoid_paste (pfile, token1, token2)
1589 cpp_reader *pfile;
1590 const cpp_token *token1, *token2;
1592 enum cpp_ttype a = token1->type, b = token2->type;
1593 cppchar_t c;
1595 if (token1->flags & NAMED_OP)
1596 a = CPP_NAME;
1597 if (token2->flags & NAMED_OP)
1598 b = CPP_NAME;
1600 c = EOF;
1601 if (token2->flags & DIGRAPH)
1602 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1603 else if (token_spellings[b].category == SPELL_OPERATOR)
1604 c = token_spellings[b].name[0];
1606 /* Quickly get everything that can paste with an '='. */
1607 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1608 return 1;
1610 switch (a)
1612 case CPP_GREATER: return c == '>' || c == '?';
1613 case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
1614 case CPP_PLUS: return c == '+';
1615 case CPP_MINUS: return c == '-' || c == '>';
1616 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1617 case CPP_MOD: return c == ':' || c == '>';
1618 case CPP_AND: return c == '&';
1619 case CPP_OR: return c == '|';
1620 case CPP_COLON: return c == ':' || c == '>';
1621 case CPP_DEREF: return c == '*';
1622 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1623 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1624 case CPP_NAME: return ((b == CPP_NUMBER
1625 && name_p (pfile, &token2->val.str))
1626 || b == CPP_NAME
1627 || b == CPP_CHAR || b == CPP_STRING); /* L */
1628 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1629 || c == '.' || c == '+' || c == '-');
1630 case CPP_OTHER: return (CPP_OPTION (pfile, objc)
1631 && token1->val.c == '@'
1632 && (b == CPP_NAME || b == CPP_STRING));
1633 default: break;
1636 return 0;
1639 /* Output all the remaining tokens on the current line, and a newline
1640 character, to FP. Leading whitespace is removed. */
1641 void
1642 cpp_output_line (pfile, fp)
1643 cpp_reader *pfile;
1644 FILE *fp;
1646 cpp_token token;
1648 cpp_get_token (pfile, &token);
1649 token.flags &= ~PREV_WHITE;
1650 while (token.type != CPP_EOF)
1652 cpp_output_token (&token, fp);
1653 cpp_get_token (pfile, &token);
1656 putc ('\n', fp);
1659 /* Returns the value of a hexadecimal digit. */
1660 static unsigned int
1661 hex_digit_value (c)
1662 unsigned int c;
1664 if (c >= 'a' && c <= 'f')
1665 return c - 'a' + 10;
1666 if (c >= 'A' && c <= 'F')
1667 return c - 'A' + 10;
1668 if (c >= '0' && c <= '9')
1669 return c - '0';
1670 abort ();
1673 /* Parse a '\uNNNN' or '\UNNNNNNNN' sequence (C++ and C99).
1675 [lex.charset]: The character designated by the universal character
1676 name \UNNNNNNNN is that character whose character short name in
1677 ISO/IEC 10646 is NNNNNNNN; the character designated by the
1678 universal character name \uNNNN is that character whose character
1679 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
1680 for a universal character name is less than 0x20 or in the range
1681 0x7F-0x9F (inclusive), or if the universal character name
1682 designates a character in the basic source character set, then the
1683 program is ill-formed.
1685 We assume that wchar_t is Unicode, so we don't need to do any
1686 mapping. Is this ever wrong? */
1688 static unsigned int
1689 read_ucs (pfile, pstr, limit, length)
1690 cpp_reader *pfile;
1691 const unsigned char **pstr;
1692 const unsigned char *limit;
1693 unsigned int length;
1695 const unsigned char *p = *pstr;
1696 unsigned int c, code = 0;
1698 for (; length; --length)
1700 if (p >= limit)
1702 cpp_error (pfile, "incomplete universal-character-name");
1703 break;
1706 c = *p;
1707 if (ISXDIGIT (c))
1709 code = (code << 4) + hex_digit_value (c);
1710 p++;
1712 else
1714 cpp_error (pfile,
1715 "non-hex digit '%c' in universal-character-name", c);
1716 break;
1721 #ifdef TARGET_EBCDIC
1722 cpp_error (pfile, "universal-character-name on EBCDIC target");
1723 code = 0x3f; /* EBCDIC invalid character */
1724 #else
1725 if (code > 0x9f && !(code & 0x80000000))
1726 ; /* True extended character, OK. */
1727 else if (code >= 0x20 && code < 0x7f)
1729 /* ASCII printable character. The C character set consists of all of
1730 these except $, @ and `. We use hex escapes so that this also
1731 works with EBCDIC hosts. */
1732 if (code != 0x24 && code != 0x40 && code != 0x60)
1733 cpp_error (pfile, "universal-character-name used for '%c'", code);
1735 else
1736 cpp_error (pfile, "invalid universal-character-name");
1737 #endif
1739 *pstr = p;
1740 return code;
1743 /* Interpret an escape sequence, and return its value. PSTR points to
1744 the input pointer, which is just after the backslash. LIMIT is how
1745 much text we have. MASK is the precision for the target type (char
1746 or wchar_t). TRADITIONAL, if true, does not interpret escapes that
1747 did not exist in traditional C. */
1749 static unsigned int
1750 parse_escape (pfile, pstr, limit, mask, traditional)
1751 cpp_reader *pfile;
1752 const unsigned char **pstr;
1753 const unsigned char *limit;
1754 HOST_WIDE_INT mask;
1755 int traditional;
1757 int unknown = 0;
1758 const unsigned char *str = *pstr;
1759 unsigned int c = *str++;
1761 switch (c)
1763 case '\\': case '\'': case '"': case '?': break;
1764 case 'b': c = TARGET_BS; break;
1765 case 'f': c = TARGET_FF; break;
1766 case 'n': c = TARGET_NEWLINE; break;
1767 case 'r': c = TARGET_CR; break;
1768 case 't': c = TARGET_TAB; break;
1769 case 'v': c = TARGET_VT; break;
1771 case '(': case '{': case '[': case '%':
1772 /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
1773 '\%' is used to prevent SCCS from getting confused. */
1774 unknown = CPP_PEDANTIC (pfile);
1775 break;
1777 case 'a':
1778 if (CPP_WTRADITIONAL (pfile))
1779 cpp_warning (pfile, "the meaning of '\\a' varies with -traditional");
1780 if (!traditional)
1781 c = TARGET_BELL;
1782 break;
1784 case 'e': case 'E':
1785 if (CPP_PEDANTIC (pfile))
1786 cpp_pedwarn (pfile, "non-ISO-standard escape sequence, '\\%c'", c);
1787 c = TARGET_ESC;
1788 break;
1790 /* Warnings and support checks handled by read_ucs(). */
1791 case 'u': case 'U':
1792 if (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99))
1794 if (CPP_WTRADITIONAL (pfile))
1795 cpp_warning (pfile,
1796 "the meaning of '\\%c' varies with -traditional", c);
1797 c = read_ucs (pfile, &str, limit, c == 'u' ? 4 : 8);
1799 else
1800 unknown = 1;
1801 break;
1803 case 'x':
1804 if (CPP_WTRADITIONAL (pfile))
1805 cpp_warning (pfile, "the meaning of '\\x' varies with -traditional");
1807 if (!traditional)
1809 unsigned int i = 0, overflow = 0;
1810 int digits_found = 0;
1812 while (str < limit)
1814 c = *str;
1815 if (! ISXDIGIT (c))
1816 break;
1817 str++;
1818 overflow |= i ^ (i << 4 >> 4);
1819 i = (i << 4) + hex_digit_value (c);
1820 digits_found = 1;
1823 if (!digits_found)
1824 cpp_error (pfile, "\\x used with no following hex digits");
1826 if (overflow | (i != (i & mask)))
1828 cpp_pedwarn (pfile, "hex escape sequence out of range");
1829 i &= mask;
1831 c = i;
1833 break;
1835 case '0': case '1': case '2': case '3':
1836 case '4': case '5': case '6': case '7':
1838 unsigned int i = c - '0';
1839 int count = 0;
1841 while (str < limit && ++count < 3)
1843 c = *str;
1844 if (c < '0' || c > '7')
1845 break;
1846 str++;
1847 i = (i << 3) + c - '0';
1850 if (i != (i & mask))
1852 cpp_pedwarn (pfile, "octal escape sequence out of range");
1853 i &= mask;
1855 c = i;
1857 break;
1859 default:
1860 unknown = 1;
1861 break;
1864 if (unknown)
1866 if (ISGRAPH (c))
1867 cpp_pedwarn (pfile, "unknown escape sequence '\\%c'", c);
1868 else
1869 cpp_pedwarn (pfile, "unknown escape sequence: '\\%03o'", c);
1872 *pstr = str;
1873 return c;
1876 #ifndef MAX_CHAR_TYPE_SIZE
1877 #define MAX_CHAR_TYPE_SIZE CHAR_TYPE_SIZE
1878 #endif
1880 #ifndef MAX_WCHAR_TYPE_SIZE
1881 #define MAX_WCHAR_TYPE_SIZE WCHAR_TYPE_SIZE
1882 #endif
1884 /* Interpret a (possibly wide) character constant in TOKEN.
1885 WARN_MULTI warns about multi-character charconsts, if not
1886 TRADITIONAL. TRADITIONAL also indicates not to interpret escapes
1887 that did not exist in traditional C. PCHARS_SEEN points to a
1888 variable that is filled in with the number of characters seen. */
1889 HOST_WIDE_INT
1890 cpp_interpret_charconst (pfile, token, warn_multi, traditional, pchars_seen)
1891 cpp_reader *pfile;
1892 const cpp_token *token;
1893 int warn_multi;
1894 int traditional;
1895 unsigned int *pchars_seen;
1897 const unsigned char *str = token->val.str.text;
1898 const unsigned char *limit = str + token->val.str.len;
1899 unsigned int chars_seen = 0;
1900 unsigned int width, max_chars, c;
1901 unsigned HOST_WIDE_INT mask;
1902 HOST_WIDE_INT result = 0;
1904 #ifdef MULTIBYTE_CHARS
1905 (void) local_mbtowc (NULL, NULL, 0);
1906 #endif
1908 /* Width in bits. */
1909 if (token->type == CPP_CHAR)
1910 width = MAX_CHAR_TYPE_SIZE;
1911 else
1912 width = MAX_WCHAR_TYPE_SIZE;
1914 if (width < HOST_BITS_PER_WIDE_INT)
1915 mask = ((unsigned HOST_WIDE_INT) 1 << width) - 1;
1916 else
1917 mask = ~0;
1918 max_chars = HOST_BITS_PER_WIDE_INT / width;
1920 while (str < limit)
1922 #ifdef MULTIBYTE_CHARS
1923 wchar_t wc;
1924 int char_len;
1926 char_len = local_mbtowc (&wc, str, limit - str);
1927 if (char_len == -1)
1929 cpp_warning (pfile, "ignoring invalid multibyte character");
1930 c = *str++;
1932 else
1934 str += char_len;
1935 c = wc;
1937 #else
1938 c = *str++;
1939 #endif
1941 if (c == '\\')
1943 c = parse_escape (pfile, &str, limit, mask, traditional);
1944 if (width < HOST_BITS_PER_WIDE_INT && c > mask)
1945 cpp_pedwarn (pfile, "escape sequence out of range for character");
1948 #ifdef MAP_CHARACTER
1949 if (ISPRINT (c))
1950 c = MAP_CHARACTER (c);
1951 #endif
1953 /* Merge character into result; ignore excess chars. */
1954 if (++chars_seen <= max_chars)
1956 if (width < HOST_BITS_PER_WIDE_INT)
1957 result = (result << width) | (c & mask);
1958 else
1959 result = c;
1963 if (chars_seen == 0)
1964 cpp_error (pfile, "empty character constant");
1965 else if (chars_seen > max_chars)
1967 chars_seen = max_chars;
1968 cpp_error (pfile, "character constant too long");
1970 else if (chars_seen > 1 && !traditional && warn_multi)
1971 cpp_warning (pfile, "multi-character character constant");
1973 /* If char type is signed, sign-extend the constant. The
1974 __CHAR_UNSIGNED__ macro is set by the driver if appropriate. */
1975 if (token->type == CPP_CHAR && chars_seen)
1977 unsigned int nbits = chars_seen * width;
1978 unsigned int mask = (unsigned int) ~0 >> (HOST_BITS_PER_INT - nbits);
1980 if (pfile->spec_nodes.n__CHAR_UNSIGNED__->type == NT_MACRO
1981 || ((result >> (nbits - 1)) & 1) == 0)
1982 result &= mask;
1983 else
1984 result |= ~mask;
1987 *pchars_seen = chars_seen;
1988 return result;
1991 /* Memory pools. */
1993 struct dummy
1995 char c;
1996 union
1998 double d;
1999 int *p;
2000 } u;
2003 #define DEFAULT_ALIGNMENT (offsetof (struct dummy, u))
2005 static int
2006 chunk_suitable (pool, chunk, size)
2007 cpp_pool *pool;
2008 cpp_chunk *chunk;
2009 unsigned int size;
2011 /* Being at least twice SIZE means we can use memcpy in
2012 _cpp_next_chunk rather than memmove. Besides, it's a good idea
2013 anyway. */
2014 return (chunk && pool->locked != chunk
2015 && (unsigned int) (chunk->limit - chunk->base) >= size * 2);
2018 /* Returns the end of the new pool. PTR points to a char in the old
2019 pool, and is updated to point to the same char in the new pool. */
2020 unsigned char *
2021 _cpp_next_chunk (pool, len, ptr)
2022 cpp_pool *pool;
2023 unsigned int len;
2024 unsigned char **ptr;
2026 cpp_chunk *chunk = pool->cur->next;
2028 /* LEN is the minimum size we want in the new pool. */
2029 len += POOL_ROOM (pool);
2030 if (! chunk_suitable (pool, chunk, len))
2032 chunk = new_chunk (POOL_SIZE (pool) * 2 + len);
2034 chunk->next = pool->cur->next;
2035 pool->cur->next = chunk;
2038 /* Update the pointer before changing chunk's front. */
2039 if (ptr)
2040 *ptr += chunk->base - POOL_FRONT (pool);
2042 memcpy (chunk->base, POOL_FRONT (pool), POOL_ROOM (pool));
2043 chunk->front = chunk->base;
2045 pool->cur = chunk;
2046 return POOL_LIMIT (pool);
2049 static cpp_chunk *
2050 new_chunk (size)
2051 unsigned int size;
2053 unsigned char *base;
2054 cpp_chunk *result;
2056 size = POOL_ALIGN (size, DEFAULT_ALIGNMENT);
2057 base = (unsigned char *) xmalloc (size + sizeof (cpp_chunk));
2058 /* Put the chunk descriptor at the end. Then chunk overruns will
2059 cause obvious chaos. */
2060 result = (cpp_chunk *) (base + size);
2061 result->base = base;
2062 result->front = base;
2063 result->limit = base + size;
2064 result->next = 0;
2066 return result;
2069 void
2070 _cpp_init_pool (pool, size, align, temp)
2071 cpp_pool *pool;
2072 unsigned int size, align, temp;
2074 if (align == 0)
2075 align = DEFAULT_ALIGNMENT;
2076 if (align & (align - 1))
2077 abort ();
2078 pool->align = align;
2079 pool->cur = new_chunk (size);
2080 pool->locked = 0;
2081 pool->locks = 0;
2082 if (temp)
2083 pool->cur->next = pool->cur;
2086 void
2087 _cpp_lock_pool (pool)
2088 cpp_pool *pool;
2090 if (pool->locks++ == 0)
2091 pool->locked = pool->cur;
2094 void
2095 _cpp_unlock_pool (pool)
2096 cpp_pool *pool;
2098 if (--pool->locks == 0)
2099 pool->locked = 0;
2102 void
2103 _cpp_free_pool (pool)
2104 cpp_pool *pool;
2106 cpp_chunk *chunk = pool->cur, *next;
2110 next = chunk->next;
2111 free (chunk->base);
2112 chunk = next;
2114 while (chunk && chunk != pool->cur);
2117 /* Reserve LEN bytes from a memory pool. */
2118 unsigned char *
2119 _cpp_pool_reserve (pool, len)
2120 cpp_pool *pool;
2121 unsigned int len;
2123 len = POOL_ALIGN (len, pool->align);
2124 if (len > (unsigned int) POOL_ROOM (pool))
2125 _cpp_next_chunk (pool, len, 0);
2127 return POOL_FRONT (pool);
2130 /* Allocate LEN bytes from a memory pool. */
2131 unsigned char *
2132 _cpp_pool_alloc (pool, len)
2133 cpp_pool *pool;
2134 unsigned int len;
2136 unsigned char *result = _cpp_pool_reserve (pool, len);
2138 POOL_COMMIT (pool, len);
2139 return result;