2000-05-02 Jeff Sturm <jsturm@one-point.com>
[official-gcc.git] / gcc / cpplex.c
blobf25913f20981828ba93a8e93c4c0e3588dc9a5c6
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7 Single-pass line tokenization by Neil Booth, April 2000
9 This program is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
12 later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23 /* This lexer works with a single pass of the file. Recently I
24 re-wrote it to minimize the places where we step backwards in the
25 input stream, to make future changes to support multi-byte
26 character sets fairly straight-forward.
28 There is now only one routine where we do step backwards:
29 skip_escaped_newlines. This routine could probably also be changed
30 so that it doesn't need to step back. One possibility is to use a
31 trick similar to that used in lex_period and lex_percent. Two
32 extra characters might be needed, but skip_escaped_newlines itself
33 would probably be the only place that needs to be aware of that,
34 and changes to the remaining routines would probably only be needed
35 if they process a backslash. */
37 #include "config.h"
38 #include "system.h"
39 #include "cpplib.h"
40 #include "cpphash.h"
42 /* MULTIBYTE_CHARS support only works for native compilers.
43 ??? Ideally what we want is to model widechar support after
44 the current floating point support. */
45 #ifdef CROSS_COMPILE
46 #undef MULTIBYTE_CHARS
47 #endif
49 #ifdef MULTIBYTE_CHARS
50 #include "mbchar.h"
51 #include <locale.h>
52 #endif
54 /* Tokens with SPELL_STRING store their spelling in the token list,
55 and it's length in the token->val.name.len. */
56 enum spell_type
58 SPELL_OPERATOR = 0,
59 SPELL_CHAR,
60 SPELL_IDENT,
61 SPELL_STRING,
62 SPELL_NONE
65 struct token_spelling
67 enum spell_type category;
68 const unsigned char *name;
71 const unsigned char *digraph_spellings [] = {U"%:", U"%:%:", U"<:",
72 U":>", U"<%", U"%>"};
74 #define OP(e, s) { SPELL_OPERATOR, U s },
75 #define TK(e, s) { s, U STRINGX (e) },
76 const struct token_spelling token_spellings [N_TTYPES] = {TTYPE_TABLE };
77 #undef OP
78 #undef TK
80 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
81 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
83 static cppchar_t handle_newline PARAMS ((cpp_buffer *, cppchar_t));
84 static cppchar_t skip_escaped_newlines PARAMS ((cpp_buffer *, cppchar_t));
85 static cppchar_t get_effective_char PARAMS ((cpp_buffer *));
87 static int skip_block_comment PARAMS ((cpp_reader *));
88 static int skip_line_comment PARAMS ((cpp_reader *));
89 static void adjust_column PARAMS ((cpp_reader *));
90 static void skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
91 static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *, cppchar_t));
92 static void parse_number PARAMS ((cpp_reader *, cpp_string *, cppchar_t, int));
93 static int unescaped_terminator_p PARAMS ((cpp_reader *, const U_CHAR *));
94 static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
95 static void unterminated PARAMS ((cpp_reader *, int));
96 static int trigraph_ok PARAMS ((cpp_reader *, cppchar_t));
97 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const U_CHAR *));
98 static void lex_percent PARAMS ((cpp_buffer *, cpp_token *));
99 static void lex_dot PARAMS ((cpp_reader *, cpp_token *));
100 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
101 static unsigned int parse_escape PARAMS ((cpp_reader *, const unsigned char **,
102 const unsigned char *, HOST_WIDE_INT,
103 int));
104 static unsigned int read_ucs PARAMS ((cpp_reader *, const unsigned char **,
105 const unsigned char *, unsigned int));
107 static cpp_chunk *new_chunk PARAMS ((unsigned int));
108 static int chunk_suitable PARAMS ((cpp_pool *, cpp_chunk *, unsigned int));
109 static unsigned int hex_digit_value PARAMS ((unsigned int));
111 /* Utility routine:
113 Compares, the token TOKEN to the NUL-terminated string STRING.
114 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
117 cpp_ideq (token, string)
118 const cpp_token *token;
119 const char *string;
121 if (token->type != CPP_NAME)
122 return 0;
124 return !ustrcmp (token->val.node->name, (const U_CHAR *) string);
127 /* Call when meeting a newline. Returns the character after the newline
128 (or carriage-return newline combination), or EOF. */
129 static cppchar_t
130 handle_newline (buffer, newline_char)
131 cpp_buffer *buffer;
132 cppchar_t newline_char;
134 cppchar_t next = EOF;
136 buffer->col_adjust = 0;
137 buffer->lineno++;
138 buffer->line_base = buffer->cur;
140 /* Handle CR-LF and LF-CR combinations, get the next character. */
141 if (buffer->cur < buffer->rlimit)
143 next = *buffer->cur++;
144 if (next + newline_char == '\r' + '\n')
146 buffer->line_base = buffer->cur;
147 if (buffer->cur < buffer->rlimit)
148 next = *buffer->cur++;
149 else
150 next = EOF;
154 buffer->read_ahead = next;
155 return next;
158 /* Subroutine of skip_escaped_newlines; called when a trigraph is
159 encountered. It warns if necessary, and returns true if the
160 trigraph should be honoured. FROM_CHAR is the third character of a
161 trigraph, and presumed to be the previous character for position
162 reporting. */
163 static int
164 trigraph_ok (pfile, from_char)
165 cpp_reader *pfile;
166 cppchar_t from_char;
168 int accept = CPP_OPTION (pfile, trigraphs);
170 /* Don't warn about trigraphs in comments. */
171 if (CPP_OPTION (pfile, warn_trigraphs) && !pfile->state.lexing_comment)
173 cpp_buffer *buffer = pfile->buffer;
174 if (accept)
175 cpp_warning_with_line (pfile, buffer->lineno, CPP_BUF_COL (buffer) - 2,
176 "trigraph ??%c converted to %c",
177 (int) from_char,
178 (int) _cpp_trigraph_map[from_char]);
179 else if (buffer->cur != buffer->last_Wtrigraphs)
181 buffer->last_Wtrigraphs = buffer->cur;
182 cpp_warning_with_line (pfile, buffer->lineno,
183 CPP_BUF_COL (buffer) - 2,
184 "trigraph ??%c ignored", (int) from_char);
188 return accept;
191 /* Assumes local variables buffer and result. */
192 #define ACCEPT_CHAR(t) \
193 do { result->type = t; buffer->read_ahead = EOF; } while (0)
195 /* When we move to multibyte character sets, add to these something
196 that saves and restores the state of the multibyte conversion
197 library. This probably involves saving and restoring a "cookie".
198 In the case of glibc it is an 8-byte structure, so is not a high
199 overhead operation. In any case, it's out of the fast path. */
200 #define SAVE_STATE() do { saved_cur = buffer->cur; } while (0)
201 #define RESTORE_STATE() do { buffer->cur = saved_cur; } while (0)
203 /* Skips any escaped newlines introduced by NEXT, which is either a
204 '?' or a '\\'. Returns the next character, which will also have
205 been placed in buffer->read_ahead. This routine performs
206 preprocessing stages 1 and 2 of the ISO C standard. */
207 static cppchar_t
208 skip_escaped_newlines (buffer, next)
209 cpp_buffer *buffer;
210 cppchar_t next;
212 /* Only do this if we apply stages 1 and 2. */
213 if (!buffer->from_stage3)
215 cppchar_t next1;
216 const unsigned char *saved_cur;
217 int space;
221 if (buffer->cur == buffer->rlimit)
222 break;
224 SAVE_STATE ();
225 if (next == '?')
227 next1 = *buffer->cur++;
228 if (next1 != '?' || buffer->cur == buffer->rlimit)
230 RESTORE_STATE ();
231 break;
234 next1 = *buffer->cur++;
235 if (!_cpp_trigraph_map[next1]
236 || !trigraph_ok (buffer->pfile, next1))
238 RESTORE_STATE ();
239 break;
242 /* We have a full trigraph here. */
243 next = _cpp_trigraph_map[next1];
244 if (next != '\\' || buffer->cur == buffer->rlimit)
245 break;
246 SAVE_STATE ();
249 /* We have a backslash, and room for at least one more character. */
250 space = 0;
253 next1 = *buffer->cur++;
254 if (!is_nvspace (next1))
255 break;
256 space = 1;
258 while (buffer->cur < buffer->rlimit);
260 if (!is_vspace (next1))
262 RESTORE_STATE ();
263 break;
266 if (space && !buffer->pfile->state.lexing_comment)
267 cpp_warning (buffer->pfile,
268 "backslash and newline separated by space");
270 next = handle_newline (buffer, next1);
271 if (next == EOF)
272 cpp_pedwarn (buffer->pfile, "backslash-newline at end of file");
274 while (next == '\\' || next == '?');
277 buffer->read_ahead = next;
278 return next;
281 /* Obtain the next character, after trigraph conversion and skipping
282 an arbitrary string of escaped newlines. The common case of no
283 trigraphs or escaped newlines falls through quickly. */
284 static cppchar_t
285 get_effective_char (buffer)
286 cpp_buffer *buffer;
288 cppchar_t next = EOF;
290 if (buffer->cur < buffer->rlimit)
292 next = *buffer->cur++;
294 /* '?' can introduce trigraphs (and therefore backslash); '\\'
295 can introduce escaped newlines, which we want to skip, or
296 UCNs, which, depending upon lexer state, we will handle in
297 the future. */
298 if (next == '?' || next == '\\')
299 next = skip_escaped_newlines (buffer, next);
302 buffer->read_ahead = next;
303 return next;
306 /* Skip a C-style block comment. We find the end of the comment by
307 seeing if an asterisk is before every '/' we encounter. Returns
308 non-zero if comment terminated by EOF, zero otherwise. */
309 static int
310 skip_block_comment (pfile)
311 cpp_reader *pfile;
313 cpp_buffer *buffer = pfile->buffer;
314 cppchar_t c = EOF, prevc = EOF;
316 pfile->state.lexing_comment = 1;
317 while (buffer->cur != buffer->rlimit)
319 prevc = c, c = *buffer->cur++;
321 next_char:
322 /* FIXME: For speed, create a new character class of characters
323 of interest inside block comments. */
324 if (c == '?' || c == '\\')
325 c = skip_escaped_newlines (buffer, c);
327 /* People like decorating comments with '*', so check for '/'
328 instead for efficiency. */
329 if (c == '/')
331 if (prevc == '*')
332 break;
334 /* Warn about potential nested comments, but not if the '/'
335 comes immediately before the true comment delimeter.
336 Don't bother to get it right across escaped newlines. */
337 if (CPP_OPTION (pfile, warn_comments)
338 && buffer->cur != buffer->rlimit)
340 prevc = c, c = *buffer->cur++;
341 if (c == '*' && buffer->cur != buffer->rlimit)
343 prevc = c, c = *buffer->cur++;
344 if (c != '/')
345 cpp_warning_with_line (pfile, CPP_BUF_LINE (buffer),
346 CPP_BUF_COL (buffer),
347 "\"/*\" within comment");
349 goto next_char;
352 else if (is_vspace (c))
354 prevc = c, c = handle_newline (buffer, c);
355 goto next_char;
357 else if (c == '\t')
358 adjust_column (pfile);
361 pfile->state.lexing_comment = 0;
362 buffer->read_ahead = EOF;
363 return c != '/' || prevc != '*';
366 /* Skip a C++ line comment. Handles escaped newlines. Returns
367 non-zero if a multiline comment. The following new line, if any,
368 is left in buffer->read_ahead. */
369 static int
370 skip_line_comment (pfile)
371 cpp_reader *pfile;
373 cpp_buffer *buffer = pfile->buffer;
374 unsigned int orig_lineno = buffer->lineno;
375 cppchar_t c;
377 pfile->state.lexing_comment = 1;
380 c = EOF;
381 if (buffer->cur == buffer->rlimit)
382 break;
384 c = *buffer->cur++;
385 if (c == '?' || c == '\\')
386 c = skip_escaped_newlines (buffer, c);
388 while (!is_vspace (c));
390 pfile->state.lexing_comment = 0;
391 buffer->read_ahead = c; /* Leave any newline for caller. */
392 return orig_lineno != buffer->lineno;
395 /* pfile->buffer->cur is one beyond the \t character. Update
396 col_adjust so we track the column correctly. */
397 static void
398 adjust_column (pfile)
399 cpp_reader *pfile;
401 cpp_buffer *buffer = pfile->buffer;
402 unsigned int col = CPP_BUF_COL (buffer) - 1; /* Zero-based column. */
404 /* Round it up to multiple of the tabstop, but subtract 1 since the
405 tab itself occupies a character position. */
406 buffer->col_adjust += (CPP_OPTION (pfile, tabstop)
407 - col % CPP_OPTION (pfile, tabstop)) - 1;
410 /* Skips whitespace, saving the next non-whitespace character.
411 Adjusts pfile->col_adjust to account for tabs. Without this,
412 tokens might be assigned an incorrect column. */
413 static void
414 skip_whitespace (pfile, c)
415 cpp_reader *pfile;
416 cppchar_t c;
418 cpp_buffer *buffer = pfile->buffer;
419 unsigned int warned = 0;
423 /* Horizontal space always OK. */
424 if (c == ' ')
426 else if (c == '\t')
427 adjust_column (pfile);
428 /* Just \f \v or \0 left. */
429 else if (c == '\0')
431 if (!warned)
433 cpp_warning (pfile, "null character(s) ignored");
434 warned = 1;
437 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
438 cpp_pedwarn_with_line (pfile, CPP_BUF_LINE (buffer),
439 CPP_BUF_COL (buffer),
440 "%s in preprocessing directive",
441 c == '\f' ? "form feed" : "vertical tab");
443 c = EOF;
444 if (buffer->cur == buffer->rlimit)
445 break;
446 c = *buffer->cur++;
448 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
449 while (is_nvspace (c));
451 /* Remember the next character. */
452 buffer->read_ahead = c;
455 /* See if the characters of a number token are valid in a name (no
456 '.', '+' or '-'). */
457 static int
458 name_p (pfile, string)
459 cpp_reader *pfile;
460 const cpp_string *string;
462 unsigned int i;
464 for (i = 0; i < string->len; i++)
465 if (!is_idchar (string->text[i]))
466 return 0;
468 return 1;
471 /* Parse an identifier, skipping embedded backslash-newlines.
472 Calculate the hash value of the token while parsing, for improved
473 performance. The hashing algorithm *must* match cpp_lookup(). */
475 static cpp_hashnode *
476 parse_identifier (pfile, c)
477 cpp_reader *pfile;
478 cppchar_t c;
480 cpp_hashnode *result;
481 cpp_buffer *buffer = pfile->buffer;
482 unsigned char *dest, *limit;
483 unsigned int r = 0, saw_dollar = 0;
485 dest = POOL_FRONT (&pfile->ident_pool);
486 limit = POOL_LIMIT (&pfile->ident_pool);
492 /* Need room for terminating null. */
493 if (dest + 1 >= limit)
494 limit = _cpp_next_chunk (&pfile->ident_pool, 0, &dest);
496 *dest++ = c;
497 r = HASHSTEP (r, c);
499 if (c == '$')
500 saw_dollar++;
502 c = EOF;
503 if (buffer->cur == buffer->rlimit)
504 break;
506 c = *buffer->cur++;
508 while (is_idchar (c));
510 /* Potential escaped newline? */
511 if (c != '?' && c != '\\')
512 break;
513 c = skip_escaped_newlines (buffer, c);
515 while (is_idchar (c));
517 /* Remember the next character. */
518 buffer->read_ahead = c;
520 /* $ is not a identifier character in the standard, but is commonly
521 accepted as an extension. Don't warn about it in skipped
522 conditional blocks. */
523 if (saw_dollar && CPP_PEDANTIC (pfile) && ! pfile->skipping)
524 cpp_pedwarn (pfile, "'$' character(s) in identifier");
526 /* Identifiers are null-terminated. */
527 *dest = '\0';
529 /* This routine commits the memory if necessary. */
530 result = _cpp_lookup_with_hash (pfile,
531 dest - POOL_FRONT (&pfile->ident_pool), r);
533 /* Some identifiers require diagnostics when lexed. */
534 if (result->flags & NODE_DIAGNOSTIC && !pfile->skipping)
536 /* It is allowed to poison the same identifier twice. */
537 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
538 cpp_error (pfile, "attempt to use poisoned \"%s\"", result->name);
540 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
541 replacement list of a variadic macro. */
542 if (result == pfile->spec_nodes.n__VA_ARGS__
543 && !pfile->state.va_args_ok)
544 cpp_pedwarn (pfile, "__VA_ARGS__ can only appear in the expansion of a C99 variadic macro");
547 return result;
550 /* Parse a number, skipping embedded backslash-newlines. */
551 static void
552 parse_number (pfile, number, c, leading_period)
553 cpp_reader *pfile;
554 cpp_string *number;
555 cppchar_t c;
556 int leading_period;
558 cpp_buffer *buffer = pfile->buffer;
559 cpp_pool *pool = &pfile->ident_pool;
560 unsigned char *dest, *limit;
562 dest = POOL_FRONT (pool);
563 limit = POOL_LIMIT (pool);
565 /* Place a leading period. */
566 if (leading_period)
568 if (dest >= limit)
569 limit = _cpp_next_chunk (pool, 0, &dest);
570 *dest++ = '.';
577 /* Need room for terminating null. */
578 if (dest + 1 >= limit)
579 limit = _cpp_next_chunk (pool, 0, &dest);
580 *dest++ = c;
582 c = EOF;
583 if (buffer->cur == buffer->rlimit)
584 break;
586 c = *buffer->cur++;
588 while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
590 /* Potential escaped newline? */
591 if (c != '?' && c != '\\')
592 break;
593 c = skip_escaped_newlines (buffer, c);
595 while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
597 /* Remember the next character. */
598 buffer->read_ahead = c;
600 /* Null-terminate the number. */
601 *dest = '\0';
603 number->text = POOL_FRONT (pool);
604 number->len = dest - number->text;
605 POOL_COMMIT (pool, number->len + 1);
608 /* Subroutine of parse_string. Emits error for unterminated strings. */
609 static void
610 unterminated (pfile, term)
611 cpp_reader *pfile;
612 int term;
614 cpp_error (pfile, "missing terminating %c character", term);
616 if (term == '\"' && pfile->mlstring_pos.line
617 && pfile->mlstring_pos.line != pfile->lexer_pos.line)
619 cpp_error_with_line (pfile, pfile->mlstring_pos.line,
620 pfile->mlstring_pos.col,
621 "possible start of unterminated string literal");
622 pfile->mlstring_pos.line = 0;
626 /* Subroutine of parse_string. */
627 static int
628 unescaped_terminator_p (pfile, dest)
629 cpp_reader *pfile;
630 const unsigned char *dest;
632 const unsigned char *start, *temp;
634 /* In #include-style directives, terminators are not escapeable. */
635 if (pfile->state.angled_headers)
636 return 1;
638 start = POOL_FRONT (&pfile->ident_pool);
640 /* An odd number of consecutive backslashes represents an escaped
641 terminator. */
642 for (temp = dest; temp > start && temp[-1] == '\\'; temp--)
645 return ((dest - temp) & 1) == 0;
648 /* Parses a string, character constant, or angle-bracketed header file
649 name. Handles embedded trigraphs and escaped newlines. The stored
650 string is guaranteed NUL-terminated, but it is not guaranteed that
651 this is the first NUL since embedded NULs are preserved.
653 Multi-line strings are allowed, but they are deprecated. */
654 static void
655 parse_string (pfile, token, terminator)
656 cpp_reader *pfile;
657 cpp_token *token;
658 cppchar_t terminator;
660 cpp_buffer *buffer = pfile->buffer;
661 cpp_pool *pool = &pfile->ident_pool;
662 unsigned char *dest, *limit;
663 cppchar_t c;
664 unsigned int nulls = 0;
666 dest = POOL_FRONT (pool);
667 limit = POOL_LIMIT (pool);
669 for (;;)
671 if (buffer->cur == buffer->rlimit)
672 c = EOF;
673 else
674 c = *buffer->cur++;
676 have_char:
677 /* We need space for the terminating NUL. */
678 if (dest >= limit)
679 limit = _cpp_next_chunk (pool, 0, &dest);
681 if (c == EOF)
683 unterminated (pfile, terminator);
684 break;
687 /* Handle trigraphs, escaped newlines etc. */
688 if (c == '?' || c == '\\')
689 c = skip_escaped_newlines (buffer, c);
691 if (c == terminator && unescaped_terminator_p (pfile, dest))
693 c = EOF;
694 break;
696 else if (is_vspace (c))
698 /* In assembly language, silently terminate string and
699 character literals at end of line. This is a kludge
700 around not knowing where comments are. */
701 if (CPP_OPTION (pfile, lang) == CLK_ASM && terminator != '>')
702 break;
704 /* Character constants and header names may not extend over
705 multiple lines. In Standard C, neither may strings.
706 Unfortunately, we accept multiline strings as an
707 extension, except in #include family directives. */
708 if (terminator != '"' || pfile->state.angled_headers)
710 unterminated (pfile, terminator);
711 break;
714 cpp_pedwarn (pfile, "multi-line string literals are deprecated");
715 if (pfile->mlstring_pos.line == 0)
716 pfile->mlstring_pos = pfile->lexer_pos;
718 c = handle_newline (buffer, c);
719 *dest++ = '\n';
720 goto have_char;
722 else if (c == '\0')
724 if (nulls++ == 0)
725 cpp_warning (pfile, "null character(s) preserved in literal");
728 *dest++ = c;
731 /* Remember the next character. */
732 buffer->read_ahead = c;
733 *dest = '\0';
735 token->val.str.text = POOL_FRONT (pool);
736 token->val.str.len = dest - token->val.str.text;
737 POOL_COMMIT (pool, token->val.str.len + 1);
740 /* The stored comment includes the comment start and any terminator. */
741 static void
742 save_comment (pfile, token, from)
743 cpp_reader *pfile;
744 cpp_token *token;
745 const unsigned char *from;
747 unsigned char *buffer;
748 unsigned int len;
750 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
751 /* C++ comments probably (not definitely) have moved past a new
752 line, which we don't want to save in the comment. */
753 if (pfile->buffer->read_ahead != EOF)
754 len--;
755 buffer = _cpp_pool_alloc (&pfile->ident_pool, len);
757 token->type = CPP_COMMENT;
758 token->val.str.len = len;
759 token->val.str.text = buffer;
761 buffer[0] = '/';
762 memcpy (buffer + 1, from, len - 1);
765 /* Subroutine of lex_token to handle '%'. A little tricky, since we
766 want to avoid stepping back when lexing %:%X. */
767 static void
768 lex_percent (buffer, result)
769 cpp_buffer *buffer;
770 cpp_token *result;
772 cppchar_t c;
774 result->type = CPP_MOD;
775 /* Parsing %:%X could leave an extra character. */
776 if (buffer->extra_char == EOF)
777 c = get_effective_char (buffer);
778 else
780 c = buffer->read_ahead = buffer->extra_char;
781 buffer->extra_char = EOF;
784 if (c == '=')
785 ACCEPT_CHAR (CPP_MOD_EQ);
786 else if (CPP_OPTION (buffer->pfile, digraphs))
788 if (c == ':')
790 result->flags |= DIGRAPH;
791 ACCEPT_CHAR (CPP_HASH);
792 if (get_effective_char (buffer) == '%')
794 buffer->extra_char = get_effective_char (buffer);
795 if (buffer->extra_char == ':')
797 buffer->extra_char = EOF;
798 ACCEPT_CHAR (CPP_PASTE);
800 else
801 /* We'll catch the extra_char when we're called back. */
802 buffer->read_ahead = '%';
805 else if (c == '>')
807 result->flags |= DIGRAPH;
808 ACCEPT_CHAR (CPP_CLOSE_BRACE);
813 /* Subroutine of lex_token to handle '.'. This is tricky, since we
814 want to avoid stepping back when lexing '...' or '.123'. In the
815 latter case we should also set a flag for parse_number. */
816 static void
817 lex_dot (pfile, result)
818 cpp_reader *pfile;
819 cpp_token *result;
821 cpp_buffer *buffer = pfile->buffer;
822 cppchar_t c;
824 /* Parsing ..X could leave an extra character. */
825 if (buffer->extra_char == EOF)
826 c = get_effective_char (buffer);
827 else
829 c = buffer->read_ahead = buffer->extra_char;
830 buffer->extra_char = EOF;
833 /* All known character sets have 0...9 contiguous. */
834 if (c >= '0' && c <= '9')
836 result->type = CPP_NUMBER;
837 parse_number (pfile, &result->val.str, c, 1);
839 else
841 result->type = CPP_DOT;
842 if (c == '.')
844 buffer->extra_char = get_effective_char (buffer);
845 if (buffer->extra_char == '.')
847 buffer->extra_char = EOF;
848 ACCEPT_CHAR (CPP_ELLIPSIS);
850 else
851 /* We'll catch the extra_char when we're called back. */
852 buffer->read_ahead = '.';
854 else if (c == '*' && CPP_OPTION (pfile, cplusplus))
855 ACCEPT_CHAR (CPP_DOT_STAR);
859 void
860 _cpp_lex_token (pfile, result)
861 cpp_reader *pfile;
862 cpp_token *result;
864 cppchar_t c;
865 cpp_buffer *buffer;
866 const unsigned char *comment_start;
867 unsigned char bol;
869 skip:
870 bol = pfile->state.next_bol;
871 done_directive:
872 buffer = pfile->buffer;
873 pfile->state.next_bol = 0;
874 result->flags = buffer->saved_flags;
875 buffer->saved_flags = 0;
876 next_char:
877 pfile->lexer_pos.line = buffer->lineno;
878 next_char2:
879 pfile->lexer_pos.col = CPP_BUF_COLUMN (buffer, buffer->cur);
881 c = buffer->read_ahead;
882 if (c == EOF && buffer->cur < buffer->rlimit)
884 c = *buffer->cur++;
885 pfile->lexer_pos.col++;
888 do_switch:
889 buffer->read_ahead = EOF;
890 switch (c)
892 case EOF:
893 /* Non-empty files should end in a newline. Checking "bol" too
894 prevents multiple warnings when hitting the EOF more than
895 once, like in a directive. Don't warn for command line and
896 _Pragma buffers. */
897 if (pfile->lexer_pos.col != 0 && !bol && !buffer->from_stage3)
898 cpp_pedwarn (pfile, "no newline at end of file");
899 pfile->state.next_bol = 1;
900 pfile->skipping = 0; /* In case missing #endif. */
901 result->type = CPP_EOF;
902 /* Don't do MI optimisation. */
903 return;
905 case ' ': case '\t': case '\f': case '\v': case '\0':
906 skip_whitespace (pfile, c);
907 result->flags |= PREV_WHITE;
908 goto next_char2;
910 case '\n': case '\r':
911 if (!pfile->state.in_directive)
913 handle_newline (buffer, c);
914 bol = 1;
915 pfile->lexer_pos.output_line = buffer->lineno;
916 /* This is a new line, so clear any white space flag.
917 Newlines in arguments are white space (6.10.3.10);
918 parse_arg takes care of that. */
919 result->flags &= ~(PREV_WHITE | AVOID_LPASTE);
920 goto next_char;
923 /* Don't let directives spill over to the next line. */
924 buffer->read_ahead = c;
925 pfile->state.next_bol = 1;
926 result->type = CPP_EOF;
927 /* Don't break; pfile->skipping might be true. */
928 return;
930 case '?':
931 case '\\':
932 /* These could start an escaped newline, or '?' a trigraph. Let
933 skip_escaped_newlines do all the work. */
935 unsigned int lineno = buffer->lineno;
937 c = skip_escaped_newlines (buffer, c);
938 if (lineno != buffer->lineno)
939 /* We had at least one escaped newline of some sort, and the
940 next character is in buffer->read_ahead. Update the
941 token's line and column. */
942 goto next_char;
944 /* We are either the original '?' or '\\', or a trigraph. */
945 result->type = CPP_QUERY;
946 buffer->read_ahead = EOF;
947 if (c == '\\')
948 goto random_char;
949 else if (c != '?')
950 goto do_switch;
952 break;
954 case '0': case '1': case '2': case '3': case '4':
955 case '5': case '6': case '7': case '8': case '9':
956 result->type = CPP_NUMBER;
957 parse_number (pfile, &result->val.str, c, 0);
958 break;
960 case '$':
961 if (!CPP_OPTION (pfile, dollars_in_ident))
962 goto random_char;
963 /* Fall through... */
965 case '_':
966 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
967 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
968 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
969 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
970 case 'y': case 'z':
971 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
972 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
973 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
974 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
975 case 'Y': case 'Z':
976 result->type = CPP_NAME;
977 result->val.node = parse_identifier (pfile, c);
979 /* 'L' may introduce wide characters or strings. */
980 if (result->val.node == pfile->spec_nodes.n_L)
982 c = buffer->read_ahead; /* For make_string. */
983 if (c == '\'' || c == '"')
985 ACCEPT_CHAR (c == '"' ? CPP_WSTRING: CPP_WCHAR);
986 goto make_string;
989 /* Convert named operators to their proper types. */
990 else if (result->val.node->flags & NODE_OPERATOR)
992 result->flags |= NAMED_OP;
993 result->type = result->val.node->value.operator;
995 break;
997 case '\'':
998 case '"':
999 result->type = c == '"' ? CPP_STRING: CPP_CHAR;
1000 make_string:
1001 parse_string (pfile, result, c);
1002 break;
1004 case '/':
1005 /* A potential block or line comment. */
1006 comment_start = buffer->cur;
1007 result->type = CPP_DIV;
1008 c = get_effective_char (buffer);
1009 if (c == '=')
1010 ACCEPT_CHAR (CPP_DIV_EQ);
1011 if (c != '/' && c != '*')
1012 break;
1014 if (c == '*')
1016 if (skip_block_comment (pfile))
1017 cpp_error_with_line (pfile, pfile->lexer_pos.line,
1018 pfile->lexer_pos.col,
1019 "unterminated comment");
1021 else
1023 if (!CPP_OPTION (pfile, cplusplus_comments)
1024 && !CPP_IN_SYSTEM_HEADER (pfile))
1025 break;
1027 /* Warn about comments only if pedantically GNUC89, and not
1028 in system headers. */
1029 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1030 && ! buffer->warned_cplusplus_comments)
1032 cpp_pedwarn (pfile,
1033 "C++ style comments are not allowed in ISO C89");
1034 cpp_pedwarn (pfile,
1035 "(this will be reported only once per input file)");
1036 buffer->warned_cplusplus_comments = 1;
1039 /* Skip_line_comment updates buffer->read_ahead. */
1040 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1041 cpp_warning_with_line (pfile, pfile->lexer_pos.line,
1042 pfile->lexer_pos.col,
1043 "multi-line comment");
1046 /* Skipping the comment has updated buffer->read_ahead. */
1047 if (!pfile->state.save_comments)
1049 result->flags |= PREV_WHITE;
1050 goto next_char;
1053 /* Save the comment as a token in its own right. */
1054 save_comment (pfile, result, comment_start);
1055 /* Don't do MI optimisation. */
1056 return;
1058 case '<':
1059 if (pfile->state.angled_headers)
1061 result->type = CPP_HEADER_NAME;
1062 c = '>'; /* terminator. */
1063 goto make_string;
1066 result->type = CPP_LESS;
1067 c = get_effective_char (buffer);
1068 if (c == '=')
1069 ACCEPT_CHAR (CPP_LESS_EQ);
1070 else if (c == '<')
1072 ACCEPT_CHAR (CPP_LSHIFT);
1073 if (get_effective_char (buffer) == '=')
1074 ACCEPT_CHAR (CPP_LSHIFT_EQ);
1076 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1078 ACCEPT_CHAR (CPP_MIN);
1079 if (get_effective_char (buffer) == '=')
1080 ACCEPT_CHAR (CPP_MIN_EQ);
1082 else if (c == ':' && CPP_OPTION (pfile, digraphs))
1084 ACCEPT_CHAR (CPP_OPEN_SQUARE);
1085 result->flags |= DIGRAPH;
1087 else if (c == '%' && CPP_OPTION (pfile, digraphs))
1089 ACCEPT_CHAR (CPP_OPEN_BRACE);
1090 result->flags |= DIGRAPH;
1092 break;
1094 case '>':
1095 result->type = CPP_GREATER;
1096 c = get_effective_char (buffer);
1097 if (c == '=')
1098 ACCEPT_CHAR (CPP_GREATER_EQ);
1099 else if (c == '>')
1101 ACCEPT_CHAR (CPP_RSHIFT);
1102 if (get_effective_char (buffer) == '=')
1103 ACCEPT_CHAR (CPP_RSHIFT_EQ);
1105 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1107 ACCEPT_CHAR (CPP_MAX);
1108 if (get_effective_char (buffer) == '=')
1109 ACCEPT_CHAR (CPP_MAX_EQ);
1111 break;
1113 case '%':
1114 lex_percent (buffer, result);
1115 if (result->type == CPP_HASH)
1116 goto do_hash;
1117 break;
1119 case '.':
1120 lex_dot (pfile, result);
1121 break;
1123 case '+':
1124 result->type = CPP_PLUS;
1125 c = get_effective_char (buffer);
1126 if (c == '=')
1127 ACCEPT_CHAR (CPP_PLUS_EQ);
1128 else if (c == '+')
1129 ACCEPT_CHAR (CPP_PLUS_PLUS);
1130 break;
1132 case '-':
1133 result->type = CPP_MINUS;
1134 c = get_effective_char (buffer);
1135 if (c == '>')
1137 ACCEPT_CHAR (CPP_DEREF);
1138 if (CPP_OPTION (pfile, cplusplus)
1139 && get_effective_char (buffer) == '*')
1140 ACCEPT_CHAR (CPP_DEREF_STAR);
1142 else if (c == '=')
1143 ACCEPT_CHAR (CPP_MINUS_EQ);
1144 else if (c == '-')
1145 ACCEPT_CHAR (CPP_MINUS_MINUS);
1146 break;
1148 case '*':
1149 result->type = CPP_MULT;
1150 if (get_effective_char (buffer) == '=')
1151 ACCEPT_CHAR (CPP_MULT_EQ);
1152 break;
1154 case '=':
1155 result->type = CPP_EQ;
1156 if (get_effective_char (buffer) == '=')
1157 ACCEPT_CHAR (CPP_EQ_EQ);
1158 break;
1160 case '!':
1161 result->type = CPP_NOT;
1162 if (get_effective_char (buffer) == '=')
1163 ACCEPT_CHAR (CPP_NOT_EQ);
1164 break;
1166 case '&':
1167 result->type = CPP_AND;
1168 c = get_effective_char (buffer);
1169 if (c == '=')
1170 ACCEPT_CHAR (CPP_AND_EQ);
1171 else if (c == '&')
1172 ACCEPT_CHAR (CPP_AND_AND);
1173 break;
1175 case '#':
1176 c = buffer->extra_char; /* Can be set by error condition below. */
1177 if (c != EOF)
1179 buffer->read_ahead = c;
1180 buffer->extra_char = EOF;
1182 else
1183 c = get_effective_char (buffer);
1185 if (c == '#')
1187 ACCEPT_CHAR (CPP_PASTE);
1188 break;
1191 result->type = CPP_HASH;
1192 do_hash:
1193 if (!bol)
1194 break;
1195 /* 6.10.3 paragraph 11: If there are sequences of preprocessing
1196 tokens within the list of arguments that would otherwise act
1197 as preprocessing directives, the behavior is undefined.
1199 This implementation will report a hard error, terminate the
1200 macro invocation, and proceed to process the directive. */
1201 if (pfile->state.parsing_args)
1203 if (pfile->state.parsing_args == 2)
1204 cpp_error (pfile,
1205 "directives may not be used inside a macro argument");
1207 /* Put a '#' in lookahead, return CPP_EOF for parse_arg. */
1208 buffer->extra_char = buffer->read_ahead;
1209 buffer->read_ahead = '#';
1210 pfile->state.next_bol = 1;
1211 result->type = CPP_EOF;
1213 /* Get whitespace right - newline_in_args sets it. */
1214 if (pfile->lexer_pos.col == 1)
1215 result->flags &= ~(PREV_WHITE | AVOID_LPASTE);
1217 else
1219 /* This is the hash introducing a directive. */
1220 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1221 goto done_directive; /* bol still 1. */
1222 /* This is in fact an assembler #. */
1224 break;
1226 case '|':
1227 result->type = CPP_OR;
1228 c = get_effective_char (buffer);
1229 if (c == '=')
1230 ACCEPT_CHAR (CPP_OR_EQ);
1231 else if (c == '|')
1232 ACCEPT_CHAR (CPP_OR_OR);
1233 break;
1235 case '^':
1236 result->type = CPP_XOR;
1237 if (get_effective_char (buffer) == '=')
1238 ACCEPT_CHAR (CPP_XOR_EQ);
1239 break;
1241 case ':':
1242 result->type = CPP_COLON;
1243 c = get_effective_char (buffer);
1244 if (c == ':' && CPP_OPTION (pfile, cplusplus))
1245 ACCEPT_CHAR (CPP_SCOPE);
1246 else if (c == '>' && CPP_OPTION (pfile, digraphs))
1248 result->flags |= DIGRAPH;
1249 ACCEPT_CHAR (CPP_CLOSE_SQUARE);
1251 break;
1253 case '~': result->type = CPP_COMPL; break;
1254 case ',': result->type = CPP_COMMA; break;
1255 case '(': result->type = CPP_OPEN_PAREN; break;
1256 case ')': result->type = CPP_CLOSE_PAREN; break;
1257 case '[': result->type = CPP_OPEN_SQUARE; break;
1258 case ']': result->type = CPP_CLOSE_SQUARE; break;
1259 case '{': result->type = CPP_OPEN_BRACE; break;
1260 case '}': result->type = CPP_CLOSE_BRACE; break;
1261 case ';': result->type = CPP_SEMICOLON; break;
1263 /* @ is a punctuator in Objective C. */
1264 case '@': result->type = CPP_ATSIGN; break;
1266 random_char:
1267 default:
1268 result->type = CPP_OTHER;
1269 result->val.c = c;
1270 break;
1273 if (pfile->skipping)
1274 goto skip;
1276 /* If not in a directive, this token invalidates controlling macros. */
1277 if (!pfile->state.in_directive)
1278 pfile->mi_state = MI_FAILED;
1281 /* An upper bound on the number of bytes needed to spell a token,
1282 including preceding whitespace. */
1283 unsigned int
1284 cpp_token_len (token)
1285 const cpp_token *token;
1287 unsigned int len;
1289 switch (TOKEN_SPELL (token))
1291 default: len = 0; break;
1292 case SPELL_STRING: len = token->val.str.len; break;
1293 case SPELL_IDENT: len = token->val.node->length; break;
1295 /* 1 for whitespace, 4 for comment delimeters. */
1296 return len + 5;
1299 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1300 already contain the enough space to hold the token's spelling.
1301 Returns a pointer to the character after the last character
1302 written. */
1303 unsigned char *
1304 cpp_spell_token (pfile, token, buffer)
1305 cpp_reader *pfile; /* Would be nice to be rid of this... */
1306 const cpp_token *token;
1307 unsigned char *buffer;
1309 switch (TOKEN_SPELL (token))
1311 case SPELL_OPERATOR:
1313 const unsigned char *spelling;
1314 unsigned char c;
1316 if (token->flags & DIGRAPH)
1317 spelling
1318 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1319 else if (token->flags & NAMED_OP)
1320 goto spell_ident;
1321 else
1322 spelling = TOKEN_NAME (token);
1324 while ((c = *spelling++) != '\0')
1325 *buffer++ = c;
1327 break;
1329 case SPELL_IDENT:
1330 spell_ident:
1331 memcpy (buffer, token->val.node->name, token->val.node->length);
1332 buffer += token->val.node->length;
1333 break;
1335 case SPELL_STRING:
1337 int left, right, tag;
1338 switch (token->type)
1340 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1341 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1342 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1343 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1344 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1345 default: left = '\0'; right = '\0'; tag = '\0'; break;
1347 if (tag) *buffer++ = tag;
1348 if (left) *buffer++ = left;
1349 memcpy (buffer, token->val.str.text, token->val.str.len);
1350 buffer += token->val.str.len;
1351 if (right) *buffer++ = right;
1353 break;
1355 case SPELL_CHAR:
1356 *buffer++ = token->val.c;
1357 break;
1359 case SPELL_NONE:
1360 cpp_ice (pfile, "Unspellable token %s", TOKEN_NAME (token));
1361 break;
1364 return buffer;
1367 /* Returns a token as a null-terminated string. The string is
1368 temporary, and automatically freed later. Useful for diagnostics. */
1369 unsigned char *
1370 cpp_token_as_text (pfile, token)
1371 cpp_reader *pfile;
1372 const cpp_token *token;
1374 unsigned int len = cpp_token_len (token);
1375 unsigned char *start = _cpp_pool_alloc (&pfile->ident_pool, len), *end;
1377 end = cpp_spell_token (pfile, token, start);
1378 end[0] = '\0';
1380 return start;
1383 /* Used by C front ends. Should really move to using cpp_token_as_text. */
1384 const char *
1385 cpp_type2name (type)
1386 enum cpp_ttype type;
1388 return (const char *) token_spellings[type].name;
1391 /* Writes the spelling of token to FP. Separate from cpp_spell_token
1392 for efficiency - to avoid double-buffering. Also, outputs a space
1393 if PREV_WHITE is flagged. */
1394 void
1395 cpp_output_token (token, fp)
1396 const cpp_token *token;
1397 FILE *fp;
1399 if (token->flags & PREV_WHITE)
1400 putc (' ', fp);
1402 switch (TOKEN_SPELL (token))
1404 case SPELL_OPERATOR:
1406 const unsigned char *spelling;
1408 if (token->flags & DIGRAPH)
1409 spelling
1410 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1411 else if (token->flags & NAMED_OP)
1412 goto spell_ident;
1413 else
1414 spelling = TOKEN_NAME (token);
1416 ufputs (spelling, fp);
1418 break;
1420 spell_ident:
1421 case SPELL_IDENT:
1422 ufputs (token->val.node->name, fp);
1423 break;
1425 case SPELL_STRING:
1427 int left, right, tag;
1428 switch (token->type)
1430 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1431 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1432 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1433 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1434 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1435 default: left = '\0'; right = '\0'; tag = '\0'; break;
1437 if (tag) putc (tag, fp);
1438 if (left) putc (left, fp);
1439 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1440 if (right) putc (right, fp);
1442 break;
1444 case SPELL_CHAR:
1445 putc (token->val.c, fp);
1446 break;
1448 case SPELL_NONE:
1449 /* An error, most probably. */
1450 break;
1454 /* Compare two tokens. */
1456 _cpp_equiv_tokens (a, b)
1457 const cpp_token *a, *b;
1459 if (a->type == b->type && a->flags == b->flags)
1460 switch (TOKEN_SPELL (a))
1462 default: /* Keep compiler happy. */
1463 case SPELL_OPERATOR:
1464 return 1;
1465 case SPELL_CHAR:
1466 return a->val.c == b->val.c; /* Character. */
1467 case SPELL_NONE:
1468 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1469 case SPELL_IDENT:
1470 return a->val.node == b->val.node;
1471 case SPELL_STRING:
1472 return (a->val.str.len == b->val.str.len
1473 && !memcmp (a->val.str.text, b->val.str.text,
1474 a->val.str.len));
1477 return 0;
1480 /* Determine whether two tokens can be pasted together, and if so,
1481 what the resulting token is. Returns CPP_EOF if the tokens cannot
1482 be pasted, or the appropriate type for the merged token if they
1483 can. */
1484 enum cpp_ttype
1485 cpp_can_paste (pfile, token1, token2, digraph)
1486 cpp_reader * pfile;
1487 const cpp_token *token1, *token2;
1488 int* digraph;
1490 enum cpp_ttype a = token1->type, b = token2->type;
1491 int cxx = CPP_OPTION (pfile, cplusplus);
1493 /* Treat named operators as if they were ordinary NAMEs. */
1494 if (token1->flags & NAMED_OP)
1495 a = CPP_NAME;
1496 if (token2->flags & NAMED_OP)
1497 b = CPP_NAME;
1499 if ((int) a <= (int) CPP_LAST_EQ && b == CPP_EQ)
1500 return (enum cpp_ttype) ((int) a + ((int) CPP_EQ_EQ - (int) CPP_EQ));
1502 switch (a)
1504 case CPP_GREATER:
1505 if (b == a) return CPP_RSHIFT;
1506 if (b == CPP_QUERY && cxx) return CPP_MAX;
1507 if (b == CPP_GREATER_EQ) return CPP_RSHIFT_EQ;
1508 break;
1509 case CPP_LESS:
1510 if (b == a) return CPP_LSHIFT;
1511 if (b == CPP_QUERY && cxx) return CPP_MIN;
1512 if (b == CPP_LESS_EQ) return CPP_LSHIFT_EQ;
1513 if (CPP_OPTION (pfile, digraphs))
1515 if (b == CPP_COLON)
1516 {*digraph = 1; return CPP_OPEN_SQUARE;} /* <: digraph */
1517 if (b == CPP_MOD)
1518 {*digraph = 1; return CPP_OPEN_BRACE;} /* <% digraph */
1520 break;
1522 case CPP_PLUS: if (b == a) return CPP_PLUS_PLUS; break;
1523 case CPP_AND: if (b == a) return CPP_AND_AND; break;
1524 case CPP_OR: if (b == a) return CPP_OR_OR; break;
1526 case CPP_MINUS:
1527 if (b == a) return CPP_MINUS_MINUS;
1528 if (b == CPP_GREATER) return CPP_DEREF;
1529 break;
1530 case CPP_COLON:
1531 if (b == a && cxx) return CPP_SCOPE;
1532 if (b == CPP_GREATER && CPP_OPTION (pfile, digraphs))
1533 {*digraph = 1; return CPP_CLOSE_SQUARE;} /* :> digraph */
1534 break;
1536 case CPP_MOD:
1537 if (CPP_OPTION (pfile, digraphs))
1539 if (b == CPP_GREATER)
1540 {*digraph = 1; return CPP_CLOSE_BRACE;} /* %> digraph */
1541 if (b == CPP_COLON)
1542 {*digraph = 1; return CPP_HASH;} /* %: digraph */
1544 break;
1545 case CPP_DEREF:
1546 if (b == CPP_MULT && cxx) return CPP_DEREF_STAR;
1547 break;
1548 case CPP_DOT:
1549 if (b == CPP_MULT && cxx) return CPP_DOT_STAR;
1550 if (b == CPP_NUMBER) return CPP_NUMBER;
1551 break;
1553 case CPP_HASH:
1554 if (b == a && (token1->flags & DIGRAPH) == (token2->flags & DIGRAPH))
1555 /* %:%: digraph */
1556 {*digraph = (token1->flags & DIGRAPH); return CPP_PASTE;}
1557 break;
1559 case CPP_NAME:
1560 if (b == CPP_NAME) return CPP_NAME;
1561 if (b == CPP_NUMBER
1562 && name_p (pfile, &token2->val.str)) return CPP_NAME;
1563 if (b == CPP_CHAR
1564 && token1->val.node == pfile->spec_nodes.n_L) return CPP_WCHAR;
1565 if (b == CPP_STRING
1566 && token1->val.node == pfile->spec_nodes.n_L) return CPP_WSTRING;
1567 break;
1569 case CPP_NUMBER:
1570 if (b == CPP_NUMBER) return CPP_NUMBER;
1571 if (b == CPP_NAME) return CPP_NUMBER;
1572 if (b == CPP_DOT) return CPP_NUMBER;
1573 /* Numbers cannot have length zero, so this is safe. */
1574 if ((b == CPP_PLUS || b == CPP_MINUS)
1575 && VALID_SIGN ('+', token1->val.str.text[token1->val.str.len - 1]))
1576 return CPP_NUMBER;
1577 break;
1579 default:
1580 break;
1583 return CPP_EOF;
1586 /* Returns nonzero if a space should be inserted to avoid an
1587 accidental token paste for output. For simplicity, it is
1588 conservative, and occasionally advises a space where one is not
1589 needed, e.g. "." and ".2". */
1592 cpp_avoid_paste (pfile, token1, token2)
1593 cpp_reader *pfile;
1594 const cpp_token *token1, *token2;
1596 enum cpp_ttype a = token1->type, b = token2->type;
1597 cppchar_t c;
1599 if (token1->flags & NAMED_OP)
1600 a = CPP_NAME;
1601 if (token2->flags & NAMED_OP)
1602 b = CPP_NAME;
1604 c = EOF;
1605 if (token2->flags & DIGRAPH)
1606 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1607 else if (token_spellings[b].category == SPELL_OPERATOR)
1608 c = token_spellings[b].name[0];
1610 /* Quickly get everything that can paste with an '='. */
1611 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1612 return 1;
1614 switch (a)
1616 case CPP_GREATER: return c == '>' || c == '?';
1617 case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
1618 case CPP_PLUS: return c == '+';
1619 case CPP_MINUS: return c == '-' || c == '>';
1620 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1621 case CPP_MOD: return c == ':' || c == '>';
1622 case CPP_AND: return c == '&';
1623 case CPP_OR: return c == '|';
1624 case CPP_COLON: return c == ':' || c == '>';
1625 case CPP_DEREF: return c == '*';
1626 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1627 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1628 case CPP_NAME: return ((b == CPP_NUMBER
1629 && name_p (pfile, &token2->val.str))
1630 || b == CPP_NAME
1631 || b == CPP_CHAR || b == CPP_STRING); /* L */
1632 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1633 || c == '.' || c == '+' || c == '-');
1634 case CPP_OTHER: return (CPP_OPTION (pfile, objc)
1635 && token1->val.c == '@'
1636 && (b == CPP_NAME || b == CPP_STRING));
1637 default: break;
1640 return 0;
1643 /* Output all the remaining tokens on the current line, and a newline
1644 character, to FP. Leading whitespace is removed. */
1645 void
1646 cpp_output_line (pfile, fp)
1647 cpp_reader *pfile;
1648 FILE *fp;
1650 cpp_token token;
1652 cpp_get_token (pfile, &token);
1653 token.flags &= ~PREV_WHITE;
1654 while (token.type != CPP_EOF)
1656 cpp_output_token (&token, fp);
1657 cpp_get_token (pfile, &token);
1660 putc ('\n', fp);
1663 /* Returns the value of a hexadecimal digit. */
1664 static unsigned int
1665 hex_digit_value (c)
1666 unsigned int c;
1668 if (c >= 'a' && c <= 'f')
1669 return c - 'a' + 10;
1670 if (c >= 'A' && c <= 'F')
1671 return c - 'A' + 10;
1672 if (c >= '0' && c <= '9')
1673 return c - '0';
1674 abort ();
1677 /* Parse a '\uNNNN' or '\UNNNNNNNN' sequence (C++ and C99).
1679 [lex.charset]: The character designated by the universal character
1680 name \UNNNNNNNN is that character whose character short name in
1681 ISO/IEC 10646 is NNNNNNNN; the character designated by the
1682 universal character name \uNNNN is that character whose character
1683 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
1684 for a universal character name is less than 0x20 or in the range
1685 0x7F-0x9F (inclusive), or if the universal character name
1686 designates a character in the basic source character set, then the
1687 program is ill-formed.
1689 We assume that wchar_t is Unicode, so we don't need to do any
1690 mapping. Is this ever wrong? */
1692 static unsigned int
1693 read_ucs (pfile, pstr, limit, length)
1694 cpp_reader *pfile;
1695 const unsigned char **pstr;
1696 const unsigned char *limit;
1697 unsigned int length;
1699 const unsigned char *p = *pstr;
1700 unsigned int c, code = 0;
1702 for (; length; --length)
1704 if (p >= limit)
1706 cpp_error (pfile, "incomplete universal-character-name");
1707 break;
1710 c = *p;
1711 if (ISXDIGIT (c))
1713 code = (code << 4) + hex_digit_value (c);
1714 p++;
1716 else
1718 cpp_error (pfile,
1719 "non-hex digit '%c' in universal-character-name", c);
1720 break;
1725 #ifdef TARGET_EBCDIC
1726 cpp_error (pfile, "universal-character-name on EBCDIC target");
1727 code = 0x3f; /* EBCDIC invalid character */
1728 #else
1729 if (code > 0x9f && !(code & 0x80000000))
1730 ; /* True extended character, OK. */
1731 else if (code >= 0x20 && code < 0x7f)
1733 /* ASCII printable character. The C character set consists of all of
1734 these except $, @ and `. We use hex escapes so that this also
1735 works with EBCDIC hosts. */
1736 if (code != 0x24 && code != 0x40 && code != 0x60)
1737 cpp_error (pfile, "universal-character-name used for '%c'", code);
1739 else
1740 cpp_error (pfile, "invalid universal-character-name");
1741 #endif
1743 *pstr = p;
1744 return code;
1747 /* Interpret an escape sequence, and return its value. PSTR points to
1748 the input pointer, which is just after the backslash. LIMIT is how
1749 much text we have. MASK is the precision for the target type (char
1750 or wchar_t). TRADITIONAL, if true, does not interpret escapes that
1751 did not exist in traditional C. */
1753 static unsigned int
1754 parse_escape (pfile, pstr, limit, mask, traditional)
1755 cpp_reader *pfile;
1756 const unsigned char **pstr;
1757 const unsigned char *limit;
1758 HOST_WIDE_INT mask;
1759 int traditional;
1761 int unknown = 0;
1762 const unsigned char *str = *pstr;
1763 unsigned int c = *str++;
1765 switch (c)
1767 case '\\': case '\'': case '"': case '?': break;
1768 case 'b': c = TARGET_BS; break;
1769 case 'f': c = TARGET_FF; break;
1770 case 'n': c = TARGET_NEWLINE; break;
1771 case 'r': c = TARGET_CR; break;
1772 case 't': c = TARGET_TAB; break;
1773 case 'v': c = TARGET_VT; break;
1775 case '(': case '{': case '[': case '%':
1776 /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
1777 '\%' is used to prevent SCCS from getting confused. */
1778 unknown = CPP_PEDANTIC (pfile);
1779 break;
1781 case 'a':
1782 if (CPP_WTRADITIONAL (pfile))
1783 cpp_warning (pfile, "the meaning of '\\a' varies with -traditional");
1784 if (!traditional)
1785 c = TARGET_BELL;
1786 break;
1788 case 'e': case 'E':
1789 if (CPP_PEDANTIC (pfile))
1790 cpp_pedwarn (pfile, "non-ISO-standard escape sequence, '\\%c'", c);
1791 c = TARGET_ESC;
1792 break;
1794 /* Warnings and support checks handled by read_ucs(). */
1795 case 'u': case 'U':
1796 if (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99))
1798 if (CPP_WTRADITIONAL (pfile))
1799 cpp_warning (pfile,
1800 "the meaning of '\\%c' varies with -traditional", c);
1801 c = read_ucs (pfile, &str, limit, c == 'u' ? 4 : 8);
1803 else
1804 unknown = 1;
1805 break;
1807 case 'x':
1808 if (CPP_WTRADITIONAL (pfile))
1809 cpp_warning (pfile, "the meaning of '\\x' varies with -traditional");
1811 if (!traditional)
1813 unsigned int i = 0, overflow = 0;
1814 int digits_found = 0;
1816 while (str < limit)
1818 c = *str;
1819 if (! ISXDIGIT (c))
1820 break;
1821 str++;
1822 overflow |= i ^ (i << 4 >> 4);
1823 i = (i << 4) + hex_digit_value (c);
1824 digits_found = 1;
1827 if (!digits_found)
1828 cpp_error (pfile, "\\x used with no following hex digits");
1830 if (overflow | (i != (i & mask)))
1832 cpp_pedwarn (pfile, "hex escape sequence out of range");
1833 i &= mask;
1835 c = i;
1837 break;
1839 case '0': case '1': case '2': case '3':
1840 case '4': case '5': case '6': case '7':
1842 unsigned int i = c - '0';
1843 int count = 0;
1845 while (str < limit && ++count < 3)
1847 c = *str;
1848 if (c < '0' || c > '7')
1849 break;
1850 str++;
1851 i = (i << 3) + c - '0';
1854 if (i != (i & mask))
1856 cpp_pedwarn (pfile, "octal escape sequence out of range");
1857 i &= mask;
1859 c = i;
1861 break;
1863 default:
1864 unknown = 1;
1865 break;
1868 if (unknown)
1870 if (ISGRAPH (c))
1871 cpp_pedwarn (pfile, "unknown escape sequence '\\%c'", c);
1872 else
1873 cpp_pedwarn (pfile, "unknown escape sequence: '\\%03o'", c);
1876 *pstr = str;
1877 return c;
1880 #ifndef MAX_CHAR_TYPE_SIZE
1881 #define MAX_CHAR_TYPE_SIZE CHAR_TYPE_SIZE
1882 #endif
1884 #ifndef MAX_WCHAR_TYPE_SIZE
1885 #define MAX_WCHAR_TYPE_SIZE WCHAR_TYPE_SIZE
1886 #endif
1888 /* Interpret a (possibly wide) character constant in TOKEN.
1889 WARN_MULTI warns about multi-character charconsts, if not
1890 TRADITIONAL. TRADITIONAL also indicates not to interpret escapes
1891 that did not exist in traditional C. PCHARS_SEEN points to a
1892 variable that is filled in with the number of characters seen. */
1893 HOST_WIDE_INT
1894 cpp_interpret_charconst (pfile, token, warn_multi, traditional, pchars_seen)
1895 cpp_reader *pfile;
1896 const cpp_token *token;
1897 int warn_multi;
1898 int traditional;
1899 unsigned int *pchars_seen;
1901 const unsigned char *str = token->val.str.text;
1902 const unsigned char *limit = str + token->val.str.len;
1903 unsigned int chars_seen = 0;
1904 unsigned int width, max_chars, c;
1905 HOST_WIDE_INT result = 0, mask;
1907 #ifdef MULTIBYTE_CHARS
1908 (void) local_mbtowc (NULL, NULL, 0);
1909 #endif
1911 /* Width in bits. */
1912 if (token->type == CPP_CHAR)
1913 width = MAX_CHAR_TYPE_SIZE;
1914 else
1915 width = MAX_WCHAR_TYPE_SIZE;
1917 if (width < HOST_BITS_PER_WIDE_INT)
1918 mask = ((unsigned HOST_WIDE_INT) 1 << width) - 1;
1919 else
1920 mask = ~0;
1921 max_chars = HOST_BITS_PER_WIDE_INT / width;
1923 while (str < limit)
1925 #ifdef MULTIBYTE_CHARS
1926 wchar_t wc;
1927 int char_len;
1929 char_len = local_mbtowc (&wc, str, limit - str);
1930 if (char_len == -1)
1932 cpp_warning (pfile, "ignoring invalid multibyte character");
1933 c = *str++;
1935 else
1937 str += char_len;
1938 c = wc;
1940 #else
1941 c = *str++;
1942 #endif
1944 if (c == '\\')
1946 c = parse_escape (pfile, &str, limit, mask, traditional);
1947 if (width < HOST_BITS_PER_WIDE_INT && c > mask)
1948 cpp_pedwarn (pfile, "escape sequence out of range for character");
1951 #ifdef MAP_CHARACTER
1952 if (ISPRINT (c))
1953 c = MAP_CHARACTER (c);
1954 #endif
1956 /* Merge character into result; ignore excess chars. */
1957 if (++chars_seen <= max_chars)
1959 if (width < HOST_BITS_PER_WIDE_INT)
1960 result = (result << width) | (c & mask);
1961 else
1962 result = c;
1966 if (chars_seen == 0)
1967 cpp_error (pfile, "empty character constant");
1968 else if (chars_seen > max_chars)
1970 chars_seen = max_chars;
1971 cpp_error (pfile, "character constant too long");
1973 else if (chars_seen > 1 && !traditional && warn_multi)
1974 cpp_warning (pfile, "multi-character character constant");
1976 /* If char type is signed, sign-extend the constant. The
1977 __CHAR_UNSIGNED__ macro is set by the driver if appropriate. */
1978 if (token->type == CPP_CHAR && chars_seen)
1980 unsigned int nbits = chars_seen * width;
1981 unsigned int mask = (unsigned int) ~0 >> (HOST_BITS_PER_INT - nbits);
1983 if (pfile->spec_nodes.n__CHAR_UNSIGNED__->type == NT_MACRO
1984 || ((result >> (nbits - 1)) & 1) == 0)
1985 result &= mask;
1986 else
1987 result |= ~mask;
1990 *pchars_seen = chars_seen;
1991 return result;
1994 /* Memory pools. */
1996 struct dummy
1998 char c;
1999 union
2001 double d;
2002 int *p;
2003 } u;
2006 #define DEFAULT_ALIGNMENT (offsetof (struct dummy, u))
2008 static int
2009 chunk_suitable (pool, chunk, size)
2010 cpp_pool *pool;
2011 cpp_chunk *chunk;
2012 unsigned int size;
2014 /* Being at least twice SIZE means we can use memcpy in
2015 _cpp_next_chunk rather than memmove. Besides, it's a good idea
2016 anyway. */
2017 return (chunk && pool->locked != chunk
2018 && (unsigned int) (chunk->limit - chunk->base) >= size * 2);
2021 /* Returns the end of the new pool. PTR points to a char in the old
2022 pool, and is updated to point to the same char in the new pool. */
2023 unsigned char *
2024 _cpp_next_chunk (pool, len, ptr)
2025 cpp_pool *pool;
2026 unsigned int len;
2027 unsigned char **ptr;
2029 cpp_chunk *chunk = pool->cur->next;
2031 /* LEN is the minimum size we want in the new pool. */
2032 len += POOL_ROOM (pool);
2033 if (! chunk_suitable (pool, chunk, len))
2035 chunk = new_chunk (POOL_SIZE (pool) * 2 + len);
2037 chunk->next = pool->cur->next;
2038 pool->cur->next = chunk;
2041 /* Update the pointer before changing chunk's front. */
2042 if (ptr)
2043 *ptr += chunk->base - POOL_FRONT (pool);
2045 memcpy (chunk->base, POOL_FRONT (pool), POOL_ROOM (pool));
2046 chunk->front = chunk->base;
2048 pool->cur = chunk;
2049 return POOL_LIMIT (pool);
2052 static cpp_chunk *
2053 new_chunk (size)
2054 unsigned int size;
2056 unsigned char *base;
2057 cpp_chunk *result;
2059 size = POOL_ALIGN (size, DEFAULT_ALIGNMENT);
2060 base = (unsigned char *) xmalloc (size + sizeof (cpp_chunk));
2061 /* Put the chunk descriptor at the end. Then chunk overruns will
2062 cause obvious chaos. */
2063 result = (cpp_chunk *) (base + size);
2064 result->base = base;
2065 result->front = base;
2066 result->limit = base + size;
2067 result->next = 0;
2069 return result;
2072 void
2073 _cpp_init_pool (pool, size, align, temp)
2074 cpp_pool *pool;
2075 unsigned int size, align, temp;
2077 if (align == 0)
2078 align = DEFAULT_ALIGNMENT;
2079 if (align & (align - 1))
2080 abort ();
2081 pool->align = align;
2082 pool->cur = new_chunk (size);
2083 pool->locked = 0;
2084 pool->locks = 0;
2085 if (temp)
2086 pool->cur->next = pool->cur;
2089 void
2090 _cpp_lock_pool (pool)
2091 cpp_pool *pool;
2093 if (pool->locks++ == 0)
2094 pool->locked = pool->cur;
2097 void
2098 _cpp_unlock_pool (pool)
2099 cpp_pool *pool;
2101 if (--pool->locks == 0)
2102 pool->locked = 0;
2105 void
2106 _cpp_free_pool (pool)
2107 cpp_pool *pool;
2109 cpp_chunk *chunk = pool->cur, *next;
2113 next = chunk->next;
2114 free (chunk->base);
2115 chunk = next;
2117 while (chunk && chunk != pool->cur);
2120 /* Reserve LEN bytes from a memory pool. */
2121 unsigned char *
2122 _cpp_pool_reserve (pool, len)
2123 cpp_pool *pool;
2124 unsigned int len;
2126 len = POOL_ALIGN (len, pool->align);
2127 if (len > (unsigned int) POOL_ROOM (pool))
2128 _cpp_next_chunk (pool, len, 0);
2130 return POOL_FRONT (pool);
2133 /* Allocate LEN bytes from a memory pool. */
2134 unsigned char *
2135 _cpp_pool_alloc (pool, len)
2136 cpp_pool *pool;
2137 unsigned int len;
2139 unsigned char *result = _cpp_pool_reserve (pool, len);
2141 POOL_COMMIT (pool, len);
2142 return result;