2001-06-22 Jan van Male <jan.vanmale@fenk.wau.nl>
[official-gcc.git] / gcc / cpplex.c
blob7424827228fcc62ba697e4c0d52644c5c7e6398b
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7 Single-pass line tokenization by Neil Booth, April 2000
9 This program is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
12 later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23 /* This lexer works with a single pass of the file. Recently I
24 re-wrote it to minimize the places where we step backwards in the
25 input stream, to make future changes to support multi-byte
26 character sets fairly straight-forward.
28 There is now only one routine where we do step backwards:
29 skip_escaped_newlines. This routine could probably also be changed
30 so that it doesn't need to step back. One possibility is to use a
31 trick similar to that used in lex_period and lex_percent. Two
32 extra characters might be needed, but skip_escaped_newlines itself
33 would probably be the only place that needs to be aware of that,
34 and changes to the remaining routines would probably only be needed
35 if they process a backslash. */
37 #include "config.h"
38 #include "system.h"
39 #include "cpplib.h"
40 #include "cpphash.h"
42 /* MULTIBYTE_CHARS support only works for native compilers.
43 ??? Ideally what we want is to model widechar support after
44 the current floating point support. */
45 #ifdef CROSS_COMPILE
46 #undef MULTIBYTE_CHARS
47 #endif
49 #ifdef MULTIBYTE_CHARS
50 #include "mbchar.h"
51 #include <locale.h>
52 #endif
54 /* Tokens with SPELL_STRING store their spelling in the token list,
55 and it's length in the token->val.name.len. */
56 enum spell_type
58 SPELL_OPERATOR = 0,
59 SPELL_CHAR,
60 SPELL_IDENT,
61 SPELL_STRING,
62 SPELL_NONE
65 struct token_spelling
67 enum spell_type category;
68 const unsigned char *name;
71 const unsigned char *digraph_spellings [] = {U"%:", U"%:%:", U"<:",
72 U":>", U"<%", U"%>"};
74 #define OP(e, s) { SPELL_OPERATOR, U s },
75 #define TK(e, s) { s, U STRINGX (e) },
76 const struct token_spelling token_spellings [N_TTYPES] = {TTYPE_TABLE };
77 #undef OP
78 #undef TK
80 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
81 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
83 static cppchar_t handle_newline PARAMS ((cpp_buffer *, cppchar_t));
84 static cppchar_t skip_escaped_newlines PARAMS ((cpp_buffer *, cppchar_t));
85 static cppchar_t get_effective_char PARAMS ((cpp_buffer *));
87 static int skip_block_comment PARAMS ((cpp_reader *));
88 static int skip_line_comment PARAMS ((cpp_reader *));
89 static void adjust_column PARAMS ((cpp_reader *));
90 static void skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
91 static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *, cppchar_t));
92 static void parse_number PARAMS ((cpp_reader *, cpp_string *, cppchar_t, int));
93 static int unescaped_terminator_p PARAMS ((cpp_reader *, const U_CHAR *));
94 static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
95 static void unterminated PARAMS ((cpp_reader *, int));
96 static int trigraph_ok PARAMS ((cpp_reader *, cppchar_t));
97 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const U_CHAR *));
98 static void lex_percent PARAMS ((cpp_buffer *, cpp_token *));
99 static void lex_dot PARAMS ((cpp_reader *, cpp_token *));
100 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
101 static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
102 const unsigned char *, unsigned int *));
104 static cpp_chunk *new_chunk PARAMS ((unsigned int));
105 static int chunk_suitable PARAMS ((cpp_pool *, cpp_chunk *, unsigned int));
106 static unsigned int hex_digit_value PARAMS ((unsigned int));
108 /* Utility routine:
110 Compares, the token TOKEN to the NUL-terminated string STRING.
111 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
114 cpp_ideq (token, string)
115 const cpp_token *token;
116 const char *string;
118 if (token->type != CPP_NAME)
119 return 0;
121 return !ustrcmp (NODE_NAME (token->val.node), (const U_CHAR *) string);
124 /* Call when meeting a newline. Returns the character after the newline
125 (or carriage-return newline combination), or EOF. */
126 static cppchar_t
127 handle_newline (buffer, newline_char)
128 cpp_buffer *buffer;
129 cppchar_t newline_char;
131 cppchar_t next = EOF;
133 buffer->col_adjust = 0;
134 buffer->lineno++;
135 buffer->line_base = buffer->cur;
137 /* Handle CR-LF and LF-CR combinations, get the next character. */
138 if (buffer->cur < buffer->rlimit)
140 next = *buffer->cur++;
141 if (next + newline_char == '\r' + '\n')
143 buffer->line_base = buffer->cur;
144 if (buffer->cur < buffer->rlimit)
145 next = *buffer->cur++;
146 else
147 next = EOF;
151 buffer->read_ahead = next;
152 return next;
155 /* Subroutine of skip_escaped_newlines; called when a trigraph is
156 encountered. It warns if necessary, and returns true if the
157 trigraph should be honoured. FROM_CHAR is the third character of a
158 trigraph, and presumed to be the previous character for position
159 reporting. */
160 static int
161 trigraph_ok (pfile, from_char)
162 cpp_reader *pfile;
163 cppchar_t from_char;
165 int accept = CPP_OPTION (pfile, trigraphs);
167 /* Don't warn about trigraphs in comments. */
168 if (CPP_OPTION (pfile, warn_trigraphs) && !pfile->state.lexing_comment)
170 cpp_buffer *buffer = pfile->buffer;
171 if (accept)
172 cpp_warning_with_line (pfile, buffer->lineno, CPP_BUF_COL (buffer) - 2,
173 "trigraph ??%c converted to %c",
174 (int) from_char,
175 (int) _cpp_trigraph_map[from_char]);
176 else if (buffer->cur != buffer->last_Wtrigraphs)
178 buffer->last_Wtrigraphs = buffer->cur;
179 cpp_warning_with_line (pfile, buffer->lineno,
180 CPP_BUF_COL (buffer) - 2,
181 "trigraph ??%c ignored", (int) from_char);
185 return accept;
188 /* Assumes local variables buffer and result. */
189 #define ACCEPT_CHAR(t) \
190 do { result->type = t; buffer->read_ahead = EOF; } while (0)
192 /* When we move to multibyte character sets, add to these something
193 that saves and restores the state of the multibyte conversion
194 library. This probably involves saving and restoring a "cookie".
195 In the case of glibc it is an 8-byte structure, so is not a high
196 overhead operation. In any case, it's out of the fast path. */
197 #define SAVE_STATE() do { saved_cur = buffer->cur; } while (0)
198 #define RESTORE_STATE() do { buffer->cur = saved_cur; } while (0)
200 /* Skips any escaped newlines introduced by NEXT, which is either a
201 '?' or a '\\'. Returns the next character, which will also have
202 been placed in buffer->read_ahead. This routine performs
203 preprocessing stages 1 and 2 of the ISO C standard. */
204 static cppchar_t
205 skip_escaped_newlines (buffer, next)
206 cpp_buffer *buffer;
207 cppchar_t next;
209 /* Only do this if we apply stages 1 and 2. */
210 if (!buffer->from_stage3)
212 cppchar_t next1;
213 const unsigned char *saved_cur;
214 int space;
218 if (buffer->cur == buffer->rlimit)
219 break;
221 SAVE_STATE ();
222 if (next == '?')
224 next1 = *buffer->cur++;
225 if (next1 != '?' || buffer->cur == buffer->rlimit)
227 RESTORE_STATE ();
228 break;
231 next1 = *buffer->cur++;
232 if (!_cpp_trigraph_map[next1]
233 || !trigraph_ok (buffer->pfile, next1))
235 RESTORE_STATE ();
236 break;
239 /* We have a full trigraph here. */
240 next = _cpp_trigraph_map[next1];
241 if (next != '\\' || buffer->cur == buffer->rlimit)
242 break;
243 SAVE_STATE ();
246 /* We have a backslash, and room for at least one more character. */
247 space = 0;
250 next1 = *buffer->cur++;
251 if (!is_nvspace (next1))
252 break;
253 space = 1;
255 while (buffer->cur < buffer->rlimit);
257 if (!is_vspace (next1))
259 RESTORE_STATE ();
260 break;
263 if (space && !buffer->pfile->state.lexing_comment)
264 cpp_warning (buffer->pfile,
265 "backslash and newline separated by space");
267 next = handle_newline (buffer, next1);
268 if (next == EOF)
269 cpp_pedwarn (buffer->pfile, "backslash-newline at end of file");
271 while (next == '\\' || next == '?');
274 buffer->read_ahead = next;
275 return next;
278 /* Obtain the next character, after trigraph conversion and skipping
279 an arbitrary string of escaped newlines. The common case of no
280 trigraphs or escaped newlines falls through quickly. */
281 static cppchar_t
282 get_effective_char (buffer)
283 cpp_buffer *buffer;
285 cppchar_t next = EOF;
287 if (buffer->cur < buffer->rlimit)
289 next = *buffer->cur++;
291 /* '?' can introduce trigraphs (and therefore backslash); '\\'
292 can introduce escaped newlines, which we want to skip, or
293 UCNs, which, depending upon lexer state, we will handle in
294 the future. */
295 if (next == '?' || next == '\\')
296 next = skip_escaped_newlines (buffer, next);
299 buffer->read_ahead = next;
300 return next;
303 /* Skip a C-style block comment. We find the end of the comment by
304 seeing if an asterisk is before every '/' we encounter. Returns
305 non-zero if comment terminated by EOF, zero otherwise. */
306 static int
307 skip_block_comment (pfile)
308 cpp_reader *pfile;
310 cpp_buffer *buffer = pfile->buffer;
311 cppchar_t c = EOF, prevc = EOF;
313 pfile->state.lexing_comment = 1;
314 while (buffer->cur != buffer->rlimit)
316 prevc = c, c = *buffer->cur++;
318 next_char:
319 /* FIXME: For speed, create a new character class of characters
320 of interest inside block comments. */
321 if (c == '?' || c == '\\')
322 c = skip_escaped_newlines (buffer, c);
324 /* People like decorating comments with '*', so check for '/'
325 instead for efficiency. */
326 if (c == '/')
328 if (prevc == '*')
329 break;
331 /* Warn about potential nested comments, but not if the '/'
332 comes immediately before the true comment delimeter.
333 Don't bother to get it right across escaped newlines. */
334 if (CPP_OPTION (pfile, warn_comments)
335 && buffer->cur != buffer->rlimit)
337 prevc = c, c = *buffer->cur++;
338 if (c == '*' && buffer->cur != buffer->rlimit)
340 prevc = c, c = *buffer->cur++;
341 if (c != '/')
342 cpp_warning_with_line (pfile, CPP_BUF_LINE (buffer),
343 CPP_BUF_COL (buffer),
344 "\"/*\" within comment");
346 goto next_char;
349 else if (is_vspace (c))
351 prevc = c, c = handle_newline (buffer, c);
352 goto next_char;
354 else if (c == '\t')
355 adjust_column (pfile);
358 pfile->state.lexing_comment = 0;
359 buffer->read_ahead = EOF;
360 return c != '/' || prevc != '*';
363 /* Skip a C++ line comment. Handles escaped newlines. Returns
364 non-zero if a multiline comment. The following new line, if any,
365 is left in buffer->read_ahead. */
366 static int
367 skip_line_comment (pfile)
368 cpp_reader *pfile;
370 cpp_buffer *buffer = pfile->buffer;
371 unsigned int orig_lineno = buffer->lineno;
372 cppchar_t c;
374 pfile->state.lexing_comment = 1;
377 c = EOF;
378 if (buffer->cur == buffer->rlimit)
379 break;
381 c = *buffer->cur++;
382 if (c == '?' || c == '\\')
383 c = skip_escaped_newlines (buffer, c);
385 while (!is_vspace (c));
387 pfile->state.lexing_comment = 0;
388 buffer->read_ahead = c; /* Leave any newline for caller. */
389 return orig_lineno != buffer->lineno;
392 /* pfile->buffer->cur is one beyond the \t character. Update
393 col_adjust so we track the column correctly. */
394 static void
395 adjust_column (pfile)
396 cpp_reader *pfile;
398 cpp_buffer *buffer = pfile->buffer;
399 unsigned int col = CPP_BUF_COL (buffer) - 1; /* Zero-based column. */
401 /* Round it up to multiple of the tabstop, but subtract 1 since the
402 tab itself occupies a character position. */
403 buffer->col_adjust += (CPP_OPTION (pfile, tabstop)
404 - col % CPP_OPTION (pfile, tabstop)) - 1;
407 /* Skips whitespace, saving the next non-whitespace character.
408 Adjusts pfile->col_adjust to account for tabs. Without this,
409 tokens might be assigned an incorrect column. */
410 static void
411 skip_whitespace (pfile, c)
412 cpp_reader *pfile;
413 cppchar_t c;
415 cpp_buffer *buffer = pfile->buffer;
416 unsigned int warned = 0;
420 /* Horizontal space always OK. */
421 if (c == ' ')
423 else if (c == '\t')
424 adjust_column (pfile);
425 /* Just \f \v or \0 left. */
426 else if (c == '\0')
428 if (!warned)
430 cpp_warning (pfile, "null character(s) ignored");
431 warned = 1;
434 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
435 cpp_pedwarn_with_line (pfile, CPP_BUF_LINE (buffer),
436 CPP_BUF_COL (buffer),
437 "%s in preprocessing directive",
438 c == '\f' ? "form feed" : "vertical tab");
440 c = EOF;
441 if (buffer->cur == buffer->rlimit)
442 break;
443 c = *buffer->cur++;
445 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
446 while (is_nvspace (c));
448 /* Remember the next character. */
449 buffer->read_ahead = c;
452 /* See if the characters of a number token are valid in a name (no
453 '.', '+' or '-'). */
454 static int
455 name_p (pfile, string)
456 cpp_reader *pfile;
457 const cpp_string *string;
459 unsigned int i;
461 for (i = 0; i < string->len; i++)
462 if (!is_idchar (string->text[i]))
463 return 0;
465 return 1;
468 /* Parse an identifier, skipping embedded backslash-newlines.
469 Calculate the hash value of the token while parsing, for improved
470 performance. The hashing algorithm *must* match cpp_lookup(). */
472 static cpp_hashnode *
473 parse_identifier (pfile, c)
474 cpp_reader *pfile;
475 cppchar_t c;
477 cpp_hashnode *result;
478 cpp_buffer *buffer = pfile->buffer;
479 unsigned int saw_dollar = 0, len;
480 struct obstack *stack = &pfile->hash_table->stack;
486 obstack_1grow (stack, c);
488 if (c == '$')
489 saw_dollar++;
491 c = EOF;
492 if (buffer->cur == buffer->rlimit)
493 break;
495 c = *buffer->cur++;
497 while (is_idchar (c));
499 /* Potential escaped newline? */
500 if (c != '?' && c != '\\')
501 break;
502 c = skip_escaped_newlines (buffer, c);
504 while (is_idchar (c));
506 /* Remember the next character. */
507 buffer->read_ahead = c;
509 /* $ is not a identifier character in the standard, but is commonly
510 accepted as an extension. Don't warn about it in skipped
511 conditional blocks. */
512 if (saw_dollar && CPP_PEDANTIC (pfile) && ! pfile->skipping)
513 cpp_pedwarn (pfile, "'$' character(s) in identifier");
515 /* Identifiers are null-terminated. */
516 len = obstack_object_size (stack);
517 obstack_1grow (stack, '\0');
519 /* This routine commits the memory if necessary. */
520 result = (cpp_hashnode *)
521 ht_lookup (pfile->hash_table, obstack_finish (stack), len, HT_ALLOCED);
523 /* Some identifiers require diagnostics when lexed. */
524 if (result->flags & NODE_DIAGNOSTIC && !pfile->skipping)
526 /* It is allowed to poison the same identifier twice. */
527 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
528 cpp_error (pfile, "attempt to use poisoned \"%s\"",
529 NODE_NAME (result));
531 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
532 replacement list of a variadic macro. */
533 if (result == pfile->spec_nodes.n__VA_ARGS__
534 && !pfile->state.va_args_ok)
535 cpp_pedwarn (pfile, "__VA_ARGS__ can only appear in the expansion of a C99 variadic macro");
538 return result;
541 /* Parse a number, skipping embedded backslash-newlines. */
542 static void
543 parse_number (pfile, number, c, leading_period)
544 cpp_reader *pfile;
545 cpp_string *number;
546 cppchar_t c;
547 int leading_period;
549 cpp_buffer *buffer = pfile->buffer;
550 cpp_pool *pool = &pfile->ident_pool;
551 unsigned char *dest, *limit;
553 dest = POOL_FRONT (pool);
554 limit = POOL_LIMIT (pool);
556 /* Place a leading period. */
557 if (leading_period)
559 if (dest >= limit)
560 limit = _cpp_next_chunk (pool, 0, &dest);
561 *dest++ = '.';
568 /* Need room for terminating null. */
569 if (dest + 1 >= limit)
570 limit = _cpp_next_chunk (pool, 0, &dest);
571 *dest++ = c;
573 c = EOF;
574 if (buffer->cur == buffer->rlimit)
575 break;
577 c = *buffer->cur++;
579 while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
581 /* Potential escaped newline? */
582 if (c != '?' && c != '\\')
583 break;
584 c = skip_escaped_newlines (buffer, c);
586 while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
588 /* Remember the next character. */
589 buffer->read_ahead = c;
591 /* Null-terminate the number. */
592 *dest = '\0';
594 number->text = POOL_FRONT (pool);
595 number->len = dest - number->text;
596 POOL_COMMIT (pool, number->len + 1);
599 /* Subroutine of parse_string. Emits error for unterminated strings. */
600 static void
601 unterminated (pfile, term)
602 cpp_reader *pfile;
603 int term;
605 cpp_error (pfile, "missing terminating %c character", term);
607 if (term == '\"' && pfile->mlstring_pos.line
608 && pfile->mlstring_pos.line != pfile->lexer_pos.line)
610 cpp_error_with_line (pfile, pfile->mlstring_pos.line,
611 pfile->mlstring_pos.col,
612 "possible start of unterminated string literal");
613 pfile->mlstring_pos.line = 0;
617 /* Subroutine of parse_string. */
618 static int
619 unescaped_terminator_p (pfile, dest)
620 cpp_reader *pfile;
621 const unsigned char *dest;
623 const unsigned char *start, *temp;
625 /* In #include-style directives, terminators are not escapeable. */
626 if (pfile->state.angled_headers)
627 return 1;
629 start = POOL_FRONT (&pfile->ident_pool);
631 /* An odd number of consecutive backslashes represents an escaped
632 terminator. */
633 for (temp = dest; temp > start && temp[-1] == '\\'; temp--)
636 return ((dest - temp) & 1) == 0;
639 /* Parses a string, character constant, or angle-bracketed header file
640 name. Handles embedded trigraphs and escaped newlines. The stored
641 string is guaranteed NUL-terminated, but it is not guaranteed that
642 this is the first NUL since embedded NULs are preserved.
644 Multi-line strings are allowed, but they are deprecated. */
645 static void
646 parse_string (pfile, token, terminator)
647 cpp_reader *pfile;
648 cpp_token *token;
649 cppchar_t terminator;
651 cpp_buffer *buffer = pfile->buffer;
652 cpp_pool *pool = &pfile->ident_pool;
653 unsigned char *dest, *limit;
654 cppchar_t c;
655 unsigned int nulls = 0;
657 dest = POOL_FRONT (pool);
658 limit = POOL_LIMIT (pool);
660 for (;;)
662 if (buffer->cur == buffer->rlimit)
663 c = EOF;
664 else
665 c = *buffer->cur++;
667 have_char:
668 /* We need space for the terminating NUL. */
669 if (dest >= limit)
670 limit = _cpp_next_chunk (pool, 0, &dest);
672 if (c == EOF)
674 unterminated (pfile, terminator);
675 break;
678 /* Handle trigraphs, escaped newlines etc. */
679 if (c == '?' || c == '\\')
680 c = skip_escaped_newlines (buffer, c);
682 if (c == terminator && unescaped_terminator_p (pfile, dest))
684 c = EOF;
685 break;
687 else if (is_vspace (c))
689 /* In assembly language, silently terminate string and
690 character literals at end of line. This is a kludge
691 around not knowing where comments are. */
692 if (CPP_OPTION (pfile, lang) == CLK_ASM && terminator != '>')
693 break;
695 /* Character constants and header names may not extend over
696 multiple lines. In Standard C, neither may strings.
697 Unfortunately, we accept multiline strings as an
698 extension, except in #include family directives. */
699 if (terminator != '"' || pfile->state.angled_headers)
701 unterminated (pfile, terminator);
702 break;
705 cpp_pedwarn (pfile, "multi-line string literals are deprecated");
706 if (pfile->mlstring_pos.line == 0)
707 pfile->mlstring_pos = pfile->lexer_pos;
709 c = handle_newline (buffer, c);
710 *dest++ = '\n';
711 goto have_char;
713 else if (c == '\0')
715 if (nulls++ == 0)
716 cpp_warning (pfile, "null character(s) preserved in literal");
719 *dest++ = c;
722 /* Remember the next character. */
723 buffer->read_ahead = c;
724 *dest = '\0';
726 token->val.str.text = POOL_FRONT (pool);
727 token->val.str.len = dest - token->val.str.text;
728 POOL_COMMIT (pool, token->val.str.len + 1);
731 /* The stored comment includes the comment start and any terminator. */
732 static void
733 save_comment (pfile, token, from)
734 cpp_reader *pfile;
735 cpp_token *token;
736 const unsigned char *from;
738 unsigned char *buffer;
739 unsigned int len;
741 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
742 /* C++ comments probably (not definitely) have moved past a new
743 line, which we don't want to save in the comment. */
744 if (pfile->buffer->read_ahead != EOF)
745 len--;
746 buffer = _cpp_pool_alloc (&pfile->ident_pool, len);
748 token->type = CPP_COMMENT;
749 token->val.str.len = len;
750 token->val.str.text = buffer;
752 buffer[0] = '/';
753 memcpy (buffer + 1, from, len - 1);
756 /* Subroutine of lex_token to handle '%'. A little tricky, since we
757 want to avoid stepping back when lexing %:%X. */
758 static void
759 lex_percent (buffer, result)
760 cpp_buffer *buffer;
761 cpp_token *result;
763 cppchar_t c;
765 result->type = CPP_MOD;
766 /* Parsing %:%X could leave an extra character. */
767 if (buffer->extra_char == EOF)
768 c = get_effective_char (buffer);
769 else
771 c = buffer->read_ahead = buffer->extra_char;
772 buffer->extra_char = EOF;
775 if (c == '=')
776 ACCEPT_CHAR (CPP_MOD_EQ);
777 else if (CPP_OPTION (buffer->pfile, digraphs))
779 if (c == ':')
781 result->flags |= DIGRAPH;
782 ACCEPT_CHAR (CPP_HASH);
783 if (get_effective_char (buffer) == '%')
785 buffer->extra_char = get_effective_char (buffer);
786 if (buffer->extra_char == ':')
788 buffer->extra_char = EOF;
789 ACCEPT_CHAR (CPP_PASTE);
791 else
792 /* We'll catch the extra_char when we're called back. */
793 buffer->read_ahead = '%';
796 else if (c == '>')
798 result->flags |= DIGRAPH;
799 ACCEPT_CHAR (CPP_CLOSE_BRACE);
804 /* Subroutine of lex_token to handle '.'. This is tricky, since we
805 want to avoid stepping back when lexing '...' or '.123'. In the
806 latter case we should also set a flag for parse_number. */
807 static void
808 lex_dot (pfile, result)
809 cpp_reader *pfile;
810 cpp_token *result;
812 cpp_buffer *buffer = pfile->buffer;
813 cppchar_t c;
815 /* Parsing ..X could leave an extra character. */
816 if (buffer->extra_char == EOF)
817 c = get_effective_char (buffer);
818 else
820 c = buffer->read_ahead = buffer->extra_char;
821 buffer->extra_char = EOF;
824 /* All known character sets have 0...9 contiguous. */
825 if (c >= '0' && c <= '9')
827 result->type = CPP_NUMBER;
828 parse_number (pfile, &result->val.str, c, 1);
830 else
832 result->type = CPP_DOT;
833 if (c == '.')
835 buffer->extra_char = get_effective_char (buffer);
836 if (buffer->extra_char == '.')
838 buffer->extra_char = EOF;
839 ACCEPT_CHAR (CPP_ELLIPSIS);
841 else
842 /* We'll catch the extra_char when we're called back. */
843 buffer->read_ahead = '.';
845 else if (c == '*' && CPP_OPTION (pfile, cplusplus))
846 ACCEPT_CHAR (CPP_DOT_STAR);
850 void
851 _cpp_lex_token (pfile, result)
852 cpp_reader *pfile;
853 cpp_token *result;
855 cppchar_t c;
856 cpp_buffer *buffer;
857 const unsigned char *comment_start;
858 unsigned char bol;
860 skip:
861 bol = pfile->state.next_bol;
862 done_directive:
863 buffer = pfile->buffer;
864 pfile->state.next_bol = 0;
865 result->flags = buffer->saved_flags;
866 buffer->saved_flags = 0;
867 next_char:
868 pfile->lexer_pos.line = buffer->lineno;
869 next_char2:
870 pfile->lexer_pos.col = CPP_BUF_COLUMN (buffer, buffer->cur);
872 c = buffer->read_ahead;
873 if (c == EOF && buffer->cur < buffer->rlimit)
875 c = *buffer->cur++;
876 pfile->lexer_pos.col++;
879 do_switch:
880 buffer->read_ahead = EOF;
881 switch (c)
883 case EOF:
884 /* Non-empty files should end in a newline. Checking "bol" too
885 prevents multiple warnings when hitting the EOF more than
886 once, like in a directive. Don't warn for command line and
887 _Pragma buffers. */
888 if (pfile->lexer_pos.col != 0 && !bol && !buffer->from_stage3)
889 cpp_pedwarn (pfile, "no newline at end of file");
890 pfile->state.next_bol = 1;
891 pfile->skipping = 0; /* In case missing #endif. */
892 result->type = CPP_EOF;
893 /* Don't do MI optimisation. */
894 return;
896 case ' ': case '\t': case '\f': case '\v': case '\0':
897 skip_whitespace (pfile, c);
898 result->flags |= PREV_WHITE;
899 goto next_char2;
901 case '\n': case '\r':
902 if (!pfile->state.in_directive)
904 handle_newline (buffer, c);
905 bol = 1;
906 pfile->lexer_pos.output_line = buffer->lineno;
907 /* This is a new line, so clear any white space flag.
908 Newlines in arguments are white space (6.10.3.10);
909 parse_arg takes care of that. */
910 result->flags &= ~(PREV_WHITE | AVOID_LPASTE);
911 goto next_char;
914 /* Don't let directives spill over to the next line. */
915 buffer->read_ahead = c;
916 pfile->state.next_bol = 1;
917 result->type = CPP_EOF;
918 /* Don't break; pfile->skipping might be true. */
919 return;
921 case '?':
922 case '\\':
923 /* These could start an escaped newline, or '?' a trigraph. Let
924 skip_escaped_newlines do all the work. */
926 unsigned int lineno = buffer->lineno;
928 c = skip_escaped_newlines (buffer, c);
929 if (lineno != buffer->lineno)
930 /* We had at least one escaped newline of some sort, and the
931 next character is in buffer->read_ahead. Update the
932 token's line and column. */
933 goto next_char;
935 /* We are either the original '?' or '\\', or a trigraph. */
936 result->type = CPP_QUERY;
937 buffer->read_ahead = EOF;
938 if (c == '\\')
939 goto random_char;
940 else if (c != '?')
941 goto do_switch;
943 break;
945 case '0': case '1': case '2': case '3': case '4':
946 case '5': case '6': case '7': case '8': case '9':
947 result->type = CPP_NUMBER;
948 parse_number (pfile, &result->val.str, c, 0);
949 break;
951 case '$':
952 if (!CPP_OPTION (pfile, dollars_in_ident))
953 goto random_char;
954 /* Fall through... */
956 case '_':
957 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
958 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
959 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
960 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
961 case 'y': case 'z':
962 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
963 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
964 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
965 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
966 case 'Y': case 'Z':
967 result->type = CPP_NAME;
968 result->val.node = parse_identifier (pfile, c);
970 /* 'L' may introduce wide characters or strings. */
971 if (result->val.node == pfile->spec_nodes.n_L)
973 c = buffer->read_ahead; /* For make_string. */
974 if (c == '\'' || c == '"')
976 ACCEPT_CHAR (c == '"' ? CPP_WSTRING: CPP_WCHAR);
977 goto make_string;
980 /* Convert named operators to their proper types. */
981 else if (result->val.node->flags & NODE_OPERATOR)
983 result->flags |= NAMED_OP;
984 result->type = result->val.node->value.operator;
986 break;
988 case '\'':
989 case '"':
990 result->type = c == '"' ? CPP_STRING: CPP_CHAR;
991 make_string:
992 parse_string (pfile, result, c);
993 break;
995 case '/':
996 /* A potential block or line comment. */
997 comment_start = buffer->cur;
998 result->type = CPP_DIV;
999 c = get_effective_char (buffer);
1000 if (c == '=')
1001 ACCEPT_CHAR (CPP_DIV_EQ);
1002 if (c != '/' && c != '*')
1003 break;
1005 if (c == '*')
1007 if (skip_block_comment (pfile))
1008 cpp_error_with_line (pfile, pfile->lexer_pos.line,
1009 pfile->lexer_pos.col,
1010 "unterminated comment");
1012 else
1014 if (!CPP_OPTION (pfile, cplusplus_comments)
1015 && !CPP_IN_SYSTEM_HEADER (pfile))
1016 break;
1018 /* Warn about comments only if pedantically GNUC89, and not
1019 in system headers. */
1020 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1021 && ! buffer->warned_cplusplus_comments)
1023 cpp_pedwarn (pfile,
1024 "C++ style comments are not allowed in ISO C89");
1025 cpp_pedwarn (pfile,
1026 "(this will be reported only once per input file)");
1027 buffer->warned_cplusplus_comments = 1;
1030 /* Skip_line_comment updates buffer->read_ahead. */
1031 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1032 cpp_warning_with_line (pfile, pfile->lexer_pos.line,
1033 pfile->lexer_pos.col,
1034 "multi-line comment");
1037 /* Skipping the comment has updated buffer->read_ahead. */
1038 if (!pfile->state.save_comments)
1040 result->flags |= PREV_WHITE;
1041 goto next_char;
1044 /* Save the comment as a token in its own right. */
1045 save_comment (pfile, result, comment_start);
1046 /* Don't do MI optimisation. */
1047 return;
1049 case '<':
1050 if (pfile->state.angled_headers)
1052 result->type = CPP_HEADER_NAME;
1053 c = '>'; /* terminator. */
1054 goto make_string;
1057 result->type = CPP_LESS;
1058 c = get_effective_char (buffer);
1059 if (c == '=')
1060 ACCEPT_CHAR (CPP_LESS_EQ);
1061 else if (c == '<')
1063 ACCEPT_CHAR (CPP_LSHIFT);
1064 if (get_effective_char (buffer) == '=')
1065 ACCEPT_CHAR (CPP_LSHIFT_EQ);
1067 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1069 ACCEPT_CHAR (CPP_MIN);
1070 if (get_effective_char (buffer) == '=')
1071 ACCEPT_CHAR (CPP_MIN_EQ);
1073 else if (c == ':' && CPP_OPTION (pfile, digraphs))
1075 ACCEPT_CHAR (CPP_OPEN_SQUARE);
1076 result->flags |= DIGRAPH;
1078 else if (c == '%' && CPP_OPTION (pfile, digraphs))
1080 ACCEPT_CHAR (CPP_OPEN_BRACE);
1081 result->flags |= DIGRAPH;
1083 break;
1085 case '>':
1086 result->type = CPP_GREATER;
1087 c = get_effective_char (buffer);
1088 if (c == '=')
1089 ACCEPT_CHAR (CPP_GREATER_EQ);
1090 else if (c == '>')
1092 ACCEPT_CHAR (CPP_RSHIFT);
1093 if (get_effective_char (buffer) == '=')
1094 ACCEPT_CHAR (CPP_RSHIFT_EQ);
1096 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1098 ACCEPT_CHAR (CPP_MAX);
1099 if (get_effective_char (buffer) == '=')
1100 ACCEPT_CHAR (CPP_MAX_EQ);
1102 break;
1104 case '%':
1105 lex_percent (buffer, result);
1106 if (result->type == CPP_HASH)
1107 goto do_hash;
1108 break;
1110 case '.':
1111 lex_dot (pfile, result);
1112 break;
1114 case '+':
1115 result->type = CPP_PLUS;
1116 c = get_effective_char (buffer);
1117 if (c == '=')
1118 ACCEPT_CHAR (CPP_PLUS_EQ);
1119 else if (c == '+')
1120 ACCEPT_CHAR (CPP_PLUS_PLUS);
1121 break;
1123 case '-':
1124 result->type = CPP_MINUS;
1125 c = get_effective_char (buffer);
1126 if (c == '>')
1128 ACCEPT_CHAR (CPP_DEREF);
1129 if (CPP_OPTION (pfile, cplusplus)
1130 && get_effective_char (buffer) == '*')
1131 ACCEPT_CHAR (CPP_DEREF_STAR);
1133 else if (c == '=')
1134 ACCEPT_CHAR (CPP_MINUS_EQ);
1135 else if (c == '-')
1136 ACCEPT_CHAR (CPP_MINUS_MINUS);
1137 break;
1139 case '*':
1140 result->type = CPP_MULT;
1141 if (get_effective_char (buffer) == '=')
1142 ACCEPT_CHAR (CPP_MULT_EQ);
1143 break;
1145 case '=':
1146 result->type = CPP_EQ;
1147 if (get_effective_char (buffer) == '=')
1148 ACCEPT_CHAR (CPP_EQ_EQ);
1149 break;
1151 case '!':
1152 result->type = CPP_NOT;
1153 if (get_effective_char (buffer) == '=')
1154 ACCEPT_CHAR (CPP_NOT_EQ);
1155 break;
1157 case '&':
1158 result->type = CPP_AND;
1159 c = get_effective_char (buffer);
1160 if (c == '=')
1161 ACCEPT_CHAR (CPP_AND_EQ);
1162 else if (c == '&')
1163 ACCEPT_CHAR (CPP_AND_AND);
1164 break;
1166 case '#':
1167 c = buffer->extra_char; /* Can be set by error condition below. */
1168 if (c != EOF)
1170 buffer->read_ahead = c;
1171 buffer->extra_char = EOF;
1173 else
1174 c = get_effective_char (buffer);
1176 if (c == '#')
1178 ACCEPT_CHAR (CPP_PASTE);
1179 break;
1182 result->type = CPP_HASH;
1183 do_hash:
1184 if (!bol)
1185 break;
1186 /* 6.10.3 paragraph 11: If there are sequences of preprocessing
1187 tokens within the list of arguments that would otherwise act
1188 as preprocessing directives, the behavior is undefined.
1190 This implementation will report a hard error, terminate the
1191 macro invocation, and proceed to process the directive. */
1192 if (pfile->state.parsing_args)
1194 if (pfile->state.parsing_args == 2)
1195 cpp_error (pfile,
1196 "directives may not be used inside a macro argument");
1198 /* Put a '#' in lookahead, return CPP_EOF for parse_arg. */
1199 buffer->extra_char = buffer->read_ahead;
1200 buffer->read_ahead = '#';
1201 pfile->state.next_bol = 1;
1202 result->type = CPP_EOF;
1204 /* Get whitespace right - newline_in_args sets it. */
1205 if (pfile->lexer_pos.col == 1)
1206 result->flags &= ~(PREV_WHITE | AVOID_LPASTE);
1208 else
1210 /* This is the hash introducing a directive. */
1211 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1212 goto done_directive; /* bol still 1. */
1213 /* This is in fact an assembler #. */
1215 break;
1217 case '|':
1218 result->type = CPP_OR;
1219 c = get_effective_char (buffer);
1220 if (c == '=')
1221 ACCEPT_CHAR (CPP_OR_EQ);
1222 else if (c == '|')
1223 ACCEPT_CHAR (CPP_OR_OR);
1224 break;
1226 case '^':
1227 result->type = CPP_XOR;
1228 if (get_effective_char (buffer) == '=')
1229 ACCEPT_CHAR (CPP_XOR_EQ);
1230 break;
1232 case ':':
1233 result->type = CPP_COLON;
1234 c = get_effective_char (buffer);
1235 if (c == ':' && CPP_OPTION (pfile, cplusplus))
1236 ACCEPT_CHAR (CPP_SCOPE);
1237 else if (c == '>' && CPP_OPTION (pfile, digraphs))
1239 result->flags |= DIGRAPH;
1240 ACCEPT_CHAR (CPP_CLOSE_SQUARE);
1242 break;
1244 case '~': result->type = CPP_COMPL; break;
1245 case ',': result->type = CPP_COMMA; break;
1246 case '(': result->type = CPP_OPEN_PAREN; break;
1247 case ')': result->type = CPP_CLOSE_PAREN; break;
1248 case '[': result->type = CPP_OPEN_SQUARE; break;
1249 case ']': result->type = CPP_CLOSE_SQUARE; break;
1250 case '{': result->type = CPP_OPEN_BRACE; break;
1251 case '}': result->type = CPP_CLOSE_BRACE; break;
1252 case ';': result->type = CPP_SEMICOLON; break;
1254 /* @ is a punctuator in Objective C. */
1255 case '@': result->type = CPP_ATSIGN; break;
1257 random_char:
1258 default:
1259 result->type = CPP_OTHER;
1260 result->val.c = c;
1261 break;
1264 if (pfile->skipping)
1265 goto skip;
1267 /* If not in a directive, this token invalidates controlling macros. */
1268 if (!pfile->state.in_directive)
1269 pfile->mi_state = MI_FAILED;
1272 /* An upper bound on the number of bytes needed to spell a token,
1273 including preceding whitespace. */
1274 unsigned int
1275 cpp_token_len (token)
1276 const cpp_token *token;
1278 unsigned int len;
1280 switch (TOKEN_SPELL (token))
1282 default: len = 0; break;
1283 case SPELL_STRING: len = token->val.str.len; break;
1284 case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
1286 /* 1 for whitespace, 4 for comment delimeters. */
1287 return len + 5;
1290 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1291 already contain the enough space to hold the token's spelling.
1292 Returns a pointer to the character after the last character
1293 written. */
1294 unsigned char *
1295 cpp_spell_token (pfile, token, buffer)
1296 cpp_reader *pfile; /* Would be nice to be rid of this... */
1297 const cpp_token *token;
1298 unsigned char *buffer;
1300 switch (TOKEN_SPELL (token))
1302 case SPELL_OPERATOR:
1304 const unsigned char *spelling;
1305 unsigned char c;
1307 if (token->flags & DIGRAPH)
1308 spelling
1309 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1310 else if (token->flags & NAMED_OP)
1311 goto spell_ident;
1312 else
1313 spelling = TOKEN_NAME (token);
1315 while ((c = *spelling++) != '\0')
1316 *buffer++ = c;
1318 break;
1320 case SPELL_IDENT:
1321 spell_ident:
1322 memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
1323 buffer += NODE_LEN (token->val.node);
1324 break;
1326 case SPELL_STRING:
1328 int left, right, tag;
1329 switch (token->type)
1331 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1332 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1333 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1334 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1335 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1336 default: left = '\0'; right = '\0'; tag = '\0'; break;
1338 if (tag) *buffer++ = tag;
1339 if (left) *buffer++ = left;
1340 memcpy (buffer, token->val.str.text, token->val.str.len);
1341 buffer += token->val.str.len;
1342 if (right) *buffer++ = right;
1344 break;
1346 case SPELL_CHAR:
1347 *buffer++ = token->val.c;
1348 break;
1350 case SPELL_NONE:
1351 cpp_ice (pfile, "Unspellable token %s", TOKEN_NAME (token));
1352 break;
1355 return buffer;
1358 /* Returns a token as a null-terminated string. The string is
1359 temporary, and automatically freed later. Useful for diagnostics. */
1360 unsigned char *
1361 cpp_token_as_text (pfile, token)
1362 cpp_reader *pfile;
1363 const cpp_token *token;
1365 unsigned int len = cpp_token_len (token);
1366 unsigned char *start = _cpp_pool_alloc (&pfile->ident_pool, len), *end;
1368 end = cpp_spell_token (pfile, token, start);
1369 end[0] = '\0';
1371 return start;
1374 /* Used by C front ends. Should really move to using cpp_token_as_text. */
1375 const char *
1376 cpp_type2name (type)
1377 enum cpp_ttype type;
1379 return (const char *) token_spellings[type].name;
1382 /* Writes the spelling of token to FP. Separate from cpp_spell_token
1383 for efficiency - to avoid double-buffering. Also, outputs a space
1384 if PREV_WHITE is flagged. */
1385 void
1386 cpp_output_token (token, fp)
1387 const cpp_token *token;
1388 FILE *fp;
1390 if (token->flags & PREV_WHITE)
1391 putc (' ', fp);
1393 switch (TOKEN_SPELL (token))
1395 case SPELL_OPERATOR:
1397 const unsigned char *spelling;
1399 if (token->flags & DIGRAPH)
1400 spelling
1401 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1402 else if (token->flags & NAMED_OP)
1403 goto spell_ident;
1404 else
1405 spelling = TOKEN_NAME (token);
1407 ufputs (spelling, fp);
1409 break;
1411 spell_ident:
1412 case SPELL_IDENT:
1413 ufputs (NODE_NAME (token->val.node), fp);
1414 break;
1416 case SPELL_STRING:
1418 int left, right, tag;
1419 switch (token->type)
1421 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1422 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1423 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1424 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1425 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1426 default: left = '\0'; right = '\0'; tag = '\0'; break;
1428 if (tag) putc (tag, fp);
1429 if (left) putc (left, fp);
1430 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1431 if (right) putc (right, fp);
1433 break;
1435 case SPELL_CHAR:
1436 putc (token->val.c, fp);
1437 break;
1439 case SPELL_NONE:
1440 /* An error, most probably. */
1441 break;
1445 /* Compare two tokens. */
1447 _cpp_equiv_tokens (a, b)
1448 const cpp_token *a, *b;
1450 if (a->type == b->type && a->flags == b->flags)
1451 switch (TOKEN_SPELL (a))
1453 default: /* Keep compiler happy. */
1454 case SPELL_OPERATOR:
1455 return 1;
1456 case SPELL_CHAR:
1457 return a->val.c == b->val.c; /* Character. */
1458 case SPELL_NONE:
1459 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1460 case SPELL_IDENT:
1461 return a->val.node == b->val.node;
1462 case SPELL_STRING:
1463 return (a->val.str.len == b->val.str.len
1464 && !memcmp (a->val.str.text, b->val.str.text,
1465 a->val.str.len));
1468 return 0;
1471 /* Determine whether two tokens can be pasted together, and if so,
1472 what the resulting token is. Returns CPP_EOF if the tokens cannot
1473 be pasted, or the appropriate type for the merged token if they
1474 can. */
1475 enum cpp_ttype
1476 cpp_can_paste (pfile, token1, token2, digraph)
1477 cpp_reader * pfile;
1478 const cpp_token *token1, *token2;
1479 int* digraph;
1481 enum cpp_ttype a = token1->type, b = token2->type;
1482 int cxx = CPP_OPTION (pfile, cplusplus);
1484 /* Treat named operators as if they were ordinary NAMEs. */
1485 if (token1->flags & NAMED_OP)
1486 a = CPP_NAME;
1487 if (token2->flags & NAMED_OP)
1488 b = CPP_NAME;
1490 if ((int) a <= (int) CPP_LAST_EQ && b == CPP_EQ)
1491 return (enum cpp_ttype) ((int) a + ((int) CPP_EQ_EQ - (int) CPP_EQ));
1493 switch (a)
1495 case CPP_GREATER:
1496 if (b == a) return CPP_RSHIFT;
1497 if (b == CPP_QUERY && cxx) return CPP_MAX;
1498 if (b == CPP_GREATER_EQ) return CPP_RSHIFT_EQ;
1499 break;
1500 case CPP_LESS:
1501 if (b == a) return CPP_LSHIFT;
1502 if (b == CPP_QUERY && cxx) return CPP_MIN;
1503 if (b == CPP_LESS_EQ) return CPP_LSHIFT_EQ;
1504 if (CPP_OPTION (pfile, digraphs))
1506 if (b == CPP_COLON)
1507 {*digraph = 1; return CPP_OPEN_SQUARE;} /* <: digraph */
1508 if (b == CPP_MOD)
1509 {*digraph = 1; return CPP_OPEN_BRACE;} /* <% digraph */
1511 break;
1513 case CPP_PLUS: if (b == a) return CPP_PLUS_PLUS; break;
1514 case CPP_AND: if (b == a) return CPP_AND_AND; break;
1515 case CPP_OR: if (b == a) return CPP_OR_OR; break;
1517 case CPP_MINUS:
1518 if (b == a) return CPP_MINUS_MINUS;
1519 if (b == CPP_GREATER) return CPP_DEREF;
1520 break;
1521 case CPP_COLON:
1522 if (b == a && cxx) return CPP_SCOPE;
1523 if (b == CPP_GREATER && CPP_OPTION (pfile, digraphs))
1524 {*digraph = 1; return CPP_CLOSE_SQUARE;} /* :> digraph */
1525 break;
1527 case CPP_MOD:
1528 if (CPP_OPTION (pfile, digraphs))
1530 if (b == CPP_GREATER)
1531 {*digraph = 1; return CPP_CLOSE_BRACE;} /* %> digraph */
1532 if (b == CPP_COLON)
1533 {*digraph = 1; return CPP_HASH;} /* %: digraph */
1535 break;
1536 case CPP_DEREF:
1537 if (b == CPP_MULT && cxx) return CPP_DEREF_STAR;
1538 break;
1539 case CPP_DOT:
1540 if (b == CPP_MULT && cxx) return CPP_DOT_STAR;
1541 if (b == CPP_NUMBER) return CPP_NUMBER;
1542 break;
1544 case CPP_HASH:
1545 if (b == a && (token1->flags & DIGRAPH) == (token2->flags & DIGRAPH))
1546 /* %:%: digraph */
1547 {*digraph = (token1->flags & DIGRAPH); return CPP_PASTE;}
1548 break;
1550 case CPP_NAME:
1551 if (b == CPP_NAME) return CPP_NAME;
1552 if (b == CPP_NUMBER
1553 && name_p (pfile, &token2->val.str)) return CPP_NAME;
1554 if (b == CPP_CHAR
1555 && token1->val.node == pfile->spec_nodes.n_L) return CPP_WCHAR;
1556 if (b == CPP_STRING
1557 && token1->val.node == pfile->spec_nodes.n_L) return CPP_WSTRING;
1558 break;
1560 case CPP_NUMBER:
1561 if (b == CPP_NUMBER) return CPP_NUMBER;
1562 if (b == CPP_NAME) return CPP_NUMBER;
1563 if (b == CPP_DOT) return CPP_NUMBER;
1564 /* Numbers cannot have length zero, so this is safe. */
1565 if ((b == CPP_PLUS || b == CPP_MINUS)
1566 && VALID_SIGN ('+', token1->val.str.text[token1->val.str.len - 1]))
1567 return CPP_NUMBER;
1568 break;
1570 default:
1571 break;
1574 return CPP_EOF;
1577 /* Returns nonzero if a space should be inserted to avoid an
1578 accidental token paste for output. For simplicity, it is
1579 conservative, and occasionally advises a space where one is not
1580 needed, e.g. "." and ".2". */
1583 cpp_avoid_paste (pfile, token1, token2)
1584 cpp_reader *pfile;
1585 const cpp_token *token1, *token2;
1587 enum cpp_ttype a = token1->type, b = token2->type;
1588 cppchar_t c;
1590 if (token1->flags & NAMED_OP)
1591 a = CPP_NAME;
1592 if (token2->flags & NAMED_OP)
1593 b = CPP_NAME;
1595 c = EOF;
1596 if (token2->flags & DIGRAPH)
1597 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1598 else if (token_spellings[b].category == SPELL_OPERATOR)
1599 c = token_spellings[b].name[0];
1601 /* Quickly get everything that can paste with an '='. */
1602 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1603 return 1;
1605 switch (a)
1607 case CPP_GREATER: return c == '>' || c == '?';
1608 case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
1609 case CPP_PLUS: return c == '+';
1610 case CPP_MINUS: return c == '-' || c == '>';
1611 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1612 case CPP_MOD: return c == ':' || c == '>';
1613 case CPP_AND: return c == '&';
1614 case CPP_OR: return c == '|';
1615 case CPP_COLON: return c == ':' || c == '>';
1616 case CPP_DEREF: return c == '*';
1617 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1618 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1619 case CPP_NAME: return ((b == CPP_NUMBER
1620 && name_p (pfile, &token2->val.str))
1621 || b == CPP_NAME
1622 || b == CPP_CHAR || b == CPP_STRING); /* L */
1623 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1624 || c == '.' || c == '+' || c == '-');
1625 case CPP_OTHER: return (CPP_OPTION (pfile, objc)
1626 && token1->val.c == '@'
1627 && (b == CPP_NAME || b == CPP_STRING));
1628 default: break;
1631 return 0;
1634 /* Output all the remaining tokens on the current line, and a newline
1635 character, to FP. Leading whitespace is removed. */
1636 void
1637 cpp_output_line (pfile, fp)
1638 cpp_reader *pfile;
1639 FILE *fp;
1641 cpp_token token;
1643 cpp_get_token (pfile, &token);
1644 token.flags &= ~PREV_WHITE;
1645 while (token.type != CPP_EOF)
1647 cpp_output_token (&token, fp);
1648 cpp_get_token (pfile, &token);
1651 putc ('\n', fp);
1654 /* Returns the value of a hexadecimal digit. */
1655 static unsigned int
1656 hex_digit_value (c)
1657 unsigned int c;
1659 if (c >= 'a' && c <= 'f')
1660 return c - 'a' + 10;
1661 if (c >= 'A' && c <= 'F')
1662 return c - 'A' + 10;
1663 if (c >= '0' && c <= '9')
1664 return c - '0';
1665 abort ();
1668 /* Parse a '\uNNNN' or '\UNNNNNNNN' sequence. Returns 1 to indicate
1669 failure if cpplib is not parsing C++ or C99. Such failure is
1670 silent, and no variables are updated. Otherwise returns 0, and
1671 warns if -Wtraditional.
1673 [lex.charset]: The character designated by the universal character
1674 name \UNNNNNNNN is that character whose character short name in
1675 ISO/IEC 10646 is NNNNNNNN; the character designated by the
1676 universal character name \uNNNN is that character whose character
1677 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
1678 for a universal character name is less than 0x20 or in the range
1679 0x7F-0x9F (inclusive), or if the universal character name
1680 designates a character in the basic source character set, then the
1681 program is ill-formed.
1683 We assume that wchar_t is Unicode, so we don't need to do any
1684 mapping. Is this ever wrong?
1686 PC points to the 'u' or 'U', PSTR is points to the byte after PC,
1687 LIMIT is the end of the string or charconst. PSTR is updated to
1688 point after the UCS on return, and the UCS is written into PC. */
1690 static int
1691 maybe_read_ucs (pfile, pstr, limit, pc)
1692 cpp_reader *pfile;
1693 const unsigned char **pstr;
1694 const unsigned char *limit;
1695 unsigned int *pc;
1697 const unsigned char *p = *pstr;
1698 unsigned int code = 0;
1699 unsigned int c = *pc, length;
1701 /* Only attempt to interpret a UCS for C++ and C99. */
1702 if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
1703 return 1;
1705 if (CPP_WTRADITIONAL (pfile))
1706 cpp_warning (pfile, "the meaning of '\\%c' varies with -traditional", c);
1708 length = (c == 'u' ? 4: 8);
1710 if ((size_t) (limit - p) < length)
1712 cpp_error (pfile, "incomplete universal-character-name");
1713 /* Skip to the end to avoid more diagnostics. */
1714 p = limit;
1716 else
1718 for (; length; length--, p++)
1720 c = *p;
1721 if (ISXDIGIT (c))
1722 code = (code << 4) + hex_digit_value (c);
1723 else
1725 cpp_error (pfile,
1726 "non-hex digit '%c' in universal-character-name", c);
1727 /* We shouldn't skip in case there are multibyte chars. */
1728 break;
1733 #ifdef TARGET_EBCDIC
1734 cpp_error (pfile, "universal-character-name on EBCDIC target");
1735 code = 0x3f; /* EBCDIC invalid character */
1736 #else
1737 /* True extended characters are OK. */
1738 if (code >= 0xa0
1739 && !(code & 0x80000000)
1740 && !(code >= 0xD800 && code <= 0xDFFF))
1742 /* The standard permits $, @ and ` to be specified as UCNs. We use
1743 hex escapes so that this also works with EBCDIC hosts. */
1744 else if (code == 0x24 || code == 0x40 || code == 0x60)
1746 /* Don't give another error if one occurred above. */
1747 else if (length == 0)
1748 cpp_error (pfile, "universal-character-name out of range");
1749 #endif
1751 *pstr = p;
1752 *pc = code;
1753 return 0;
1756 /* Interpret an escape sequence, and return its value. PSTR points to
1757 the input pointer, which is just after the backslash. LIMIT is how
1758 much text we have. MASK is a bitmask for the precision for the
1759 destination type (char or wchar_t). TRADITIONAL, if true, does not
1760 interpret escapes that did not exist in traditional C.
1762 Handles all relevant diagnostics. */
1764 unsigned int
1765 cpp_parse_escape (pfile, pstr, limit, mask, traditional)
1766 cpp_reader *pfile;
1767 const unsigned char **pstr;
1768 const unsigned char *limit;
1769 unsigned HOST_WIDE_INT mask;
1770 int traditional;
1772 int unknown = 0;
1773 const unsigned char *str = *pstr;
1774 unsigned int c = *str++;
1776 switch (c)
1778 case '\\': case '\'': case '"': case '?': break;
1779 case 'b': c = TARGET_BS; break;
1780 case 'f': c = TARGET_FF; break;
1781 case 'n': c = TARGET_NEWLINE; break;
1782 case 'r': c = TARGET_CR; break;
1783 case 't': c = TARGET_TAB; break;
1784 case 'v': c = TARGET_VT; break;
1786 case '(': case '{': case '[': case '%':
1787 /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
1788 '\%' is used to prevent SCCS from getting confused. */
1789 unknown = CPP_PEDANTIC (pfile);
1790 break;
1792 case 'a':
1793 if (CPP_WTRADITIONAL (pfile))
1794 cpp_warning (pfile, "the meaning of '\\a' varies with -traditional");
1795 if (!traditional)
1796 c = TARGET_BELL;
1797 break;
1799 case 'e': case 'E':
1800 if (CPP_PEDANTIC (pfile))
1801 cpp_pedwarn (pfile, "non-ISO-standard escape sequence, '\\%c'", c);
1802 c = TARGET_ESC;
1803 break;
1805 case 'u': case 'U':
1806 unknown = maybe_read_ucs (pfile, &str, limit, &c);
1807 break;
1809 case 'x':
1810 if (CPP_WTRADITIONAL (pfile))
1811 cpp_warning (pfile, "the meaning of '\\x' varies with -traditional");
1813 if (!traditional)
1815 unsigned int i = 0, overflow = 0;
1816 int digits_found = 0;
1818 while (str < limit)
1820 c = *str;
1821 if (! ISXDIGIT (c))
1822 break;
1823 str++;
1824 overflow |= i ^ (i << 4 >> 4);
1825 i = (i << 4) + hex_digit_value (c);
1826 digits_found = 1;
1829 if (!digits_found)
1830 cpp_error (pfile, "\\x used with no following hex digits");
1832 if (overflow | (i != (i & mask)))
1834 cpp_pedwarn (pfile, "hex escape sequence out of range");
1835 i &= mask;
1837 c = i;
1839 break;
1841 case '0': case '1': case '2': case '3':
1842 case '4': case '5': case '6': case '7':
1844 unsigned int i = c - '0';
1845 int count = 0;
1847 while (str < limit && ++count < 3)
1849 c = *str;
1850 if (c < '0' || c > '7')
1851 break;
1852 str++;
1853 i = (i << 3) + c - '0';
1856 if (i != (i & mask))
1858 cpp_pedwarn (pfile, "octal escape sequence out of range");
1859 i &= mask;
1861 c = i;
1863 break;
1865 default:
1866 unknown = 1;
1867 break;
1870 if (unknown)
1872 if (ISGRAPH (c))
1873 cpp_pedwarn (pfile, "unknown escape sequence '\\%c'", c);
1874 else
1875 cpp_pedwarn (pfile, "unknown escape sequence: '\\%03o'", c);
1878 if (c > mask)
1879 cpp_pedwarn (pfile, "escape sequence out of range for character");
1881 *pstr = str;
1882 return c;
1885 #ifndef MAX_CHAR_TYPE_SIZE
1886 #define MAX_CHAR_TYPE_SIZE CHAR_TYPE_SIZE
1887 #endif
1889 #ifndef MAX_WCHAR_TYPE_SIZE
1890 #define MAX_WCHAR_TYPE_SIZE WCHAR_TYPE_SIZE
1891 #endif
1893 /* Interpret a (possibly wide) character constant in TOKEN.
1894 WARN_MULTI warns about multi-character charconsts, if not
1895 TRADITIONAL. TRADITIONAL also indicates not to interpret escapes
1896 that did not exist in traditional C. PCHARS_SEEN points to a
1897 variable that is filled in with the number of characters seen. */
1898 HOST_WIDE_INT
1899 cpp_interpret_charconst (pfile, token, warn_multi, traditional, pchars_seen)
1900 cpp_reader *pfile;
1901 const cpp_token *token;
1902 int warn_multi;
1903 int traditional;
1904 unsigned int *pchars_seen;
1906 const unsigned char *str = token->val.str.text;
1907 const unsigned char *limit = str + token->val.str.len;
1908 unsigned int chars_seen = 0;
1909 unsigned int width, max_chars, c;
1910 unsigned HOST_WIDE_INT mask;
1911 HOST_WIDE_INT result = 0;
1913 #ifdef MULTIBYTE_CHARS
1914 (void) local_mbtowc (NULL, NULL, 0);
1915 #endif
1917 /* Width in bits. */
1918 if (token->type == CPP_CHAR)
1919 width = MAX_CHAR_TYPE_SIZE;
1920 else
1921 width = MAX_WCHAR_TYPE_SIZE;
1923 if (width < HOST_BITS_PER_WIDE_INT)
1924 mask = ((unsigned HOST_WIDE_INT) 1 << width) - 1;
1925 else
1926 mask = ~0;
1927 max_chars = HOST_BITS_PER_WIDE_INT / width;
1929 while (str < limit)
1931 #ifdef MULTIBYTE_CHARS
1932 wchar_t wc;
1933 int char_len;
1935 char_len = local_mbtowc (&wc, str, limit - str);
1936 if (char_len == -1)
1938 cpp_warning (pfile, "ignoring invalid multibyte character");
1939 c = *str++;
1941 else
1943 str += char_len;
1944 c = wc;
1946 #else
1947 c = *str++;
1948 #endif
1950 if (c == '\\')
1951 c = cpp_parse_escape (pfile, &str, limit, mask, traditional);
1953 #ifdef MAP_CHARACTER
1954 if (ISPRINT (c))
1955 c = MAP_CHARACTER (c);
1956 #endif
1958 /* Merge character into result; ignore excess chars. */
1959 if (++chars_seen <= max_chars)
1961 if (width < HOST_BITS_PER_WIDE_INT)
1962 result = (result << width) | (c & mask);
1963 else
1964 result = c;
1968 if (chars_seen == 0)
1969 cpp_error (pfile, "empty character constant");
1970 else if (chars_seen > max_chars)
1972 chars_seen = max_chars;
1973 cpp_warning (pfile, "character constant too long");
1975 else if (chars_seen > 1 && !traditional && warn_multi)
1976 cpp_warning (pfile, "multi-character character constant");
1978 /* If char type is signed, sign-extend the constant. The
1979 __CHAR_UNSIGNED__ macro is set by the driver if appropriate. */
1980 if (token->type == CPP_CHAR && chars_seen)
1982 unsigned int nbits = chars_seen * width;
1983 unsigned int mask = (unsigned int) ~0 >> (HOST_BITS_PER_INT - nbits);
1985 if (pfile->spec_nodes.n__CHAR_UNSIGNED__->type == NT_MACRO
1986 || ((result >> (nbits - 1)) & 1) == 0)
1987 result &= mask;
1988 else
1989 result |= ~mask;
1992 *pchars_seen = chars_seen;
1993 return result;
1996 /* Memory pools. */
1998 struct dummy
2000 char c;
2001 union
2003 double d;
2004 int *p;
2005 } u;
2008 #define DEFAULT_ALIGNMENT (offsetof (struct dummy, u))
2010 static int
2011 chunk_suitable (pool, chunk, size)
2012 cpp_pool *pool;
2013 cpp_chunk *chunk;
2014 unsigned int size;
2016 /* Being at least twice SIZE means we can use memcpy in
2017 _cpp_next_chunk rather than memmove. Besides, it's a good idea
2018 anyway. */
2019 return (chunk && pool->locked != chunk
2020 && (unsigned int) (chunk->limit - chunk->base) >= size * 2);
2023 /* Returns the end of the new pool. PTR points to a char in the old
2024 pool, and is updated to point to the same char in the new pool. */
2025 unsigned char *
2026 _cpp_next_chunk (pool, len, ptr)
2027 cpp_pool *pool;
2028 unsigned int len;
2029 unsigned char **ptr;
2031 cpp_chunk *chunk = pool->cur->next;
2033 /* LEN is the minimum size we want in the new pool. */
2034 len += POOL_ROOM (pool);
2035 if (! chunk_suitable (pool, chunk, len))
2037 chunk = new_chunk (POOL_SIZE (pool) * 2 + len);
2039 chunk->next = pool->cur->next;
2040 pool->cur->next = chunk;
2043 /* Update the pointer before changing chunk's front. */
2044 if (ptr)
2045 *ptr += chunk->base - POOL_FRONT (pool);
2047 memcpy (chunk->base, POOL_FRONT (pool), POOL_ROOM (pool));
2048 chunk->front = chunk->base;
2050 pool->cur = chunk;
2051 return POOL_LIMIT (pool);
2054 static cpp_chunk *
2055 new_chunk (size)
2056 unsigned int size;
2058 unsigned char *base;
2059 cpp_chunk *result;
2061 size = POOL_ALIGN (size, DEFAULT_ALIGNMENT);
2062 base = (unsigned char *) xmalloc (size + sizeof (cpp_chunk));
2063 /* Put the chunk descriptor at the end. Then chunk overruns will
2064 cause obvious chaos. */
2065 result = (cpp_chunk *) (base + size);
2066 result->base = base;
2067 result->front = base;
2068 result->limit = base + size;
2069 result->next = 0;
2071 return result;
2074 void
2075 _cpp_init_pool (pool, size, align, temp)
2076 cpp_pool *pool;
2077 unsigned int size, align, temp;
2079 if (align == 0)
2080 align = DEFAULT_ALIGNMENT;
2081 if (align & (align - 1))
2082 abort ();
2083 pool->align = align;
2084 pool->cur = new_chunk (size);
2085 pool->locked = 0;
2086 pool->locks = 0;
2087 if (temp)
2088 pool->cur->next = pool->cur;
2091 void
2092 _cpp_lock_pool (pool)
2093 cpp_pool *pool;
2095 if (pool->locks++ == 0)
2096 pool->locked = pool->cur;
2099 void
2100 _cpp_unlock_pool (pool)
2101 cpp_pool *pool;
2103 if (--pool->locks == 0)
2104 pool->locked = 0;
2107 void
2108 _cpp_free_pool (pool)
2109 cpp_pool *pool;
2111 cpp_chunk *chunk = pool->cur, *next;
2115 next = chunk->next;
2116 free (chunk->base);
2117 chunk = next;
2119 while (chunk && chunk != pool->cur);
2122 /* Reserve LEN bytes from a memory pool. */
2123 unsigned char *
2124 _cpp_pool_reserve (pool, len)
2125 cpp_pool *pool;
2126 unsigned int len;
2128 len = POOL_ALIGN (len, pool->align);
2129 if (len > (unsigned int) POOL_ROOM (pool))
2130 _cpp_next_chunk (pool, len, 0);
2132 return POOL_FRONT (pool);
2135 /* Allocate LEN bytes from a memory pool. */
2136 unsigned char *
2137 _cpp_pool_alloc (pool, len)
2138 cpp_pool *pool;
2139 unsigned int len;
2141 unsigned char *result = _cpp_pool_reserve (pool, len);
2143 POOL_COMMIT (pool, len);
2144 return result;