* verify.c (verify_jvm_instructions): Fix typo.
[official-gcc.git] / gcc / cpplex.c
blob6d640e090afa805e84e21a9295f45c796b45886b
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7 Single-pass line tokenization by Neil Booth, April 2000
9 This program is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
12 later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23 /* This lexer works with a single pass of the file. Recently I
24 re-wrote it to minimize the places where we step backwards in the
25 input stream, to make future changes to support multi-byte
26 character sets fairly straight-forward.
28 There is now only one routine where we do step backwards:
29 skip_escaped_newlines. This routine could probably also be changed
30 so that it doesn't need to step back. One possibility is to use a
31 trick similar to that used in lex_period and lex_percent. Two
32 extra characters might be needed, but skip_escaped_newlines itself
33 would probably be the only place that needs to be aware of that,
34 and changes to the remaining routines would probably only be needed
35 if they process a backslash. */
37 #include "config.h"
38 #include "system.h"
39 #include "cpplib.h"
40 #include "cpphash.h"
42 /* MULTIBYTE_CHARS support only works for native compilers.
43 ??? Ideally what we want is to model widechar support after
44 the current floating point support. */
45 #ifdef CROSS_COMPILE
46 #undef MULTIBYTE_CHARS
47 #endif
49 #ifdef MULTIBYTE_CHARS
50 #include "mbchar.h"
51 #include <locale.h>
52 #endif
54 /* Tokens with SPELL_STRING store their spelling in the token list,
55 and it's length in the token->val.name.len. */
56 enum spell_type
58 SPELL_OPERATOR = 0,
59 SPELL_CHAR,
60 SPELL_IDENT,
61 SPELL_STRING,
62 SPELL_NONE
65 struct token_spelling
67 enum spell_type category;
68 const unsigned char *name;
71 const unsigned char *digraph_spellings [] = {U"%:", U"%:%:", U"<:",
72 U":>", U"<%", U"%>"};
74 #define OP(e, s) { SPELL_OPERATOR, U s },
75 #define TK(e, s) { s, U STRINGX (e) },
76 const struct token_spelling token_spellings [N_TTYPES] = {TTYPE_TABLE };
77 #undef OP
78 #undef TK
80 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
81 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
83 static cppchar_t handle_newline PARAMS ((cpp_reader *, cppchar_t));
84 static cppchar_t skip_escaped_newlines PARAMS ((cpp_reader *, cppchar_t));
85 static cppchar_t get_effective_char PARAMS ((cpp_reader *));
87 static int skip_block_comment PARAMS ((cpp_reader *));
88 static int skip_line_comment PARAMS ((cpp_reader *));
89 static void adjust_column PARAMS ((cpp_reader *));
90 static void skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
91 static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *));
92 static cpp_hashnode *parse_identifier_slow PARAMS ((cpp_reader *,
93 const U_CHAR *));
94 static void parse_number PARAMS ((cpp_reader *, cpp_string *, cppchar_t, int));
95 static int unescaped_terminator_p PARAMS ((cpp_reader *, const U_CHAR *));
96 static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
97 static void unterminated PARAMS ((cpp_reader *, int));
98 static int trigraph_ok PARAMS ((cpp_reader *, cppchar_t));
99 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const U_CHAR *));
100 static void lex_percent PARAMS ((cpp_reader *, cpp_token *));
101 static void lex_dot PARAMS ((cpp_reader *, cpp_token *));
102 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
103 static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
104 const unsigned char *, unsigned int *));
105 static cpp_token *lex_token PARAMS ((cpp_reader *, cpp_token *));
106 static tokenrun *next_tokenrun PARAMS ((tokenrun *));
108 static cpp_chunk *new_chunk PARAMS ((unsigned int));
109 static int chunk_suitable PARAMS ((cpp_pool *, cpp_chunk *, unsigned int));
110 static unsigned int hex_digit_value PARAMS ((unsigned int));
112 /* Utility routine:
114 Compares, the token TOKEN to the NUL-terminated string STRING.
115 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
118 cpp_ideq (token, string)
119 const cpp_token *token;
120 const char *string;
122 if (token->type != CPP_NAME)
123 return 0;
125 return !ustrcmp (NODE_NAME (token->val.node), (const U_CHAR *) string);
128 /* Call when meeting a newline. Returns the character after the newline
129 (or carriage-return newline combination), or EOF. */
130 static cppchar_t
131 handle_newline (pfile, newline_char)
132 cpp_reader *pfile;
133 cppchar_t newline_char;
135 cpp_buffer *buffer;
136 cppchar_t next = EOF;
138 pfile->line++;
139 buffer = pfile->buffer;
140 buffer->col_adjust = 0;
141 buffer->line_base = buffer->cur;
143 /* Handle CR-LF and LF-CR combinations, get the next character. */
144 if (buffer->cur < buffer->rlimit)
146 next = *buffer->cur++;
147 if (next + newline_char == '\r' + '\n')
149 buffer->line_base = buffer->cur;
150 if (buffer->cur < buffer->rlimit)
151 next = *buffer->cur++;
152 else
153 next = EOF;
157 buffer->read_ahead = next;
158 return next;
161 /* Subroutine of skip_escaped_newlines; called when a trigraph is
162 encountered. It warns if necessary, and returns true if the
163 trigraph should be honoured. FROM_CHAR is the third character of a
164 trigraph, and presumed to be the previous character for position
165 reporting. */
166 static int
167 trigraph_ok (pfile, from_char)
168 cpp_reader *pfile;
169 cppchar_t from_char;
171 int accept = CPP_OPTION (pfile, trigraphs);
173 /* Don't warn about trigraphs in comments. */
174 if (CPP_OPTION (pfile, warn_trigraphs) && !pfile->state.lexing_comment)
176 cpp_buffer *buffer = pfile->buffer;
178 if (accept)
179 cpp_warning_with_line (pfile, pfile->line, CPP_BUF_COL (buffer) - 2,
180 "trigraph ??%c converted to %c",
181 (int) from_char,
182 (int) _cpp_trigraph_map[from_char]);
183 else if (buffer->cur != buffer->last_Wtrigraphs)
185 buffer->last_Wtrigraphs = buffer->cur;
186 cpp_warning_with_line (pfile, pfile->line,
187 CPP_BUF_COL (buffer) - 2,
188 "trigraph ??%c ignored", (int) from_char);
192 return accept;
195 /* Assumes local variables buffer and result. */
196 #define ACCEPT_CHAR(t) \
197 do { result->type = t; buffer->read_ahead = EOF; } while (0)
199 /* When we move to multibyte character sets, add to these something
200 that saves and restores the state of the multibyte conversion
201 library. This probably involves saving and restoring a "cookie".
202 In the case of glibc it is an 8-byte structure, so is not a high
203 overhead operation. In any case, it's out of the fast path. */
204 #define SAVE_STATE() do { saved_cur = buffer->cur; } while (0)
205 #define RESTORE_STATE() do { buffer->cur = saved_cur; } while (0)
207 /* Skips any escaped newlines introduced by NEXT, which is either a
208 '?' or a '\\'. Returns the next character, which will also have
209 been placed in buffer->read_ahead. This routine performs
210 preprocessing stages 1 and 2 of the ISO C standard. */
211 static cppchar_t
212 skip_escaped_newlines (pfile, next)
213 cpp_reader *pfile;
214 cppchar_t next;
216 cpp_buffer *buffer = pfile->buffer;
218 /* Only do this if we apply stages 1 and 2. */
219 if (!buffer->from_stage3)
221 cppchar_t next1;
222 const unsigned char *saved_cur;
223 int space;
227 if (buffer->cur == buffer->rlimit)
228 break;
230 SAVE_STATE ();
231 if (next == '?')
233 next1 = *buffer->cur++;
234 if (next1 != '?' || buffer->cur == buffer->rlimit)
236 RESTORE_STATE ();
237 break;
240 next1 = *buffer->cur++;
241 if (!_cpp_trigraph_map[next1]
242 || !trigraph_ok (pfile, next1))
244 RESTORE_STATE ();
245 break;
248 /* We have a full trigraph here. */
249 next = _cpp_trigraph_map[next1];
250 if (next != '\\' || buffer->cur == buffer->rlimit)
251 break;
252 SAVE_STATE ();
255 /* We have a backslash, and room for at least one more character. */
256 space = 0;
259 next1 = *buffer->cur++;
260 if (!is_nvspace (next1))
261 break;
262 space = 1;
264 while (buffer->cur < buffer->rlimit);
266 if (!is_vspace (next1))
268 RESTORE_STATE ();
269 break;
272 if (space && !pfile->state.lexing_comment)
273 cpp_warning (pfile, "backslash and newline separated by space");
275 next = handle_newline (pfile, next1);
276 if (next == EOF)
277 cpp_pedwarn (pfile, "backslash-newline at end of file");
279 while (next == '\\' || next == '?');
282 buffer->read_ahead = next;
283 return next;
286 /* Obtain the next character, after trigraph conversion and skipping
287 an arbitrary string of escaped newlines. The common case of no
288 trigraphs or escaped newlines falls through quickly. */
289 static cppchar_t
290 get_effective_char (pfile)
291 cpp_reader *pfile;
293 cpp_buffer *buffer = pfile->buffer;
294 cppchar_t next = EOF;
296 if (buffer->cur < buffer->rlimit)
298 next = *buffer->cur++;
300 /* '?' can introduce trigraphs (and therefore backslash); '\\'
301 can introduce escaped newlines, which we want to skip, or
302 UCNs, which, depending upon lexer state, we will handle in
303 the future. */
304 if (next == '?' || next == '\\')
305 next = skip_escaped_newlines (pfile, next);
308 buffer->read_ahead = next;
309 return next;
312 /* Skip a C-style block comment. We find the end of the comment by
313 seeing if an asterisk is before every '/' we encounter. Returns
314 non-zero if comment terminated by EOF, zero otherwise. */
315 static int
316 skip_block_comment (pfile)
317 cpp_reader *pfile;
319 cpp_buffer *buffer = pfile->buffer;
320 cppchar_t c = EOF, prevc = EOF;
322 pfile->state.lexing_comment = 1;
323 while (buffer->cur != buffer->rlimit)
325 prevc = c, c = *buffer->cur++;
327 next_char:
328 /* FIXME: For speed, create a new character class of characters
329 of interest inside block comments. */
330 if (c == '?' || c == '\\')
331 c = skip_escaped_newlines (pfile, c);
333 /* People like decorating comments with '*', so check for '/'
334 instead for efficiency. */
335 if (c == '/')
337 if (prevc == '*')
338 break;
340 /* Warn about potential nested comments, but not if the '/'
341 comes immediately before the true comment delimeter.
342 Don't bother to get it right across escaped newlines. */
343 if (CPP_OPTION (pfile, warn_comments)
344 && buffer->cur != buffer->rlimit)
346 prevc = c, c = *buffer->cur++;
347 if (c == '*' && buffer->cur != buffer->rlimit)
349 prevc = c, c = *buffer->cur++;
350 if (c != '/')
351 cpp_warning_with_line (pfile, pfile->line,
352 CPP_BUF_COL (buffer) - 2,
353 "\"/*\" within comment");
355 goto next_char;
358 else if (is_vspace (c))
360 prevc = c, c = handle_newline (pfile, c);
361 goto next_char;
363 else if (c == '\t')
364 adjust_column (pfile);
367 pfile->state.lexing_comment = 0;
368 buffer->read_ahead = EOF;
369 return c != '/' || prevc != '*';
372 /* Skip a C++ line comment. Handles escaped newlines. Returns
373 non-zero if a multiline comment. The following new line, if any,
374 is left in buffer->read_ahead. */
375 static int
376 skip_line_comment (pfile)
377 cpp_reader *pfile;
379 cpp_buffer *buffer = pfile->buffer;
380 unsigned int orig_line = pfile->line;
381 cppchar_t c;
383 pfile->state.lexing_comment = 1;
386 c = EOF;
387 if (buffer->cur == buffer->rlimit)
388 break;
390 c = *buffer->cur++;
391 if (c == '?' || c == '\\')
392 c = skip_escaped_newlines (pfile, c);
394 while (!is_vspace (c));
396 pfile->state.lexing_comment = 0;
397 buffer->read_ahead = c; /* Leave any newline for caller. */
398 return orig_line != pfile->line;
401 /* pfile->buffer->cur is one beyond the \t character. Update
402 col_adjust so we track the column correctly. */
403 static void
404 adjust_column (pfile)
405 cpp_reader *pfile;
407 cpp_buffer *buffer = pfile->buffer;
408 unsigned int col = CPP_BUF_COL (buffer) - 1; /* Zero-based column. */
410 /* Round it up to multiple of the tabstop, but subtract 1 since the
411 tab itself occupies a character position. */
412 buffer->col_adjust += (CPP_OPTION (pfile, tabstop)
413 - col % CPP_OPTION (pfile, tabstop)) - 1;
416 /* Skips whitespace, saving the next non-whitespace character.
417 Adjusts pfile->col_adjust to account for tabs. Without this,
418 tokens might be assigned an incorrect column. */
419 static void
420 skip_whitespace (pfile, c)
421 cpp_reader *pfile;
422 cppchar_t c;
424 cpp_buffer *buffer = pfile->buffer;
425 unsigned int warned = 0;
429 /* Horizontal space always OK. */
430 if (c == ' ')
432 else if (c == '\t')
433 adjust_column (pfile);
434 /* Just \f \v or \0 left. */
435 else if (c == '\0')
437 if (!warned)
439 cpp_warning (pfile, "null character(s) ignored");
440 warned = 1;
443 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
444 cpp_pedwarn_with_line (pfile, pfile->line,
445 CPP_BUF_COL (buffer),
446 "%s in preprocessing directive",
447 c == '\f' ? "form feed" : "vertical tab");
449 c = EOF;
450 if (buffer->cur == buffer->rlimit)
451 break;
452 c = *buffer->cur++;
454 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
455 while (is_nvspace (c));
457 /* Remember the next character. */
458 buffer->read_ahead = c;
461 /* See if the characters of a number token are valid in a name (no
462 '.', '+' or '-'). */
463 static int
464 name_p (pfile, string)
465 cpp_reader *pfile;
466 const cpp_string *string;
468 unsigned int i;
470 for (i = 0; i < string->len; i++)
471 if (!is_idchar (string->text[i]))
472 return 0;
474 return 1;
477 /* Parse an identifier, skipping embedded backslash-newlines. This is
478 a critical inner loop. The common case is an identifier which has
479 not been split by backslash-newline, does not contain a dollar
480 sign, and has already been scanned (roughly 10:1 ratio of
481 seen:unseen identifiers in normal code; the distribution is
482 Poisson-like). Second most common case is a new identifier, not
483 split and no dollar sign. The other possibilities are rare and
484 have been relegated to parse_identifier_slow. */
486 static cpp_hashnode *
487 parse_identifier (pfile)
488 cpp_reader *pfile;
490 cpp_hashnode *result;
491 const U_CHAR *cur, *rlimit;
493 /* Fast-path loop. Skim over a normal identifier.
494 N.B. ISIDNUM does not include $. */
495 cur = pfile->buffer->cur - 1;
496 rlimit = pfile->buffer->rlimit;
498 cur++;
499 while (cur < rlimit && ISIDNUM (*cur));
501 /* Check for slow-path cases. */
502 if (cur < rlimit && (*cur == '?' || *cur == '\\' || *cur == '$'))
503 result = parse_identifier_slow (pfile, cur);
504 else
506 const U_CHAR *base = pfile->buffer->cur - 1;
507 result = (cpp_hashnode *)
508 ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
509 pfile->buffer->cur = cur;
512 /* Rarely, identifiers require diagnostics when lexed.
513 XXX Has to be forced out of the fast path. */
514 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
515 && !pfile->state.skipping, 0))
517 /* It is allowed to poison the same identifier twice. */
518 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
519 cpp_error (pfile, "attempt to use poisoned \"%s\"",
520 NODE_NAME (result));
522 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
523 replacement list of a variadic macro. */
524 if (result == pfile->spec_nodes.n__VA_ARGS__
525 && !pfile->state.va_args_ok)
526 cpp_pedwarn (pfile,
527 "__VA_ARGS__ can only appear in the expansion of a C99 variadic macro");
530 return result;
533 /* Slow path. This handles identifiers which have been split, and
534 identifiers which contain dollar signs. The part of the identifier
535 from PFILE->buffer->cur-1 to CUR has already been scanned. */
536 static cpp_hashnode *
537 parse_identifier_slow (pfile, cur)
538 cpp_reader *pfile;
539 const U_CHAR *cur;
541 cpp_buffer *buffer = pfile->buffer;
542 const U_CHAR *base = buffer->cur - 1;
543 struct obstack *stack = &pfile->hash_table->stack;
544 unsigned int c, saw_dollar = 0, len;
546 /* Copy the part of the token which is known to be okay. */
547 obstack_grow (stack, base, cur - base);
549 /* Now process the part which isn't. We are looking at one of
550 '$', '\\', or '?' on entry to this loop. */
551 c = *cur++;
552 buffer->cur = cur;
555 while (is_idchar (c))
557 obstack_1grow (stack, c);
559 if (c == '$')
560 saw_dollar++;
562 c = EOF;
563 if (buffer->cur == buffer->rlimit)
564 break;
566 c = *buffer->cur++;
569 /* Potential escaped newline? */
570 if (c != '?' && c != '\\')
571 break;
572 c = skip_escaped_newlines (pfile, c);
574 while (is_idchar (c));
576 /* Remember the next character. */
577 buffer->read_ahead = c;
579 /* $ is not a identifier character in the standard, but is commonly
580 accepted as an extension. Don't warn about it in skipped
581 conditional blocks. */
582 if (saw_dollar && CPP_PEDANTIC (pfile) && ! pfile->state.skipping)
583 cpp_pedwarn (pfile, "'$' character(s) in identifier");
585 /* Identifiers are null-terminated. */
586 len = obstack_object_size (stack);
587 obstack_1grow (stack, '\0');
589 return (cpp_hashnode *)
590 ht_lookup (pfile->hash_table, obstack_finish (stack), len, HT_ALLOCED);
593 /* Parse a number, skipping embedded backslash-newlines. */
594 static void
595 parse_number (pfile, number, c, leading_period)
596 cpp_reader *pfile;
597 cpp_string *number;
598 cppchar_t c;
599 int leading_period;
601 cpp_buffer *buffer = pfile->buffer;
602 cpp_pool *pool = &pfile->ident_pool;
603 unsigned char *dest, *limit;
605 dest = POOL_FRONT (pool);
606 limit = POOL_LIMIT (pool);
608 /* Place a leading period. */
609 if (leading_period)
611 if (dest >= limit)
612 limit = _cpp_next_chunk (pool, 0, &dest);
613 *dest++ = '.';
620 /* Need room for terminating null. */
621 if (dest + 1 >= limit)
622 limit = _cpp_next_chunk (pool, 0, &dest);
623 *dest++ = c;
625 c = EOF;
626 if (buffer->cur == buffer->rlimit)
627 break;
629 c = *buffer->cur++;
631 while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
633 /* Potential escaped newline? */
634 if (c != '?' && c != '\\')
635 break;
636 c = skip_escaped_newlines (pfile, c);
638 while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
640 /* Remember the next character. */
641 buffer->read_ahead = c;
643 /* Null-terminate the number. */
644 *dest = '\0';
646 number->text = POOL_FRONT (pool);
647 number->len = dest - number->text;
648 POOL_COMMIT (pool, number->len + 1);
651 /* Subroutine of parse_string. Emits error for unterminated strings. */
652 static void
653 unterminated (pfile, term)
654 cpp_reader *pfile;
655 int term;
657 cpp_error (pfile, "missing terminating %c character", term);
659 if (term == '\"' && pfile->mlstring_pos.line
660 && pfile->mlstring_pos.line != pfile->lexer_pos.line)
662 cpp_error_with_line (pfile, pfile->mlstring_pos.line,
663 pfile->mlstring_pos.col,
664 "possible start of unterminated string literal");
665 pfile->mlstring_pos.line = 0;
669 /* Subroutine of parse_string. */
670 static int
671 unescaped_terminator_p (pfile, dest)
672 cpp_reader *pfile;
673 const unsigned char *dest;
675 const unsigned char *start, *temp;
677 /* In #include-style directives, terminators are not escapeable. */
678 if (pfile->state.angled_headers)
679 return 1;
681 start = POOL_FRONT (&pfile->ident_pool);
683 /* An odd number of consecutive backslashes represents an escaped
684 terminator. */
685 for (temp = dest; temp > start && temp[-1] == '\\'; temp--)
688 return ((dest - temp) & 1) == 0;
691 /* Parses a string, character constant, or angle-bracketed header file
692 name. Handles embedded trigraphs and escaped newlines. The stored
693 string is guaranteed NUL-terminated, but it is not guaranteed that
694 this is the first NUL since embedded NULs are preserved.
696 Multi-line strings are allowed, but they are deprecated. */
697 static void
698 parse_string (pfile, token, terminator)
699 cpp_reader *pfile;
700 cpp_token *token;
701 cppchar_t terminator;
703 cpp_buffer *buffer = pfile->buffer;
704 cpp_pool *pool = &pfile->ident_pool;
705 unsigned char *dest, *limit;
706 cppchar_t c;
707 bool warned_nulls = false, warned_multi = false;
709 dest = POOL_FRONT (pool);
710 limit = POOL_LIMIT (pool);
712 for (;;)
714 if (buffer->cur == buffer->rlimit)
715 c = EOF;
716 else
717 c = *buffer->cur++;
719 have_char:
720 /* We need space for the terminating NUL. */
721 if (dest >= limit)
722 limit = _cpp_next_chunk (pool, 0, &dest);
724 if (c == EOF)
726 unterminated (pfile, terminator);
727 break;
730 /* Handle trigraphs, escaped newlines etc. */
731 if (c == '?' || c == '\\')
732 c = skip_escaped_newlines (pfile, c);
734 if (c == terminator && unescaped_terminator_p (pfile, dest))
736 c = EOF;
737 break;
739 else if (is_vspace (c))
741 /* In assembly language, silently terminate string and
742 character literals at end of line. This is a kludge
743 around not knowing where comments are. */
744 if (CPP_OPTION (pfile, lang) == CLK_ASM && terminator != '>')
745 break;
747 /* Character constants and header names may not extend over
748 multiple lines. In Standard C, neither may strings.
749 Unfortunately, we accept multiline strings as an
750 extension, except in #include family directives. */
751 if (terminator != '"' || pfile->state.angled_headers)
753 unterminated (pfile, terminator);
754 break;
757 if (!warned_multi)
759 warned_multi = true;
760 cpp_pedwarn (pfile, "multi-line string literals are deprecated");
763 if (pfile->mlstring_pos.line == 0)
764 pfile->mlstring_pos = pfile->lexer_pos;
766 c = handle_newline (pfile, c);
767 *dest++ = '\n';
768 goto have_char;
770 else if (c == '\0' && !warned_nulls)
772 warned_nulls = true;
773 cpp_warning (pfile, "null character(s) preserved in literal");
776 *dest++ = c;
779 /* Remember the next character. */
780 buffer->read_ahead = c;
781 *dest = '\0';
783 token->val.str.text = POOL_FRONT (pool);
784 token->val.str.len = dest - token->val.str.text;
785 POOL_COMMIT (pool, token->val.str.len + 1);
788 /* The stored comment includes the comment start and any terminator. */
789 static void
790 save_comment (pfile, token, from)
791 cpp_reader *pfile;
792 cpp_token *token;
793 const unsigned char *from;
795 unsigned char *buffer;
796 unsigned int len;
798 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
799 /* C++ comments probably (not definitely) have moved past a new
800 line, which we don't want to save in the comment. */
801 if (pfile->buffer->read_ahead != EOF)
802 len--;
803 buffer = _cpp_pool_alloc (&pfile->ident_pool, len);
805 token->type = CPP_COMMENT;
806 token->val.str.len = len;
807 token->val.str.text = buffer;
809 buffer[0] = '/';
810 memcpy (buffer + 1, from, len - 1);
813 /* Subroutine of lex_token to handle '%'. A little tricky, since we
814 want to avoid stepping back when lexing %:%X. */
815 static void
816 lex_percent (pfile, result)
817 cpp_reader *pfile;
818 cpp_token *result;
820 cpp_buffer *buffer= pfile->buffer;
821 cppchar_t c;
823 result->type = CPP_MOD;
824 /* Parsing %:%X could leave an extra character. */
825 if (buffer->extra_char == EOF)
826 c = get_effective_char (pfile);
827 else
829 c = buffer->read_ahead = buffer->extra_char;
830 buffer->extra_char = EOF;
833 if (c == '=')
834 ACCEPT_CHAR (CPP_MOD_EQ);
835 else if (CPP_OPTION (pfile, digraphs))
837 if (c == ':')
839 result->flags |= DIGRAPH;
840 ACCEPT_CHAR (CPP_HASH);
841 if (get_effective_char (pfile) == '%')
843 buffer->extra_char = get_effective_char (pfile);
844 if (buffer->extra_char == ':')
846 buffer->extra_char = EOF;
847 ACCEPT_CHAR (CPP_PASTE);
849 else
850 /* We'll catch the extra_char when we're called back. */
851 buffer->read_ahead = '%';
854 else if (c == '>')
856 result->flags |= DIGRAPH;
857 ACCEPT_CHAR (CPP_CLOSE_BRACE);
862 /* Subroutine of lex_token to handle '.'. This is tricky, since we
863 want to avoid stepping back when lexing '...' or '.123'. In the
864 latter case we should also set a flag for parse_number. */
865 static void
866 lex_dot (pfile, result)
867 cpp_reader *pfile;
868 cpp_token *result;
870 cpp_buffer *buffer = pfile->buffer;
871 cppchar_t c;
873 /* Parsing ..X could leave an extra character. */
874 if (buffer->extra_char == EOF)
875 c = get_effective_char (pfile);
876 else
878 c = buffer->read_ahead = buffer->extra_char;
879 buffer->extra_char = EOF;
882 /* All known character sets have 0...9 contiguous. */
883 if (c >= '0' && c <= '9')
885 result->type = CPP_NUMBER;
886 parse_number (pfile, &result->val.str, c, 1);
888 else
890 result->type = CPP_DOT;
891 if (c == '.')
893 buffer->extra_char = get_effective_char (pfile);
894 if (buffer->extra_char == '.')
896 buffer->extra_char = EOF;
897 ACCEPT_CHAR (CPP_ELLIPSIS);
899 else
900 /* We'll catch the extra_char when we're called back. */
901 buffer->read_ahead = '.';
903 else if (c == '*' && CPP_OPTION (pfile, cplusplus))
904 ACCEPT_CHAR (CPP_DOT_STAR);
908 /* Allocate COUNT tokens for RUN. */
909 void
910 _cpp_init_tokenrun (run, count)
911 tokenrun *run;
912 unsigned int count;
914 run->base = xnewvec (cpp_token, count);
915 run->limit = run->base + count;
916 run->next = NULL;
919 /* Returns the next tokenrun, or creates one if there is none. */
920 static tokenrun *
921 next_tokenrun (run)
922 tokenrun *run;
924 if (run->next == NULL)
926 run->next = xnew (tokenrun);
927 run->next->prev = run;
928 _cpp_init_tokenrun (run->next, 250);
931 return run->next;
934 /* Lex a token into RESULT (external interface). */
935 void
936 _cpp_lex_token (pfile, dest)
937 cpp_reader *pfile;
938 cpp_token *dest;
940 cpp_token *result;
942 for (;;)
944 if (pfile->cur_token == pfile->cur_run->limit)
946 pfile->cur_run = next_tokenrun (pfile->cur_run);
947 pfile->cur_token = pfile->cur_run->base;
949 result = pfile->cur_token++;
951 if (pfile->lookaheads)
952 pfile->lookaheads--;
953 else
954 result = lex_token (pfile, result);
956 if (result->flags & BOL)
958 pfile->lexer_pos.output_line = result->line;
959 /* Is this a directive. If _cpp_handle_directive returns
960 false, it is an assembler #. */
961 if (result->type == CPP_HASH
962 && !pfile->state.parsing_args
963 && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
964 continue;
967 /* We don't skip tokens in directives. */
968 if (pfile->state.in_directive)
969 break;
971 /* Outside a directive, invalidate controlling macros. At file
972 EOF, lex_token takes care of popping the buffer, so we never
973 get here and MI optimisation works. */
974 pfile->mi_valid = false;
976 if (!pfile->state.skipping || result->type == CPP_EOF)
977 break;
980 *dest = *result;
983 /* Lex a token into RESULT. When meeting a newline, returns CPP_EOF
984 if parsing a directive, otherwise returns to the start of the token
985 buffer if permissible. Returns the location of the lexed token. */
986 static cpp_token *
987 lex_token (pfile, result)
988 cpp_reader *pfile;
989 cpp_token *result;
991 cppchar_t c;
992 cpp_buffer *buffer;
993 const unsigned char *comment_start;
995 fresh_line:
996 buffer = pfile->buffer;
997 result->flags = buffer->saved_flags;
998 buffer->saved_flags = 0;
999 update_tokens_line:
1000 pfile->lexer_pos.line = pfile->line;
1001 result->line = pfile->line;
1003 skipped_white:
1004 c = buffer->read_ahead;
1005 if (c == EOF && buffer->cur < buffer->rlimit)
1006 c = *buffer->cur++;
1007 result->col = CPP_BUF_COLUMN (buffer, buffer->cur);
1008 pfile->lexer_pos.col = result->col;
1009 buffer->read_ahead = EOF;
1011 trigraph:
1012 switch (c)
1014 case EOF:
1015 buffer->saved_flags = BOL;
1016 if (!pfile->state.parsing_args && !pfile->state.in_directive)
1018 if (buffer->cur != buffer->line_base)
1020 /* Non-empty files should end in a newline. Don't warn
1021 for command line and _Pragma buffers. */
1022 if (!buffer->from_stage3)
1023 cpp_pedwarn (pfile, "no newline at end of file");
1024 handle_newline (pfile, '\n');
1027 /* Don't pop the last buffer. */
1028 if (buffer->prev)
1030 unsigned char stop = buffer->return_at_eof;
1032 _cpp_pop_buffer (pfile);
1033 if (!stop)
1034 goto fresh_line;
1037 result->type = CPP_EOF;
1038 break;
1040 case ' ': case '\t': case '\f': case '\v': case '\0':
1041 skip_whitespace (pfile, c);
1042 result->flags |= PREV_WHITE;
1043 goto skipped_white;
1045 case '\n': case '\r':
1046 handle_newline (pfile, c);
1047 buffer->saved_flags = BOL;
1048 if (! pfile->state.in_directive)
1050 if (!pfile->keep_tokens)
1052 pfile->cur_run = &pfile->base_run;
1053 result = pfile->base_run.base;
1054 pfile->cur_token = result + 1;
1056 goto fresh_line;
1058 result->type = CPP_EOF;
1059 break;
1061 case '?':
1062 case '\\':
1063 /* These could start an escaped newline, or '?' a trigraph. Let
1064 skip_escaped_newlines do all the work. */
1066 unsigned int line = pfile->line;
1068 c = skip_escaped_newlines (pfile, c);
1069 if (line != pfile->line)
1070 /* We had at least one escaped newline of some sort, and the
1071 next character is in buffer->read_ahead. Update the
1072 token's line and column. */
1073 goto update_tokens_line;
1075 /* We are either the original '?' or '\\', or a trigraph. */
1076 result->type = CPP_QUERY;
1077 buffer->read_ahead = EOF;
1078 if (c == '\\')
1079 goto random_char;
1080 else if (c != '?')
1081 goto trigraph;
1083 break;
1085 case '0': case '1': case '2': case '3': case '4':
1086 case '5': case '6': case '7': case '8': case '9':
1087 result->type = CPP_NUMBER;
1088 parse_number (pfile, &result->val.str, c, 0);
1089 break;
1091 case '$':
1092 if (!CPP_OPTION (pfile, dollars_in_ident))
1093 goto random_char;
1094 /* Fall through... */
1096 case '_':
1097 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1098 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1099 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1100 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1101 case 'y': case 'z':
1102 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1103 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1104 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1105 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1106 case 'Y': case 'Z':
1107 result->type = CPP_NAME;
1108 result->val.node = parse_identifier (pfile);
1110 /* 'L' may introduce wide characters or strings. */
1111 if (result->val.node == pfile->spec_nodes.n_L)
1113 c = buffer->read_ahead;
1114 if (c == EOF && buffer->cur < buffer->rlimit)
1115 c = *buffer->cur;
1116 if (c == '\'' || c == '"')
1118 buffer->cur++;
1119 ACCEPT_CHAR (c == '"' ? CPP_WSTRING: CPP_WCHAR);
1120 goto make_string;
1123 /* Convert named operators to their proper types. */
1124 else if (result->val.node->flags & NODE_OPERATOR)
1126 result->flags |= NAMED_OP;
1127 result->type = result->val.node->value.operator;
1129 break;
1131 case '\'':
1132 case '"':
1133 result->type = c == '"' ? CPP_STRING: CPP_CHAR;
1134 make_string:
1135 parse_string (pfile, result, c);
1136 break;
1138 case '/':
1139 /* A potential block or line comment. */
1140 comment_start = buffer->cur;
1141 result->type = CPP_DIV;
1142 c = get_effective_char (pfile);
1143 if (c == '=')
1144 ACCEPT_CHAR (CPP_DIV_EQ);
1145 if (c != '/' && c != '*')
1146 break;
1148 if (c == '*')
1150 if (skip_block_comment (pfile))
1151 cpp_error (pfile, "unterminated comment");
1153 else
1155 if (!CPP_OPTION (pfile, cplusplus_comments)
1156 && !CPP_IN_SYSTEM_HEADER (pfile))
1157 break;
1159 /* Warn about comments only if pedantically GNUC89, and not
1160 in system headers. */
1161 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1162 && ! buffer->warned_cplusplus_comments)
1164 cpp_pedwarn (pfile,
1165 "C++ style comments are not allowed in ISO C89");
1166 cpp_pedwarn (pfile,
1167 "(this will be reported only once per input file)");
1168 buffer->warned_cplusplus_comments = 1;
1171 /* Skip_line_comment updates buffer->read_ahead. */
1172 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1173 cpp_warning_with_line (pfile, pfile->lexer_pos.line,
1174 pfile->lexer_pos.col,
1175 "multi-line comment");
1178 /* Skipping the comment has updated buffer->read_ahead. */
1179 if (!pfile->state.save_comments)
1181 result->flags |= PREV_WHITE;
1182 goto update_tokens_line;
1185 /* Save the comment as a token in its own right. */
1186 save_comment (pfile, result, comment_start);
1187 /* Don't do MI optimisation. */
1188 break;
1190 case '<':
1191 if (pfile->state.angled_headers)
1193 result->type = CPP_HEADER_NAME;
1194 c = '>'; /* terminator. */
1195 goto make_string;
1198 result->type = CPP_LESS;
1199 c = get_effective_char (pfile);
1200 if (c == '=')
1201 ACCEPT_CHAR (CPP_LESS_EQ);
1202 else if (c == '<')
1204 ACCEPT_CHAR (CPP_LSHIFT);
1205 if (get_effective_char (pfile) == '=')
1206 ACCEPT_CHAR (CPP_LSHIFT_EQ);
1208 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1210 ACCEPT_CHAR (CPP_MIN);
1211 if (get_effective_char (pfile) == '=')
1212 ACCEPT_CHAR (CPP_MIN_EQ);
1214 else if (c == ':' && CPP_OPTION (pfile, digraphs))
1216 ACCEPT_CHAR (CPP_OPEN_SQUARE);
1217 result->flags |= DIGRAPH;
1219 else if (c == '%' && CPP_OPTION (pfile, digraphs))
1221 ACCEPT_CHAR (CPP_OPEN_BRACE);
1222 result->flags |= DIGRAPH;
1224 break;
1226 case '>':
1227 result->type = CPP_GREATER;
1228 c = get_effective_char (pfile);
1229 if (c == '=')
1230 ACCEPT_CHAR (CPP_GREATER_EQ);
1231 else if (c == '>')
1233 ACCEPT_CHAR (CPP_RSHIFT);
1234 if (get_effective_char (pfile) == '=')
1235 ACCEPT_CHAR (CPP_RSHIFT_EQ);
1237 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1239 ACCEPT_CHAR (CPP_MAX);
1240 if (get_effective_char (pfile) == '=')
1241 ACCEPT_CHAR (CPP_MAX_EQ);
1243 break;
1245 case '%':
1246 lex_percent (pfile, result);
1247 break;
1249 case '.':
1250 lex_dot (pfile, result);
1251 break;
1253 case '+':
1254 result->type = CPP_PLUS;
1255 c = get_effective_char (pfile);
1256 if (c == '=')
1257 ACCEPT_CHAR (CPP_PLUS_EQ);
1258 else if (c == '+')
1259 ACCEPT_CHAR (CPP_PLUS_PLUS);
1260 break;
1262 case '-':
1263 result->type = CPP_MINUS;
1264 c = get_effective_char (pfile);
1265 if (c == '>')
1267 ACCEPT_CHAR (CPP_DEREF);
1268 if (CPP_OPTION (pfile, cplusplus)
1269 && get_effective_char (pfile) == '*')
1270 ACCEPT_CHAR (CPP_DEREF_STAR);
1272 else if (c == '=')
1273 ACCEPT_CHAR (CPP_MINUS_EQ);
1274 else if (c == '-')
1275 ACCEPT_CHAR (CPP_MINUS_MINUS);
1276 break;
1278 case '*':
1279 result->type = CPP_MULT;
1280 if (get_effective_char (pfile) == '=')
1281 ACCEPT_CHAR (CPP_MULT_EQ);
1282 break;
1284 case '=':
1285 result->type = CPP_EQ;
1286 if (get_effective_char (pfile) == '=')
1287 ACCEPT_CHAR (CPP_EQ_EQ);
1288 break;
1290 case '!':
1291 result->type = CPP_NOT;
1292 if (get_effective_char (pfile) == '=')
1293 ACCEPT_CHAR (CPP_NOT_EQ);
1294 break;
1296 case '&':
1297 result->type = CPP_AND;
1298 c = get_effective_char (pfile);
1299 if (c == '=')
1300 ACCEPT_CHAR (CPP_AND_EQ);
1301 else if (c == '&')
1302 ACCEPT_CHAR (CPP_AND_AND);
1303 break;
1305 case '#':
1306 result->type = CPP_HASH;
1307 if (get_effective_char (pfile) == '#')
1308 ACCEPT_CHAR (CPP_PASTE);
1309 break;
1311 case '|':
1312 result->type = CPP_OR;
1313 c = get_effective_char (pfile);
1314 if (c == '=')
1315 ACCEPT_CHAR (CPP_OR_EQ);
1316 else if (c == '|')
1317 ACCEPT_CHAR (CPP_OR_OR);
1318 break;
1320 case '^':
1321 result->type = CPP_XOR;
1322 if (get_effective_char (pfile) == '=')
1323 ACCEPT_CHAR (CPP_XOR_EQ);
1324 break;
1326 case ':':
1327 result->type = CPP_COLON;
1328 c = get_effective_char (pfile);
1329 if (c == ':' && CPP_OPTION (pfile, cplusplus))
1330 ACCEPT_CHAR (CPP_SCOPE);
1331 else if (c == '>' && CPP_OPTION (pfile, digraphs))
1333 result->flags |= DIGRAPH;
1334 ACCEPT_CHAR (CPP_CLOSE_SQUARE);
1336 break;
1338 case '~': result->type = CPP_COMPL; break;
1339 case ',': result->type = CPP_COMMA; break;
1340 case '(': result->type = CPP_OPEN_PAREN; break;
1341 case ')': result->type = CPP_CLOSE_PAREN; break;
1342 case '[': result->type = CPP_OPEN_SQUARE; break;
1343 case ']': result->type = CPP_CLOSE_SQUARE; break;
1344 case '{': result->type = CPP_OPEN_BRACE; break;
1345 case '}': result->type = CPP_CLOSE_BRACE; break;
1346 case ';': result->type = CPP_SEMICOLON; break;
1348 /* @ is a punctuator in Objective C. */
1349 case '@': result->type = CPP_ATSIGN; break;
1351 random_char:
1352 default:
1353 result->type = CPP_OTHER;
1354 result->val.c = c;
1355 break;
1358 return result;
1361 /* An upper bound on the number of bytes needed to spell a token,
1362 including preceding whitespace. */
1363 unsigned int
1364 cpp_token_len (token)
1365 const cpp_token *token;
1367 unsigned int len;
1369 switch (TOKEN_SPELL (token))
1371 default: len = 0; break;
1372 case SPELL_STRING: len = token->val.str.len; break;
1373 case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
1375 /* 1 for whitespace, 4 for comment delimeters. */
1376 return len + 5;
1379 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1380 already contain the enough space to hold the token's spelling.
1381 Returns a pointer to the character after the last character
1382 written. */
1383 unsigned char *
1384 cpp_spell_token (pfile, token, buffer)
1385 cpp_reader *pfile; /* Would be nice to be rid of this... */
1386 const cpp_token *token;
1387 unsigned char *buffer;
1389 switch (TOKEN_SPELL (token))
1391 case SPELL_OPERATOR:
1393 const unsigned char *spelling;
1394 unsigned char c;
1396 if (token->flags & DIGRAPH)
1397 spelling
1398 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1399 else if (token->flags & NAMED_OP)
1400 goto spell_ident;
1401 else
1402 spelling = TOKEN_NAME (token);
1404 while ((c = *spelling++) != '\0')
1405 *buffer++ = c;
1407 break;
1409 case SPELL_IDENT:
1410 spell_ident:
1411 memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
1412 buffer += NODE_LEN (token->val.node);
1413 break;
1415 case SPELL_STRING:
1417 int left, right, tag;
1418 switch (token->type)
1420 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1421 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1422 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1423 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1424 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1425 default: left = '\0'; right = '\0'; tag = '\0'; break;
1427 if (tag) *buffer++ = tag;
1428 if (left) *buffer++ = left;
1429 memcpy (buffer, token->val.str.text, token->val.str.len);
1430 buffer += token->val.str.len;
1431 if (right) *buffer++ = right;
1433 break;
1435 case SPELL_CHAR:
1436 *buffer++ = token->val.c;
1437 break;
1439 case SPELL_NONE:
1440 cpp_ice (pfile, "Unspellable token %s", TOKEN_NAME (token));
1441 break;
1444 return buffer;
1447 /* Returns a token as a null-terminated string. The string is
1448 temporary, and automatically freed later. Useful for diagnostics. */
1449 unsigned char *
1450 cpp_token_as_text (pfile, token)
1451 cpp_reader *pfile;
1452 const cpp_token *token;
1454 unsigned int len = cpp_token_len (token);
1455 unsigned char *start = _cpp_pool_alloc (&pfile->ident_pool, len), *end;
1457 end = cpp_spell_token (pfile, token, start);
1458 end[0] = '\0';
1460 return start;
1463 /* Used by C front ends. Should really move to using cpp_token_as_text. */
1464 const char *
1465 cpp_type2name (type)
1466 enum cpp_ttype type;
1468 return (const char *) token_spellings[type].name;
1471 /* Writes the spelling of token to FP. Separate from cpp_spell_token
1472 for efficiency - to avoid double-buffering. Also, outputs a space
1473 if PREV_WHITE is flagged. */
1474 void
1475 cpp_output_token (token, fp)
1476 const cpp_token *token;
1477 FILE *fp;
1479 if (token->flags & PREV_WHITE)
1480 putc (' ', fp);
1482 switch (TOKEN_SPELL (token))
1484 case SPELL_OPERATOR:
1486 const unsigned char *spelling;
1488 if (token->flags & DIGRAPH)
1489 spelling
1490 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1491 else if (token->flags & NAMED_OP)
1492 goto spell_ident;
1493 else
1494 spelling = TOKEN_NAME (token);
1496 ufputs (spelling, fp);
1498 break;
1500 spell_ident:
1501 case SPELL_IDENT:
1502 ufputs (NODE_NAME (token->val.node), fp);
1503 break;
1505 case SPELL_STRING:
1507 int left, right, tag;
1508 switch (token->type)
1510 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1511 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1512 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1513 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1514 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1515 default: left = '\0'; right = '\0'; tag = '\0'; break;
1517 if (tag) putc (tag, fp);
1518 if (left) putc (left, fp);
1519 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1520 if (right) putc (right, fp);
1522 break;
1524 case SPELL_CHAR:
1525 putc (token->val.c, fp);
1526 break;
1528 case SPELL_NONE:
1529 /* An error, most probably. */
1530 break;
1534 /* Compare two tokens. */
1536 _cpp_equiv_tokens (a, b)
1537 const cpp_token *a, *b;
1539 if (a->type == b->type && a->flags == b->flags)
1540 switch (TOKEN_SPELL (a))
1542 default: /* Keep compiler happy. */
1543 case SPELL_OPERATOR:
1544 return 1;
1545 case SPELL_CHAR:
1546 return a->val.c == b->val.c; /* Character. */
1547 case SPELL_NONE:
1548 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1549 case SPELL_IDENT:
1550 return a->val.node == b->val.node;
1551 case SPELL_STRING:
1552 return (a->val.str.len == b->val.str.len
1553 && !memcmp (a->val.str.text, b->val.str.text,
1554 a->val.str.len));
1557 return 0;
1560 /* Determine whether two tokens can be pasted together, and if so,
1561 what the resulting token is. Returns CPP_EOF if the tokens cannot
1562 be pasted, or the appropriate type for the merged token if they
1563 can. */
1564 enum cpp_ttype
1565 cpp_can_paste (pfile, token1, token2, digraph)
1566 cpp_reader * pfile;
1567 const cpp_token *token1, *token2;
1568 int* digraph;
1570 enum cpp_ttype a = token1->type, b = token2->type;
1571 int cxx = CPP_OPTION (pfile, cplusplus);
1573 /* Treat named operators as if they were ordinary NAMEs. */
1574 if (token1->flags & NAMED_OP)
1575 a = CPP_NAME;
1576 if (token2->flags & NAMED_OP)
1577 b = CPP_NAME;
1579 if ((int) a <= (int) CPP_LAST_EQ && b == CPP_EQ)
1580 return (enum cpp_ttype) ((int) a + ((int) CPP_EQ_EQ - (int) CPP_EQ));
1582 switch (a)
1584 case CPP_GREATER:
1585 if (b == a) return CPP_RSHIFT;
1586 if (b == CPP_QUERY && cxx) return CPP_MAX;
1587 if (b == CPP_GREATER_EQ) return CPP_RSHIFT_EQ;
1588 break;
1589 case CPP_LESS:
1590 if (b == a) return CPP_LSHIFT;
1591 if (b == CPP_QUERY && cxx) return CPP_MIN;
1592 if (b == CPP_LESS_EQ) return CPP_LSHIFT_EQ;
1593 if (CPP_OPTION (pfile, digraphs))
1595 if (b == CPP_COLON)
1596 {*digraph = 1; return CPP_OPEN_SQUARE;} /* <: digraph */
1597 if (b == CPP_MOD)
1598 {*digraph = 1; return CPP_OPEN_BRACE;} /* <% digraph */
1600 break;
1602 case CPP_PLUS: if (b == a) return CPP_PLUS_PLUS; break;
1603 case CPP_AND: if (b == a) return CPP_AND_AND; break;
1604 case CPP_OR: if (b == a) return CPP_OR_OR; break;
1606 case CPP_MINUS:
1607 if (b == a) return CPP_MINUS_MINUS;
1608 if (b == CPP_GREATER) return CPP_DEREF;
1609 break;
1610 case CPP_COLON:
1611 if (b == a && cxx) return CPP_SCOPE;
1612 if (b == CPP_GREATER && CPP_OPTION (pfile, digraphs))
1613 {*digraph = 1; return CPP_CLOSE_SQUARE;} /* :> digraph */
1614 break;
1616 case CPP_MOD:
1617 if (CPP_OPTION (pfile, digraphs))
1619 if (b == CPP_GREATER)
1620 {*digraph = 1; return CPP_CLOSE_BRACE;} /* %> digraph */
1621 if (b == CPP_COLON)
1622 {*digraph = 1; return CPP_HASH;} /* %: digraph */
1624 break;
1625 case CPP_DEREF:
1626 if (b == CPP_MULT && cxx) return CPP_DEREF_STAR;
1627 break;
1628 case CPP_DOT:
1629 if (b == CPP_MULT && cxx) return CPP_DOT_STAR;
1630 if (b == CPP_NUMBER) return CPP_NUMBER;
1631 break;
1633 case CPP_HASH:
1634 if (b == a && (token1->flags & DIGRAPH) == (token2->flags & DIGRAPH))
1635 /* %:%: digraph */
1636 {*digraph = (token1->flags & DIGRAPH); return CPP_PASTE;}
1637 break;
1639 case CPP_NAME:
1640 if (b == CPP_NAME) return CPP_NAME;
1641 if (b == CPP_NUMBER
1642 && name_p (pfile, &token2->val.str)) return CPP_NAME;
1643 if (b == CPP_CHAR
1644 && token1->val.node == pfile->spec_nodes.n_L) return CPP_WCHAR;
1645 if (b == CPP_STRING
1646 && token1->val.node == pfile->spec_nodes.n_L) return CPP_WSTRING;
1647 break;
1649 case CPP_NUMBER:
1650 if (b == CPP_NUMBER) return CPP_NUMBER;
1651 if (b == CPP_NAME) return CPP_NUMBER;
1652 if (b == CPP_DOT) return CPP_NUMBER;
1653 /* Numbers cannot have length zero, so this is safe. */
1654 if ((b == CPP_PLUS || b == CPP_MINUS)
1655 && VALID_SIGN ('+', token1->val.str.text[token1->val.str.len - 1]))
1656 return CPP_NUMBER;
1657 break;
1659 default:
1660 break;
1663 return CPP_EOF;
1666 /* Returns nonzero if a space should be inserted to avoid an
1667 accidental token paste for output. For simplicity, it is
1668 conservative, and occasionally advises a space where one is not
1669 needed, e.g. "." and ".2". */
1672 cpp_avoid_paste (pfile, token1, token2)
1673 cpp_reader *pfile;
1674 const cpp_token *token1, *token2;
1676 enum cpp_ttype a = token1->type, b = token2->type;
1677 cppchar_t c;
1679 if (token1->flags & NAMED_OP)
1680 a = CPP_NAME;
1681 if (token2->flags & NAMED_OP)
1682 b = CPP_NAME;
1684 c = EOF;
1685 if (token2->flags & DIGRAPH)
1686 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1687 else if (token_spellings[b].category == SPELL_OPERATOR)
1688 c = token_spellings[b].name[0];
1690 /* Quickly get everything that can paste with an '='. */
1691 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1692 return 1;
1694 switch (a)
1696 case CPP_GREATER: return c == '>' || c == '?';
1697 case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
1698 case CPP_PLUS: return c == '+';
1699 case CPP_MINUS: return c == '-' || c == '>';
1700 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1701 case CPP_MOD: return c == ':' || c == '>';
1702 case CPP_AND: return c == '&';
1703 case CPP_OR: return c == '|';
1704 case CPP_COLON: return c == ':' || c == '>';
1705 case CPP_DEREF: return c == '*';
1706 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1707 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1708 case CPP_NAME: return ((b == CPP_NUMBER
1709 && name_p (pfile, &token2->val.str))
1710 || b == CPP_NAME
1711 || b == CPP_CHAR || b == CPP_STRING); /* L */
1712 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1713 || c == '.' || c == '+' || c == '-');
1714 case CPP_OTHER: return (CPP_OPTION (pfile, objc)
1715 && token1->val.c == '@'
1716 && (b == CPP_NAME || b == CPP_STRING));
1717 default: break;
1720 return 0;
1723 /* Output all the remaining tokens on the current line, and a newline
1724 character, to FP. Leading whitespace is removed. */
1725 void
1726 cpp_output_line (pfile, fp)
1727 cpp_reader *pfile;
1728 FILE *fp;
1730 cpp_token token;
1732 cpp_get_token (pfile, &token);
1733 token.flags &= ~PREV_WHITE;
1734 while (token.type != CPP_EOF)
1736 cpp_output_token (&token, fp);
1737 cpp_get_token (pfile, &token);
1740 putc ('\n', fp);
1743 /* Returns the value of a hexadecimal digit. */
1744 static unsigned int
1745 hex_digit_value (c)
1746 unsigned int c;
1748 if (c >= 'a' && c <= 'f')
1749 return c - 'a' + 10;
1750 if (c >= 'A' && c <= 'F')
1751 return c - 'A' + 10;
1752 if (c >= '0' && c <= '9')
1753 return c - '0';
1754 abort ();
1757 /* Parse a '\uNNNN' or '\UNNNNNNNN' sequence. Returns 1 to indicate
1758 failure if cpplib is not parsing C++ or C99. Such failure is
1759 silent, and no variables are updated. Otherwise returns 0, and
1760 warns if -Wtraditional.
1762 [lex.charset]: The character designated by the universal character
1763 name \UNNNNNNNN is that character whose character short name in
1764 ISO/IEC 10646 is NNNNNNNN; the character designated by the
1765 universal character name \uNNNN is that character whose character
1766 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
1767 for a universal character name is less than 0x20 or in the range
1768 0x7F-0x9F (inclusive), or if the universal character name
1769 designates a character in the basic source character set, then the
1770 program is ill-formed.
1772 We assume that wchar_t is Unicode, so we don't need to do any
1773 mapping. Is this ever wrong?
1775 PC points to the 'u' or 'U', PSTR is points to the byte after PC,
1776 LIMIT is the end of the string or charconst. PSTR is updated to
1777 point after the UCS on return, and the UCS is written into PC. */
1779 static int
1780 maybe_read_ucs (pfile, pstr, limit, pc)
1781 cpp_reader *pfile;
1782 const unsigned char **pstr;
1783 const unsigned char *limit;
1784 unsigned int *pc;
1786 const unsigned char *p = *pstr;
1787 unsigned int code = 0;
1788 unsigned int c = *pc, length;
1790 /* Only attempt to interpret a UCS for C++ and C99. */
1791 if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
1792 return 1;
1794 if (CPP_WTRADITIONAL (pfile))
1795 cpp_warning (pfile, "the meaning of '\\%c' varies with -traditional", c);
1797 length = (c == 'u' ? 4: 8);
1799 if ((size_t) (limit - p) < length)
1801 cpp_error (pfile, "incomplete universal-character-name");
1802 /* Skip to the end to avoid more diagnostics. */
1803 p = limit;
1805 else
1807 for (; length; length--, p++)
1809 c = *p;
1810 if (ISXDIGIT (c))
1811 code = (code << 4) + hex_digit_value (c);
1812 else
1814 cpp_error (pfile,
1815 "non-hex digit '%c' in universal-character-name", c);
1816 /* We shouldn't skip in case there are multibyte chars. */
1817 break;
1822 #ifdef TARGET_EBCDIC
1823 cpp_error (pfile, "universal-character-name on EBCDIC target");
1824 code = 0x3f; /* EBCDIC invalid character */
1825 #else
1826 /* True extended characters are OK. */
1827 if (code >= 0xa0
1828 && !(code & 0x80000000)
1829 && !(code >= 0xD800 && code <= 0xDFFF))
1831 /* The standard permits $, @ and ` to be specified as UCNs. We use
1832 hex escapes so that this also works with EBCDIC hosts. */
1833 else if (code == 0x24 || code == 0x40 || code == 0x60)
1835 /* Don't give another error if one occurred above. */
1836 else if (length == 0)
1837 cpp_error (pfile, "universal-character-name out of range");
1838 #endif
1840 *pstr = p;
1841 *pc = code;
1842 return 0;
1845 /* Interpret an escape sequence, and return its value. PSTR points to
1846 the input pointer, which is just after the backslash. LIMIT is how
1847 much text we have. MASK is a bitmask for the precision for the
1848 destination type (char or wchar_t). TRADITIONAL, if true, does not
1849 interpret escapes that did not exist in traditional C.
1851 Handles all relevant diagnostics. */
1853 unsigned int
1854 cpp_parse_escape (pfile, pstr, limit, mask, traditional)
1855 cpp_reader *pfile;
1856 const unsigned char **pstr;
1857 const unsigned char *limit;
1858 unsigned HOST_WIDE_INT mask;
1859 int traditional;
1861 int unknown = 0;
1862 const unsigned char *str = *pstr;
1863 unsigned int c = *str++;
1865 switch (c)
1867 case '\\': case '\'': case '"': case '?': break;
1868 case 'b': c = TARGET_BS; break;
1869 case 'f': c = TARGET_FF; break;
1870 case 'n': c = TARGET_NEWLINE; break;
1871 case 'r': c = TARGET_CR; break;
1872 case 't': c = TARGET_TAB; break;
1873 case 'v': c = TARGET_VT; break;
1875 case '(': case '{': case '[': case '%':
1876 /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
1877 '\%' is used to prevent SCCS from getting confused. */
1878 unknown = CPP_PEDANTIC (pfile);
1879 break;
1881 case 'a':
1882 if (CPP_WTRADITIONAL (pfile))
1883 cpp_warning (pfile, "the meaning of '\\a' varies with -traditional");
1884 if (!traditional)
1885 c = TARGET_BELL;
1886 break;
1888 case 'e': case 'E':
1889 if (CPP_PEDANTIC (pfile))
1890 cpp_pedwarn (pfile, "non-ISO-standard escape sequence, '\\%c'", c);
1891 c = TARGET_ESC;
1892 break;
1894 case 'u': case 'U':
1895 unknown = maybe_read_ucs (pfile, &str, limit, &c);
1896 break;
1898 case 'x':
1899 if (CPP_WTRADITIONAL (pfile))
1900 cpp_warning (pfile, "the meaning of '\\x' varies with -traditional");
1902 if (!traditional)
1904 unsigned int i = 0, overflow = 0;
1905 int digits_found = 0;
1907 while (str < limit)
1909 c = *str;
1910 if (! ISXDIGIT (c))
1911 break;
1912 str++;
1913 overflow |= i ^ (i << 4 >> 4);
1914 i = (i << 4) + hex_digit_value (c);
1915 digits_found = 1;
1918 if (!digits_found)
1919 cpp_error (pfile, "\\x used with no following hex digits");
1921 if (overflow | (i != (i & mask)))
1923 cpp_pedwarn (pfile, "hex escape sequence out of range");
1924 i &= mask;
1926 c = i;
1928 break;
1930 case '0': case '1': case '2': case '3':
1931 case '4': case '5': case '6': case '7':
1933 unsigned int i = c - '0';
1934 int count = 0;
1936 while (str < limit && ++count < 3)
1938 c = *str;
1939 if (c < '0' || c > '7')
1940 break;
1941 str++;
1942 i = (i << 3) + c - '0';
1945 if (i != (i & mask))
1947 cpp_pedwarn (pfile, "octal escape sequence out of range");
1948 i &= mask;
1950 c = i;
1952 break;
1954 default:
1955 unknown = 1;
1956 break;
1959 if (unknown)
1961 if (ISGRAPH (c))
1962 cpp_pedwarn (pfile, "unknown escape sequence '\\%c'", c);
1963 else
1964 cpp_pedwarn (pfile, "unknown escape sequence: '\\%03o'", c);
1967 if (c > mask)
1968 cpp_pedwarn (pfile, "escape sequence out of range for character");
1970 *pstr = str;
1971 return c;
1974 #ifndef MAX_CHAR_TYPE_SIZE
1975 #define MAX_CHAR_TYPE_SIZE CHAR_TYPE_SIZE
1976 #endif
1978 #ifndef MAX_WCHAR_TYPE_SIZE
1979 #define MAX_WCHAR_TYPE_SIZE WCHAR_TYPE_SIZE
1980 #endif
1982 /* Interpret a (possibly wide) character constant in TOKEN.
1983 WARN_MULTI warns about multi-character charconsts, if not
1984 TRADITIONAL. TRADITIONAL also indicates not to interpret escapes
1985 that did not exist in traditional C. PCHARS_SEEN points to a
1986 variable that is filled in with the number of characters seen. */
1987 HOST_WIDE_INT
1988 cpp_interpret_charconst (pfile, token, warn_multi, traditional, pchars_seen)
1989 cpp_reader *pfile;
1990 const cpp_token *token;
1991 int warn_multi;
1992 int traditional;
1993 unsigned int *pchars_seen;
1995 const unsigned char *str = token->val.str.text;
1996 const unsigned char *limit = str + token->val.str.len;
1997 unsigned int chars_seen = 0;
1998 unsigned int width, max_chars, c;
1999 unsigned HOST_WIDE_INT mask;
2000 HOST_WIDE_INT result = 0;
2002 #ifdef MULTIBYTE_CHARS
2003 (void) local_mbtowc (NULL, NULL, 0);
2004 #endif
2006 /* Width in bits. */
2007 if (token->type == CPP_CHAR)
2008 width = MAX_CHAR_TYPE_SIZE;
2009 else
2010 width = MAX_WCHAR_TYPE_SIZE;
2012 if (width < HOST_BITS_PER_WIDE_INT)
2013 mask = ((unsigned HOST_WIDE_INT) 1 << width) - 1;
2014 else
2015 mask = ~0;
2016 max_chars = HOST_BITS_PER_WIDE_INT / width;
2018 while (str < limit)
2020 #ifdef MULTIBYTE_CHARS
2021 wchar_t wc;
2022 int char_len;
2024 char_len = local_mbtowc (&wc, str, limit - str);
2025 if (char_len == -1)
2027 cpp_warning (pfile, "ignoring invalid multibyte character");
2028 c = *str++;
2030 else
2032 str += char_len;
2033 c = wc;
2035 #else
2036 c = *str++;
2037 #endif
2039 if (c == '\\')
2040 c = cpp_parse_escape (pfile, &str, limit, mask, traditional);
2042 #ifdef MAP_CHARACTER
2043 if (ISPRINT (c))
2044 c = MAP_CHARACTER (c);
2045 #endif
2047 /* Merge character into result; ignore excess chars. */
2048 if (++chars_seen <= max_chars)
2050 if (width < HOST_BITS_PER_WIDE_INT)
2051 result = (result << width) | (c & mask);
2052 else
2053 result = c;
2057 if (chars_seen == 0)
2058 cpp_error (pfile, "empty character constant");
2059 else if (chars_seen > max_chars)
2061 chars_seen = max_chars;
2062 cpp_warning (pfile, "character constant too long");
2064 else if (chars_seen > 1 && !traditional && warn_multi)
2065 cpp_warning (pfile, "multi-character character constant");
2067 /* If char type is signed, sign-extend the constant. The
2068 __CHAR_UNSIGNED__ macro is set by the driver if appropriate. */
2069 if (token->type == CPP_CHAR && chars_seen)
2071 unsigned int nbits = chars_seen * width;
2072 unsigned int mask = (unsigned int) ~0 >> (HOST_BITS_PER_INT - nbits);
2074 if (pfile->spec_nodes.n__CHAR_UNSIGNED__->type == NT_MACRO
2075 || ((result >> (nbits - 1)) & 1) == 0)
2076 result &= mask;
2077 else
2078 result |= ~mask;
2081 *pchars_seen = chars_seen;
2082 return result;
2085 /* Memory pools. */
2087 struct dummy
2089 char c;
2090 union
2092 double d;
2093 int *p;
2094 } u;
2097 #define DEFAULT_ALIGNMENT (offsetof (struct dummy, u))
2099 static int
2100 chunk_suitable (pool, chunk, size)
2101 cpp_pool *pool;
2102 cpp_chunk *chunk;
2103 unsigned int size;
2105 /* Being at least twice SIZE means we can use memcpy in
2106 _cpp_next_chunk rather than memmove. Besides, it's a good idea
2107 anyway. */
2108 return (chunk && pool->locked != chunk
2109 && (unsigned int) (chunk->limit - chunk->base) >= size * 2);
2112 /* Returns the end of the new pool. PTR points to a char in the old
2113 pool, and is updated to point to the same char in the new pool. */
2114 unsigned char *
2115 _cpp_next_chunk (pool, len, ptr)
2116 cpp_pool *pool;
2117 unsigned int len;
2118 unsigned char **ptr;
2120 cpp_chunk *chunk = pool->cur->next;
2122 /* LEN is the minimum size we want in the new pool. */
2123 len += POOL_ROOM (pool);
2124 if (! chunk_suitable (pool, chunk, len))
2126 chunk = new_chunk (POOL_SIZE (pool) * 2 + len);
2128 chunk->next = pool->cur->next;
2129 pool->cur->next = chunk;
2132 /* Update the pointer before changing chunk's front. */
2133 if (ptr)
2134 *ptr += chunk->base - POOL_FRONT (pool);
2136 memcpy (chunk->base, POOL_FRONT (pool), POOL_ROOM (pool));
2137 chunk->front = chunk->base;
2139 pool->cur = chunk;
2140 return POOL_LIMIT (pool);
2143 static cpp_chunk *
2144 new_chunk (size)
2145 unsigned int size;
2147 unsigned char *base;
2148 cpp_chunk *result;
2150 size = POOL_ALIGN (size, DEFAULT_ALIGNMENT);
2151 base = (unsigned char *) xmalloc (size + sizeof (cpp_chunk));
2152 /* Put the chunk descriptor at the end. Then chunk overruns will
2153 cause obvious chaos. */
2154 result = (cpp_chunk *) (base + size);
2155 result->base = base;
2156 result->front = base;
2157 result->limit = base + size;
2158 result->next = 0;
2160 return result;
2163 void
2164 _cpp_init_pool (pool, size, align, temp)
2165 cpp_pool *pool;
2166 unsigned int size, align, temp;
2168 if (align == 0)
2169 align = DEFAULT_ALIGNMENT;
2170 if (align & (align - 1))
2171 abort ();
2172 pool->align = align;
2173 pool->first = new_chunk (size);
2174 pool->cur = pool->first;
2175 pool->locked = 0;
2176 pool->locks = 0;
2177 if (temp)
2178 pool->cur->next = pool->cur;
2181 void
2182 _cpp_lock_pool (pool)
2183 cpp_pool *pool;
2185 if (pool->locks++ == 0)
2186 pool->locked = pool->cur;
2189 void
2190 _cpp_unlock_pool (pool)
2191 cpp_pool *pool;
2193 if (--pool->locks == 0)
2194 pool->locked = 0;
2197 void
2198 _cpp_free_pool (pool)
2199 cpp_pool *pool;
2201 cpp_chunk *chunk = pool->first, *next;
2205 next = chunk->next;
2206 free (chunk->base);
2207 chunk = next;
2209 while (chunk && chunk != pool->first);
2212 /* Reserve LEN bytes from a memory pool. */
2213 unsigned char *
2214 _cpp_pool_reserve (pool, len)
2215 cpp_pool *pool;
2216 unsigned int len;
2218 len = POOL_ALIGN (len, pool->align);
2219 if (len > (unsigned int) POOL_ROOM (pool))
2220 _cpp_next_chunk (pool, len, 0);
2222 return POOL_FRONT (pool);
2225 /* Allocate LEN bytes from a memory pool. */
2226 unsigned char *
2227 _cpp_pool_alloc (pool, len)
2228 cpp_pool *pool;
2229 unsigned int len;
2231 unsigned char *result = _cpp_pool_reserve (pool, len);
2233 POOL_COMMIT (pool, len);
2234 return result;