* Makefile.in (rtlanal.o): Depend on $(TM_P_H).
[official-gcc.git] / gcc / cpplex.c
blobbeeb40c9b802c8c0a1ae44504b80039d49c736a1
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7 Single-pass line tokenization by Neil Booth, April 2000
9 This program is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
12 later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23 /* This lexer works with a single pass of the file. Recently I
24 re-wrote it to minimize the places where we step backwards in the
25 input stream, to make future changes to support multi-byte
26 character sets fairly straight-forward.
28 There is now only one routine where we do step backwards:
29 skip_escaped_newlines. This routine could probably also be changed
30 so that it doesn't need to step back. One possibility is to use a
31 trick similar to that used in lex_period and lex_percent. Two
32 extra characters might be needed, but skip_escaped_newlines itself
33 would probably be the only place that needs to be aware of that,
34 and changes to the remaining routines would probably only be needed
35 if they process a backslash. */
37 #include "config.h"
38 #include "system.h"
39 #include "cpplib.h"
40 #include "cpphash.h"
42 /* MULTIBYTE_CHARS support only works for native compilers.
43 ??? Ideally what we want is to model widechar support after
44 the current floating point support. */
45 #ifdef CROSS_COMPILE
46 #undef MULTIBYTE_CHARS
47 #endif
49 #ifdef MULTIBYTE_CHARS
50 #include "mbchar.h"
51 #include <locale.h>
52 #endif
54 /* Tokens with SPELL_STRING store their spelling in the token list,
55 and it's length in the token->val.name.len. */
56 enum spell_type
58 SPELL_OPERATOR = 0,
59 SPELL_CHAR,
60 SPELL_IDENT,
61 SPELL_NUMBER,
62 SPELL_STRING,
63 SPELL_NONE
66 struct token_spelling
68 enum spell_type category;
69 const unsigned char *name;
72 static const unsigned char *const digraph_spellings[] =
73 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
75 #define OP(e, s) { SPELL_OPERATOR, U s },
76 #define TK(e, s) { s, U STRINGX (e) },
77 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
78 #undef OP
79 #undef TK
81 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
82 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
84 static cppchar_t handle_newline PARAMS ((cpp_reader *, cppchar_t));
85 static cppchar_t skip_escaped_newlines PARAMS ((cpp_reader *, cppchar_t));
86 static cppchar_t get_effective_char PARAMS ((cpp_reader *));
88 static int skip_block_comment PARAMS ((cpp_reader *));
89 static int skip_line_comment PARAMS ((cpp_reader *));
90 static void adjust_column PARAMS ((cpp_reader *));
91 static void skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
92 static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *));
93 static cpp_hashnode *parse_identifier_slow PARAMS ((cpp_reader *,
94 const U_CHAR *));
95 static void parse_number PARAMS ((cpp_reader *, cpp_string *, cppchar_t, int));
96 static int unescaped_terminator_p PARAMS ((cpp_reader *, const U_CHAR *));
97 static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
98 static void unterminated PARAMS ((cpp_reader *, int));
99 static int trigraph_ok PARAMS ((cpp_reader *, cppchar_t));
100 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const U_CHAR *));
101 static void lex_percent PARAMS ((cpp_reader *, cpp_token *));
102 static void lex_dot PARAMS ((cpp_reader *, cpp_token *));
103 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
104 static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
105 const unsigned char *, unsigned int *));
106 static tokenrun *next_tokenrun PARAMS ((tokenrun *));
108 static unsigned int hex_digit_value PARAMS ((unsigned int));
109 static _cpp_buff *new_buff PARAMS ((size_t));
111 /* Utility routine:
113 Compares, the token TOKEN to the NUL-terminated string STRING.
114 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
117 cpp_ideq (token, string)
118 const cpp_token *token;
119 const char *string;
121 if (token->type != CPP_NAME)
122 return 0;
124 return !ustrcmp (NODE_NAME (token->val.node), (const U_CHAR *) string);
127 /* Call when meeting a newline. Returns the character after the newline
128 (or carriage-return newline combination), or EOF. */
129 static cppchar_t
130 handle_newline (pfile, newline_char)
131 cpp_reader *pfile;
132 cppchar_t newline_char;
134 cpp_buffer *buffer;
135 cppchar_t next = EOF;
137 pfile->line++;
138 buffer = pfile->buffer;
139 buffer->col_adjust = 0;
140 buffer->line_base = buffer->cur;
142 /* Handle CR-LF and LF-CR combinations, get the next character. */
143 if (buffer->cur < buffer->rlimit)
145 next = *buffer->cur++;
146 if (next + newline_char == '\r' + '\n')
148 buffer->line_base = buffer->cur;
149 if (buffer->cur < buffer->rlimit)
150 next = *buffer->cur++;
151 else
152 next = EOF;
156 buffer->read_ahead = next;
157 return next;
160 /* Subroutine of skip_escaped_newlines; called when a trigraph is
161 encountered. It warns if necessary, and returns true if the
162 trigraph should be honoured. FROM_CHAR is the third character of a
163 trigraph, and presumed to be the previous character for position
164 reporting. */
165 static int
166 trigraph_ok (pfile, from_char)
167 cpp_reader *pfile;
168 cppchar_t from_char;
170 int accept = CPP_OPTION (pfile, trigraphs);
172 /* Don't warn about trigraphs in comments. */
173 if (CPP_OPTION (pfile, warn_trigraphs) && !pfile->state.lexing_comment)
175 cpp_buffer *buffer = pfile->buffer;
177 if (accept)
178 cpp_warning_with_line (pfile, pfile->line, CPP_BUF_COL (buffer) - 2,
179 "trigraph ??%c converted to %c",
180 (int) from_char,
181 (int) _cpp_trigraph_map[from_char]);
182 else if (buffer->cur != buffer->last_Wtrigraphs)
184 buffer->last_Wtrigraphs = buffer->cur;
185 cpp_warning_with_line (pfile, pfile->line,
186 CPP_BUF_COL (buffer) - 2,
187 "trigraph ??%c ignored", (int) from_char);
191 return accept;
194 /* Assumes local variables buffer and result. */
195 #define ACCEPT_CHAR(t) \
196 do { result->type = t; buffer->read_ahead = EOF; } while (0)
198 /* When we move to multibyte character sets, add to these something
199 that saves and restores the state of the multibyte conversion
200 library. This probably involves saving and restoring a "cookie".
201 In the case of glibc it is an 8-byte structure, so is not a high
202 overhead operation. In any case, it's out of the fast path. */
203 #define SAVE_STATE() do { saved_cur = buffer->cur; } while (0)
204 #define RESTORE_STATE() do { buffer->cur = saved_cur; } while (0)
206 /* Skips any escaped newlines introduced by NEXT, which is either a
207 '?' or a '\\'. Returns the next character, which will also have
208 been placed in buffer->read_ahead. This routine performs
209 preprocessing stages 1 and 2 of the ISO C standard. */
210 static cppchar_t
211 skip_escaped_newlines (pfile, next)
212 cpp_reader *pfile;
213 cppchar_t next;
215 cpp_buffer *buffer = pfile->buffer;
217 /* Only do this if we apply stages 1 and 2. */
218 if (!buffer->from_stage3)
220 cppchar_t next1;
221 const unsigned char *saved_cur;
222 int space;
226 if (buffer->cur == buffer->rlimit)
227 break;
229 SAVE_STATE ();
230 if (next == '?')
232 next1 = *buffer->cur++;
233 if (next1 != '?' || buffer->cur == buffer->rlimit)
235 RESTORE_STATE ();
236 break;
239 next1 = *buffer->cur++;
240 if (!_cpp_trigraph_map[next1]
241 || !trigraph_ok (pfile, next1))
243 RESTORE_STATE ();
244 break;
247 /* We have a full trigraph here. */
248 next = _cpp_trigraph_map[next1];
249 if (next != '\\' || buffer->cur == buffer->rlimit)
250 break;
251 SAVE_STATE ();
254 /* We have a backslash, and room for at least one more character. */
255 space = 0;
258 next1 = *buffer->cur++;
259 if (!is_nvspace (next1))
260 break;
261 space = 1;
263 while (buffer->cur < buffer->rlimit);
265 if (!is_vspace (next1))
267 RESTORE_STATE ();
268 break;
271 if (space && !pfile->state.lexing_comment)
272 cpp_warning (pfile, "backslash and newline separated by space");
274 next = handle_newline (pfile, next1);
275 if (next == EOF)
276 cpp_pedwarn (pfile, "backslash-newline at end of file");
278 while (next == '\\' || next == '?');
281 buffer->read_ahead = next;
282 return next;
285 /* Obtain the next character, after trigraph conversion and skipping
286 an arbitrary string of escaped newlines. The common case of no
287 trigraphs or escaped newlines falls through quickly. */
288 static cppchar_t
289 get_effective_char (pfile)
290 cpp_reader *pfile;
292 cpp_buffer *buffer = pfile->buffer;
293 cppchar_t next = EOF;
295 if (buffer->cur < buffer->rlimit)
297 next = *buffer->cur++;
299 /* '?' can introduce trigraphs (and therefore backslash); '\\'
300 can introduce escaped newlines, which we want to skip, or
301 UCNs, which, depending upon lexer state, we will handle in
302 the future. */
303 if (next == '?' || next == '\\')
304 next = skip_escaped_newlines (pfile, next);
307 buffer->read_ahead = next;
308 return next;
311 /* Skip a C-style block comment. We find the end of the comment by
312 seeing if an asterisk is before every '/' we encounter. Returns
313 non-zero if comment terminated by EOF, zero otherwise. */
314 static int
315 skip_block_comment (pfile)
316 cpp_reader *pfile;
318 cpp_buffer *buffer = pfile->buffer;
319 cppchar_t c = EOF, prevc = EOF;
321 pfile->state.lexing_comment = 1;
322 while (buffer->cur != buffer->rlimit)
324 prevc = c, c = *buffer->cur++;
326 next_char:
327 /* FIXME: For speed, create a new character class of characters
328 of interest inside block comments. */
329 if (c == '?' || c == '\\')
330 c = skip_escaped_newlines (pfile, c);
332 /* People like decorating comments with '*', so check for '/'
333 instead for efficiency. */
334 if (c == '/')
336 if (prevc == '*')
337 break;
339 /* Warn about potential nested comments, but not if the '/'
340 comes immediately before the true comment delimeter.
341 Don't bother to get it right across escaped newlines. */
342 if (CPP_OPTION (pfile, warn_comments)
343 && buffer->cur != buffer->rlimit)
345 prevc = c, c = *buffer->cur++;
346 if (c == '*' && buffer->cur != buffer->rlimit)
348 prevc = c, c = *buffer->cur++;
349 if (c != '/')
350 cpp_warning_with_line (pfile, pfile->line,
351 CPP_BUF_COL (buffer) - 2,
352 "\"/*\" within comment");
354 goto next_char;
357 else if (is_vspace (c))
359 prevc = c, c = handle_newline (pfile, c);
360 goto next_char;
362 else if (c == '\t')
363 adjust_column (pfile);
366 pfile->state.lexing_comment = 0;
367 buffer->read_ahead = EOF;
368 return c != '/' || prevc != '*';
371 /* Skip a C++ line comment. Handles escaped newlines. Returns
372 non-zero if a multiline comment. The following new line, if any,
373 is left in buffer->read_ahead. */
374 static int
375 skip_line_comment (pfile)
376 cpp_reader *pfile;
378 cpp_buffer *buffer = pfile->buffer;
379 unsigned int orig_line = pfile->line;
380 cppchar_t c;
382 pfile->state.lexing_comment = 1;
385 c = EOF;
386 if (buffer->cur == buffer->rlimit)
387 break;
389 c = *buffer->cur++;
390 if (c == '?' || c == '\\')
391 c = skip_escaped_newlines (pfile, c);
393 while (!is_vspace (c));
395 pfile->state.lexing_comment = 0;
396 buffer->read_ahead = c; /* Leave any newline for caller. */
397 return orig_line != pfile->line;
400 /* pfile->buffer->cur is one beyond the \t character. Update
401 col_adjust so we track the column correctly. */
402 static void
403 adjust_column (pfile)
404 cpp_reader *pfile;
406 cpp_buffer *buffer = pfile->buffer;
407 unsigned int col = CPP_BUF_COL (buffer) - 1; /* Zero-based column. */
409 /* Round it up to multiple of the tabstop, but subtract 1 since the
410 tab itself occupies a character position. */
411 buffer->col_adjust += (CPP_OPTION (pfile, tabstop)
412 - col % CPP_OPTION (pfile, tabstop)) - 1;
415 /* Skips whitespace, saving the next non-whitespace character.
416 Adjusts pfile->col_adjust to account for tabs. Without this,
417 tokens might be assigned an incorrect column. */
418 static void
419 skip_whitespace (pfile, c)
420 cpp_reader *pfile;
421 cppchar_t c;
423 cpp_buffer *buffer = pfile->buffer;
424 unsigned int warned = 0;
428 /* Horizontal space always OK. */
429 if (c == ' ')
431 else if (c == '\t')
432 adjust_column (pfile);
433 /* Just \f \v or \0 left. */
434 else if (c == '\0')
436 if (!warned)
438 cpp_warning (pfile, "null character(s) ignored");
439 warned = 1;
442 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
443 cpp_pedwarn_with_line (pfile, pfile->line,
444 CPP_BUF_COL (buffer),
445 "%s in preprocessing directive",
446 c == '\f' ? "form feed" : "vertical tab");
448 c = EOF;
449 if (buffer->cur == buffer->rlimit)
450 break;
451 c = *buffer->cur++;
453 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
454 while (is_nvspace (c));
456 /* Remember the next character. */
457 buffer->read_ahead = c;
460 /* See if the characters of a number token are valid in a name (no
461 '.', '+' or '-'). */
462 static int
463 name_p (pfile, string)
464 cpp_reader *pfile;
465 const cpp_string *string;
467 unsigned int i;
469 for (i = 0; i < string->len; i++)
470 if (!is_idchar (string->text[i]))
471 return 0;
473 return 1;
476 /* Parse an identifier, skipping embedded backslash-newlines. This is
477 a critical inner loop. The common case is an identifier which has
478 not been split by backslash-newline, does not contain a dollar
479 sign, and has already been scanned (roughly 10:1 ratio of
480 seen:unseen identifiers in normal code; the distribution is
481 Poisson-like). Second most common case is a new identifier, not
482 split and no dollar sign. The other possibilities are rare and
483 have been relegated to parse_identifier_slow. */
485 static cpp_hashnode *
486 parse_identifier (pfile)
487 cpp_reader *pfile;
489 cpp_hashnode *result;
490 const U_CHAR *cur, *rlimit;
492 /* Fast-path loop. Skim over a normal identifier.
493 N.B. ISIDNUM does not include $. */
494 cur = pfile->buffer->cur - 1;
495 rlimit = pfile->buffer->rlimit;
497 cur++;
498 while (cur < rlimit && ISIDNUM (*cur));
500 /* Check for slow-path cases. */
501 if (cur < rlimit && (*cur == '?' || *cur == '\\' || *cur == '$'))
502 result = parse_identifier_slow (pfile, cur);
503 else
505 const U_CHAR *base = pfile->buffer->cur - 1;
506 result = (cpp_hashnode *)
507 ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
508 pfile->buffer->cur = cur;
511 /* Rarely, identifiers require diagnostics when lexed.
512 XXX Has to be forced out of the fast path. */
513 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
514 && !pfile->state.skipping, 0))
516 /* It is allowed to poison the same identifier twice. */
517 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
518 cpp_error (pfile, "attempt to use poisoned \"%s\"",
519 NODE_NAME (result));
521 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
522 replacement list of a variadic macro. */
523 if (result == pfile->spec_nodes.n__VA_ARGS__
524 && !pfile->state.va_args_ok)
525 cpp_pedwarn (pfile,
526 "__VA_ARGS__ can only appear in the expansion of a C99 variadic macro");
529 return result;
532 /* Slow path. This handles identifiers which have been split, and
533 identifiers which contain dollar signs. The part of the identifier
534 from PFILE->buffer->cur-1 to CUR has already been scanned. */
535 static cpp_hashnode *
536 parse_identifier_slow (pfile, cur)
537 cpp_reader *pfile;
538 const U_CHAR *cur;
540 cpp_buffer *buffer = pfile->buffer;
541 const U_CHAR *base = buffer->cur - 1;
542 struct obstack *stack = &pfile->hash_table->stack;
543 unsigned int c, saw_dollar = 0, len;
545 /* Copy the part of the token which is known to be okay. */
546 obstack_grow (stack, base, cur - base);
548 /* Now process the part which isn't. We are looking at one of
549 '$', '\\', or '?' on entry to this loop. */
550 c = *cur++;
551 buffer->cur = cur;
554 while (is_idchar (c))
556 obstack_1grow (stack, c);
558 if (c == '$')
559 saw_dollar++;
561 c = EOF;
562 if (buffer->cur == buffer->rlimit)
563 break;
565 c = *buffer->cur++;
568 /* Potential escaped newline? */
569 if (c != '?' && c != '\\')
570 break;
571 c = skip_escaped_newlines (pfile, c);
573 while (is_idchar (c));
575 /* Remember the next character. */
576 buffer->read_ahead = c;
578 /* $ is not an identifier character in the standard, but is commonly
579 accepted as an extension. Don't warn about it in skipped
580 conditional blocks. */
581 if (saw_dollar && CPP_PEDANTIC (pfile) && ! pfile->state.skipping)
582 cpp_pedwarn (pfile, "'$' character(s) in identifier");
584 /* Identifiers are null-terminated. */
585 len = obstack_object_size (stack);
586 obstack_1grow (stack, '\0');
588 return (cpp_hashnode *)
589 ht_lookup (pfile->hash_table, obstack_finish (stack), len, HT_ALLOCED);
592 /* Parse a number, skipping embedded backslash-newlines. */
593 static void
594 parse_number (pfile, number, c, leading_period)
595 cpp_reader *pfile;
596 cpp_string *number;
597 cppchar_t c;
598 int leading_period;
600 cpp_buffer *buffer = pfile->buffer;
601 unsigned char *dest, *limit;
603 dest = BUFF_FRONT (pfile->u_buff);
604 limit = BUFF_LIMIT (pfile->u_buff);
606 /* Place a leading period. */
607 if (leading_period)
609 if (dest == limit)
611 _cpp_extend_buff (pfile, &pfile->u_buff, 1);
612 dest = BUFF_FRONT (pfile->u_buff);
613 limit = BUFF_LIMIT (pfile->u_buff);
615 *dest++ = '.';
622 /* Need room for terminating null. */
623 if ((size_t) (limit - dest) < 2)
625 size_t len_so_far = dest - BUFF_FRONT (pfile->u_buff);
626 _cpp_extend_buff (pfile, &pfile->u_buff, 2);
627 dest = BUFF_FRONT (pfile->u_buff) + len_so_far;
628 limit = BUFF_LIMIT (pfile->u_buff);
630 *dest++ = c;
632 c = EOF;
633 if (buffer->cur == buffer->rlimit)
634 break;
636 c = *buffer->cur++;
638 while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
640 /* Potential escaped newline? */
641 if (c != '?' && c != '\\')
642 break;
643 c = skip_escaped_newlines (pfile, c);
645 while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
647 /* Remember the next character. */
648 buffer->read_ahead = c;
650 /* Null-terminate the number. */
651 *dest = '\0';
653 number->text = BUFF_FRONT (pfile->u_buff);
654 number->len = dest - number->text;
655 BUFF_FRONT (pfile->u_buff) = dest + 1;
658 /* Subroutine of parse_string. Emits error for unterminated strings. */
659 static void
660 unterminated (pfile, term)
661 cpp_reader *pfile;
662 int term;
664 cpp_error (pfile, "missing terminating %c character", term);
666 if (term == '\"' && pfile->mls_line && pfile->mls_line != pfile->line)
668 cpp_error_with_line (pfile, pfile->mls_line, pfile->mls_col,
669 "possible start of unterminated string literal");
670 pfile->mls_line = 0;
674 /* Subroutine of parse_string. */
675 static int
676 unescaped_terminator_p (pfile, dest)
677 cpp_reader *pfile;
678 const unsigned char *dest;
680 const unsigned char *start, *temp;
682 /* In #include-style directives, terminators are not escapeable. */
683 if (pfile->state.angled_headers)
684 return 1;
686 start = BUFF_FRONT (pfile->u_buff);
688 /* An odd number of consecutive backslashes represents an escaped
689 terminator. */
690 for (temp = dest; temp > start && temp[-1] == '\\'; temp--)
693 return ((dest - temp) & 1) == 0;
696 /* Parses a string, character constant, or angle-bracketed header file
697 name. Handles embedded trigraphs and escaped newlines. The stored
698 string is guaranteed NUL-terminated, but it is not guaranteed that
699 this is the first NUL since embedded NULs are preserved.
701 Multi-line strings are allowed, but they are deprecated. */
702 static void
703 parse_string (pfile, token, terminator)
704 cpp_reader *pfile;
705 cpp_token *token;
706 cppchar_t terminator;
708 cpp_buffer *buffer = pfile->buffer;
709 unsigned char *dest, *limit;
710 cppchar_t c;
711 bool warned_nulls = false, warned_multi = false;
713 dest = BUFF_FRONT (pfile->u_buff);
714 limit = BUFF_LIMIT (pfile->u_buff);
716 for (;;)
718 if (buffer->cur == buffer->rlimit)
719 c = EOF;
720 else
721 c = *buffer->cur++;
723 have_char:
724 /* We need space for the terminating NUL. */
725 if ((size_t) (limit - dest) < 1)
727 size_t len_so_far = dest - BUFF_FRONT (pfile->u_buff);
728 _cpp_extend_buff (pfile, &pfile->u_buff, 2);
729 dest = BUFF_FRONT (pfile->u_buff) + len_so_far;
730 limit = BUFF_LIMIT (pfile->u_buff);
733 if (c == EOF)
735 unterminated (pfile, terminator);
736 break;
739 /* Handle trigraphs, escaped newlines etc. */
740 if (c == '?' || c == '\\')
741 c = skip_escaped_newlines (pfile, c);
743 if (c == terminator && unescaped_terminator_p (pfile, dest))
745 c = EOF;
746 break;
748 else if (is_vspace (c))
750 /* In assembly language, silently terminate string and
751 character literals at end of line. This is a kludge
752 around not knowing where comments are. */
753 if (CPP_OPTION (pfile, lang) == CLK_ASM && terminator != '>')
754 break;
756 /* Character constants and header names may not extend over
757 multiple lines. In Standard C, neither may strings.
758 Unfortunately, we accept multiline strings as an
759 extension, except in #include family directives. */
760 if (terminator != '"' || pfile->state.angled_headers)
762 unterminated (pfile, terminator);
763 break;
766 if (!warned_multi)
768 warned_multi = true;
769 cpp_pedwarn (pfile, "multi-line string literals are deprecated");
772 if (pfile->mls_line == 0)
774 pfile->mls_line = token->line;
775 pfile->mls_col = token->col;
778 c = handle_newline (pfile, c);
779 *dest++ = '\n';
780 goto have_char;
782 else if (c == '\0' && !warned_nulls)
784 warned_nulls = true;
785 cpp_warning (pfile, "null character(s) preserved in literal");
788 *dest++ = c;
791 /* Remember the next character. */
792 buffer->read_ahead = c;
793 *dest = '\0';
795 token->val.str.text = BUFF_FRONT (pfile->u_buff);
796 token->val.str.len = dest - BUFF_FRONT (pfile->u_buff);
797 BUFF_FRONT (pfile->u_buff) = dest + 1;
800 /* The stored comment includes the comment start and any terminator. */
801 static void
802 save_comment (pfile, token, from)
803 cpp_reader *pfile;
804 cpp_token *token;
805 const unsigned char *from;
807 unsigned char *buffer;
808 unsigned int len;
810 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
811 /* C++ comments probably (not definitely) have moved past a new
812 line, which we don't want to save in the comment. */
813 if (pfile->buffer->read_ahead != EOF)
814 len--;
815 buffer = _cpp_unaligned_alloc (pfile, len);
817 token->type = CPP_COMMENT;
818 token->val.str.len = len;
819 token->val.str.text = buffer;
821 buffer[0] = '/';
822 memcpy (buffer + 1, from, len - 1);
825 /* Subroutine of _cpp_lex_direct to handle '%'. A little tricky, since we
826 want to avoid stepping back when lexing %:%X. */
827 static void
828 lex_percent (pfile, result)
829 cpp_reader *pfile;
830 cpp_token *result;
832 cpp_buffer *buffer= pfile->buffer;
833 cppchar_t c;
835 result->type = CPP_MOD;
836 /* Parsing %:%X could leave an extra character. */
837 if (buffer->extra_char == EOF)
838 c = get_effective_char (pfile);
839 else
841 c = buffer->read_ahead = buffer->extra_char;
842 buffer->extra_char = EOF;
845 if (c == '=')
846 ACCEPT_CHAR (CPP_MOD_EQ);
847 else if (CPP_OPTION (pfile, digraphs))
849 if (c == ':')
851 result->flags |= DIGRAPH;
852 ACCEPT_CHAR (CPP_HASH);
853 if (get_effective_char (pfile) == '%')
855 buffer->extra_char = get_effective_char (pfile);
856 if (buffer->extra_char == ':')
858 buffer->extra_char = EOF;
859 ACCEPT_CHAR (CPP_PASTE);
861 else
862 /* We'll catch the extra_char when we're called back. */
863 buffer->read_ahead = '%';
866 else if (c == '>')
868 result->flags |= DIGRAPH;
869 ACCEPT_CHAR (CPP_CLOSE_BRACE);
874 /* Subroutine of _cpp_lex_direct to handle '.'. This is tricky, since we
875 want to avoid stepping back when lexing '...' or '.123'. In the
876 latter case we should also set a flag for parse_number. */
877 static void
878 lex_dot (pfile, result)
879 cpp_reader *pfile;
880 cpp_token *result;
882 cpp_buffer *buffer = pfile->buffer;
883 cppchar_t c;
885 /* Parsing ..X could leave an extra character. */
886 if (buffer->extra_char == EOF)
887 c = get_effective_char (pfile);
888 else
890 c = buffer->read_ahead = buffer->extra_char;
891 buffer->extra_char = EOF;
894 /* All known character sets have 0...9 contiguous. */
895 if (c >= '0' && c <= '9')
897 result->type = CPP_NUMBER;
898 parse_number (pfile, &result->val.str, c, 1);
900 else
902 result->type = CPP_DOT;
903 if (c == '.')
905 buffer->extra_char = get_effective_char (pfile);
906 if (buffer->extra_char == '.')
908 buffer->extra_char = EOF;
909 ACCEPT_CHAR (CPP_ELLIPSIS);
911 else
912 /* We'll catch the extra_char when we're called back. */
913 buffer->read_ahead = '.';
915 else if (c == '*' && CPP_OPTION (pfile, cplusplus))
916 ACCEPT_CHAR (CPP_DOT_STAR);
920 /* Allocate COUNT tokens for RUN. */
921 void
922 _cpp_init_tokenrun (run, count)
923 tokenrun *run;
924 unsigned int count;
926 run->base = xnewvec (cpp_token, count);
927 run->limit = run->base + count;
928 run->next = NULL;
931 /* Returns the next tokenrun, or creates one if there is none. */
932 static tokenrun *
933 next_tokenrun (run)
934 tokenrun *run;
936 if (run->next == NULL)
938 run->next = xnew (tokenrun);
939 run->next->prev = run;
940 _cpp_init_tokenrun (run->next, 250);
943 return run->next;
946 /* Allocate a single token that is invalidated at the same time as the
947 rest of the tokens on the line. Has its line and col set to the
948 same as the last lexed token, so that diagnostics appear in the
949 right place. */
950 cpp_token *
951 _cpp_temp_token (pfile)
952 cpp_reader *pfile;
954 cpp_token *old, *result;
956 old = pfile->cur_token - 1;
957 if (pfile->cur_token == pfile->cur_run->limit)
959 pfile->cur_run = next_tokenrun (pfile->cur_run);
960 pfile->cur_token = pfile->cur_run->base;
963 result = pfile->cur_token++;
964 result->line = old->line;
965 result->col = old->col;
966 return result;
969 /* Lex a token into RESULT (external interface). Takes care of issues
970 like directive handling, token lookahead, multiple include
971 opimisation and skipping. */
972 const cpp_token *
973 _cpp_lex_token (pfile)
974 cpp_reader *pfile;
976 cpp_token *result;
978 for (;;)
980 if (pfile->cur_token == pfile->cur_run->limit)
982 pfile->cur_run = next_tokenrun (pfile->cur_run);
983 pfile->cur_token = pfile->cur_run->base;
986 if (pfile->lookaheads)
988 pfile->lookaheads--;
989 result = pfile->cur_token++;
991 else
992 result = _cpp_lex_direct (pfile);
994 if (result->flags & BOL)
996 /* Is this a directive. If _cpp_handle_directive returns
997 false, it is an assembler #. */
998 if (result->type == CPP_HASH
999 && !pfile->state.parsing_args
1000 && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1001 continue;
1002 if (pfile->cb.line_change && !pfile->state.skipping)
1003 (*pfile->cb.line_change)(pfile, result, pfile->state.parsing_args);
1006 /* We don't skip tokens in directives. */
1007 if (pfile->state.in_directive)
1008 break;
1010 /* Outside a directive, invalidate controlling macros. At file
1011 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
1012 get here and MI optimisation works. */
1013 pfile->mi_valid = false;
1015 if (!pfile->state.skipping || result->type == CPP_EOF)
1016 break;
1019 return result;
1022 /* Lex a token into pfile->cur_token, which is also incremented, to
1023 get diagnostics pointing to the correct location.
1025 Does not handle issues such as token lookahead, multiple-include
1026 optimisation, directives, skipping etc. This function is only
1027 suitable for use by _cpp_lex_token, and in special cases like
1028 lex_expansion_token which doesn't care for any of these issues.
1030 When meeting a newline, returns CPP_EOF if parsing a directive,
1031 otherwise returns to the start of the token buffer if permissible.
1032 Returns the location of the lexed token. */
1033 cpp_token *
1034 _cpp_lex_direct (pfile)
1035 cpp_reader *pfile;
1037 cppchar_t c;
1038 cpp_buffer *buffer;
1039 const unsigned char *comment_start;
1040 cpp_token *result = pfile->cur_token++;
1042 fresh_line:
1043 buffer = pfile->buffer;
1044 result->flags = buffer->saved_flags;
1045 buffer->saved_flags = 0;
1046 update_tokens_line:
1047 result->line = pfile->line;
1049 skipped_white:
1050 c = buffer->read_ahead;
1051 if (c == EOF && buffer->cur < buffer->rlimit)
1052 c = *buffer->cur++;
1053 result->col = CPP_BUF_COLUMN (buffer, buffer->cur);
1054 buffer->read_ahead = EOF;
1056 trigraph:
1057 switch (c)
1059 case EOF:
1060 buffer->saved_flags = BOL;
1061 if (!pfile->state.parsing_args && !pfile->state.in_directive)
1063 if (buffer->cur != buffer->line_base)
1065 /* Non-empty files should end in a newline. Don't warn
1066 for command line and _Pragma buffers. */
1067 if (!buffer->from_stage3)
1068 cpp_pedwarn (pfile, "no newline at end of file");
1069 handle_newline (pfile, '\n');
1072 /* Don't pop the last buffer. */
1073 if (buffer->prev)
1075 unsigned char stop = buffer->return_at_eof;
1077 _cpp_pop_buffer (pfile);
1078 if (!stop)
1079 goto fresh_line;
1082 result->type = CPP_EOF;
1083 break;
1085 case ' ': case '\t': case '\f': case '\v': case '\0':
1086 skip_whitespace (pfile, c);
1087 result->flags |= PREV_WHITE;
1088 goto skipped_white;
1090 case '\n': case '\r':
1091 handle_newline (pfile, c);
1092 buffer->saved_flags = BOL;
1093 if (! pfile->state.in_directive)
1095 if (pfile->state.parsing_args == 2)
1096 buffer->saved_flags |= PREV_WHITE;
1097 if (!pfile->keep_tokens)
1099 pfile->cur_run = &pfile->base_run;
1100 result = pfile->base_run.base;
1101 pfile->cur_token = result + 1;
1103 goto fresh_line;
1105 result->type = CPP_EOF;
1106 break;
1108 case '?':
1109 case '\\':
1110 /* These could start an escaped newline, or '?' a trigraph. Let
1111 skip_escaped_newlines do all the work. */
1113 unsigned int line = pfile->line;
1115 c = skip_escaped_newlines (pfile, c);
1116 if (line != pfile->line)
1117 /* We had at least one escaped newline of some sort, and the
1118 next character is in buffer->read_ahead. Update the
1119 token's line and column. */
1120 goto update_tokens_line;
1122 /* We are either the original '?' or '\\', or a trigraph. */
1123 result->type = CPP_QUERY;
1124 buffer->read_ahead = EOF;
1125 if (c == '\\')
1126 goto random_char;
1127 else if (c != '?')
1128 goto trigraph;
1130 break;
1132 case '0': case '1': case '2': case '3': case '4':
1133 case '5': case '6': case '7': case '8': case '9':
1134 result->type = CPP_NUMBER;
1135 parse_number (pfile, &result->val.str, c, 0);
1136 break;
1138 case '$':
1139 if (!CPP_OPTION (pfile, dollars_in_ident))
1140 goto random_char;
1141 /* Fall through... */
1143 case '_':
1144 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1145 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1146 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1147 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1148 case 'y': case 'z':
1149 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1150 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1151 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1152 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1153 case 'Y': case 'Z':
1154 result->type = CPP_NAME;
1155 result->val.node = parse_identifier (pfile);
1157 /* 'L' may introduce wide characters or strings. */
1158 if (result->val.node == pfile->spec_nodes.n_L)
1160 c = buffer->read_ahead;
1161 if (c == EOF && buffer->cur < buffer->rlimit)
1162 c = *buffer->cur;
1163 if (c == '\'' || c == '"')
1165 buffer->cur++;
1166 ACCEPT_CHAR (c == '"' ? CPP_WSTRING: CPP_WCHAR);
1167 goto make_string;
1170 /* Convert named operators to their proper types. */
1171 else if (result->val.node->flags & NODE_OPERATOR)
1173 result->flags |= NAMED_OP;
1174 result->type = result->val.node->value.operator;
1176 break;
1178 case '\'':
1179 case '"':
1180 result->type = c == '"' ? CPP_STRING: CPP_CHAR;
1181 make_string:
1182 parse_string (pfile, result, c);
1183 break;
1185 case '/':
1186 /* A potential block or line comment. */
1187 comment_start = buffer->cur;
1188 result->type = CPP_DIV;
1189 c = get_effective_char (pfile);
1190 if (c == '=')
1191 ACCEPT_CHAR (CPP_DIV_EQ);
1192 if (c != '/' && c != '*')
1193 break;
1195 if (c == '*')
1197 if (skip_block_comment (pfile))
1198 cpp_error (pfile, "unterminated comment");
1200 else
1202 if (!CPP_OPTION (pfile, cplusplus_comments)
1203 && !CPP_IN_SYSTEM_HEADER (pfile))
1204 break;
1206 /* Warn about comments only if pedantically GNUC89, and not
1207 in system headers. */
1208 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1209 && ! buffer->warned_cplusplus_comments)
1211 cpp_pedwarn (pfile,
1212 "C++ style comments are not allowed in ISO C89");
1213 cpp_pedwarn (pfile,
1214 "(this will be reported only once per input file)");
1215 buffer->warned_cplusplus_comments = 1;
1218 /* Skip_line_comment updates buffer->read_ahead. */
1219 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1220 cpp_warning (pfile, "multi-line comment");
1223 /* Skipping the comment has updated buffer->read_ahead. */
1224 if (!pfile->state.save_comments)
1226 result->flags |= PREV_WHITE;
1227 goto update_tokens_line;
1230 /* Save the comment as a token in its own right. */
1231 save_comment (pfile, result, comment_start);
1232 break;
1234 case '<':
1235 if (pfile->state.angled_headers)
1237 result->type = CPP_HEADER_NAME;
1238 c = '>'; /* terminator. */
1239 goto make_string;
1242 result->type = CPP_LESS;
1243 c = get_effective_char (pfile);
1244 if (c == '=')
1245 ACCEPT_CHAR (CPP_LESS_EQ);
1246 else if (c == '<')
1248 ACCEPT_CHAR (CPP_LSHIFT);
1249 if (get_effective_char (pfile) == '=')
1250 ACCEPT_CHAR (CPP_LSHIFT_EQ);
1252 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1254 ACCEPT_CHAR (CPP_MIN);
1255 if (get_effective_char (pfile) == '=')
1256 ACCEPT_CHAR (CPP_MIN_EQ);
1258 else if (c == ':' && CPP_OPTION (pfile, digraphs))
1260 ACCEPT_CHAR (CPP_OPEN_SQUARE);
1261 result->flags |= DIGRAPH;
1263 else if (c == '%' && CPP_OPTION (pfile, digraphs))
1265 ACCEPT_CHAR (CPP_OPEN_BRACE);
1266 result->flags |= DIGRAPH;
1268 break;
1270 case '>':
1271 result->type = CPP_GREATER;
1272 c = get_effective_char (pfile);
1273 if (c == '=')
1274 ACCEPT_CHAR (CPP_GREATER_EQ);
1275 else if (c == '>')
1277 ACCEPT_CHAR (CPP_RSHIFT);
1278 if (get_effective_char (pfile) == '=')
1279 ACCEPT_CHAR (CPP_RSHIFT_EQ);
1281 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1283 ACCEPT_CHAR (CPP_MAX);
1284 if (get_effective_char (pfile) == '=')
1285 ACCEPT_CHAR (CPP_MAX_EQ);
1287 break;
1289 case '%':
1290 lex_percent (pfile, result);
1291 break;
1293 case '.':
1294 lex_dot (pfile, result);
1295 break;
1297 case '+':
1298 result->type = CPP_PLUS;
1299 c = get_effective_char (pfile);
1300 if (c == '=')
1301 ACCEPT_CHAR (CPP_PLUS_EQ);
1302 else if (c == '+')
1303 ACCEPT_CHAR (CPP_PLUS_PLUS);
1304 break;
1306 case '-':
1307 result->type = CPP_MINUS;
1308 c = get_effective_char (pfile);
1309 if (c == '>')
1311 ACCEPT_CHAR (CPP_DEREF);
1312 if (CPP_OPTION (pfile, cplusplus)
1313 && get_effective_char (pfile) == '*')
1314 ACCEPT_CHAR (CPP_DEREF_STAR);
1316 else if (c == '=')
1317 ACCEPT_CHAR (CPP_MINUS_EQ);
1318 else if (c == '-')
1319 ACCEPT_CHAR (CPP_MINUS_MINUS);
1320 break;
1322 case '*':
1323 result->type = CPP_MULT;
1324 if (get_effective_char (pfile) == '=')
1325 ACCEPT_CHAR (CPP_MULT_EQ);
1326 break;
1328 case '=':
1329 result->type = CPP_EQ;
1330 if (get_effective_char (pfile) == '=')
1331 ACCEPT_CHAR (CPP_EQ_EQ);
1332 break;
1334 case '!':
1335 result->type = CPP_NOT;
1336 if (get_effective_char (pfile) == '=')
1337 ACCEPT_CHAR (CPP_NOT_EQ);
1338 break;
1340 case '&':
1341 result->type = CPP_AND;
1342 c = get_effective_char (pfile);
1343 if (c == '=')
1344 ACCEPT_CHAR (CPP_AND_EQ);
1345 else if (c == '&')
1346 ACCEPT_CHAR (CPP_AND_AND);
1347 break;
1349 case '#':
1350 result->type = CPP_HASH;
1351 if (get_effective_char (pfile) == '#')
1352 ACCEPT_CHAR (CPP_PASTE);
1353 break;
1355 case '|':
1356 result->type = CPP_OR;
1357 c = get_effective_char (pfile);
1358 if (c == '=')
1359 ACCEPT_CHAR (CPP_OR_EQ);
1360 else if (c == '|')
1361 ACCEPT_CHAR (CPP_OR_OR);
1362 break;
1364 case '^':
1365 result->type = CPP_XOR;
1366 if (get_effective_char (pfile) == '=')
1367 ACCEPT_CHAR (CPP_XOR_EQ);
1368 break;
1370 case ':':
1371 result->type = CPP_COLON;
1372 c = get_effective_char (pfile);
1373 if (c == ':' && CPP_OPTION (pfile, cplusplus))
1374 ACCEPT_CHAR (CPP_SCOPE);
1375 else if (c == '>' && CPP_OPTION (pfile, digraphs))
1377 result->flags |= DIGRAPH;
1378 ACCEPT_CHAR (CPP_CLOSE_SQUARE);
1380 break;
1382 case '~': result->type = CPP_COMPL; break;
1383 case ',': result->type = CPP_COMMA; break;
1384 case '(': result->type = CPP_OPEN_PAREN; break;
1385 case ')': result->type = CPP_CLOSE_PAREN; break;
1386 case '[': result->type = CPP_OPEN_SQUARE; break;
1387 case ']': result->type = CPP_CLOSE_SQUARE; break;
1388 case '{': result->type = CPP_OPEN_BRACE; break;
1389 case '}': result->type = CPP_CLOSE_BRACE; break;
1390 case ';': result->type = CPP_SEMICOLON; break;
1392 /* @ is a punctuator in Objective C. */
1393 case '@': result->type = CPP_ATSIGN; break;
1395 random_char:
1396 default:
1397 result->type = CPP_OTHER;
1398 result->val.c = c;
1399 break;
1402 return result;
1405 /* An upper bound on the number of bytes needed to spell a token,
1406 including preceding whitespace. */
1407 unsigned int
1408 cpp_token_len (token)
1409 const cpp_token *token;
1411 unsigned int len;
1413 switch (TOKEN_SPELL (token))
1415 default: len = 0; break;
1416 case SPELL_NUMBER:
1417 case SPELL_STRING: len = token->val.str.len; break;
1418 case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
1420 /* 1 for whitespace, 4 for comment delimiters. */
1421 return len + 5;
1424 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1425 already contain the enough space to hold the token's spelling.
1426 Returns a pointer to the character after the last character
1427 written. */
1428 unsigned char *
1429 cpp_spell_token (pfile, token, buffer)
1430 cpp_reader *pfile; /* Would be nice to be rid of this... */
1431 const cpp_token *token;
1432 unsigned char *buffer;
1434 switch (TOKEN_SPELL (token))
1436 case SPELL_OPERATOR:
1438 const unsigned char *spelling;
1439 unsigned char c;
1441 if (token->flags & DIGRAPH)
1442 spelling
1443 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1444 else if (token->flags & NAMED_OP)
1445 goto spell_ident;
1446 else
1447 spelling = TOKEN_NAME (token);
1449 while ((c = *spelling++) != '\0')
1450 *buffer++ = c;
1452 break;
1454 case SPELL_CHAR:
1455 *buffer++ = token->val.c;
1456 break;
1458 spell_ident:
1459 case SPELL_IDENT:
1460 memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
1461 buffer += NODE_LEN (token->val.node);
1462 break;
1464 case SPELL_NUMBER:
1465 memcpy (buffer, token->val.str.text, token->val.str.len);
1466 buffer += token->val.str.len;
1467 break;
1469 case SPELL_STRING:
1471 int left, right, tag;
1472 switch (token->type)
1474 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1475 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1476 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1477 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1478 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1479 default:
1480 cpp_ice (pfile, "unknown string token %s\n", TOKEN_NAME (token));
1481 return buffer;
1483 if (tag) *buffer++ = tag;
1484 *buffer++ = left;
1485 memcpy (buffer, token->val.str.text, token->val.str.len);
1486 buffer += token->val.str.len;
1487 *buffer++ = right;
1489 break;
1491 case SPELL_NONE:
1492 cpp_ice (pfile, "Unspellable token %s", TOKEN_NAME (token));
1493 break;
1496 return buffer;
1499 /* Returns a token as a null-terminated string. The string is
1500 temporary, and automatically freed later. Useful for diagnostics. */
1501 unsigned char *
1502 cpp_token_as_text (pfile, token)
1503 cpp_reader *pfile;
1504 const cpp_token *token;
1506 unsigned int len = cpp_token_len (token);
1507 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1509 end = cpp_spell_token (pfile, token, start);
1510 end[0] = '\0';
1512 return start;
1515 /* Used by C front ends. Should really move to using cpp_token_as_text. */
1516 const char *
1517 cpp_type2name (type)
1518 enum cpp_ttype type;
1520 return (const char *) token_spellings[type].name;
1523 /* Writes the spelling of token to FP, without any preceding space.
1524 Separated from cpp_spell_token for efficiency - to avoid stdio
1525 double-buffering. */
1526 void
1527 cpp_output_token (token, fp)
1528 const cpp_token *token;
1529 FILE *fp;
1531 switch (TOKEN_SPELL (token))
1533 case SPELL_OPERATOR:
1535 const unsigned char *spelling;
1536 int c;
1538 if (token->flags & DIGRAPH)
1539 spelling
1540 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1541 else if (token->flags & NAMED_OP)
1542 goto spell_ident;
1543 else
1544 spelling = TOKEN_NAME (token);
1546 c = *spelling;
1548 putc (c, fp);
1549 while ((c = *++spelling) != '\0');
1551 break;
1553 case SPELL_CHAR:
1554 putc (token->val.c, fp);
1555 break;
1557 spell_ident:
1558 case SPELL_IDENT:
1559 fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
1560 break;
1562 case SPELL_NUMBER:
1563 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1564 break;
1566 case SPELL_STRING:
1568 int left, right, tag;
1569 switch (token->type)
1571 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1572 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1573 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1574 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1575 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1576 default:
1577 fprintf (stderr, "impossible STRING token %s\n", TOKEN_NAME (token));
1578 return;
1580 if (tag) putc (tag, fp);
1581 putc (left, fp);
1582 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1583 putc (right, fp);
1585 break;
1587 case SPELL_NONE:
1588 /* An error, most probably. */
1589 break;
1593 /* Compare two tokens. */
1595 _cpp_equiv_tokens (a, b)
1596 const cpp_token *a, *b;
1598 if (a->type == b->type && a->flags == b->flags)
1599 switch (TOKEN_SPELL (a))
1601 default: /* Keep compiler happy. */
1602 case SPELL_OPERATOR:
1603 return 1;
1604 case SPELL_CHAR:
1605 return a->val.c == b->val.c; /* Character. */
1606 case SPELL_NONE:
1607 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1608 case SPELL_IDENT:
1609 return a->val.node == b->val.node;
1610 case SPELL_NUMBER:
1611 case SPELL_STRING:
1612 return (a->val.str.len == b->val.str.len
1613 && !memcmp (a->val.str.text, b->val.str.text,
1614 a->val.str.len));
1617 return 0;
1620 /* Returns nonzero if a space should be inserted to avoid an
1621 accidental token paste for output. For simplicity, it is
1622 conservative, and occasionally advises a space where one is not
1623 needed, e.g. "." and ".2". */
1626 cpp_avoid_paste (pfile, token1, token2)
1627 cpp_reader *pfile;
1628 const cpp_token *token1, *token2;
1630 enum cpp_ttype a = token1->type, b = token2->type;
1631 cppchar_t c;
1633 if (token1->flags & NAMED_OP)
1634 a = CPP_NAME;
1635 if (token2->flags & NAMED_OP)
1636 b = CPP_NAME;
1638 c = EOF;
1639 if (token2->flags & DIGRAPH)
1640 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1641 else if (token_spellings[b].category == SPELL_OPERATOR)
1642 c = token_spellings[b].name[0];
1644 /* Quickly get everything that can paste with an '='. */
1645 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1646 return 1;
1648 switch (a)
1650 case CPP_GREATER: return c == '>' || c == '?';
1651 case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
1652 case CPP_PLUS: return c == '+';
1653 case CPP_MINUS: return c == '-' || c == '>';
1654 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1655 case CPP_MOD: return c == ':' || c == '>';
1656 case CPP_AND: return c == '&';
1657 case CPP_OR: return c == '|';
1658 case CPP_COLON: return c == ':' || c == '>';
1659 case CPP_DEREF: return c == '*';
1660 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1661 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1662 case CPP_NAME: return ((b == CPP_NUMBER
1663 && name_p (pfile, &token2->val.str))
1664 || b == CPP_NAME
1665 || b == CPP_CHAR || b == CPP_STRING); /* L */
1666 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1667 || c == '.' || c == '+' || c == '-');
1668 case CPP_OTHER: return (CPP_OPTION (pfile, objc)
1669 && token1->val.c == '@'
1670 && (b == CPP_NAME || b == CPP_STRING));
1671 default: break;
1674 return 0;
1677 /* Output all the remaining tokens on the current line, and a newline
1678 character, to FP. Leading whitespace is removed. If there are
1679 macros, special token padding is not performed. */
1680 void
1681 cpp_output_line (pfile, fp)
1682 cpp_reader *pfile;
1683 FILE *fp;
1685 const cpp_token *token;
1687 token = cpp_get_token (pfile);
1688 while (token->type != CPP_EOF)
1690 cpp_output_token (token, fp);
1691 token = cpp_get_token (pfile);
1692 if (token->flags & PREV_WHITE)
1693 putc (' ', fp);
1696 putc ('\n', fp);
1699 /* Returns the value of a hexadecimal digit. */
1700 static unsigned int
1701 hex_digit_value (c)
1702 unsigned int c;
1704 if (c >= 'a' && c <= 'f')
1705 return c - 'a' + 10;
1706 if (c >= 'A' && c <= 'F')
1707 return c - 'A' + 10;
1708 if (c >= '0' && c <= '9')
1709 return c - '0';
1710 abort ();
1713 /* Parse a '\uNNNN' or '\UNNNNNNNN' sequence. Returns 1 to indicate
1714 failure if cpplib is not parsing C++ or C99. Such failure is
1715 silent, and no variables are updated. Otherwise returns 0, and
1716 warns if -Wtraditional.
1718 [lex.charset]: The character designated by the universal character
1719 name \UNNNNNNNN is that character whose character short name in
1720 ISO/IEC 10646 is NNNNNNNN; the character designated by the
1721 universal character name \uNNNN is that character whose character
1722 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
1723 for a universal character name is less than 0x20 or in the range
1724 0x7F-0x9F (inclusive), or if the universal character name
1725 designates a character in the basic source character set, then the
1726 program is ill-formed.
1728 We assume that wchar_t is Unicode, so we don't need to do any
1729 mapping. Is this ever wrong?
1731 PC points to the 'u' or 'U', PSTR is points to the byte after PC,
1732 LIMIT is the end of the string or charconst. PSTR is updated to
1733 point after the UCS on return, and the UCS is written into PC. */
1735 static int
1736 maybe_read_ucs (pfile, pstr, limit, pc)
1737 cpp_reader *pfile;
1738 const unsigned char **pstr;
1739 const unsigned char *limit;
1740 unsigned int *pc;
1742 const unsigned char *p = *pstr;
1743 unsigned int code = 0;
1744 unsigned int c = *pc, length;
1746 /* Only attempt to interpret a UCS for C++ and C99. */
1747 if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
1748 return 1;
1750 if (CPP_WTRADITIONAL (pfile))
1751 cpp_warning (pfile, "the meaning of '\\%c' varies with -traditional", c);
1753 length = (c == 'u' ? 4: 8);
1755 if ((size_t) (limit - p) < length)
1757 cpp_error (pfile, "incomplete universal-character-name");
1758 /* Skip to the end to avoid more diagnostics. */
1759 p = limit;
1761 else
1763 for (; length; length--, p++)
1765 c = *p;
1766 if (ISXDIGIT (c))
1767 code = (code << 4) + hex_digit_value (c);
1768 else
1770 cpp_error (pfile,
1771 "non-hex digit '%c' in universal-character-name", c);
1772 /* We shouldn't skip in case there are multibyte chars. */
1773 break;
1778 #ifdef TARGET_EBCDIC
1779 cpp_error (pfile, "universal-character-name on EBCDIC target");
1780 code = 0x3f; /* EBCDIC invalid character */
1781 #else
1782 /* True extended characters are OK. */
1783 if (code >= 0xa0
1784 && !(code & 0x80000000)
1785 && !(code >= 0xD800 && code <= 0xDFFF))
1787 /* The standard permits $, @ and ` to be specified as UCNs. We use
1788 hex escapes so that this also works with EBCDIC hosts. */
1789 else if (code == 0x24 || code == 0x40 || code == 0x60)
1791 /* Don't give another error if one occurred above. */
1792 else if (length == 0)
1793 cpp_error (pfile, "universal-character-name out of range");
1794 #endif
1796 *pstr = p;
1797 *pc = code;
1798 return 0;
1801 /* Interpret an escape sequence, and return its value. PSTR points to
1802 the input pointer, which is just after the backslash. LIMIT is how
1803 much text we have. MASK is a bitmask for the precision for the
1804 destination type (char or wchar_t). TRADITIONAL, if true, does not
1805 interpret escapes that did not exist in traditional C.
1807 Handles all relevant diagnostics. */
1809 unsigned int
1810 cpp_parse_escape (pfile, pstr, limit, mask, traditional)
1811 cpp_reader *pfile;
1812 const unsigned char **pstr;
1813 const unsigned char *limit;
1814 unsigned HOST_WIDE_INT mask;
1815 int traditional;
1817 int unknown = 0;
1818 const unsigned char *str = *pstr;
1819 unsigned int c = *str++;
1821 switch (c)
1823 case '\\': case '\'': case '"': case '?': break;
1824 case 'b': c = TARGET_BS; break;
1825 case 'f': c = TARGET_FF; break;
1826 case 'n': c = TARGET_NEWLINE; break;
1827 case 'r': c = TARGET_CR; break;
1828 case 't': c = TARGET_TAB; break;
1829 case 'v': c = TARGET_VT; break;
1831 case '(': case '{': case '[': case '%':
1832 /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
1833 '\%' is used to prevent SCCS from getting confused. */
1834 unknown = CPP_PEDANTIC (pfile);
1835 break;
1837 case 'a':
1838 if (CPP_WTRADITIONAL (pfile))
1839 cpp_warning (pfile, "the meaning of '\\a' varies with -traditional");
1840 if (!traditional)
1841 c = TARGET_BELL;
1842 break;
1844 case 'e': case 'E':
1845 if (CPP_PEDANTIC (pfile))
1846 cpp_pedwarn (pfile, "non-ISO-standard escape sequence, '\\%c'", c);
1847 c = TARGET_ESC;
1848 break;
1850 case 'u': case 'U':
1851 unknown = maybe_read_ucs (pfile, &str, limit, &c);
1852 break;
1854 case 'x':
1855 if (CPP_WTRADITIONAL (pfile))
1856 cpp_warning (pfile, "the meaning of '\\x' varies with -traditional");
1858 if (!traditional)
1860 unsigned int i = 0, overflow = 0;
1861 int digits_found = 0;
1863 while (str < limit)
1865 c = *str;
1866 if (! ISXDIGIT (c))
1867 break;
1868 str++;
1869 overflow |= i ^ (i << 4 >> 4);
1870 i = (i << 4) + hex_digit_value (c);
1871 digits_found = 1;
1874 if (!digits_found)
1875 cpp_error (pfile, "\\x used with no following hex digits");
1877 if (overflow | (i != (i & mask)))
1879 cpp_pedwarn (pfile, "hex escape sequence out of range");
1880 i &= mask;
1882 c = i;
1884 break;
1886 case '0': case '1': case '2': case '3':
1887 case '4': case '5': case '6': case '7':
1889 unsigned int i = c - '0';
1890 int count = 0;
1892 while (str < limit && ++count < 3)
1894 c = *str;
1895 if (c < '0' || c > '7')
1896 break;
1897 str++;
1898 i = (i << 3) + c - '0';
1901 if (i != (i & mask))
1903 cpp_pedwarn (pfile, "octal escape sequence out of range");
1904 i &= mask;
1906 c = i;
1908 break;
1910 default:
1911 unknown = 1;
1912 break;
1915 if (unknown)
1917 if (ISGRAPH (c))
1918 cpp_pedwarn (pfile, "unknown escape sequence '\\%c'", c);
1919 else
1920 cpp_pedwarn (pfile, "unknown escape sequence: '\\%03o'", c);
1923 if (c > mask)
1924 cpp_pedwarn (pfile, "escape sequence out of range for character");
1926 *pstr = str;
1927 return c;
1930 #ifndef MAX_CHAR_TYPE_SIZE
1931 #define MAX_CHAR_TYPE_SIZE CHAR_TYPE_SIZE
1932 #endif
1934 #ifndef MAX_WCHAR_TYPE_SIZE
1935 #define MAX_WCHAR_TYPE_SIZE WCHAR_TYPE_SIZE
1936 #endif
1938 /* Interpret a (possibly wide) character constant in TOKEN.
1939 WARN_MULTI warns about multi-character charconsts, if not
1940 TRADITIONAL. TRADITIONAL also indicates not to interpret escapes
1941 that did not exist in traditional C. PCHARS_SEEN points to a
1942 variable that is filled in with the number of characters seen. */
1943 HOST_WIDE_INT
1944 cpp_interpret_charconst (pfile, token, warn_multi, traditional, pchars_seen)
1945 cpp_reader *pfile;
1946 const cpp_token *token;
1947 int warn_multi;
1948 int traditional;
1949 unsigned int *pchars_seen;
1951 const unsigned char *str = token->val.str.text;
1952 const unsigned char *limit = str + token->val.str.len;
1953 unsigned int chars_seen = 0;
1954 unsigned int width, max_chars, c;
1955 unsigned HOST_WIDE_INT mask;
1956 HOST_WIDE_INT result = 0;
1958 #ifdef MULTIBYTE_CHARS
1959 (void) local_mbtowc (NULL, NULL, 0);
1960 #endif
1962 /* Width in bits. */
1963 if (token->type == CPP_CHAR)
1964 width = MAX_CHAR_TYPE_SIZE;
1965 else
1966 width = MAX_WCHAR_TYPE_SIZE;
1968 if (width < HOST_BITS_PER_WIDE_INT)
1969 mask = ((unsigned HOST_WIDE_INT) 1 << width) - 1;
1970 else
1971 mask = ~0;
1972 max_chars = HOST_BITS_PER_WIDE_INT / width;
1974 while (str < limit)
1976 #ifdef MULTIBYTE_CHARS
1977 wchar_t wc;
1978 int char_len;
1980 char_len = local_mbtowc (&wc, str, limit - str);
1981 if (char_len == -1)
1983 cpp_warning (pfile, "ignoring invalid multibyte character");
1984 c = *str++;
1986 else
1988 str += char_len;
1989 c = wc;
1991 #else
1992 c = *str++;
1993 #endif
1995 if (c == '\\')
1996 c = cpp_parse_escape (pfile, &str, limit, mask, traditional);
1998 #ifdef MAP_CHARACTER
1999 if (ISPRINT (c))
2000 c = MAP_CHARACTER (c);
2001 #endif
2003 /* Merge character into result; ignore excess chars. */
2004 if (++chars_seen <= max_chars)
2006 if (width < HOST_BITS_PER_WIDE_INT)
2007 result = (result << width) | (c & mask);
2008 else
2009 result = c;
2013 if (chars_seen == 0)
2014 cpp_error (pfile, "empty character constant");
2015 else if (chars_seen > max_chars)
2017 chars_seen = max_chars;
2018 cpp_warning (pfile, "character constant too long");
2020 else if (chars_seen > 1 && !traditional && warn_multi)
2021 cpp_warning (pfile, "multi-character character constant");
2023 /* If char type is signed, sign-extend the constant. The
2024 __CHAR_UNSIGNED__ macro is set by the driver if appropriate. */
2025 if (token->type == CPP_CHAR && chars_seen)
2027 unsigned int nbits = chars_seen * width;
2028 unsigned int mask = (unsigned int) ~0 >> (HOST_BITS_PER_INT - nbits);
2030 if (pfile->spec_nodes.n__CHAR_UNSIGNED__->type == NT_MACRO
2031 || ((result >> (nbits - 1)) & 1) == 0)
2032 result &= mask;
2033 else
2034 result |= ~mask;
2037 *pchars_seen = chars_seen;
2038 return result;
2041 /* Memory buffers. Changing these three constants can have a dramatic
2042 effect on performance. The values here are reasonable defaults,
2043 but might be tuned. If you adjust them, be sure to test across a
2044 range of uses of cpplib, including heavy nested function-like macro
2045 expansion. Also check the change in peak memory usage (NJAMD is a
2046 good tool for this). */
2047 #define MIN_BUFF_SIZE 8000
2048 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (8000 + (MIN_SIZE) * 3 / 2)
2049 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2050 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2052 struct dummy
2054 char c;
2055 union
2057 double d;
2058 int *p;
2059 } u;
2062 #define DEFAULT_ALIGNMENT (offsetof (struct dummy, u))
2063 #define CPP_ALIGN(size, align) (((size) + ((align) - 1)) & ~((align) - 1))
2065 /* Create a new allocation buffer. Place the control block at the end
2066 of the buffer, so that buffer overflows will cause immediate chaos. */
2067 static _cpp_buff *
2068 new_buff (len)
2069 size_t len;
2071 _cpp_buff *result;
2072 unsigned char *base;
2074 if (len < MIN_BUFF_SIZE)
2075 len = MIN_BUFF_SIZE;
2076 len = CPP_ALIGN (len, DEFAULT_ALIGNMENT);
2078 base = xmalloc (len + sizeof (_cpp_buff));
2079 result = (_cpp_buff *) (base + len);
2080 result->base = base;
2081 result->cur = base;
2082 result->limit = base + len;
2083 result->next = NULL;
2084 return result;
2087 /* Place a chain of unwanted allocation buffers on the free list. */
2088 void
2089 _cpp_release_buff (pfile, buff)
2090 cpp_reader *pfile;
2091 _cpp_buff *buff;
2093 _cpp_buff *end = buff;
2095 while (end->next)
2096 end = end->next;
2097 end->next = pfile->free_buffs;
2098 pfile->free_buffs = buff;
2101 /* Return a free buffer of size at least MIN_SIZE. */
2102 _cpp_buff *
2103 _cpp_get_buff (pfile, min_size)
2104 cpp_reader *pfile;
2105 size_t min_size;
2107 _cpp_buff *result, **p;
2109 for (p = &pfile->free_buffs;; p = &(*p)->next)
2111 size_t size;
2113 if (*p == NULL)
2114 return new_buff (min_size);
2115 result = *p;
2116 size = result->limit - result->base;
2117 /* Return a buffer that's big enough, but don't waste one that's
2118 way too big. */
2119 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2120 break;
2123 *p = result->next;
2124 result->next = NULL;
2125 result->cur = result->base;
2126 return result;
2129 /* Creates a new buffer with enough space to hold the uncommitted
2130 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
2131 the excess bytes to the new buffer. Chains the new buffer after
2132 BUFF, and returns the new buffer. */
2133 _cpp_buff *
2134 _cpp_append_extend_buff (pfile, buff, min_extra)
2135 cpp_reader *pfile;
2136 _cpp_buff *buff;
2137 size_t min_extra;
2139 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2140 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2142 buff->next = new_buff;
2143 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2144 return new_buff;
2147 /* Creates a new buffer with enough space to hold the uncommitted
2148 remaining bytes of the buffer pointed to by BUFF, and at least
2149 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
2150 Chains the new buffer before the buffer pointed to by BUFF, and
2151 updates the pointer to point to the new buffer. */
2152 void
2153 _cpp_extend_buff (pfile, pbuff, min_extra)
2154 cpp_reader *pfile;
2155 _cpp_buff **pbuff;
2156 size_t min_extra;
2158 _cpp_buff *new_buff, *old_buff = *pbuff;
2159 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2161 new_buff = _cpp_get_buff (pfile, size);
2162 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2163 new_buff->next = old_buff;
2164 *pbuff = new_buff;
2167 /* Free a chain of buffers starting at BUFF. */
2168 void
2169 _cpp_free_buff (buff)
2170 _cpp_buff *buff;
2172 _cpp_buff *next;
2174 for (; buff; buff = next)
2176 next = buff->next;
2177 free (buff->base);
2181 /* Allocate permanent, unaligned storage of length LEN. */
2182 unsigned char *
2183 _cpp_unaligned_alloc (pfile, len)
2184 cpp_reader *pfile;
2185 size_t len;
2187 _cpp_buff *buff = pfile->u_buff;
2188 unsigned char *result = buff->cur;
2190 if (len > (size_t) (buff->limit - result))
2192 buff = _cpp_get_buff (pfile, len);
2193 buff->next = pfile->u_buff;
2194 pfile->u_buff = buff;
2195 result = buff->cur;
2198 buff->cur = result + len;
2199 return result;
2202 /* Allocate permanent, unaligned storage of length LEN. */
2203 unsigned char *
2204 _cpp_aligned_alloc (pfile, len)
2205 cpp_reader *pfile;
2206 size_t len;
2208 _cpp_buff *buff = pfile->a_buff;
2209 unsigned char *result = buff->cur;
2211 if (len > (size_t) (buff->limit - result))
2213 buff = _cpp_get_buff (pfile, len);
2214 buff->next = pfile->a_buff;
2215 pfile->a_buff = buff;
2216 result = buff->cur;
2219 buff->cur = result + len;
2220 return result;