* config/xtensa/xtensa.h (ASM_OUTPUT_POOL_PROLOGUE): Emit a
[official-gcc.git] / gcc / cpplex.c
blobc1dae50f2082c102aefa3a76338ead5366383cc1
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000, 2001, 2002 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7 Single-pass line tokenization by Neil Booth, April 2000
9 This program is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
12 later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23 #include "config.h"
24 #include "system.h"
25 #include "cpplib.h"
26 #include "cpphash.h"
28 /* MULTIBYTE_CHARS support only works for native compilers.
29 ??? Ideally what we want is to model widechar support after
30 the current floating point support. */
31 #ifdef CROSS_COMPILE
32 #undef MULTIBYTE_CHARS
33 #endif
35 #ifdef MULTIBYTE_CHARS
36 #include "mbchar.h"
37 #include <locale.h>
38 #endif
40 /* Tokens with SPELL_STRING store their spelling in the token list,
41 and it's length in the token->val.name.len. */
42 enum spell_type
44 SPELL_OPERATOR = 0,
45 SPELL_CHAR,
46 SPELL_IDENT,
47 SPELL_NUMBER,
48 SPELL_STRING,
49 SPELL_NONE
52 struct token_spelling
54 enum spell_type category;
55 const unsigned char *name;
58 static const unsigned char *const digraph_spellings[] =
59 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
61 #define OP(e, s) { SPELL_OPERATOR, U s },
62 #define TK(e, s) { s, U STRINGX (e) },
63 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
64 #undef OP
65 #undef TK
67 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
68 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
69 #define BACKUP() do {buffer->cur = buffer->backup_to;} while (0)
71 static void handle_newline PARAMS ((cpp_reader *));
72 static cppchar_t skip_escaped_newlines PARAMS ((cpp_reader *));
73 static cppchar_t get_effective_char PARAMS ((cpp_reader *));
75 static int skip_block_comment PARAMS ((cpp_reader *));
76 static int skip_line_comment PARAMS ((cpp_reader *));
77 static void adjust_column PARAMS ((cpp_reader *));
78 static int skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
79 static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *));
80 static cpp_hashnode *parse_identifier_slow PARAMS ((cpp_reader *,
81 const U_CHAR *));
82 static void parse_number PARAMS ((cpp_reader *, cpp_string *, cppchar_t, int));
83 static int unescaped_terminator_p PARAMS ((cpp_reader *, const U_CHAR *));
84 static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
85 static void unterminated PARAMS ((cpp_reader *, int));
86 static bool trigraph_p PARAMS ((cpp_reader *));
87 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const U_CHAR *));
88 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
89 static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
90 const unsigned char *, unsigned int *));
91 static tokenrun *next_tokenrun PARAMS ((tokenrun *));
93 static unsigned int hex_digit_value PARAMS ((unsigned int));
94 static _cpp_buff *new_buff PARAMS ((size_t));
96 /* Utility routine:
98 Compares, the token TOKEN to the NUL-terminated string STRING.
99 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
101 cpp_ideq (token, string)
102 const cpp_token *token;
103 const char *string;
105 if (token->type != CPP_NAME)
106 return 0;
108 return !ustrcmp (NODE_NAME (token->val.node), (const U_CHAR *) string);
111 /* Call when meeting a newline, assumed to be in buffer->cur[-1].
112 Returns with buffer->cur pointing to the character immediately
113 following the newline (combination). */
114 static void
115 handle_newline (pfile)
116 cpp_reader *pfile;
118 cpp_buffer *buffer = pfile->buffer;
120 /* Handle CR-LF and LF-CR. Most other implementations (e.g. java)
121 only accept CR-LF; maybe we should fall back to that behaviour? */
122 if (buffer->cur[-1] + buffer->cur[0] == '\r' + '\n')
123 buffer->cur++;
125 buffer->line_base = buffer->cur;
126 buffer->col_adjust = 0;
127 pfile->line++;
130 /* Subroutine of skip_escaped_newlines; called when a 3-character
131 sequence beginning with "??" is encountered. buffer->cur points to
132 the second '?'.
134 Warn if necessary, and returns true if the sequence forms a
135 trigraph and the trigraph should be honoured. */
136 static bool
137 trigraph_p (pfile)
138 cpp_reader *pfile;
140 cpp_buffer *buffer = pfile->buffer;
141 cppchar_t from_char = buffer->cur[1];
142 bool accept;
144 if (!_cpp_trigraph_map[from_char])
145 return false;
147 accept = CPP_OPTION (pfile, trigraphs);
149 /* Don't warn about trigraphs in comments. */
150 if (CPP_OPTION (pfile, warn_trigraphs) && !pfile->state.lexing_comment)
152 if (accept)
153 cpp_warning_with_line (pfile, pfile->line, CPP_BUF_COL (buffer) - 1,
154 "trigraph ??%c converted to %c",
155 (int) from_char,
156 (int) _cpp_trigraph_map[from_char]);
157 else if (buffer->cur != buffer->last_Wtrigraphs)
159 buffer->last_Wtrigraphs = buffer->cur;
160 cpp_warning_with_line (pfile, pfile->line,
161 CPP_BUF_COL (buffer) - 1,
162 "trigraph ??%c ignored", (int) from_char);
166 return accept;
169 /* Skips any escaped newlines introduced by '?' or a '\\', assumed to
170 lie in buffer->cur[-1]. Returns the next byte, which will be in
171 buffer->cur[-1]. This routine performs preprocessing stages 1 and
172 2 of the ISO C standard. */
173 static cppchar_t
174 skip_escaped_newlines (pfile)
175 cpp_reader *pfile;
177 cpp_buffer *buffer = pfile->buffer;
178 cppchar_t next = buffer->cur[-1];
180 /* Only do this if we apply stages 1 and 2. */
181 if (!buffer->from_stage3)
183 const unsigned char *saved_cur;
184 cppchar_t next1;
188 if (next == '?')
190 if (buffer->cur[0] != '?' || !trigraph_p (pfile))
191 break;
193 /* Translate the trigraph. */
194 next = _cpp_trigraph_map[buffer->cur[1]];
195 buffer->cur += 2;
196 if (next != '\\')
197 break;
200 if (buffer->cur == buffer->rlimit)
201 break;
203 /* We have a backslash, and room for at least one more
204 character. Skip horizontal whitespace. */
205 saved_cur = buffer->cur;
207 next1 = *buffer->cur++;
208 while (is_nvspace (next1) && buffer->cur < buffer->rlimit);
210 if (!is_vspace (next1))
212 buffer->cur = saved_cur;
213 break;
216 if (saved_cur != buffer->cur - 1
217 && !pfile->state.lexing_comment)
218 cpp_warning (pfile, "backslash and newline separated by space");
220 handle_newline (pfile);
221 buffer->backup_to = buffer->cur;
222 if (buffer->cur == buffer->rlimit)
224 cpp_pedwarn (pfile, "backslash-newline at end of file");
225 next = EOF;
227 else
228 next = *buffer->cur++;
230 while (next == '\\' || next == '?');
233 return next;
236 /* Obtain the next character, after trigraph conversion and skipping
237 an arbitrarily long string of escaped newlines. The common case of
238 no trigraphs or escaped newlines falls through quickly. On return,
239 buffer->backup_to points to where to return to if the character is
240 not to be processed. */
241 static cppchar_t
242 get_effective_char (pfile)
243 cpp_reader *pfile;
245 cppchar_t next;
246 cpp_buffer *buffer = pfile->buffer;
248 buffer->backup_to = buffer->cur;
249 next = *buffer->cur++;
250 if (__builtin_expect (next == '?' || next == '\\', 0))
251 next = skip_escaped_newlines (pfile);
253 return next;
256 /* Skip a C-style block comment. We find the end of the comment by
257 seeing if an asterisk is before every '/' we encounter. Returns
258 non-zero if comment terminated by EOF, zero otherwise. */
259 static int
260 skip_block_comment (pfile)
261 cpp_reader *pfile;
263 cpp_buffer *buffer = pfile->buffer;
264 cppchar_t c = EOF, prevc = EOF;
266 pfile->state.lexing_comment = 1;
267 while (buffer->cur != buffer->rlimit)
269 prevc = c, c = *buffer->cur++;
271 /* FIXME: For speed, create a new character class of characters
272 of interest inside block comments. */
273 if (c == '?' || c == '\\')
274 c = skip_escaped_newlines (pfile);
276 /* People like decorating comments with '*', so check for '/'
277 instead for efficiency. */
278 if (c == '/')
280 if (prevc == '*')
281 break;
283 /* Warn about potential nested comments, but not if the '/'
284 comes immediately before the true comment delimiter.
285 Don't bother to get it right across escaped newlines. */
286 if (CPP_OPTION (pfile, warn_comments)
287 && buffer->cur[0] == '*' && buffer->cur[1] != '/')
288 cpp_warning_with_line (pfile,
289 pfile->line, CPP_BUF_COL (buffer),
290 "\"/*\" within comment");
292 else if (is_vspace (c))
293 handle_newline (pfile);
294 else if (c == '\t')
295 adjust_column (pfile);
298 pfile->state.lexing_comment = 0;
299 return c != '/' || prevc != '*';
302 /* Skip a C++ line comment, leaving buffer->cur pointing to the
303 terminating newline. Handles escaped newlines. Returns non-zero
304 if a multiline comment. */
305 static int
306 skip_line_comment (pfile)
307 cpp_reader *pfile;
309 cpp_buffer *buffer = pfile->buffer;
310 unsigned int orig_line = pfile->line;
311 cppchar_t c;
313 pfile->state.lexing_comment = 1;
316 if (buffer->cur == buffer->rlimit)
317 goto at_eof;
319 c = *buffer->cur++;
320 if (c == '?' || c == '\\')
321 c = skip_escaped_newlines (pfile);
323 while (!is_vspace (c));
325 /* Step back over the newline, except at EOF. */
326 buffer->cur--;
327 at_eof:
329 pfile->state.lexing_comment = 0;
330 return orig_line != pfile->line;
333 /* pfile->buffer->cur is one beyond the \t character. Update
334 col_adjust so we track the column correctly. */
335 static void
336 adjust_column (pfile)
337 cpp_reader *pfile;
339 cpp_buffer *buffer = pfile->buffer;
340 unsigned int col = CPP_BUF_COL (buffer) - 1; /* Zero-based column. */
342 /* Round it up to multiple of the tabstop, but subtract 1 since the
343 tab itself occupies a character position. */
344 buffer->col_adjust += (CPP_OPTION (pfile, tabstop)
345 - col % CPP_OPTION (pfile, tabstop)) - 1;
348 /* Skips whitespace, saving the next non-whitespace character.
349 Adjusts pfile->col_adjust to account for tabs. Without this,
350 tokens might be assigned an incorrect column. */
351 static int
352 skip_whitespace (pfile, c)
353 cpp_reader *pfile;
354 cppchar_t c;
356 cpp_buffer *buffer = pfile->buffer;
357 unsigned int warned = 0;
361 /* Horizontal space always OK. */
362 if (c == ' ')
364 else if (c == '\t')
365 adjust_column (pfile);
366 /* Just \f \v or \0 left. */
367 else if (c == '\0')
369 if (buffer->cur - 1 == buffer->rlimit)
370 return 0;
371 if (!warned)
373 cpp_warning (pfile, "null character(s) ignored");
374 warned = 1;
377 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
378 cpp_pedwarn_with_line (pfile, pfile->line,
379 CPP_BUF_COL (buffer),
380 "%s in preprocessing directive",
381 c == '\f' ? "form feed" : "vertical tab");
383 c = *buffer->cur++;
385 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
386 while (is_nvspace (c));
388 buffer->cur--;
389 return 1;
392 /* See if the characters of a number token are valid in a name (no
393 '.', '+' or '-'). */
394 static int
395 name_p (pfile, string)
396 cpp_reader *pfile;
397 const cpp_string *string;
399 unsigned int i;
401 for (i = 0; i < string->len; i++)
402 if (!is_idchar (string->text[i]))
403 return 0;
405 return 1;
408 /* Parse an identifier, skipping embedded backslash-newlines. This is
409 a critical inner loop. The common case is an identifier which has
410 not been split by backslash-newline, does not contain a dollar
411 sign, and has already been scanned (roughly 10:1 ratio of
412 seen:unseen identifiers in normal code; the distribution is
413 Poisson-like). Second most common case is a new identifier, not
414 split and no dollar sign. The other possibilities are rare and
415 have been relegated to parse_identifier_slow. */
416 static cpp_hashnode *
417 parse_identifier (pfile)
418 cpp_reader *pfile;
420 cpp_hashnode *result;
421 const U_CHAR *cur;
423 /* Fast-path loop. Skim over a normal identifier.
424 N.B. ISIDNUM does not include $. */
425 cur = pfile->buffer->cur;
426 while (ISIDNUM (*cur))
427 cur++;
429 /* Check for slow-path cases. */
430 if (*cur == '?' || *cur == '\\' || *cur == '$')
431 result = parse_identifier_slow (pfile, cur);
432 else
434 const U_CHAR *base = pfile->buffer->cur - 1;
435 result = (cpp_hashnode *)
436 ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
437 pfile->buffer->cur = cur;
440 /* Rarely, identifiers require diagnostics when lexed.
441 XXX Has to be forced out of the fast path. */
442 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
443 && !pfile->state.skipping, 0))
445 /* It is allowed to poison the same identifier twice. */
446 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
447 cpp_error (pfile, "attempt to use poisoned \"%s\"",
448 NODE_NAME (result));
450 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
451 replacement list of a variadic macro. */
452 if (result == pfile->spec_nodes.n__VA_ARGS__
453 && !pfile->state.va_args_ok)
454 cpp_pedwarn (pfile,
455 "__VA_ARGS__ can only appear in the expansion of a C99 variadic macro");
458 return result;
461 /* Slow path. This handles identifiers which have been split, and
462 identifiers which contain dollar signs. The part of the identifier
463 from PFILE->buffer->cur-1 to CUR has already been scanned. */
464 static cpp_hashnode *
465 parse_identifier_slow (pfile, cur)
466 cpp_reader *pfile;
467 const U_CHAR *cur;
469 cpp_buffer *buffer = pfile->buffer;
470 const U_CHAR *base = buffer->cur - 1;
471 struct obstack *stack = &pfile->hash_table->stack;
472 unsigned int c, saw_dollar = 0, len;
474 /* Copy the part of the token which is known to be okay. */
475 obstack_grow (stack, base, cur - base);
477 /* Now process the part which isn't. We are looking at one of
478 '$', '\\', or '?' on entry to this loop. */
479 c = *cur++;
480 buffer->cur = cur;
483 while (is_idchar (c))
485 obstack_1grow (stack, c);
487 if (c == '$')
488 saw_dollar++;
490 c = *buffer->cur++;
493 /* Potential escaped newline? */
494 buffer->backup_to = buffer->cur - 1;
495 if (c != '?' && c != '\\')
496 break;
497 c = skip_escaped_newlines (pfile);
499 while (is_idchar (c));
501 /* Step back over the unwanted char. */
502 BACKUP ();
504 /* $ is not an identifier character in the standard, but is commonly
505 accepted as an extension. Don't warn about it in skipped
506 conditional blocks. */
507 if (saw_dollar && CPP_PEDANTIC (pfile) && ! pfile->state.skipping)
508 cpp_pedwarn (pfile, "'$' character(s) in identifier");
510 /* Identifiers are null-terminated. */
511 len = obstack_object_size (stack);
512 obstack_1grow (stack, '\0');
514 return (cpp_hashnode *)
515 ht_lookup (pfile->hash_table, obstack_finish (stack), len, HT_ALLOCED);
518 /* Parse a number, beginning with character C, skipping embedded
519 backslash-newlines. LEADING_PERIOD is non-zero if there was a "."
520 before C. Place the result in NUMBER. */
521 static void
522 parse_number (pfile, number, c, leading_period)
523 cpp_reader *pfile;
524 cpp_string *number;
525 cppchar_t c;
526 int leading_period;
528 cpp_buffer *buffer = pfile->buffer;
529 unsigned char *dest, *limit;
531 dest = BUFF_FRONT (pfile->u_buff);
532 limit = BUFF_LIMIT (pfile->u_buff);
534 /* Place a leading period. */
535 if (leading_period)
537 if (dest == limit)
539 _cpp_extend_buff (pfile, &pfile->u_buff, 1);
540 dest = BUFF_FRONT (pfile->u_buff);
541 limit = BUFF_LIMIT (pfile->u_buff);
543 *dest++ = '.';
550 /* Need room for terminating null. */
551 if ((size_t) (limit - dest) < 2)
553 size_t len_so_far = dest - BUFF_FRONT (pfile->u_buff);
554 _cpp_extend_buff (pfile, &pfile->u_buff, 2);
555 dest = BUFF_FRONT (pfile->u_buff) + len_so_far;
556 limit = BUFF_LIMIT (pfile->u_buff);
558 *dest++ = c;
560 c = *buffer->cur++;
562 while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
564 /* Potential escaped newline? */
565 buffer->backup_to = buffer->cur - 1;
566 if (c != '?' && c != '\\')
567 break;
568 c = skip_escaped_newlines (pfile);
570 while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
572 /* Step back over the unwanted char. */
573 BACKUP ();
575 /* Null-terminate the number. */
576 *dest = '\0';
578 number->text = BUFF_FRONT (pfile->u_buff);
579 number->len = dest - number->text;
580 BUFF_FRONT (pfile->u_buff) = dest + 1;
583 /* Subroutine of parse_string. Emits error for unterminated strings. */
584 static void
585 unterminated (pfile, term)
586 cpp_reader *pfile;
587 int term;
589 cpp_error (pfile, "missing terminating %c character", term);
591 if (term == '\"' && pfile->mls_line && pfile->mls_line != pfile->line)
593 cpp_error_with_line (pfile, pfile->mls_line, pfile->mls_col,
594 "possible start of unterminated string literal");
595 pfile->mls_line = 0;
599 /* Subroutine of parse_string. */
600 static int
601 unescaped_terminator_p (pfile, dest)
602 cpp_reader *pfile;
603 const unsigned char *dest;
605 const unsigned char *start, *temp;
607 /* In #include-style directives, terminators are not escapeable. */
608 if (pfile->state.angled_headers)
609 return 1;
611 start = BUFF_FRONT (pfile->u_buff);
613 /* An odd number of consecutive backslashes represents an escaped
614 terminator. */
615 for (temp = dest; temp > start && temp[-1] == '\\'; temp--)
618 return ((dest - temp) & 1) == 0;
621 /* Parses a string, character constant, or angle-bracketed header file
622 name. Handles embedded trigraphs and escaped newlines. The stored
623 string is guaranteed NUL-terminated, but it is not guaranteed that
624 this is the first NUL since embedded NULs are preserved.
625 Multi-line strings are allowed, but they are deprecated.
627 When this function returns, buffer->cur points to the next
628 character to be processed. */
629 static void
630 parse_string (pfile, token, terminator)
631 cpp_reader *pfile;
632 cpp_token *token;
633 cppchar_t terminator;
635 cpp_buffer *buffer = pfile->buffer;
636 unsigned char *dest, *limit;
637 cppchar_t c;
638 bool warned_nulls = false, warned_multi = false;
640 dest = BUFF_FRONT (pfile->u_buff);
641 limit = BUFF_LIMIT (pfile->u_buff);
643 for (;;)
645 /* We need room for another char, possibly the terminating NUL. */
646 if ((size_t) (limit - dest) < 1)
648 size_t len_so_far = dest - BUFF_FRONT (pfile->u_buff);
649 _cpp_extend_buff (pfile, &pfile->u_buff, 2);
650 dest = BUFF_FRONT (pfile->u_buff) + len_so_far;
651 limit = BUFF_LIMIT (pfile->u_buff);
654 /* Handle trigraphs, escaped newlines etc. */
655 c = *buffer->cur++;
656 if (c == '?' || c == '\\')
657 c = skip_escaped_newlines (pfile);
659 if (c == terminator)
661 if (unescaped_terminator_p (pfile, dest))
662 break;
664 else if (is_vspace (c))
666 /* In assembly language, silently terminate string and
667 character literals at end of line. This is a kludge
668 around not knowing where comments are. */
669 if (CPP_OPTION (pfile, lang) == CLK_ASM && terminator != '>')
671 buffer->cur--;
672 break;
675 /* Character constants and header names may not extend over
676 multiple lines. In Standard C, neither may strings.
677 Unfortunately, we accept multiline strings as an
678 extension, except in #include family directives. */
679 if (terminator != '"' || pfile->state.angled_headers)
681 unterminated (pfile, terminator);
682 buffer->cur--;
683 break;
686 if (!warned_multi)
688 warned_multi = true;
689 cpp_pedwarn (pfile, "multi-line string literals are deprecated");
692 if (pfile->mls_line == 0)
694 pfile->mls_line = token->line;
695 pfile->mls_col = token->col;
698 handle_newline (pfile);
699 c = '\n';
701 else if (c == '\0')
703 if (buffer->cur - 1 == buffer->rlimit)
705 unterminated (pfile, terminator);
706 buffer->cur--;
707 break;
709 if (!warned_nulls)
711 warned_nulls = true;
712 cpp_warning (pfile, "null character(s) preserved in literal");
716 *dest++ = c;
719 *dest = '\0';
721 token->val.str.text = BUFF_FRONT (pfile->u_buff);
722 token->val.str.len = dest - BUFF_FRONT (pfile->u_buff);
723 BUFF_FRONT (pfile->u_buff) = dest + 1;
726 /* The stored comment includes the comment start and any terminator. */
727 static void
728 save_comment (pfile, token, from)
729 cpp_reader *pfile;
730 cpp_token *token;
731 const unsigned char *from;
733 unsigned char *buffer;
734 unsigned int len;
736 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
738 /* C++ comments probably (not definitely) have moved past a new
739 line, which we don't want to save in the comment. */
740 if (is_vspace (pfile->buffer->cur[-1]))
741 len--;
742 buffer = _cpp_unaligned_alloc (pfile, len);
744 token->type = CPP_COMMENT;
745 token->val.str.len = len;
746 token->val.str.text = buffer;
748 buffer[0] = '/';
749 memcpy (buffer + 1, from, len - 1);
752 /* Allocate COUNT tokens for RUN. */
753 void
754 _cpp_init_tokenrun (run, count)
755 tokenrun *run;
756 unsigned int count;
758 run->base = xnewvec (cpp_token, count);
759 run->limit = run->base + count;
760 run->next = NULL;
763 /* Returns the next tokenrun, or creates one if there is none. */
764 static tokenrun *
765 next_tokenrun (run)
766 tokenrun *run;
768 if (run->next == NULL)
770 run->next = xnew (tokenrun);
771 run->next->prev = run;
772 _cpp_init_tokenrun (run->next, 250);
775 return run->next;
778 /* Allocate a single token that is invalidated at the same time as the
779 rest of the tokens on the line. Has its line and col set to the
780 same as the last lexed token, so that diagnostics appear in the
781 right place. */
782 cpp_token *
783 _cpp_temp_token (pfile)
784 cpp_reader *pfile;
786 cpp_token *old, *result;
788 old = pfile->cur_token - 1;
789 if (pfile->cur_token == pfile->cur_run->limit)
791 pfile->cur_run = next_tokenrun (pfile->cur_run);
792 pfile->cur_token = pfile->cur_run->base;
795 result = pfile->cur_token++;
796 result->line = old->line;
797 result->col = old->col;
798 return result;
801 /* Lex a token into RESULT (external interface). Takes care of issues
802 like directive handling, token lookahead, multiple include
803 optimization and skipping. */
804 const cpp_token *
805 _cpp_lex_token (pfile)
806 cpp_reader *pfile;
808 cpp_token *result;
810 for (;;)
812 if (pfile->cur_token == pfile->cur_run->limit)
814 pfile->cur_run = next_tokenrun (pfile->cur_run);
815 pfile->cur_token = pfile->cur_run->base;
818 if (pfile->lookaheads)
820 pfile->lookaheads--;
821 result = pfile->cur_token++;
823 else
824 result = _cpp_lex_direct (pfile);
826 if (result->flags & BOL)
828 /* Is this a directive. If _cpp_handle_directive returns
829 false, it is an assembler #. */
830 if (result->type == CPP_HASH
831 /* 6.10.3 p 11: Directives in a list of macro arguments
832 gives undefined behavior. This implementation
833 handles the directive as normal. */
834 && pfile->state.parsing_args != 1
835 && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
836 continue;
837 if (pfile->cb.line_change && !pfile->state.skipping)
838 (*pfile->cb.line_change)(pfile, result, pfile->state.parsing_args);
841 /* We don't skip tokens in directives. */
842 if (pfile->state.in_directive)
843 break;
845 /* Outside a directive, invalidate controlling macros. At file
846 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
847 get here and MI optimisation works. */
848 pfile->mi_valid = false;
850 if (!pfile->state.skipping || result->type == CPP_EOF)
851 break;
854 return result;
857 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
858 do { \
859 if (get_effective_char (pfile) == CHAR) \
860 result->type = THEN_TYPE; \
861 else \
863 BACKUP (); \
864 result->type = ELSE_TYPE; \
866 } while (0)
868 /* Lex a token into pfile->cur_token, which is also incremented, to
869 get diagnostics pointing to the correct location.
871 Does not handle issues such as token lookahead, multiple-include
872 optimisation, directives, skipping etc. This function is only
873 suitable for use by _cpp_lex_token, and in special cases like
874 lex_expansion_token which doesn't care for any of these issues.
876 When meeting a newline, returns CPP_EOF if parsing a directive,
877 otherwise returns to the start of the token buffer if permissible.
878 Returns the location of the lexed token. */
879 cpp_token *
880 _cpp_lex_direct (pfile)
881 cpp_reader *pfile;
883 cppchar_t c;
884 cpp_buffer *buffer;
885 const unsigned char *comment_start;
886 cpp_token *result = pfile->cur_token++;
888 fresh_line:
889 buffer = pfile->buffer;
890 result->flags = buffer->saved_flags;
891 buffer->saved_flags = 0;
892 update_tokens_line:
893 result->line = pfile->line;
895 skipped_white:
896 c = *buffer->cur++;
897 result->col = CPP_BUF_COLUMN (buffer, buffer->cur);
899 trigraph:
900 switch (c)
902 case ' ': case '\t': case '\f': case '\v': case '\0':
903 result->flags |= PREV_WHITE;
904 if (skip_whitespace (pfile, c))
905 goto skipped_white;
907 /* EOF. */
908 buffer->cur--;
909 buffer->saved_flags = BOL;
910 if (!pfile->state.parsing_args && !pfile->state.in_directive)
912 if (buffer->cur != buffer->line_base)
914 /* Non-empty files should end in a newline. Don't warn
915 for command line and _Pragma buffers. */
916 if (!buffer->from_stage3)
917 cpp_pedwarn (pfile, "no newline at end of file");
918 handle_newline (pfile);
921 /* Don't pop the last buffer. */
922 if (buffer->prev)
924 unsigned char stop = buffer->return_at_eof;
926 _cpp_pop_buffer (pfile);
927 if (!stop)
928 goto fresh_line;
931 result->type = CPP_EOF;
932 break;
934 case '\n': case '\r':
935 handle_newline (pfile);
936 buffer->saved_flags = BOL;
937 if (! pfile->state.in_directive)
939 if (pfile->state.parsing_args == 2)
940 buffer->saved_flags |= PREV_WHITE;
941 if (!pfile->keep_tokens)
943 pfile->cur_run = &pfile->base_run;
944 result = pfile->base_run.base;
945 pfile->cur_token = result + 1;
947 goto fresh_line;
949 result->type = CPP_EOF;
950 break;
952 case '?':
953 case '\\':
954 /* These could start an escaped newline, or '?' a trigraph. Let
955 skip_escaped_newlines do all the work. */
957 unsigned int line = pfile->line;
959 c = skip_escaped_newlines (pfile);
960 if (line != pfile->line)
962 buffer->cur--;
963 /* We had at least one escaped newline of some sort.
964 Update the token's line and column. */
965 goto update_tokens_line;
969 /* We are either the original '?' or '\\', or a trigraph. */
970 if (c == '?')
971 result->type = CPP_QUERY;
972 else if (c == '\\')
973 goto random_char;
974 else
975 goto trigraph;
976 break;
978 case '0': case '1': case '2': case '3': case '4':
979 case '5': case '6': case '7': case '8': case '9':
980 result->type = CPP_NUMBER;
981 parse_number (pfile, &result->val.str, c, 0);
982 break;
984 case 'L':
985 /* 'L' may introduce wide characters or strings. */
987 const unsigned char *pos = buffer->cur;
989 c = get_effective_char (pfile);
990 if (c == '\'' || c == '"')
992 result->type = (c == '"' ? CPP_WSTRING: CPP_WCHAR);
993 parse_string (pfile, result, c);
994 break;
996 buffer->cur = pos;
998 /* Fall through. */
1000 start_ident:
1001 case '_':
1002 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1003 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1004 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1005 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1006 case 'y': case 'z':
1007 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1008 case 'G': case 'H': case 'I': case 'J': case 'K':
1009 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1010 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1011 case 'Y': case 'Z':
1012 result->type = CPP_NAME;
1013 result->val.node = parse_identifier (pfile);
1015 /* Convert named operators to their proper types. */
1016 if (result->val.node->flags & NODE_OPERATOR)
1018 result->flags |= NAMED_OP;
1019 result->type = result->val.node->value.operator;
1021 break;
1023 case '\'':
1024 case '"':
1025 result->type = c == '"' ? CPP_STRING: CPP_CHAR;
1026 parse_string (pfile, result, c);
1027 break;
1029 case '/':
1030 /* A potential block or line comment. */
1031 comment_start = buffer->cur;
1032 c = get_effective_char (pfile);
1034 if (c == '*')
1036 if (skip_block_comment (pfile))
1037 cpp_error (pfile, "unterminated comment");
1039 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1040 || CPP_IN_SYSTEM_HEADER (pfile)))
1042 /* Warn about comments only if pedantically GNUC89, and not
1043 in system headers. */
1044 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1045 && ! buffer->warned_cplusplus_comments)
1047 cpp_pedwarn (pfile,
1048 "C++ style comments are not allowed in ISO C89");
1049 cpp_pedwarn (pfile,
1050 "(this will be reported only once per input file)");
1051 buffer->warned_cplusplus_comments = 1;
1054 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1055 cpp_warning (pfile, "multi-line comment");
1057 else if (c == '=')
1059 result->type = CPP_DIV_EQ;
1060 break;
1062 else
1064 BACKUP ();
1065 result->type = CPP_DIV;
1066 break;
1069 if (!pfile->state.save_comments)
1071 result->flags |= PREV_WHITE;
1072 goto update_tokens_line;
1075 /* Save the comment as a token in its own right. */
1076 save_comment (pfile, result, comment_start);
1077 break;
1079 case '<':
1080 if (pfile->state.angled_headers)
1082 result->type = CPP_HEADER_NAME;
1083 parse_string (pfile, result, '>');
1084 break;
1087 c = get_effective_char (pfile);
1088 if (c == '=')
1089 result->type = CPP_LESS_EQ;
1090 else if (c == '<')
1091 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1092 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1093 IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
1094 else if (c == ':' && CPP_OPTION (pfile, digraphs))
1096 result->type = CPP_OPEN_SQUARE;
1097 result->flags |= DIGRAPH;
1099 else if (c == '%' && CPP_OPTION (pfile, digraphs))
1101 result->type = CPP_OPEN_BRACE;
1102 result->flags |= DIGRAPH;
1104 else
1106 BACKUP ();
1107 result->type = CPP_LESS;
1109 break;
1111 case '>':
1112 c = get_effective_char (pfile);
1113 if (c == '=')
1114 result->type = CPP_GREATER_EQ;
1115 else if (c == '>')
1116 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1117 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1118 IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
1119 else
1121 BACKUP ();
1122 result->type = CPP_GREATER;
1124 break;
1126 case '%':
1127 c = get_effective_char (pfile);
1128 if (c == '=')
1129 result->type = CPP_MOD_EQ;
1130 else if (CPP_OPTION (pfile, digraphs) && c == ':')
1132 result->flags |= DIGRAPH;
1133 result->type = CPP_HASH;
1134 if (get_effective_char (pfile) == '%')
1136 const unsigned char *pos = buffer->cur;
1138 if (get_effective_char (pfile) == ':')
1139 result->type = CPP_PASTE;
1140 else
1141 buffer->cur = pos - 1;
1143 else
1144 BACKUP ();
1146 else if (CPP_OPTION (pfile, digraphs) && c == '>')
1148 result->flags |= DIGRAPH;
1149 result->type = CPP_CLOSE_BRACE;
1151 else
1153 BACKUP ();
1154 result->type = CPP_MOD;
1156 break;
1158 case '.':
1159 result->type = CPP_DOT;
1160 c = get_effective_char (pfile);
1161 if (c == '.')
1163 const unsigned char *pos = buffer->cur;
1165 if (get_effective_char (pfile) == '.')
1166 result->type = CPP_ELLIPSIS;
1167 else
1168 buffer->cur = pos - 1;
1170 /* All known character sets have 0...9 contiguous. */
1171 else if (ISDIGIT (c))
1173 result->type = CPP_NUMBER;
1174 parse_number (pfile, &result->val.str, c, 1);
1176 else if (c == '*' && CPP_OPTION (pfile, cplusplus))
1177 result->type = CPP_DOT_STAR;
1178 else
1179 BACKUP ();
1180 break;
1182 case '+':
1183 c = get_effective_char (pfile);
1184 if (c == '+')
1185 result->type = CPP_PLUS_PLUS;
1186 else if (c == '=')
1187 result->type = CPP_PLUS_EQ;
1188 else
1190 BACKUP ();
1191 result->type = CPP_PLUS;
1193 break;
1195 case '-':
1196 c = get_effective_char (pfile);
1197 if (c == '>')
1199 result->type = CPP_DEREF;
1200 if (CPP_OPTION (pfile, cplusplus))
1202 if (get_effective_char (pfile) == '*')
1203 result->type = CPP_DEREF_STAR;
1204 else
1205 BACKUP ();
1208 else if (c == '-')
1209 result->type = CPP_MINUS_MINUS;
1210 else if (c == '=')
1211 result->type = CPP_MINUS_EQ;
1212 else
1214 BACKUP ();
1215 result->type = CPP_MINUS;
1217 break;
1219 case '&':
1220 c = get_effective_char (pfile);
1221 if (c == '&')
1222 result->type = CPP_AND_AND;
1223 else if (c == '=')
1224 result->type = CPP_AND_EQ;
1225 else
1227 BACKUP ();
1228 result->type = CPP_AND;
1230 break;
1232 case '|':
1233 c = get_effective_char (pfile);
1234 if (c == '|')
1235 result->type = CPP_OR_OR;
1236 else if (c == '=')
1237 result->type = CPP_OR_EQ;
1238 else
1240 BACKUP ();
1241 result->type = CPP_OR;
1243 break;
1245 case ':':
1246 c = get_effective_char (pfile);
1247 if (c == ':' && CPP_OPTION (pfile, cplusplus))
1248 result->type = CPP_SCOPE;
1249 else if (c == '>' && CPP_OPTION (pfile, digraphs))
1251 result->flags |= DIGRAPH;
1252 result->type = CPP_CLOSE_SQUARE;
1254 else
1256 BACKUP ();
1257 result->type = CPP_COLON;
1259 break;
1261 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1262 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1263 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1264 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1265 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1267 case '~': result->type = CPP_COMPL; break;
1268 case ',': result->type = CPP_COMMA; break;
1269 case '(': result->type = CPP_OPEN_PAREN; break;
1270 case ')': result->type = CPP_CLOSE_PAREN; break;
1271 case '[': result->type = CPP_OPEN_SQUARE; break;
1272 case ']': result->type = CPP_CLOSE_SQUARE; break;
1273 case '{': result->type = CPP_OPEN_BRACE; break;
1274 case '}': result->type = CPP_CLOSE_BRACE; break;
1275 case ';': result->type = CPP_SEMICOLON; break;
1277 /* @ is a punctuator in Objective C. */
1278 case '@': result->type = CPP_ATSIGN; break;
1280 case '$':
1281 if (CPP_OPTION (pfile, dollars_in_ident))
1282 goto start_ident;
1283 /* Fall through... */
1285 random_char:
1286 default:
1287 result->type = CPP_OTHER;
1288 result->val.c = c;
1289 break;
1292 return result;
1295 /* An upper bound on the number of bytes needed to spell TOKEN,
1296 including preceding whitespace. */
1297 unsigned int
1298 cpp_token_len (token)
1299 const cpp_token *token;
1301 unsigned int len;
1303 switch (TOKEN_SPELL (token))
1305 default: len = 0; break;
1306 case SPELL_NUMBER:
1307 case SPELL_STRING: len = token->val.str.len; break;
1308 case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
1310 /* 1 for whitespace, 4 for comment delimiters. */
1311 return len + 5;
1314 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1315 already contain the enough space to hold the token's spelling.
1316 Returns a pointer to the character after the last character
1317 written. */
1318 unsigned char *
1319 cpp_spell_token (pfile, token, buffer)
1320 cpp_reader *pfile; /* Would be nice to be rid of this... */
1321 const cpp_token *token;
1322 unsigned char *buffer;
1324 switch (TOKEN_SPELL (token))
1326 case SPELL_OPERATOR:
1328 const unsigned char *spelling;
1329 unsigned char c;
1331 if (token->flags & DIGRAPH)
1332 spelling
1333 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1334 else if (token->flags & NAMED_OP)
1335 goto spell_ident;
1336 else
1337 spelling = TOKEN_NAME (token);
1339 while ((c = *spelling++) != '\0')
1340 *buffer++ = c;
1342 break;
1344 case SPELL_CHAR:
1345 *buffer++ = token->val.c;
1346 break;
1348 spell_ident:
1349 case SPELL_IDENT:
1350 memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
1351 buffer += NODE_LEN (token->val.node);
1352 break;
1354 case SPELL_NUMBER:
1355 memcpy (buffer, token->val.str.text, token->val.str.len);
1356 buffer += token->val.str.len;
1357 break;
1359 case SPELL_STRING:
1361 int left, right, tag;
1362 switch (token->type)
1364 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1365 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1366 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1367 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1368 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1369 default:
1370 cpp_ice (pfile, "unknown string token %s\n", TOKEN_NAME (token));
1371 return buffer;
1373 if (tag) *buffer++ = tag;
1374 *buffer++ = left;
1375 memcpy (buffer, token->val.str.text, token->val.str.len);
1376 buffer += token->val.str.len;
1377 *buffer++ = right;
1379 break;
1381 case SPELL_NONE:
1382 cpp_ice (pfile, "unspellable token %s", TOKEN_NAME (token));
1383 break;
1386 return buffer;
1389 /* Returns TOKEN spelt as a null-terminated string. The string is
1390 freed when the reader is destroyed. Useful for diagnostics. */
1391 unsigned char *
1392 cpp_token_as_text (pfile, token)
1393 cpp_reader *pfile;
1394 const cpp_token *token;
1396 unsigned int len = cpp_token_len (token);
1397 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1399 end = cpp_spell_token (pfile, token, start);
1400 end[0] = '\0';
1402 return start;
1405 /* Used by C front ends, which really should move to using
1406 cpp_token_as_text. */
1407 const char *
1408 cpp_type2name (type)
1409 enum cpp_ttype type;
1411 return (const char *) token_spellings[type].name;
1414 /* Writes the spelling of token to FP, without any preceding space.
1415 Separated from cpp_spell_token for efficiency - to avoid stdio
1416 double-buffering. */
1417 void
1418 cpp_output_token (token, fp)
1419 const cpp_token *token;
1420 FILE *fp;
1422 switch (TOKEN_SPELL (token))
1424 case SPELL_OPERATOR:
1426 const unsigned char *spelling;
1427 int c;
1429 if (token->flags & DIGRAPH)
1430 spelling
1431 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1432 else if (token->flags & NAMED_OP)
1433 goto spell_ident;
1434 else
1435 spelling = TOKEN_NAME (token);
1437 c = *spelling;
1439 putc (c, fp);
1440 while ((c = *++spelling) != '\0');
1442 break;
1444 case SPELL_CHAR:
1445 putc (token->val.c, fp);
1446 break;
1448 spell_ident:
1449 case SPELL_IDENT:
1450 fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
1451 break;
1453 case SPELL_NUMBER:
1454 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1455 break;
1457 case SPELL_STRING:
1459 int left, right, tag;
1460 switch (token->type)
1462 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1463 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1464 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1465 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1466 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1467 default:
1468 fprintf (stderr, "impossible STRING token %s\n", TOKEN_NAME (token));
1469 return;
1471 if (tag) putc (tag, fp);
1472 putc (left, fp);
1473 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1474 putc (right, fp);
1476 break;
1478 case SPELL_NONE:
1479 /* An error, most probably. */
1480 break;
1484 /* Compare two tokens. */
1486 _cpp_equiv_tokens (a, b)
1487 const cpp_token *a, *b;
1489 if (a->type == b->type && a->flags == b->flags)
1490 switch (TOKEN_SPELL (a))
1492 default: /* Keep compiler happy. */
1493 case SPELL_OPERATOR:
1494 return 1;
1495 case SPELL_CHAR:
1496 return a->val.c == b->val.c; /* Character. */
1497 case SPELL_NONE:
1498 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1499 case SPELL_IDENT:
1500 return a->val.node == b->val.node;
1501 case SPELL_NUMBER:
1502 case SPELL_STRING:
1503 return (a->val.str.len == b->val.str.len
1504 && !memcmp (a->val.str.text, b->val.str.text,
1505 a->val.str.len));
1508 return 0;
1511 /* Returns nonzero if a space should be inserted to avoid an
1512 accidental token paste for output. For simplicity, it is
1513 conservative, and occasionally advises a space where one is not
1514 needed, e.g. "." and ".2". */
1516 cpp_avoid_paste (pfile, token1, token2)
1517 cpp_reader *pfile;
1518 const cpp_token *token1, *token2;
1520 enum cpp_ttype a = token1->type, b = token2->type;
1521 cppchar_t c;
1523 if (token1->flags & NAMED_OP)
1524 a = CPP_NAME;
1525 if (token2->flags & NAMED_OP)
1526 b = CPP_NAME;
1528 c = EOF;
1529 if (token2->flags & DIGRAPH)
1530 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1531 else if (token_spellings[b].category == SPELL_OPERATOR)
1532 c = token_spellings[b].name[0];
1534 /* Quickly get everything that can paste with an '='. */
1535 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1536 return 1;
1538 switch (a)
1540 case CPP_GREATER: return c == '>' || c == '?';
1541 case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
1542 case CPP_PLUS: return c == '+';
1543 case CPP_MINUS: return c == '-' || c == '>';
1544 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1545 case CPP_MOD: return c == ':' || c == '>';
1546 case CPP_AND: return c == '&';
1547 case CPP_OR: return c == '|';
1548 case CPP_COLON: return c == ':' || c == '>';
1549 case CPP_DEREF: return c == '*';
1550 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1551 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1552 case CPP_NAME: return ((b == CPP_NUMBER
1553 && name_p (pfile, &token2->val.str))
1554 || b == CPP_NAME
1555 || b == CPP_CHAR || b == CPP_STRING); /* L */
1556 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1557 || c == '.' || c == '+' || c == '-');
1558 case CPP_OTHER: return (CPP_OPTION (pfile, objc)
1559 && token1->val.c == '@'
1560 && (b == CPP_NAME || b == CPP_STRING));
1561 default: break;
1564 return 0;
1567 /* Output all the remaining tokens on the current line, and a newline
1568 character, to FP. Leading whitespace is removed. If there are
1569 macros, special token padding is not performed. */
1570 void
1571 cpp_output_line (pfile, fp)
1572 cpp_reader *pfile;
1573 FILE *fp;
1575 const cpp_token *token;
1577 token = cpp_get_token (pfile);
1578 while (token->type != CPP_EOF)
1580 cpp_output_token (token, fp);
1581 token = cpp_get_token (pfile);
1582 if (token->flags & PREV_WHITE)
1583 putc (' ', fp);
1586 putc ('\n', fp);
1589 /* Returns the value of a hexadecimal digit. */
1590 static unsigned int
1591 hex_digit_value (c)
1592 unsigned int c;
1594 if (hex_p (c))
1595 return hex_value (c);
1596 else
1597 abort ();
1600 /* Parse a '\uNNNN' or '\UNNNNNNNN' sequence. Returns 1 to indicate
1601 failure if cpplib is not parsing C++ or C99. Such failure is
1602 silent, and no variables are updated. Otherwise returns 0, and
1603 warns if -Wtraditional.
1605 [lex.charset]: The character designated by the universal character
1606 name \UNNNNNNNN is that character whose character short name in
1607 ISO/IEC 10646 is NNNNNNNN; the character designated by the
1608 universal character name \uNNNN is that character whose character
1609 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
1610 for a universal character name is less than 0x20 or in the range
1611 0x7F-0x9F (inclusive), or if the universal character name
1612 designates a character in the basic source character set, then the
1613 program is ill-formed.
1615 We assume that wchar_t is Unicode, so we don't need to do any
1616 mapping. Is this ever wrong?
1618 PC points to the 'u' or 'U', PSTR is points to the byte after PC,
1619 LIMIT is the end of the string or charconst. PSTR is updated to
1620 point after the UCS on return, and the UCS is written into PC. */
1622 static int
1623 maybe_read_ucs (pfile, pstr, limit, pc)
1624 cpp_reader *pfile;
1625 const unsigned char **pstr;
1626 const unsigned char *limit;
1627 unsigned int *pc;
1629 const unsigned char *p = *pstr;
1630 unsigned int code = 0;
1631 unsigned int c = *pc, length;
1633 /* Only attempt to interpret a UCS for C++ and C99. */
1634 if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
1635 return 1;
1637 if (CPP_WTRADITIONAL (pfile))
1638 cpp_warning (pfile, "the meaning of '\\%c' is different in traditional C", c);
1640 length = (c == 'u' ? 4: 8);
1642 if ((size_t) (limit - p) < length)
1644 cpp_error (pfile, "incomplete universal-character-name");
1645 /* Skip to the end to avoid more diagnostics. */
1646 p = limit;
1648 else
1650 for (; length; length--, p++)
1652 c = *p;
1653 if (ISXDIGIT (c))
1654 code = (code << 4) + hex_digit_value (c);
1655 else
1657 cpp_error (pfile,
1658 "non-hex digit '%c' in universal-character-name", c);
1659 /* We shouldn't skip in case there are multibyte chars. */
1660 break;
1665 #ifdef TARGET_EBCDIC
1666 cpp_error (pfile, "universal-character-name on EBCDIC target");
1667 code = 0x3f; /* EBCDIC invalid character */
1668 #else
1669 /* True extended characters are OK. */
1670 if (code >= 0xa0
1671 && !(code & 0x80000000)
1672 && !(code >= 0xD800 && code <= 0xDFFF))
1674 /* The standard permits $, @ and ` to be specified as UCNs. We use
1675 hex escapes so that this also works with EBCDIC hosts. */
1676 else if (code == 0x24 || code == 0x40 || code == 0x60)
1678 /* Don't give another error if one occurred above. */
1679 else if (length == 0)
1680 cpp_error (pfile, "universal-character-name out of range");
1681 #endif
1683 *pstr = p;
1684 *pc = code;
1685 return 0;
1688 /* Interpret an escape sequence, and return its value. PSTR points to
1689 the input pointer, which is just after the backslash. LIMIT is how
1690 much text we have. MASK is a bitmask for the precision for the
1691 destination type (char or wchar_t).
1693 Handles all relevant diagnostics. */
1694 unsigned int
1695 cpp_parse_escape (pfile, pstr, limit, mask)
1696 cpp_reader *pfile;
1697 const unsigned char **pstr;
1698 const unsigned char *limit;
1699 unsigned HOST_WIDE_INT mask;
1701 int unknown = 0;
1702 const unsigned char *str = *pstr;
1703 unsigned int c = *str++;
1705 switch (c)
1707 case '\\': case '\'': case '"': case '?': break;
1708 case 'b': c = TARGET_BS; break;
1709 case 'f': c = TARGET_FF; break;
1710 case 'n': c = TARGET_NEWLINE; break;
1711 case 'r': c = TARGET_CR; break;
1712 case 't': c = TARGET_TAB; break;
1713 case 'v': c = TARGET_VT; break;
1715 case '(': case '{': case '[': case '%':
1716 /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
1717 '\%' is used to prevent SCCS from getting confused. */
1718 unknown = CPP_PEDANTIC (pfile);
1719 break;
1721 case 'a':
1722 if (CPP_WTRADITIONAL (pfile))
1723 cpp_warning (pfile, "the meaning of '\\a' is different in traditional C");
1724 c = TARGET_BELL;
1725 break;
1727 case 'e': case 'E':
1728 if (CPP_PEDANTIC (pfile))
1729 cpp_pedwarn (pfile, "non-ISO-standard escape sequence, '\\%c'", c);
1730 c = TARGET_ESC;
1731 break;
1733 case 'u': case 'U':
1734 unknown = maybe_read_ucs (pfile, &str, limit, &c);
1735 break;
1737 case 'x':
1738 if (CPP_WTRADITIONAL (pfile))
1739 cpp_warning (pfile, "the meaning of '\\x' is different in traditional C");
1742 unsigned int i = 0, overflow = 0;
1743 int digits_found = 0;
1745 while (str < limit)
1747 c = *str;
1748 if (! ISXDIGIT (c))
1749 break;
1750 str++;
1751 overflow |= i ^ (i << 4 >> 4);
1752 i = (i << 4) + hex_digit_value (c);
1753 digits_found = 1;
1756 if (!digits_found)
1757 cpp_error (pfile, "\\x used with no following hex digits");
1759 if (overflow | (i != (i & mask)))
1761 cpp_pedwarn (pfile, "hex escape sequence out of range");
1762 i &= mask;
1764 c = i;
1766 break;
1768 case '0': case '1': case '2': case '3':
1769 case '4': case '5': case '6': case '7':
1771 unsigned int i = c - '0';
1772 int count = 0;
1774 while (str < limit && ++count < 3)
1776 c = *str;
1777 if (c < '0' || c > '7')
1778 break;
1779 str++;
1780 i = (i << 3) + c - '0';
1783 if (i != (i & mask))
1785 cpp_pedwarn (pfile, "octal escape sequence out of range");
1786 i &= mask;
1788 c = i;
1790 break;
1792 default:
1793 unknown = 1;
1794 break;
1797 if (unknown)
1799 if (ISGRAPH (c))
1800 cpp_pedwarn (pfile, "unknown escape sequence '\\%c'", c);
1801 else
1802 cpp_pedwarn (pfile, "unknown escape sequence: '\\%03o'", c);
1805 if (c > mask)
1806 cpp_pedwarn (pfile, "escape sequence out of range for character");
1808 *pstr = str;
1809 return c;
1812 #ifndef MAX_CHAR_TYPE_SIZE
1813 #define MAX_CHAR_TYPE_SIZE CHAR_TYPE_SIZE
1814 #endif
1816 #ifndef MAX_WCHAR_TYPE_SIZE
1817 #define MAX_WCHAR_TYPE_SIZE WCHAR_TYPE_SIZE
1818 #endif
1820 /* Interpret a (possibly wide) character constant in TOKEN.
1821 WARN_MULTI warns about multi-character charconsts. PCHARS_SEEN points
1822 to a variable that is filled in with the number of characters seen. */
1823 HOST_WIDE_INT
1824 cpp_interpret_charconst (pfile, token, warn_multi, pchars_seen)
1825 cpp_reader *pfile;
1826 const cpp_token *token;
1827 int warn_multi;
1828 unsigned int *pchars_seen;
1830 const unsigned char *str = token->val.str.text;
1831 const unsigned char *limit = str + token->val.str.len;
1832 unsigned int chars_seen = 0;
1833 unsigned int width, max_chars, c;
1834 unsigned HOST_WIDE_INT mask;
1835 HOST_WIDE_INT result = 0;
1836 bool unsigned_p;
1838 #ifdef MULTIBYTE_CHARS
1839 (void) local_mbtowc (NULL, NULL, 0);
1840 #endif
1842 /* Width in bits. */
1843 if (token->type == CPP_CHAR)
1845 width = MAX_CHAR_TYPE_SIZE;
1846 unsigned_p = CPP_OPTION (pfile, signed_char) == 0;
1848 else
1850 width = MAX_WCHAR_TYPE_SIZE;
1851 unsigned_p = WCHAR_UNSIGNED;
1854 if (width < HOST_BITS_PER_WIDE_INT)
1855 mask = ((unsigned HOST_WIDE_INT) 1 << width) - 1;
1856 else
1857 mask = ~0;
1858 max_chars = HOST_BITS_PER_WIDE_INT / width;
1860 while (str < limit)
1862 #ifdef MULTIBYTE_CHARS
1863 wchar_t wc;
1864 int char_len;
1866 char_len = local_mbtowc (&wc, str, limit - str);
1867 if (char_len == -1)
1869 cpp_warning (pfile, "ignoring invalid multibyte character");
1870 c = *str++;
1872 else
1874 str += char_len;
1875 c = wc;
1877 #else
1878 c = *str++;
1879 #endif
1881 if (c == '\\')
1882 c = cpp_parse_escape (pfile, &str, limit, mask);
1884 #ifdef MAP_CHARACTER
1885 if (ISPRINT (c))
1886 c = MAP_CHARACTER (c);
1887 #endif
1889 /* Merge character into result; ignore excess chars. */
1890 if (++chars_seen <= max_chars)
1892 if (width < HOST_BITS_PER_WIDE_INT)
1893 result = (result << width) | (c & mask);
1894 else
1895 result = c;
1899 if (chars_seen == 0)
1900 cpp_error (pfile, "empty character constant");
1901 else if (chars_seen > max_chars)
1903 chars_seen = max_chars;
1904 cpp_warning (pfile, "character constant too long");
1906 else if (chars_seen > 1 && warn_multi)
1907 cpp_warning (pfile, "multi-character character constant");
1909 /* If relevant type is signed, sign-extend the constant. */
1910 if (chars_seen)
1912 unsigned int nbits = chars_seen * width;
1914 mask = (unsigned HOST_WIDE_INT) ~0 >> (HOST_BITS_PER_WIDE_INT - nbits);
1915 if (unsigned_p || ((result >> (nbits - 1)) & 1) == 0)
1916 result &= mask;
1917 else
1918 result |= ~mask;
1921 *pchars_seen = chars_seen;
1922 return result;
1925 /* Memory buffers. Changing these three constants can have a dramatic
1926 effect on performance. The values here are reasonable defaults,
1927 but might be tuned. If you adjust them, be sure to test across a
1928 range of uses of cpplib, including heavy nested function-like macro
1929 expansion. Also check the change in peak memory usage (NJAMD is a
1930 good tool for this). */
1931 #define MIN_BUFF_SIZE 8000
1932 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1933 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1934 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1936 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1937 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1938 #endif
1940 struct dummy
1942 char c;
1943 union
1945 double d;
1946 int *p;
1947 } u;
1950 #define DEFAULT_ALIGNMENT (offsetof (struct dummy, u))
1951 #define CPP_ALIGN(size, align) (((size) + ((align) - 1)) & ~((align) - 1))
1953 /* Create a new allocation buffer. Place the control block at the end
1954 of the buffer, so that buffer overflows will cause immediate chaos. */
1955 static _cpp_buff *
1956 new_buff (len)
1957 size_t len;
1959 _cpp_buff *result;
1960 unsigned char *base;
1962 if (len < MIN_BUFF_SIZE)
1963 len = MIN_BUFF_SIZE;
1964 len = CPP_ALIGN (len, DEFAULT_ALIGNMENT);
1966 base = xmalloc (len + sizeof (_cpp_buff));
1967 result = (_cpp_buff *) (base + len);
1968 result->base = base;
1969 result->cur = base;
1970 result->limit = base + len;
1971 result->next = NULL;
1972 return result;
1975 /* Place a chain of unwanted allocation buffers on the free list. */
1976 void
1977 _cpp_release_buff (pfile, buff)
1978 cpp_reader *pfile;
1979 _cpp_buff *buff;
1981 _cpp_buff *end = buff;
1983 while (end->next)
1984 end = end->next;
1985 end->next = pfile->free_buffs;
1986 pfile->free_buffs = buff;
1989 /* Return a free buffer of size at least MIN_SIZE. */
1990 _cpp_buff *
1991 _cpp_get_buff (pfile, min_size)
1992 cpp_reader *pfile;
1993 size_t min_size;
1995 _cpp_buff *result, **p;
1997 for (p = &pfile->free_buffs;; p = &(*p)->next)
1999 size_t size;
2001 if (*p == NULL)
2002 return new_buff (min_size);
2003 result = *p;
2004 size = result->limit - result->base;
2005 /* Return a buffer that's big enough, but don't waste one that's
2006 way too big. */
2007 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2008 break;
2011 *p = result->next;
2012 result->next = NULL;
2013 result->cur = result->base;
2014 return result;
2017 /* Creates a new buffer with enough space to hold the uncommitted
2018 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
2019 the excess bytes to the new buffer. Chains the new buffer after
2020 BUFF, and returns the new buffer. */
2021 _cpp_buff *
2022 _cpp_append_extend_buff (pfile, buff, min_extra)
2023 cpp_reader *pfile;
2024 _cpp_buff *buff;
2025 size_t min_extra;
2027 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2028 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2030 buff->next = new_buff;
2031 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2032 return new_buff;
2035 /* Creates a new buffer with enough space to hold the uncommitted
2036 remaining bytes of the buffer pointed to by BUFF, and at least
2037 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
2038 Chains the new buffer before the buffer pointed to by BUFF, and
2039 updates the pointer to point to the new buffer. */
2040 void
2041 _cpp_extend_buff (pfile, pbuff, min_extra)
2042 cpp_reader *pfile;
2043 _cpp_buff **pbuff;
2044 size_t min_extra;
2046 _cpp_buff *new_buff, *old_buff = *pbuff;
2047 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2049 new_buff = _cpp_get_buff (pfile, size);
2050 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2051 new_buff->next = old_buff;
2052 *pbuff = new_buff;
2055 /* Free a chain of buffers starting at BUFF. */
2056 void
2057 _cpp_free_buff (buff)
2058 _cpp_buff *buff;
2060 _cpp_buff *next;
2062 for (; buff; buff = next)
2064 next = buff->next;
2065 free (buff->base);
2069 /* Allocate permanent, unaligned storage of length LEN. */
2070 unsigned char *
2071 _cpp_unaligned_alloc (pfile, len)
2072 cpp_reader *pfile;
2073 size_t len;
2075 _cpp_buff *buff = pfile->u_buff;
2076 unsigned char *result = buff->cur;
2078 if (len > (size_t) (buff->limit - result))
2080 buff = _cpp_get_buff (pfile, len);
2081 buff->next = pfile->u_buff;
2082 pfile->u_buff = buff;
2083 result = buff->cur;
2086 buff->cur = result + len;
2087 return result;
2090 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2091 That buffer is used for growing allocations when saving macro
2092 replacement lists in a #define, and when parsing an answer to an
2093 assertion in #assert, #unassert or #if (and therefore possibly
2094 whilst expanding macros). It therefore must not be used by any
2095 code that they might call: specifically the lexer and the guts of
2096 the macro expander.
2098 All existing other uses clearly fit this restriction: storing
2099 registered pragmas during initialization. */
2100 unsigned char *
2101 _cpp_aligned_alloc (pfile, len)
2102 cpp_reader *pfile;
2103 size_t len;
2105 _cpp_buff *buff = pfile->a_buff;
2106 unsigned char *result = buff->cur;
2108 if (len > (size_t) (buff->limit - result))
2110 buff = _cpp_get_buff (pfile, len);
2111 buff->next = pfile->a_buff;
2112 pfile->a_buff = buff;
2113 result = buff->cur;
2116 buff->cur = result + len;
2117 return result;