2001-11-01 Eric Christopher <echristo@redhat.com>
[official-gcc.git] / gcc / cpplex.c
bloba15fccac22e8e32ab0300a74129c01ff7024f953
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7 Single-pass line tokenization by Neil Booth, April 2000
9 This program is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
12 later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23 #include "config.h"
24 #include "system.h"
25 #include "cpplib.h"
26 #include "cpphash.h"
28 /* MULTIBYTE_CHARS support only works for native compilers.
29 ??? Ideally what we want is to model widechar support after
30 the current floating point support. */
31 #ifdef CROSS_COMPILE
32 #undef MULTIBYTE_CHARS
33 #endif
35 #ifdef MULTIBYTE_CHARS
36 #include "mbchar.h"
37 #include <locale.h>
38 #endif
40 /* Tokens with SPELL_STRING store their spelling in the token list,
41 and it's length in the token->val.name.len. */
42 enum spell_type
44 SPELL_OPERATOR = 0,
45 SPELL_CHAR,
46 SPELL_IDENT,
47 SPELL_NUMBER,
48 SPELL_STRING,
49 SPELL_NONE
52 struct token_spelling
54 enum spell_type category;
55 const unsigned char *name;
58 static const unsigned char *const digraph_spellings[] =
59 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
61 #define OP(e, s) { SPELL_OPERATOR, U s },
62 #define TK(e, s) { s, U STRINGX (e) },
63 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
64 #undef OP
65 #undef TK
67 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
68 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
69 #define BACKUP() do {buffer->cur = buffer->backup_to;} while (0)
71 static void handle_newline PARAMS ((cpp_reader *));
72 static cppchar_t skip_escaped_newlines PARAMS ((cpp_reader *));
73 static cppchar_t get_effective_char PARAMS ((cpp_reader *));
75 static int skip_block_comment PARAMS ((cpp_reader *));
76 static int skip_line_comment PARAMS ((cpp_reader *));
77 static void adjust_column PARAMS ((cpp_reader *));
78 static void skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
79 static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *));
80 static cpp_hashnode *parse_identifier_slow PARAMS ((cpp_reader *,
81 const U_CHAR *));
82 static void parse_number PARAMS ((cpp_reader *, cpp_string *, cppchar_t, int));
83 static int unescaped_terminator_p PARAMS ((cpp_reader *, const U_CHAR *));
84 static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
85 static void unterminated PARAMS ((cpp_reader *, int));
86 static bool trigraph_p PARAMS ((cpp_reader *));
87 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const U_CHAR *));
88 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
89 static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
90 const unsigned char *, unsigned int *));
91 static tokenrun *next_tokenrun PARAMS ((tokenrun *));
93 static unsigned int hex_digit_value PARAMS ((unsigned int));
94 static _cpp_buff *new_buff PARAMS ((size_t));
96 /* Utility routine:
98 Compares, the token TOKEN to the NUL-terminated string STRING.
99 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
102 cpp_ideq (token, string)
103 const cpp_token *token;
104 const char *string;
106 if (token->type != CPP_NAME)
107 return 0;
109 return !ustrcmp (NODE_NAME (token->val.node), (const U_CHAR *) string);
112 /* Call when meeting a newline, assumed to be in buffer->cur[-1].
113 Returns with buffer->cur pointing to the character immediately
114 following the newline (combination). */
115 static void
116 handle_newline (pfile)
117 cpp_reader *pfile;
119 cpp_buffer *buffer = pfile->buffer;
121 /* Handle CR-LF and LF-CR. Most other implementations (e.g. java)
122 only accept CR-LF; maybe we should fall back to that behaviour?
124 NOTE: the EOF case in _cpp_lex_direct currently requires the
125 buffer->cur != buffer->rlimit test here for 0-length files. */
126 if (buffer->cur != buffer->rlimit
127 && buffer->cur[-1] + buffer->cur[0] == '\r' + '\n')
128 buffer->cur++;
130 buffer->line_base = buffer->cur;
131 buffer->col_adjust = 0;
132 pfile->line++;
135 /* Subroutine of skip_escaped_newlines; called when a 3-character
136 sequence beginning with "??" is encountered. buffer->cur points to
137 the second '?'.
139 Warn if necessary, and returns true if the sequence forms a
140 trigraph and the trigraph should be honoured. */
141 static bool
142 trigraph_p (pfile)
143 cpp_reader *pfile;
145 cpp_buffer *buffer = pfile->buffer;
146 cppchar_t from_char = buffer->cur[1];
147 bool accept;
149 if (!_cpp_trigraph_map[from_char])
150 return false;
152 accept = CPP_OPTION (pfile, trigraphs);
154 /* Don't warn about trigraphs in comments. */
155 if (CPP_OPTION (pfile, warn_trigraphs) && !pfile->state.lexing_comment)
157 if (accept)
158 cpp_warning_with_line (pfile, pfile->line, CPP_BUF_COL (buffer) - 1,
159 "trigraph ??%c converted to %c",
160 (int) from_char,
161 (int) _cpp_trigraph_map[from_char]);
162 else if (buffer->cur != buffer->last_Wtrigraphs)
164 buffer->last_Wtrigraphs = buffer->cur;
165 cpp_warning_with_line (pfile, pfile->line,
166 CPP_BUF_COL (buffer) - 1,
167 "trigraph ??%c ignored", (int) from_char);
171 return accept;
174 /* Skips any escaped newlines introduced by '?' or a '\\', assumed to
175 lie in buffer->cur[-1]. Returns the next byte, which will be in
176 buffer->cur[-1]. This routine performs preprocessing stages 1 and
177 2 of the ISO C standard. */
178 static cppchar_t
179 skip_escaped_newlines (pfile)
180 cpp_reader *pfile;
182 cpp_buffer *buffer = pfile->buffer;
183 cppchar_t next = buffer->cur[-1];
185 /* Only do this if we apply stages 1 and 2. */
186 if (!buffer->from_stage3)
188 const unsigned char *saved_cur;
189 cppchar_t next1;
193 if (buffer->cur == buffer->rlimit)
194 break;
196 if (next == '?')
198 if (buffer->cur[0] != '?' || buffer->cur + 1 == buffer->rlimit)
199 break;
201 if (!trigraph_p (pfile))
202 break;
204 /* Translate the trigraph. */
205 next = _cpp_trigraph_map[buffer->cur[1]];
206 buffer->cur += 2;
207 if (next != '\\' || buffer->cur == buffer->rlimit)
208 break;
211 /* We have a backslash, and room for at least one more
212 character. Skip horizontal whitespace. */
213 saved_cur = buffer->cur;
215 next1 = *buffer->cur++;
216 while (is_nvspace (next1) && buffer->cur < buffer->rlimit);
218 if (!is_vspace (next1))
220 buffer->cur = saved_cur;
221 break;
224 if (saved_cur != buffer->cur - 1
225 && !pfile->state.lexing_comment)
226 cpp_warning (pfile, "backslash and newline separated by space");
228 handle_newline (pfile);
229 buffer->backup_to = buffer->cur;
230 if (buffer->cur == buffer->rlimit)
232 cpp_pedwarn (pfile, "backslash-newline at end of file");
233 next = EOF;
235 else
236 next = *buffer->cur++;
238 while (next == '\\' || next == '?');
241 return next;
244 /* Obtain the next character, after trigraph conversion and skipping
245 an arbitrarily long string of escaped newlines. The common case of
246 no trigraphs or escaped newlines falls through quickly. On return,
247 buffer->backup_to points to where to return to if the character is
248 not to be processed. */
249 static cppchar_t
250 get_effective_char (pfile)
251 cpp_reader *pfile;
253 cppchar_t next = EOF;
254 cpp_buffer *buffer = pfile->buffer;
256 buffer->backup_to = buffer->cur;
257 if (buffer->cur < buffer->rlimit)
259 next = *buffer->cur++;
260 if (__builtin_expect (next == '?' || next == '\\', 0))
261 next = skip_escaped_newlines (pfile);
264 return next;
267 /* Skip a C-style block comment. We find the end of the comment by
268 seeing if an asterisk is before every '/' we encounter. Returns
269 non-zero if comment terminated by EOF, zero otherwise. */
270 static int
271 skip_block_comment (pfile)
272 cpp_reader *pfile;
274 cpp_buffer *buffer = pfile->buffer;
275 cppchar_t c = EOF, prevc = EOF;
277 pfile->state.lexing_comment = 1;
278 while (buffer->cur != buffer->rlimit)
280 prevc = c, c = *buffer->cur++;
282 /* FIXME: For speed, create a new character class of characters
283 of interest inside block comments. */
284 if (c == '?' || c == '\\')
285 c = skip_escaped_newlines (pfile);
287 /* People like decorating comments with '*', so check for '/'
288 instead for efficiency. */
289 if (c == '/')
291 if (prevc == '*')
292 break;
294 /* Warn about potential nested comments, but not if the '/'
295 comes immediately before the true comment delimeter.
296 Don't bother to get it right across escaped newlines. */
297 if (CPP_OPTION (pfile, warn_comments)
298 && buffer->cur + 1 < buffer->rlimit
299 && buffer->cur[0] == '*' && buffer->cur[1] != '/')
300 cpp_warning_with_line (pfile,
301 pfile->line, CPP_BUF_COL (buffer),
302 "\"/*\" within comment");
304 else if (is_vspace (c))
305 handle_newline (pfile);
306 else if (c == '\t')
307 adjust_column (pfile);
310 pfile->state.lexing_comment = 0;
311 return c != '/' || prevc != '*';
314 /* Skip a C++ line comment, leaving buffer->cur pointing to the
315 terminating newline. Handles escaped newlines. Returns non-zero
316 if a multiline comment. */
317 static int
318 skip_line_comment (pfile)
319 cpp_reader *pfile;
321 cpp_buffer *buffer = pfile->buffer;
322 unsigned int orig_line = pfile->line;
323 cppchar_t c;
325 pfile->state.lexing_comment = 1;
328 if (buffer->cur == buffer->rlimit)
329 goto at_eof;
331 c = *buffer->cur++;
332 if (c == '?' || c == '\\')
333 c = skip_escaped_newlines (pfile);
335 while (!is_vspace (c));
337 /* Step back over the newline, except at EOF. */
338 buffer->cur--;
339 at_eof:
341 pfile->state.lexing_comment = 0;
342 return orig_line != pfile->line;
345 /* pfile->buffer->cur is one beyond the \t character. Update
346 col_adjust so we track the column correctly. */
347 static void
348 adjust_column (pfile)
349 cpp_reader *pfile;
351 cpp_buffer *buffer = pfile->buffer;
352 unsigned int col = CPP_BUF_COL (buffer) - 1; /* Zero-based column. */
354 /* Round it up to multiple of the tabstop, but subtract 1 since the
355 tab itself occupies a character position. */
356 buffer->col_adjust += (CPP_OPTION (pfile, tabstop)
357 - col % CPP_OPTION (pfile, tabstop)) - 1;
360 /* Skips whitespace, saving the next non-whitespace character.
361 Adjusts pfile->col_adjust to account for tabs. Without this,
362 tokens might be assigned an incorrect column. */
363 static void
364 skip_whitespace (pfile, c)
365 cpp_reader *pfile;
366 cppchar_t c;
368 cpp_buffer *buffer = pfile->buffer;
369 unsigned int warned = 0;
373 /* Horizontal space always OK. */
374 if (c == ' ')
376 else if (c == '\t')
377 adjust_column (pfile);
378 /* Just \f \v or \0 left. */
379 else if (c == '\0')
381 if (!warned)
383 cpp_warning (pfile, "null character(s) ignored");
384 warned = 1;
387 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
388 cpp_pedwarn_with_line (pfile, pfile->line,
389 CPP_BUF_COL (buffer),
390 "%s in preprocessing directive",
391 c == '\f' ? "form feed" : "vertical tab");
393 if (buffer->cur == buffer->rlimit)
394 return;
395 c = *buffer->cur++;
397 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
398 while (is_nvspace (c));
400 buffer->cur--;
403 /* See if the characters of a number token are valid in a name (no
404 '.', '+' or '-'). */
405 static int
406 name_p (pfile, string)
407 cpp_reader *pfile;
408 const cpp_string *string;
410 unsigned int i;
412 for (i = 0; i < string->len; i++)
413 if (!is_idchar (string->text[i]))
414 return 0;
416 return 1;
419 /* Parse an identifier, skipping embedded backslash-newlines. This is
420 a critical inner loop. The common case is an identifier which has
421 not been split by backslash-newline, does not contain a dollar
422 sign, and has already been scanned (roughly 10:1 ratio of
423 seen:unseen identifiers in normal code; the distribution is
424 Poisson-like). Second most common case is a new identifier, not
425 split and no dollar sign. The other possibilities are rare and
426 have been relegated to parse_identifier_slow. */
428 static cpp_hashnode *
429 parse_identifier (pfile)
430 cpp_reader *pfile;
432 cpp_hashnode *result;
433 const U_CHAR *cur, *rlimit;
435 /* Fast-path loop. Skim over a normal identifier.
436 N.B. ISIDNUM does not include $. */
437 cur = pfile->buffer->cur - 1;
438 rlimit = pfile->buffer->rlimit;
440 cur++;
441 while (cur < rlimit && ISIDNUM (*cur));
443 /* Check for slow-path cases. */
444 if (cur < rlimit && (*cur == '?' || *cur == '\\' || *cur == '$'))
445 result = parse_identifier_slow (pfile, cur);
446 else
448 const U_CHAR *base = pfile->buffer->cur - 1;
449 result = (cpp_hashnode *)
450 ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
451 pfile->buffer->cur = cur;
454 /* Rarely, identifiers require diagnostics when lexed.
455 XXX Has to be forced out of the fast path. */
456 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
457 && !pfile->state.skipping, 0))
459 /* It is allowed to poison the same identifier twice. */
460 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
461 cpp_error (pfile, "attempt to use poisoned \"%s\"",
462 NODE_NAME (result));
464 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
465 replacement list of a variadic macro. */
466 if (result == pfile->spec_nodes.n__VA_ARGS__
467 && !pfile->state.va_args_ok)
468 cpp_pedwarn (pfile,
469 "__VA_ARGS__ can only appear in the expansion of a C99 variadic macro");
472 return result;
475 /* Slow path. This handles identifiers which have been split, and
476 identifiers which contain dollar signs. The part of the identifier
477 from PFILE->buffer->cur-1 to CUR has already been scanned. */
478 static cpp_hashnode *
479 parse_identifier_slow (pfile, cur)
480 cpp_reader *pfile;
481 const U_CHAR *cur;
483 cpp_buffer *buffer = pfile->buffer;
484 const U_CHAR *base = buffer->cur - 1;
485 struct obstack *stack = &pfile->hash_table->stack;
486 unsigned int c, saw_dollar = 0, len;
488 /* Copy the part of the token which is known to be okay. */
489 obstack_grow (stack, base, cur - base);
491 /* Now process the part which isn't. We are looking at one of
492 '$', '\\', or '?' on entry to this loop. */
493 c = *cur++;
494 buffer->cur = cur;
497 while (is_idchar (c))
499 obstack_1grow (stack, c);
501 if (c == '$')
502 saw_dollar++;
504 if (buffer->cur == buffer->rlimit)
505 goto at_eof;
507 c = *buffer->cur++;
510 /* Potential escaped newline? */
511 buffer->backup_to = buffer->cur - 1;
512 if (c != '?' && c != '\\')
513 break;
514 c = skip_escaped_newlines (pfile);
516 while (is_idchar (c));
518 /* Step back over the unwanted char, except at EOF. */
519 BACKUP ();
520 at_eof:
522 /* $ is not an identifier character in the standard, but is commonly
523 accepted as an extension. Don't warn about it in skipped
524 conditional blocks. */
525 if (saw_dollar && CPP_PEDANTIC (pfile) && ! pfile->state.skipping)
526 cpp_pedwarn (pfile, "'$' character(s) in identifier");
528 /* Identifiers are null-terminated. */
529 len = obstack_object_size (stack);
530 obstack_1grow (stack, '\0');
532 return (cpp_hashnode *)
533 ht_lookup (pfile->hash_table, obstack_finish (stack), len, HT_ALLOCED);
536 /* Parse a number, skipping embedded backslash-newlines. */
537 static void
538 parse_number (pfile, number, c, leading_period)
539 cpp_reader *pfile;
540 cpp_string *number;
541 cppchar_t c;
542 int leading_period;
544 cpp_buffer *buffer = pfile->buffer;
545 unsigned char *dest, *limit;
547 dest = BUFF_FRONT (pfile->u_buff);
548 limit = BUFF_LIMIT (pfile->u_buff);
550 /* Place a leading period. */
551 if (leading_period)
553 if (dest == limit)
555 _cpp_extend_buff (pfile, &pfile->u_buff, 1);
556 dest = BUFF_FRONT (pfile->u_buff);
557 limit = BUFF_LIMIT (pfile->u_buff);
559 *dest++ = '.';
566 /* Need room for terminating null. */
567 if ((size_t) (limit - dest) < 2)
569 size_t len_so_far = dest - BUFF_FRONT (pfile->u_buff);
570 _cpp_extend_buff (pfile, &pfile->u_buff, 2);
571 dest = BUFF_FRONT (pfile->u_buff) + len_so_far;
572 limit = BUFF_LIMIT (pfile->u_buff);
574 *dest++ = c;
576 if (buffer->cur == buffer->rlimit)
577 goto at_eof;
579 c = *buffer->cur++;
581 while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
583 /* Potential escaped newline? */
584 buffer->backup_to = buffer->cur - 1;
585 if (c != '?' && c != '\\')
586 break;
587 c = skip_escaped_newlines (pfile);
589 while (is_numchar (c) || c == '.' || VALID_SIGN (c, dest[-1]));
591 /* Step back over the unwanted char, except at EOF. */
592 BACKUP ();
593 at_eof:
595 /* Null-terminate the number. */
596 *dest = '\0';
598 number->text = BUFF_FRONT (pfile->u_buff);
599 number->len = dest - number->text;
600 BUFF_FRONT (pfile->u_buff) = dest + 1;
603 /* Subroutine of parse_string. Emits error for unterminated strings. */
604 static void
605 unterminated (pfile, term)
606 cpp_reader *pfile;
607 int term;
609 cpp_error (pfile, "missing terminating %c character", term);
611 if (term == '\"' && pfile->mls_line && pfile->mls_line != pfile->line)
613 cpp_error_with_line (pfile, pfile->mls_line, pfile->mls_col,
614 "possible start of unterminated string literal");
615 pfile->mls_line = 0;
619 /* Subroutine of parse_string. */
620 static int
621 unescaped_terminator_p (pfile, dest)
622 cpp_reader *pfile;
623 const unsigned char *dest;
625 const unsigned char *start, *temp;
627 /* In #include-style directives, terminators are not escapeable. */
628 if (pfile->state.angled_headers)
629 return 1;
631 start = BUFF_FRONT (pfile->u_buff);
633 /* An odd number of consecutive backslashes represents an escaped
634 terminator. */
635 for (temp = dest; temp > start && temp[-1] == '\\'; temp--)
638 return ((dest - temp) & 1) == 0;
641 /* Parses a string, character constant, or angle-bracketed header file
642 name. Handles embedded trigraphs and escaped newlines. The stored
643 string is guaranteed NUL-terminated, but it is not guaranteed that
644 this is the first NUL since embedded NULs are preserved.
645 Multi-line strings are allowed, but they are deprecated.
647 When this function returns, buffer->cur points to the next
648 character to be processed. */
649 static void
650 parse_string (pfile, token, terminator)
651 cpp_reader *pfile;
652 cpp_token *token;
653 cppchar_t terminator;
655 cpp_buffer *buffer = pfile->buffer;
656 unsigned char *dest, *limit;
657 cppchar_t c;
658 bool warned_nulls = false, warned_multi = false;
660 dest = BUFF_FRONT (pfile->u_buff);
661 limit = BUFF_LIMIT (pfile->u_buff);
663 for (;;)
665 /* We need room for another char, possibly the terminating NUL. */
666 if ((size_t) (limit - dest) < 1)
668 size_t len_so_far = dest - BUFF_FRONT (pfile->u_buff);
669 _cpp_extend_buff (pfile, &pfile->u_buff, 2);
670 dest = BUFF_FRONT (pfile->u_buff) + len_so_far;
671 limit = BUFF_LIMIT (pfile->u_buff);
674 if (buffer->cur == buffer->rlimit)
676 unterminated (pfile, terminator);
677 break;
680 /* Handle trigraphs, escaped newlines etc. */
681 c = *buffer->cur++;
682 if (c == '?' || c == '\\')
683 c = skip_escaped_newlines (pfile);
685 if (c == terminator)
687 if (unescaped_terminator_p (pfile, dest))
688 break;
690 else if (is_vspace (c))
692 /* In assembly language, silently terminate string and
693 character literals at end of line. This is a kludge
694 around not knowing where comments are. */
695 if (CPP_OPTION (pfile, lang) == CLK_ASM && terminator != '>')
697 buffer->cur--;
698 break;
701 /* Character constants and header names may not extend over
702 multiple lines. In Standard C, neither may strings.
703 Unfortunately, we accept multiline strings as an
704 extension, except in #include family directives. */
705 if (terminator != '"' || pfile->state.angled_headers)
707 unterminated (pfile, terminator);
708 buffer->cur--;
709 break;
712 if (!warned_multi)
714 warned_multi = true;
715 cpp_pedwarn (pfile, "multi-line string literals are deprecated");
718 if (pfile->mls_line == 0)
720 pfile->mls_line = token->line;
721 pfile->mls_col = token->col;
724 handle_newline (pfile);
725 c = '\n';
727 else if (c == '\0' && !warned_nulls)
729 warned_nulls = true;
730 cpp_warning (pfile, "null character(s) preserved in literal");
733 *dest++ = c;
736 *dest = '\0';
738 token->val.str.text = BUFF_FRONT (pfile->u_buff);
739 token->val.str.len = dest - BUFF_FRONT (pfile->u_buff);
740 BUFF_FRONT (pfile->u_buff) = dest + 1;
743 /* The stored comment includes the comment start and any terminator. */
744 static void
745 save_comment (pfile, token, from)
746 cpp_reader *pfile;
747 cpp_token *token;
748 const unsigned char *from;
750 unsigned char *buffer;
751 unsigned int len;
753 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
755 /* C++ comments probably (not definitely) have moved past a new
756 line, which we don't want to save in the comment. */
757 if (is_vspace (pfile->buffer->cur[-1]))
758 len--;
759 buffer = _cpp_unaligned_alloc (pfile, len);
761 token->type = CPP_COMMENT;
762 token->val.str.len = len;
763 token->val.str.text = buffer;
765 buffer[0] = '/';
766 memcpy (buffer + 1, from, len - 1);
769 /* Allocate COUNT tokens for RUN. */
770 void
771 _cpp_init_tokenrun (run, count)
772 tokenrun *run;
773 unsigned int count;
775 run->base = xnewvec (cpp_token, count);
776 run->limit = run->base + count;
777 run->next = NULL;
780 /* Returns the next tokenrun, or creates one if there is none. */
781 static tokenrun *
782 next_tokenrun (run)
783 tokenrun *run;
785 if (run->next == NULL)
787 run->next = xnew (tokenrun);
788 run->next->prev = run;
789 _cpp_init_tokenrun (run->next, 250);
792 return run->next;
795 /* Allocate a single token that is invalidated at the same time as the
796 rest of the tokens on the line. Has its line and col set to the
797 same as the last lexed token, so that diagnostics appear in the
798 right place. */
799 cpp_token *
800 _cpp_temp_token (pfile)
801 cpp_reader *pfile;
803 cpp_token *old, *result;
805 old = pfile->cur_token - 1;
806 if (pfile->cur_token == pfile->cur_run->limit)
808 pfile->cur_run = next_tokenrun (pfile->cur_run);
809 pfile->cur_token = pfile->cur_run->base;
812 result = pfile->cur_token++;
813 result->line = old->line;
814 result->col = old->col;
815 return result;
818 /* Lex a token into RESULT (external interface). Takes care of issues
819 like directive handling, token lookahead, multiple include
820 opimisation and skipping. */
821 const cpp_token *
822 _cpp_lex_token (pfile)
823 cpp_reader *pfile;
825 cpp_token *result;
827 for (;;)
829 if (pfile->cur_token == pfile->cur_run->limit)
831 pfile->cur_run = next_tokenrun (pfile->cur_run);
832 pfile->cur_token = pfile->cur_run->base;
835 if (pfile->lookaheads)
837 pfile->lookaheads--;
838 result = pfile->cur_token++;
840 else
841 result = _cpp_lex_direct (pfile);
843 if (result->flags & BOL)
845 /* Is this a directive. If _cpp_handle_directive returns
846 false, it is an assembler #. */
847 if (result->type == CPP_HASH
848 && !pfile->state.parsing_args
849 && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
850 continue;
851 if (pfile->cb.line_change && !pfile->state.skipping)
852 (*pfile->cb.line_change)(pfile, result, pfile->state.parsing_args);
855 /* We don't skip tokens in directives. */
856 if (pfile->state.in_directive)
857 break;
859 /* Outside a directive, invalidate controlling macros. At file
860 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
861 get here and MI optimisation works. */
862 pfile->mi_valid = false;
864 if (!pfile->state.skipping || result->type == CPP_EOF)
865 break;
868 return result;
871 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
872 do { \
873 if (get_effective_char (pfile) == CHAR) \
874 result->type = THEN_TYPE; \
875 else \
877 BACKUP (); \
878 result->type = ELSE_TYPE; \
880 } while (0)
882 /* Lex a token into pfile->cur_token, which is also incremented, to
883 get diagnostics pointing to the correct location.
885 Does not handle issues such as token lookahead, multiple-include
886 optimisation, directives, skipping etc. This function is only
887 suitable for use by _cpp_lex_token, and in special cases like
888 lex_expansion_token which doesn't care for any of these issues.
890 When meeting a newline, returns CPP_EOF if parsing a directive,
891 otherwise returns to the start of the token buffer if permissible.
892 Returns the location of the lexed token. */
893 cpp_token *
894 _cpp_lex_direct (pfile)
895 cpp_reader *pfile;
897 cppchar_t c;
898 cpp_buffer *buffer;
899 const unsigned char *comment_start;
900 cpp_token *result = pfile->cur_token++;
902 fresh_line:
903 buffer = pfile->buffer;
904 result->flags = buffer->saved_flags;
905 buffer->saved_flags = 0;
906 update_tokens_line:
907 result->line = pfile->line;
909 skipped_white:
910 if (buffer->cur == buffer->rlimit)
911 goto at_eof;
912 c = *buffer->cur++;
913 result->col = CPP_BUF_COLUMN (buffer, buffer->cur);
915 trigraph:
916 switch (c)
918 at_eof:
919 buffer->saved_flags = BOL;
920 if (!pfile->state.parsing_args && !pfile->state.in_directive)
922 if (buffer->cur != buffer->line_base)
924 /* Non-empty files should end in a newline. Don't warn
925 for command line and _Pragma buffers. */
926 if (!buffer->from_stage3)
927 cpp_pedwarn (pfile, "no newline at end of file");
928 handle_newline (pfile);
931 /* Don't pop the last buffer. */
932 if (buffer->prev)
934 unsigned char stop = buffer->return_at_eof;
936 _cpp_pop_buffer (pfile);
937 if (!stop)
938 goto fresh_line;
941 result->type = CPP_EOF;
942 break;
944 case ' ': case '\t': case '\f': case '\v': case '\0':
945 skip_whitespace (pfile, c);
946 result->flags |= PREV_WHITE;
947 goto skipped_white;
949 case '\n': case '\r':
950 handle_newline (pfile);
951 buffer->saved_flags = BOL;
952 if (! pfile->state.in_directive)
954 if (pfile->state.parsing_args == 2)
955 buffer->saved_flags |= PREV_WHITE;
956 if (!pfile->keep_tokens)
958 pfile->cur_run = &pfile->base_run;
959 result = pfile->base_run.base;
960 pfile->cur_token = result + 1;
962 goto fresh_line;
964 result->type = CPP_EOF;
965 break;
967 case '?':
968 case '\\':
969 /* These could start an escaped newline, or '?' a trigraph. Let
970 skip_escaped_newlines do all the work. */
972 unsigned int line = pfile->line;
974 c = skip_escaped_newlines (pfile);
975 if (line != pfile->line)
977 buffer->cur--;
978 /* We had at least one escaped newline of some sort.
979 Update the token's line and column. */
980 goto update_tokens_line;
984 /* We are either the original '?' or '\\', or a trigraph. */
985 if (c == '?')
986 result->type = CPP_QUERY;
987 else if (c == '\\')
988 goto random_char;
989 else
990 goto trigraph;
991 break;
993 case '0': case '1': case '2': case '3': case '4':
994 case '5': case '6': case '7': case '8': case '9':
995 result->type = CPP_NUMBER;
996 parse_number (pfile, &result->val.str, c, 0);
997 break;
999 case '$':
1000 if (!CPP_OPTION (pfile, dollars_in_ident))
1001 goto random_char;
1002 /* Fall through... */
1004 case '_':
1005 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1006 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1007 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1008 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1009 case 'y': case 'z':
1010 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1011 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1012 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1013 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1014 case 'Y': case 'Z':
1015 result->type = CPP_NAME;
1016 result->val.node = parse_identifier (pfile);
1018 /* 'L' may introduce wide characters or strings. */
1019 if (result->val.node == pfile->spec_nodes.n_L
1020 && buffer->cur < buffer->rlimit)
1022 c = *buffer->cur;
1023 if (c == '\'' || c == '"')
1025 buffer->cur++;
1026 result->type = (c == '"' ? CPP_WSTRING: CPP_WCHAR);
1027 parse_string (pfile, result, c);
1030 /* Convert named operators to their proper types. */
1031 else if (result->val.node->flags & NODE_OPERATOR)
1033 result->flags |= NAMED_OP;
1034 result->type = result->val.node->value.operator;
1036 break;
1038 case '\'':
1039 case '"':
1040 result->type = c == '"' ? CPP_STRING: CPP_CHAR;
1041 parse_string (pfile, result, c);
1042 break;
1044 case '/':
1045 /* A potential block or line comment. */
1046 comment_start = buffer->cur;
1047 c = get_effective_char (pfile);
1049 if (c == '*')
1051 if (skip_block_comment (pfile))
1052 cpp_error (pfile, "unterminated comment");
1054 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1055 || CPP_IN_SYSTEM_HEADER (pfile)))
1057 /* Warn about comments only if pedantically GNUC89, and not
1058 in system headers. */
1059 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1060 && ! buffer->warned_cplusplus_comments)
1062 cpp_pedwarn (pfile,
1063 "C++ style comments are not allowed in ISO C89");
1064 cpp_pedwarn (pfile,
1065 "(this will be reported only once per input file)");
1066 buffer->warned_cplusplus_comments = 1;
1069 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1070 cpp_warning (pfile, "multi-line comment");
1072 else if (c == '=')
1074 result->type = CPP_DIV_EQ;
1075 break;
1077 else
1079 BACKUP ();
1080 result->type = CPP_DIV;
1081 break;
1084 if (!pfile->state.save_comments)
1086 result->flags |= PREV_WHITE;
1087 goto update_tokens_line;
1090 /* Save the comment as a token in its own right. */
1091 save_comment (pfile, result, comment_start);
1092 break;
1094 case '<':
1095 if (pfile->state.angled_headers)
1097 result->type = CPP_HEADER_NAME;
1098 parse_string (pfile, result, '>');
1099 break;
1102 c = get_effective_char (pfile);
1103 if (c == '=')
1104 result->type = CPP_LESS_EQ;
1105 else if (c == '<')
1106 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1107 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1108 IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
1109 else if (c == ':' && CPP_OPTION (pfile, digraphs))
1111 result->type = CPP_OPEN_SQUARE;
1112 result->flags |= DIGRAPH;
1114 else if (c == '%' && CPP_OPTION (pfile, digraphs))
1116 result->type = CPP_OPEN_BRACE;
1117 result->flags |= DIGRAPH;
1119 else
1121 BACKUP ();
1122 result->type = CPP_LESS;
1124 break;
1126 case '>':
1127 c = get_effective_char (pfile);
1128 if (c == '=')
1129 result->type = CPP_GREATER_EQ;
1130 else if (c == '>')
1131 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1132 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1133 IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
1134 else
1136 BACKUP ();
1137 result->type = CPP_GREATER;
1139 break;
1141 case '%':
1142 c = get_effective_char (pfile);
1143 if (c == '=')
1144 result->type = CPP_MOD_EQ;
1145 else if (CPP_OPTION (pfile, digraphs) && c == ':')
1147 result->flags |= DIGRAPH;
1148 result->type = CPP_HASH;
1149 if (get_effective_char (pfile) == '%')
1151 const unsigned char *pos = buffer->cur;
1153 if (get_effective_char (pfile) == ':')
1154 result->type = CPP_PASTE;
1155 else
1156 buffer->cur = pos - 1;
1158 else
1159 BACKUP ();
1161 else if (CPP_OPTION (pfile, digraphs) && c == '>')
1163 result->flags |= DIGRAPH;
1164 result->type = CPP_CLOSE_BRACE;
1166 else
1168 BACKUP ();
1169 result->type = CPP_MOD;
1171 break;
1173 case '.':
1174 result->type = CPP_DOT;
1175 c = get_effective_char (pfile);
1176 if (c == '.')
1178 const unsigned char *pos = buffer->cur;
1180 if (get_effective_char (pfile) == '.')
1181 result->type = CPP_ELLIPSIS;
1182 else
1183 buffer->cur = pos - 1;
1185 /* All known character sets have 0...9 contiguous. */
1186 else if (ISDIGIT (c))
1188 result->type = CPP_NUMBER;
1189 parse_number (pfile, &result->val.str, c, 1);
1191 else if (c == '*' && CPP_OPTION (pfile, cplusplus))
1192 result->type = CPP_DOT_STAR;
1193 else
1194 BACKUP ();
1195 break;
1197 case '+':
1198 c = get_effective_char (pfile);
1199 if (c == '+')
1200 result->type = CPP_PLUS_PLUS;
1201 else if (c == '=')
1202 result->type = CPP_PLUS_EQ;
1203 else
1205 BACKUP ();
1206 result->type = CPP_PLUS;
1208 break;
1210 case '-':
1211 c = get_effective_char (pfile);
1212 if (c == '>')
1214 result->type = CPP_DEREF;
1215 if (CPP_OPTION (pfile, cplusplus))
1217 if (get_effective_char (pfile) == '*')
1218 result->type = CPP_DEREF_STAR;
1219 else
1220 BACKUP ();
1223 else if (c == '-')
1224 result->type = CPP_MINUS_MINUS;
1225 else if (c == '=')
1226 result->type = CPP_MINUS_EQ;
1227 else
1229 BACKUP ();
1230 result->type = CPP_MINUS;
1232 break;
1234 case '&':
1235 c = get_effective_char (pfile);
1236 if (c == '&')
1237 result->type = CPP_AND_AND;
1238 else if (c == '=')
1239 result->type = CPP_AND_EQ;
1240 else
1242 BACKUP ();
1243 result->type = CPP_AND;
1245 break;
1247 case '|':
1248 c = get_effective_char (pfile);
1249 if (c == '|')
1250 result->type = CPP_OR_OR;
1251 else if (c == '=')
1252 result->type = CPP_OR_EQ;
1253 else
1255 BACKUP ();
1256 result->type = CPP_OR;
1258 break;
1260 case ':':
1261 c = get_effective_char (pfile);
1262 if (c == ':' && CPP_OPTION (pfile, cplusplus))
1263 result->type = CPP_SCOPE;
1264 else if (c == '>' && CPP_OPTION (pfile, digraphs))
1266 result->flags |= DIGRAPH;
1267 result->type = CPP_CLOSE_SQUARE;
1269 else
1271 BACKUP ();
1272 result->type = CPP_COLON;
1274 break;
1276 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1277 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1278 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1279 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1280 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1282 case '~': result->type = CPP_COMPL; break;
1283 case ',': result->type = CPP_COMMA; break;
1284 case '(': result->type = CPP_OPEN_PAREN; break;
1285 case ')': result->type = CPP_CLOSE_PAREN; break;
1286 case '[': result->type = CPP_OPEN_SQUARE; break;
1287 case ']': result->type = CPP_CLOSE_SQUARE; break;
1288 case '{': result->type = CPP_OPEN_BRACE; break;
1289 case '}': result->type = CPP_CLOSE_BRACE; break;
1290 case ';': result->type = CPP_SEMICOLON; break;
1292 /* @ is a punctuator in Objective C. */
1293 case '@': result->type = CPP_ATSIGN; break;
1295 random_char:
1296 default:
1297 result->type = CPP_OTHER;
1298 result->val.c = c;
1299 break;
1302 return result;
1305 /* An upper bound on the number of bytes needed to spell a token,
1306 including preceding whitespace. */
1307 unsigned int
1308 cpp_token_len (token)
1309 const cpp_token *token;
1311 unsigned int len;
1313 switch (TOKEN_SPELL (token))
1315 default: len = 0; break;
1316 case SPELL_NUMBER:
1317 case SPELL_STRING: len = token->val.str.len; break;
1318 case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
1320 /* 1 for whitespace, 4 for comment delimiters. */
1321 return len + 5;
1324 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1325 already contain the enough space to hold the token's spelling.
1326 Returns a pointer to the character after the last character
1327 written. */
1328 unsigned char *
1329 cpp_spell_token (pfile, token, buffer)
1330 cpp_reader *pfile; /* Would be nice to be rid of this... */
1331 const cpp_token *token;
1332 unsigned char *buffer;
1334 switch (TOKEN_SPELL (token))
1336 case SPELL_OPERATOR:
1338 const unsigned char *spelling;
1339 unsigned char c;
1341 if (token->flags & DIGRAPH)
1342 spelling
1343 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1344 else if (token->flags & NAMED_OP)
1345 goto spell_ident;
1346 else
1347 spelling = TOKEN_NAME (token);
1349 while ((c = *spelling++) != '\0')
1350 *buffer++ = c;
1352 break;
1354 case SPELL_CHAR:
1355 *buffer++ = token->val.c;
1356 break;
1358 spell_ident:
1359 case SPELL_IDENT:
1360 memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
1361 buffer += NODE_LEN (token->val.node);
1362 break;
1364 case SPELL_NUMBER:
1365 memcpy (buffer, token->val.str.text, token->val.str.len);
1366 buffer += token->val.str.len;
1367 break;
1369 case SPELL_STRING:
1371 int left, right, tag;
1372 switch (token->type)
1374 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1375 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1376 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1377 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1378 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1379 default:
1380 cpp_ice (pfile, "unknown string token %s\n", TOKEN_NAME (token));
1381 return buffer;
1383 if (tag) *buffer++ = tag;
1384 *buffer++ = left;
1385 memcpy (buffer, token->val.str.text, token->val.str.len);
1386 buffer += token->val.str.len;
1387 *buffer++ = right;
1389 break;
1391 case SPELL_NONE:
1392 cpp_ice (pfile, "Unspellable token %s", TOKEN_NAME (token));
1393 break;
1396 return buffer;
1399 /* Returns a token as a null-terminated string. The string is
1400 temporary, and automatically freed later. Useful for diagnostics. */
1401 unsigned char *
1402 cpp_token_as_text (pfile, token)
1403 cpp_reader *pfile;
1404 const cpp_token *token;
1406 unsigned int len = cpp_token_len (token);
1407 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1409 end = cpp_spell_token (pfile, token, start);
1410 end[0] = '\0';
1412 return start;
1415 /* Used by C front ends. Should really move to using cpp_token_as_text. */
1416 const char *
1417 cpp_type2name (type)
1418 enum cpp_ttype type;
1420 return (const char *) token_spellings[type].name;
1423 /* Writes the spelling of token to FP, without any preceding space.
1424 Separated from cpp_spell_token for efficiency - to avoid stdio
1425 double-buffering. */
1426 void
1427 cpp_output_token (token, fp)
1428 const cpp_token *token;
1429 FILE *fp;
1431 switch (TOKEN_SPELL (token))
1433 case SPELL_OPERATOR:
1435 const unsigned char *spelling;
1436 int c;
1438 if (token->flags & DIGRAPH)
1439 spelling
1440 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1441 else if (token->flags & NAMED_OP)
1442 goto spell_ident;
1443 else
1444 spelling = TOKEN_NAME (token);
1446 c = *spelling;
1448 putc (c, fp);
1449 while ((c = *++spelling) != '\0');
1451 break;
1453 case SPELL_CHAR:
1454 putc (token->val.c, fp);
1455 break;
1457 spell_ident:
1458 case SPELL_IDENT:
1459 fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
1460 break;
1462 case SPELL_NUMBER:
1463 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1464 break;
1466 case SPELL_STRING:
1468 int left, right, tag;
1469 switch (token->type)
1471 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1472 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1473 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1474 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1475 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1476 default:
1477 fprintf (stderr, "impossible STRING token %s\n", TOKEN_NAME (token));
1478 return;
1480 if (tag) putc (tag, fp);
1481 putc (left, fp);
1482 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1483 putc (right, fp);
1485 break;
1487 case SPELL_NONE:
1488 /* An error, most probably. */
1489 break;
1493 /* Compare two tokens. */
1495 _cpp_equiv_tokens (a, b)
1496 const cpp_token *a, *b;
1498 if (a->type == b->type && a->flags == b->flags)
1499 switch (TOKEN_SPELL (a))
1501 default: /* Keep compiler happy. */
1502 case SPELL_OPERATOR:
1503 return 1;
1504 case SPELL_CHAR:
1505 return a->val.c == b->val.c; /* Character. */
1506 case SPELL_NONE:
1507 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1508 case SPELL_IDENT:
1509 return a->val.node == b->val.node;
1510 case SPELL_NUMBER:
1511 case SPELL_STRING:
1512 return (a->val.str.len == b->val.str.len
1513 && !memcmp (a->val.str.text, b->val.str.text,
1514 a->val.str.len));
1517 return 0;
1520 /* Returns nonzero if a space should be inserted to avoid an
1521 accidental token paste for output. For simplicity, it is
1522 conservative, and occasionally advises a space where one is not
1523 needed, e.g. "." and ".2". */
1526 cpp_avoid_paste (pfile, token1, token2)
1527 cpp_reader *pfile;
1528 const cpp_token *token1, *token2;
1530 enum cpp_ttype a = token1->type, b = token2->type;
1531 cppchar_t c;
1533 if (token1->flags & NAMED_OP)
1534 a = CPP_NAME;
1535 if (token2->flags & NAMED_OP)
1536 b = CPP_NAME;
1538 c = EOF;
1539 if (token2->flags & DIGRAPH)
1540 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1541 else if (token_spellings[b].category == SPELL_OPERATOR)
1542 c = token_spellings[b].name[0];
1544 /* Quickly get everything that can paste with an '='. */
1545 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1546 return 1;
1548 switch (a)
1550 case CPP_GREATER: return c == '>' || c == '?';
1551 case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
1552 case CPP_PLUS: return c == '+';
1553 case CPP_MINUS: return c == '-' || c == '>';
1554 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1555 case CPP_MOD: return c == ':' || c == '>';
1556 case CPP_AND: return c == '&';
1557 case CPP_OR: return c == '|';
1558 case CPP_COLON: return c == ':' || c == '>';
1559 case CPP_DEREF: return c == '*';
1560 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1561 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1562 case CPP_NAME: return ((b == CPP_NUMBER
1563 && name_p (pfile, &token2->val.str))
1564 || b == CPP_NAME
1565 || b == CPP_CHAR || b == CPP_STRING); /* L */
1566 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1567 || c == '.' || c == '+' || c == '-');
1568 case CPP_OTHER: return (CPP_OPTION (pfile, objc)
1569 && token1->val.c == '@'
1570 && (b == CPP_NAME || b == CPP_STRING));
1571 default: break;
1574 return 0;
1577 /* Output all the remaining tokens on the current line, and a newline
1578 character, to FP. Leading whitespace is removed. If there are
1579 macros, special token padding is not performed. */
1580 void
1581 cpp_output_line (pfile, fp)
1582 cpp_reader *pfile;
1583 FILE *fp;
1585 const cpp_token *token;
1587 token = cpp_get_token (pfile);
1588 while (token->type != CPP_EOF)
1590 cpp_output_token (token, fp);
1591 token = cpp_get_token (pfile);
1592 if (token->flags & PREV_WHITE)
1593 putc (' ', fp);
1596 putc ('\n', fp);
1599 /* Returns the value of a hexadecimal digit. */
1600 static unsigned int
1601 hex_digit_value (c)
1602 unsigned int c;
1604 if (c >= 'a' && c <= 'f')
1605 return c - 'a' + 10;
1606 if (c >= 'A' && c <= 'F')
1607 return c - 'A' + 10;
1608 if (c >= '0' && c <= '9')
1609 return c - '0';
1610 abort ();
1613 /* Parse a '\uNNNN' or '\UNNNNNNNN' sequence. Returns 1 to indicate
1614 failure if cpplib is not parsing C++ or C99. Such failure is
1615 silent, and no variables are updated. Otherwise returns 0, and
1616 warns if -Wtraditional.
1618 [lex.charset]: The character designated by the universal character
1619 name \UNNNNNNNN is that character whose character short name in
1620 ISO/IEC 10646 is NNNNNNNN; the character designated by the
1621 universal character name \uNNNN is that character whose character
1622 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
1623 for a universal character name is less than 0x20 or in the range
1624 0x7F-0x9F (inclusive), or if the universal character name
1625 designates a character in the basic source character set, then the
1626 program is ill-formed.
1628 We assume that wchar_t is Unicode, so we don't need to do any
1629 mapping. Is this ever wrong?
1631 PC points to the 'u' or 'U', PSTR is points to the byte after PC,
1632 LIMIT is the end of the string or charconst. PSTR is updated to
1633 point after the UCS on return, and the UCS is written into PC. */
1635 static int
1636 maybe_read_ucs (pfile, pstr, limit, pc)
1637 cpp_reader *pfile;
1638 const unsigned char **pstr;
1639 const unsigned char *limit;
1640 unsigned int *pc;
1642 const unsigned char *p = *pstr;
1643 unsigned int code = 0;
1644 unsigned int c = *pc, length;
1646 /* Only attempt to interpret a UCS for C++ and C99. */
1647 if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
1648 return 1;
1650 if (CPP_WTRADITIONAL (pfile))
1651 cpp_warning (pfile, "the meaning of '\\%c' varies with -traditional", c);
1653 length = (c == 'u' ? 4: 8);
1655 if ((size_t) (limit - p) < length)
1657 cpp_error (pfile, "incomplete universal-character-name");
1658 /* Skip to the end to avoid more diagnostics. */
1659 p = limit;
1661 else
1663 for (; length; length--, p++)
1665 c = *p;
1666 if (ISXDIGIT (c))
1667 code = (code << 4) + hex_digit_value (c);
1668 else
1670 cpp_error (pfile,
1671 "non-hex digit '%c' in universal-character-name", c);
1672 /* We shouldn't skip in case there are multibyte chars. */
1673 break;
1678 #ifdef TARGET_EBCDIC
1679 cpp_error (pfile, "universal-character-name on EBCDIC target");
1680 code = 0x3f; /* EBCDIC invalid character */
1681 #else
1682 /* True extended characters are OK. */
1683 if (code >= 0xa0
1684 && !(code & 0x80000000)
1685 && !(code >= 0xD800 && code <= 0xDFFF))
1687 /* The standard permits $, @ and ` to be specified as UCNs. We use
1688 hex escapes so that this also works with EBCDIC hosts. */
1689 else if (code == 0x24 || code == 0x40 || code == 0x60)
1691 /* Don't give another error if one occurred above. */
1692 else if (length == 0)
1693 cpp_error (pfile, "universal-character-name out of range");
1694 #endif
1696 *pstr = p;
1697 *pc = code;
1698 return 0;
1701 /* Interpret an escape sequence, and return its value. PSTR points to
1702 the input pointer, which is just after the backslash. LIMIT is how
1703 much text we have. MASK is a bitmask for the precision for the
1704 destination type (char or wchar_t). TRADITIONAL, if true, does not
1705 interpret escapes that did not exist in traditional C.
1707 Handles all relevant diagnostics. */
1709 unsigned int
1710 cpp_parse_escape (pfile, pstr, limit, mask, traditional)
1711 cpp_reader *pfile;
1712 const unsigned char **pstr;
1713 const unsigned char *limit;
1714 unsigned HOST_WIDE_INT mask;
1715 int traditional;
1717 int unknown = 0;
1718 const unsigned char *str = *pstr;
1719 unsigned int c = *str++;
1721 switch (c)
1723 case '\\': case '\'': case '"': case '?': break;
1724 case 'b': c = TARGET_BS; break;
1725 case 'f': c = TARGET_FF; break;
1726 case 'n': c = TARGET_NEWLINE; break;
1727 case 'r': c = TARGET_CR; break;
1728 case 't': c = TARGET_TAB; break;
1729 case 'v': c = TARGET_VT; break;
1731 case '(': case '{': case '[': case '%':
1732 /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
1733 '\%' is used to prevent SCCS from getting confused. */
1734 unknown = CPP_PEDANTIC (pfile);
1735 break;
1737 case 'a':
1738 if (CPP_WTRADITIONAL (pfile))
1739 cpp_warning (pfile, "the meaning of '\\a' varies with -traditional");
1740 if (!traditional)
1741 c = TARGET_BELL;
1742 break;
1744 case 'e': case 'E':
1745 if (CPP_PEDANTIC (pfile))
1746 cpp_pedwarn (pfile, "non-ISO-standard escape sequence, '\\%c'", c);
1747 c = TARGET_ESC;
1748 break;
1750 case 'u': case 'U':
1751 unknown = maybe_read_ucs (pfile, &str, limit, &c);
1752 break;
1754 case 'x':
1755 if (CPP_WTRADITIONAL (pfile))
1756 cpp_warning (pfile, "the meaning of '\\x' varies with -traditional");
1758 if (!traditional)
1760 unsigned int i = 0, overflow = 0;
1761 int digits_found = 0;
1763 while (str < limit)
1765 c = *str;
1766 if (! ISXDIGIT (c))
1767 break;
1768 str++;
1769 overflow |= i ^ (i << 4 >> 4);
1770 i = (i << 4) + hex_digit_value (c);
1771 digits_found = 1;
1774 if (!digits_found)
1775 cpp_error (pfile, "\\x used with no following hex digits");
1777 if (overflow | (i != (i & mask)))
1779 cpp_pedwarn (pfile, "hex escape sequence out of range");
1780 i &= mask;
1782 c = i;
1784 break;
1786 case '0': case '1': case '2': case '3':
1787 case '4': case '5': case '6': case '7':
1789 unsigned int i = c - '0';
1790 int count = 0;
1792 while (str < limit && ++count < 3)
1794 c = *str;
1795 if (c < '0' || c > '7')
1796 break;
1797 str++;
1798 i = (i << 3) + c - '0';
1801 if (i != (i & mask))
1803 cpp_pedwarn (pfile, "octal escape sequence out of range");
1804 i &= mask;
1806 c = i;
1808 break;
1810 default:
1811 unknown = 1;
1812 break;
1815 if (unknown)
1817 if (ISGRAPH (c))
1818 cpp_pedwarn (pfile, "unknown escape sequence '\\%c'", c);
1819 else
1820 cpp_pedwarn (pfile, "unknown escape sequence: '\\%03o'", c);
1823 if (c > mask)
1824 cpp_pedwarn (pfile, "escape sequence out of range for character");
1826 *pstr = str;
1827 return c;
1830 #ifndef MAX_CHAR_TYPE_SIZE
1831 #define MAX_CHAR_TYPE_SIZE CHAR_TYPE_SIZE
1832 #endif
1834 #ifndef MAX_WCHAR_TYPE_SIZE
1835 #define MAX_WCHAR_TYPE_SIZE WCHAR_TYPE_SIZE
1836 #endif
1838 /* Interpret a (possibly wide) character constant in TOKEN.
1839 WARN_MULTI warns about multi-character charconsts, if not
1840 TRADITIONAL. TRADITIONAL also indicates not to interpret escapes
1841 that did not exist in traditional C. PCHARS_SEEN points to a
1842 variable that is filled in with the number of characters seen. */
1843 HOST_WIDE_INT
1844 cpp_interpret_charconst (pfile, token, warn_multi, traditional, pchars_seen)
1845 cpp_reader *pfile;
1846 const cpp_token *token;
1847 int warn_multi;
1848 int traditional;
1849 unsigned int *pchars_seen;
1851 const unsigned char *str = token->val.str.text;
1852 const unsigned char *limit = str + token->val.str.len;
1853 unsigned int chars_seen = 0;
1854 unsigned int width, max_chars, c;
1855 unsigned HOST_WIDE_INT mask;
1856 HOST_WIDE_INT result = 0;
1858 #ifdef MULTIBYTE_CHARS
1859 (void) local_mbtowc (NULL, NULL, 0);
1860 #endif
1862 /* Width in bits. */
1863 if (token->type == CPP_CHAR)
1864 width = MAX_CHAR_TYPE_SIZE;
1865 else
1866 width = MAX_WCHAR_TYPE_SIZE;
1868 if (width < HOST_BITS_PER_WIDE_INT)
1869 mask = ((unsigned HOST_WIDE_INT) 1 << width) - 1;
1870 else
1871 mask = ~0;
1872 max_chars = HOST_BITS_PER_WIDE_INT / width;
1874 while (str < limit)
1876 #ifdef MULTIBYTE_CHARS
1877 wchar_t wc;
1878 int char_len;
1880 char_len = local_mbtowc (&wc, str, limit - str);
1881 if (char_len == -1)
1883 cpp_warning (pfile, "ignoring invalid multibyte character");
1884 c = *str++;
1886 else
1888 str += char_len;
1889 c = wc;
1891 #else
1892 c = *str++;
1893 #endif
1895 if (c == '\\')
1896 c = cpp_parse_escape (pfile, &str, limit, mask, traditional);
1898 #ifdef MAP_CHARACTER
1899 if (ISPRINT (c))
1900 c = MAP_CHARACTER (c);
1901 #endif
1903 /* Merge character into result; ignore excess chars. */
1904 if (++chars_seen <= max_chars)
1906 if (width < HOST_BITS_PER_WIDE_INT)
1907 result = (result << width) | (c & mask);
1908 else
1909 result = c;
1913 if (chars_seen == 0)
1914 cpp_error (pfile, "empty character constant");
1915 else if (chars_seen > max_chars)
1917 chars_seen = max_chars;
1918 cpp_warning (pfile, "character constant too long");
1920 else if (chars_seen > 1 && !traditional && warn_multi)
1921 cpp_warning (pfile, "multi-character character constant");
1923 /* If char type is signed, sign-extend the constant. The
1924 __CHAR_UNSIGNED__ macro is set by the driver if appropriate. */
1925 if (token->type == CPP_CHAR && chars_seen)
1927 unsigned int nbits = chars_seen * width;
1928 unsigned int mask = (unsigned int) ~0 >> (HOST_BITS_PER_INT - nbits);
1930 if (pfile->spec_nodes.n__CHAR_UNSIGNED__->type == NT_MACRO
1931 || ((result >> (nbits - 1)) & 1) == 0)
1932 result &= mask;
1933 else
1934 result |= ~mask;
1937 *pchars_seen = chars_seen;
1938 return result;
1941 /* Memory buffers. Changing these three constants can have a dramatic
1942 effect on performance. The values here are reasonable defaults,
1943 but might be tuned. If you adjust them, be sure to test across a
1944 range of uses of cpplib, including heavy nested function-like macro
1945 expansion. Also check the change in peak memory usage (NJAMD is a
1946 good tool for this). */
1947 #define MIN_BUFF_SIZE 8000
1948 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1949 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1950 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1952 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1953 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1954 #endif
1956 struct dummy
1958 char c;
1959 union
1961 double d;
1962 int *p;
1963 } u;
1966 #define DEFAULT_ALIGNMENT (offsetof (struct dummy, u))
1967 #define CPP_ALIGN(size, align) (((size) + ((align) - 1)) & ~((align) - 1))
1969 /* Create a new allocation buffer. Place the control block at the end
1970 of the buffer, so that buffer overflows will cause immediate chaos. */
1971 static _cpp_buff *
1972 new_buff (len)
1973 size_t len;
1975 _cpp_buff *result;
1976 unsigned char *base;
1978 if (len < MIN_BUFF_SIZE)
1979 len = MIN_BUFF_SIZE;
1980 len = CPP_ALIGN (len, DEFAULT_ALIGNMENT);
1982 base = xmalloc (len + sizeof (_cpp_buff));
1983 result = (_cpp_buff *) (base + len);
1984 result->base = base;
1985 result->cur = base;
1986 result->limit = base + len;
1987 result->next = NULL;
1988 return result;
1991 /* Place a chain of unwanted allocation buffers on the free list. */
1992 void
1993 _cpp_release_buff (pfile, buff)
1994 cpp_reader *pfile;
1995 _cpp_buff *buff;
1997 _cpp_buff *end = buff;
1999 while (end->next)
2000 end = end->next;
2001 end->next = pfile->free_buffs;
2002 pfile->free_buffs = buff;
2005 /* Return a free buffer of size at least MIN_SIZE. */
2006 _cpp_buff *
2007 _cpp_get_buff (pfile, min_size)
2008 cpp_reader *pfile;
2009 size_t min_size;
2011 _cpp_buff *result, **p;
2013 for (p = &pfile->free_buffs;; p = &(*p)->next)
2015 size_t size;
2017 if (*p == NULL)
2018 return new_buff (min_size);
2019 result = *p;
2020 size = result->limit - result->base;
2021 /* Return a buffer that's big enough, but don't waste one that's
2022 way too big. */
2023 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2024 break;
2027 *p = result->next;
2028 result->next = NULL;
2029 result->cur = result->base;
2030 return result;
2033 /* Creates a new buffer with enough space to hold the uncommitted
2034 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
2035 the excess bytes to the new buffer. Chains the new buffer after
2036 BUFF, and returns the new buffer. */
2037 _cpp_buff *
2038 _cpp_append_extend_buff (pfile, buff, min_extra)
2039 cpp_reader *pfile;
2040 _cpp_buff *buff;
2041 size_t min_extra;
2043 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2044 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2046 buff->next = new_buff;
2047 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2048 return new_buff;
2051 /* Creates a new buffer with enough space to hold the uncommitted
2052 remaining bytes of the buffer pointed to by BUFF, and at least
2053 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
2054 Chains the new buffer before the buffer pointed to by BUFF, and
2055 updates the pointer to point to the new buffer. */
2056 void
2057 _cpp_extend_buff (pfile, pbuff, min_extra)
2058 cpp_reader *pfile;
2059 _cpp_buff **pbuff;
2060 size_t min_extra;
2062 _cpp_buff *new_buff, *old_buff = *pbuff;
2063 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2065 new_buff = _cpp_get_buff (pfile, size);
2066 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2067 new_buff->next = old_buff;
2068 *pbuff = new_buff;
2071 /* Free a chain of buffers starting at BUFF. */
2072 void
2073 _cpp_free_buff (buff)
2074 _cpp_buff *buff;
2076 _cpp_buff *next;
2078 for (; buff; buff = next)
2080 next = buff->next;
2081 free (buff->base);
2085 /* Allocate permanent, unaligned storage of length LEN. */
2086 unsigned char *
2087 _cpp_unaligned_alloc (pfile, len)
2088 cpp_reader *pfile;
2089 size_t len;
2091 _cpp_buff *buff = pfile->u_buff;
2092 unsigned char *result = buff->cur;
2094 if (len > (size_t) (buff->limit - result))
2096 buff = _cpp_get_buff (pfile, len);
2097 buff->next = pfile->u_buff;
2098 pfile->u_buff = buff;
2099 result = buff->cur;
2102 buff->cur = result + len;
2103 return result;
2106 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2107 That buffer is used for growing allocations when saving macro
2108 replacement lists in a #define, and when parsing an answer to an
2109 assertion in #assert, #unassert or #if (and therefore possibly
2110 whilst expanding macros). It therefore must not be used by any
2111 code that they might call: specifically the lexer and the guts of
2112 the macro expander.
2114 All existing other uses clearly fit this restriction: storing
2115 registered pragmas during initialization. */
2116 unsigned char *
2117 _cpp_aligned_alloc (pfile, len)
2118 cpp_reader *pfile;
2119 size_t len;
2121 _cpp_buff *buff = pfile->a_buff;
2122 unsigned char *result = buff->cur;
2124 if (len > (size_t) (buff->limit - result))
2126 buff = _cpp_get_buff (pfile, len);
2127 buff->next = pfile->a_buff;
2128 pfile->a_buff = buff;
2129 result = buff->cur;
2132 buff->cur = result + len;
2133 return result;