2003-04-11 Eric Christopher <echristo@redhat.com>
[official-gcc.git] / gcc / cpplex.c
blob93e04b8b5293f8b7dba5620887a72b7e22906745
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000, 2001, 2002 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7 Single-pass line tokenization by Neil Booth, April 2000
9 This program is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
12 later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "cpplib.h"
28 #include "cpphash.h"
30 #ifdef MULTIBYTE_CHARS
31 #include "mbchar.h"
32 #include <locale.h>
33 #endif
35 /* Tokens with SPELL_STRING store their spelling in the token list,
36 and it's length in the token->val.name.len. */
37 enum spell_type
39 SPELL_OPERATOR = 0,
40 SPELL_CHAR,
41 SPELL_IDENT,
42 SPELL_NUMBER,
43 SPELL_STRING,
44 SPELL_NONE
47 struct token_spelling
49 enum spell_type category;
50 const unsigned char *name;
53 static const unsigned char *const digraph_spellings[] =
54 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
56 #define OP(e, s) { SPELL_OPERATOR, U s },
57 #define TK(e, s) { s, U STRINGX (e) },
58 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
59 #undef OP
60 #undef TK
62 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
63 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
64 #define BACKUP() do {buffer->cur = buffer->backup_to;} while (0)
66 static void handle_newline PARAMS ((cpp_reader *));
67 static cppchar_t skip_escaped_newlines PARAMS ((cpp_reader *));
68 static cppchar_t get_effective_char PARAMS ((cpp_reader *));
70 static int skip_block_comment PARAMS ((cpp_reader *));
71 static int skip_line_comment PARAMS ((cpp_reader *));
72 static void adjust_column PARAMS ((cpp_reader *));
73 static int skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
74 static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *));
75 static uchar *parse_slow PARAMS ((cpp_reader *, const uchar *, int,
76 unsigned int *));
77 static void parse_number PARAMS ((cpp_reader *, cpp_string *, int));
78 static int unescaped_terminator_p PARAMS ((cpp_reader *, const uchar *));
79 static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
80 static bool trigraph_p PARAMS ((cpp_reader *));
81 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const uchar *,
82 cppchar_t));
83 static bool continue_after_nul PARAMS ((cpp_reader *));
84 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
85 static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
86 const unsigned char *, cppchar_t *));
87 static tokenrun *next_tokenrun PARAMS ((tokenrun *));
89 static unsigned int hex_digit_value PARAMS ((unsigned int));
90 static _cpp_buff *new_buff PARAMS ((size_t));
92 /* Change to the native locale for multibyte conversions. */
93 void
94 _cpp_init_mbchar ()
96 #ifdef MULTIBYTE_CHARS
97 setlocale (LC_CTYPE, "");
98 GET_ENVIRONMENT (literal_codeset, "LANG");
99 #endif
102 /* Utility routine:
104 Compares, the token TOKEN to the NUL-terminated string STRING.
105 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
107 cpp_ideq (token, string)
108 const cpp_token *token;
109 const char *string;
111 if (token->type != CPP_NAME)
112 return 0;
114 return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
117 /* Call when meeting a newline, assumed to be in buffer->cur[-1].
118 Returns with buffer->cur pointing to the character immediately
119 following the newline (combination). */
120 static void
121 handle_newline (pfile)
122 cpp_reader *pfile;
124 cpp_buffer *buffer = pfile->buffer;
126 /* Handle CR-LF and LF-CR. Most other implementations (e.g. java)
127 only accept CR-LF; maybe we should fall back to that behavior? */
128 if (buffer->cur[-1] + buffer->cur[0] == '\r' + '\n')
129 buffer->cur++;
131 buffer->line_base = buffer->cur;
132 buffer->col_adjust = 0;
133 pfile->line++;
136 /* Subroutine of skip_escaped_newlines; called when a 3-character
137 sequence beginning with "??" is encountered. buffer->cur points to
138 the second '?'.
140 Warn if necessary, and returns true if the sequence forms a
141 trigraph and the trigraph should be honored. */
142 static bool
143 trigraph_p (pfile)
144 cpp_reader *pfile;
146 cpp_buffer *buffer = pfile->buffer;
147 cppchar_t from_char = buffer->cur[1];
148 bool accept;
150 if (!_cpp_trigraph_map[from_char])
151 return false;
153 accept = CPP_OPTION (pfile, trigraphs);
155 /* Don't warn about trigraphs in comments. */
156 if (CPP_OPTION (pfile, warn_trigraphs) && !pfile->state.lexing_comment)
158 if (accept)
159 cpp_error_with_line (pfile, DL_WARNING,
160 pfile->line, CPP_BUF_COL (buffer) - 1,
161 "trigraph ??%c converted to %c",
162 (int) from_char,
163 (int) _cpp_trigraph_map[from_char]);
164 else if (buffer->cur != buffer->last_Wtrigraphs)
166 buffer->last_Wtrigraphs = buffer->cur;
167 cpp_error_with_line (pfile, DL_WARNING,
168 pfile->line, CPP_BUF_COL (buffer) - 1,
169 "trigraph ??%c ignored", (int) from_char);
173 return accept;
176 /* Skips any escaped newlines introduced by '?' or a '\\', assumed to
177 lie in buffer->cur[-1]. Returns the next byte, which will be in
178 buffer->cur[-1]. This routine performs preprocessing stages 1 and
179 2 of the ISO C standard. */
180 static cppchar_t
181 skip_escaped_newlines (pfile)
182 cpp_reader *pfile;
184 cpp_buffer *buffer = pfile->buffer;
185 cppchar_t next = buffer->cur[-1];
187 /* Only do this if we apply stages 1 and 2. */
188 if (!buffer->from_stage3)
190 const unsigned char *saved_cur;
191 cppchar_t next1;
195 if (next == '?')
197 if (buffer->cur[0] != '?' || !trigraph_p (pfile))
198 break;
200 /* Translate the trigraph. */
201 next = _cpp_trigraph_map[buffer->cur[1]];
202 buffer->cur += 2;
203 if (next != '\\')
204 break;
207 if (buffer->cur == buffer->rlimit)
208 break;
210 /* We have a backslash, and room for at least one more
211 character. Skip horizontal whitespace. */
212 saved_cur = buffer->cur;
214 next1 = *buffer->cur++;
215 while (is_nvspace (next1) && buffer->cur < buffer->rlimit);
217 if (!is_vspace (next1))
219 buffer->cur = saved_cur;
220 break;
223 if (saved_cur != buffer->cur - 1
224 && !pfile->state.lexing_comment)
225 cpp_error (pfile, DL_WARNING,
226 "backslash and newline separated by space");
228 handle_newline (pfile);
229 buffer->backup_to = buffer->cur;
230 if (buffer->cur == buffer->rlimit)
232 cpp_error (pfile, DL_PEDWARN,
233 "backslash-newline at end of file");
234 next = EOF;
236 else
237 next = *buffer->cur++;
239 while (next == '\\' || next == '?');
242 return next;
245 /* Obtain the next character, after trigraph conversion and skipping
246 an arbitrarily long string of escaped newlines. The common case of
247 no trigraphs or escaped newlines falls through quickly. On return,
248 buffer->backup_to points to where to return to if the character is
249 not to be processed. */
250 static cppchar_t
251 get_effective_char (pfile)
252 cpp_reader *pfile;
254 cppchar_t next;
255 cpp_buffer *buffer = pfile->buffer;
257 buffer->backup_to = buffer->cur;
258 next = *buffer->cur++;
259 if (__builtin_expect (next == '?' || next == '\\', 0))
260 next = skip_escaped_newlines (pfile);
262 return next;
265 /* Skip a C-style block comment. We find the end of the comment by
266 seeing if an asterisk is before every '/' we encounter. Returns
267 nonzero if comment terminated by EOF, zero otherwise. */
268 static int
269 skip_block_comment (pfile)
270 cpp_reader *pfile;
272 cpp_buffer *buffer = pfile->buffer;
273 cppchar_t c = EOF, prevc = EOF;
275 pfile->state.lexing_comment = 1;
276 while (buffer->cur != buffer->rlimit)
278 prevc = c, c = *buffer->cur++;
280 /* FIXME: For speed, create a new character class of characters
281 of interest inside block comments. */
282 if (c == '?' || c == '\\')
283 c = skip_escaped_newlines (pfile);
285 /* People like decorating comments with '*', so check for '/'
286 instead for efficiency. */
287 if (c == '/')
289 if (prevc == '*')
290 break;
292 /* Warn about potential nested comments, but not if the '/'
293 comes immediately before the true comment delimiter.
294 Don't bother to get it right across escaped newlines. */
295 if (CPP_OPTION (pfile, warn_comments)
296 && buffer->cur[0] == '*' && buffer->cur[1] != '/')
297 cpp_error_with_line (pfile, DL_WARNING,
298 pfile->line, CPP_BUF_COL (buffer),
299 "\"/*\" within comment");
301 else if (is_vspace (c))
302 handle_newline (pfile);
303 else if (c == '\t')
304 adjust_column (pfile);
307 pfile->state.lexing_comment = 0;
308 return c != '/' || prevc != '*';
311 /* Skip a C++ line comment, leaving buffer->cur pointing to the
312 terminating newline. Handles escaped newlines. Returns nonzero
313 if a multiline comment. */
314 static int
315 skip_line_comment (pfile)
316 cpp_reader *pfile;
318 cpp_buffer *buffer = pfile->buffer;
319 unsigned int orig_line = pfile->line;
320 cppchar_t c;
321 #ifdef MULTIBYTE_CHARS
322 wchar_t wc;
323 int char_len;
324 #endif
326 pfile->state.lexing_comment = 1;
327 #ifdef MULTIBYTE_CHARS
328 /* Reset multibyte conversion state. */
329 (void) local_mbtowc (NULL, NULL, 0);
330 #endif
333 if (buffer->cur == buffer->rlimit)
334 goto at_eof;
336 #ifdef MULTIBYTE_CHARS
337 char_len = local_mbtowc (&wc, (const char *) buffer->cur,
338 buffer->rlimit - buffer->cur);
339 if (char_len == -1)
341 cpp_error (pfile, DL_WARNING,
342 "ignoring invalid multibyte character");
343 char_len = 1;
344 c = *buffer->cur++;
346 else
348 buffer->cur += char_len;
349 c = wc;
351 #else
352 c = *buffer->cur++;
353 #endif
354 if (c == '?' || c == '\\')
355 c = skip_escaped_newlines (pfile);
357 while (!is_vspace (c));
359 /* Step back over the newline, except at EOF. */
360 buffer->cur--;
361 at_eof:
363 pfile->state.lexing_comment = 0;
364 return orig_line != pfile->line;
367 /* pfile->buffer->cur is one beyond the \t character. Update
368 col_adjust so we track the column correctly. */
369 static void
370 adjust_column (pfile)
371 cpp_reader *pfile;
373 cpp_buffer *buffer = pfile->buffer;
374 unsigned int col = CPP_BUF_COL (buffer) - 1; /* Zero-based column. */
376 /* Round it up to multiple of the tabstop, but subtract 1 since the
377 tab itself occupies a character position. */
378 buffer->col_adjust += (CPP_OPTION (pfile, tabstop)
379 - col % CPP_OPTION (pfile, tabstop)) - 1;
382 /* Skips whitespace, saving the next non-whitespace character.
383 Adjusts pfile->col_adjust to account for tabs. Without this,
384 tokens might be assigned an incorrect column. */
385 static int
386 skip_whitespace (pfile, c)
387 cpp_reader *pfile;
388 cppchar_t c;
390 cpp_buffer *buffer = pfile->buffer;
391 unsigned int warned = 0;
395 /* Horizontal space always OK. */
396 if (c == ' ')
398 else if (c == '\t')
399 adjust_column (pfile);
400 /* Just \f \v or \0 left. */
401 else if (c == '\0')
403 if (buffer->cur - 1 == buffer->rlimit)
404 return 0;
405 if (!warned)
407 cpp_error (pfile, DL_WARNING, "null character(s) ignored");
408 warned = 1;
411 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
412 cpp_error_with_line (pfile, DL_PEDWARN, pfile->line,
413 CPP_BUF_COL (buffer),
414 "%s in preprocessing directive",
415 c == '\f' ? "form feed" : "vertical tab");
417 c = *buffer->cur++;
419 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
420 while (is_nvspace (c));
422 buffer->cur--;
423 return 1;
426 /* See if the characters of a number token are valid in a name (no
427 '.', '+' or '-'). */
428 static int
429 name_p (pfile, string)
430 cpp_reader *pfile;
431 const cpp_string *string;
433 unsigned int i;
435 for (i = 0; i < string->len; i++)
436 if (!is_idchar (string->text[i]))
437 return 0;
439 return 1;
442 /* Parse an identifier, skipping embedded backslash-newlines. This is
443 a critical inner loop. The common case is an identifier which has
444 not been split by backslash-newline, does not contain a dollar
445 sign, and has already been scanned (roughly 10:1 ratio of
446 seen:unseen identifiers in normal code; the distribution is
447 Poisson-like). Second most common case is a new identifier, not
448 split and no dollar sign. The other possibilities are rare and
449 have been relegated to parse_slow. */
450 static cpp_hashnode *
451 parse_identifier (pfile)
452 cpp_reader *pfile;
454 cpp_hashnode *result;
455 const uchar *cur, *base;
457 /* Fast-path loop. Skim over a normal identifier.
458 N.B. ISIDNUM does not include $. */
459 cur = pfile->buffer->cur;
460 while (ISIDNUM (*cur))
461 cur++;
463 /* Check for slow-path cases. */
464 if (*cur == '?' || *cur == '\\' || *cur == '$')
466 unsigned int len;
468 base = parse_slow (pfile, cur, 0, &len);
469 result = (cpp_hashnode *)
470 ht_lookup (pfile->hash_table, base, len, HT_ALLOCED);
472 else
474 base = pfile->buffer->cur - 1;
475 pfile->buffer->cur = cur;
476 result = (cpp_hashnode *)
477 ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
480 /* Rarely, identifiers require diagnostics when lexed.
481 XXX Has to be forced out of the fast path. */
482 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
483 && !pfile->state.skipping, 0))
485 /* It is allowed to poison the same identifier twice. */
486 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
487 cpp_error (pfile, DL_ERROR, "attempt to use poisoned \"%s\"",
488 NODE_NAME (result));
490 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
491 replacement list of a variadic macro. */
492 if (result == pfile->spec_nodes.n__VA_ARGS__
493 && !pfile->state.va_args_ok)
494 cpp_error (pfile, DL_PEDWARN,
495 "__VA_ARGS__ can only appear in the expansion of a C99 variadic macro");
498 return result;
501 /* Slow path. This handles numbers and identifiers which have been
502 split, or contain dollar signs. The part of the token from
503 PFILE->buffer->cur-1 to CUR has already been scanned. NUMBER_P is
504 1 if it's a number, and 2 if it has a leading period. Returns a
505 pointer to the token's NUL-terminated spelling in permanent
506 storage, and sets PLEN to its length. */
507 static uchar *
508 parse_slow (pfile, cur, number_p, plen)
509 cpp_reader *pfile;
510 const uchar *cur;
511 int number_p;
512 unsigned int *plen;
514 cpp_buffer *buffer = pfile->buffer;
515 const uchar *base = buffer->cur - 1;
516 struct obstack *stack = &pfile->hash_table->stack;
517 unsigned int c, prevc, saw_dollar = 0;
519 /* Place any leading period. */
520 if (number_p == 2)
521 obstack_1grow (stack, '.');
523 /* Copy the part of the token which is known to be okay. */
524 obstack_grow (stack, base, cur - base);
526 /* Now process the part which isn't. We are looking at one of
527 '$', '\\', or '?' on entry to this loop. */
528 prevc = cur[-1];
529 c = *cur++;
530 buffer->cur = cur;
531 for (;;)
533 /* Potential escaped newline? */
534 buffer->backup_to = buffer->cur - 1;
535 if (c == '?' || c == '\\')
536 c = skip_escaped_newlines (pfile);
538 if (!is_idchar (c))
540 if (!number_p)
541 break;
542 if (c != '.' && !VALID_SIGN (c, prevc))
543 break;
546 /* Handle normal identifier characters in this loop. */
549 prevc = c;
550 obstack_1grow (stack, c);
552 if (c == '$')
553 saw_dollar++;
555 c = *buffer->cur++;
557 while (is_idchar (c));
560 /* Step back over the unwanted char. */
561 BACKUP ();
563 /* $ is not an identifier character in the standard, but is commonly
564 accepted as an extension. Don't warn about it in skipped
565 conditional blocks. */
566 if (saw_dollar && CPP_PEDANTIC (pfile) && ! pfile->state.skipping)
567 cpp_error (pfile, DL_PEDWARN, "'$' character(s) in identifier or number");
569 /* Identifiers and numbers are null-terminated. */
570 *plen = obstack_object_size (stack);
571 obstack_1grow (stack, '\0');
572 return obstack_finish (stack);
575 /* Parse a number, beginning with character C, skipping embedded
576 backslash-newlines. LEADING_PERIOD is nonzero if there was a "."
577 before C. Place the result in NUMBER. */
578 static void
579 parse_number (pfile, number, leading_period)
580 cpp_reader *pfile;
581 cpp_string *number;
582 int leading_period;
584 const uchar *cur;
586 /* Fast-path loop. Skim over a normal number.
587 N.B. ISIDNUM does not include $. */
588 cur = pfile->buffer->cur;
589 while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
590 cur++;
592 /* Check for slow-path cases. */
593 if (*cur == '?' || *cur == '\\' || *cur == '$')
594 number->text = parse_slow (pfile, cur, 1 + leading_period, &number->len);
595 else
597 const uchar *base = pfile->buffer->cur - 1;
598 uchar *dest;
600 number->len = cur - base + leading_period;
601 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
602 dest[number->len] = '\0';
603 number->text = dest;
605 if (leading_period)
606 *dest++ = '.';
607 memcpy (dest, base, cur - base);
608 pfile->buffer->cur = cur;
612 /* Subroutine of parse_string. */
613 static int
614 unescaped_terminator_p (pfile, dest)
615 cpp_reader *pfile;
616 const unsigned char *dest;
618 const unsigned char *start, *temp;
620 /* In #include-style directives, terminators are not escapable. */
621 if (pfile->state.angled_headers)
622 return 1;
624 start = BUFF_FRONT (pfile->u_buff);
626 /* An odd number of consecutive backslashes represents an escaped
627 terminator. */
628 for (temp = dest; temp > start && temp[-1] == '\\'; temp--)
631 return ((dest - temp) & 1) == 0;
634 /* Parses a string, character constant, or angle-bracketed header file
635 name. Handles embedded trigraphs and escaped newlines. The stored
636 string is guaranteed NUL-terminated, but it is not guaranteed that
637 this is the first NUL since embedded NULs are preserved.
639 When this function returns, buffer->cur points to the next
640 character to be processed. */
641 static void
642 parse_string (pfile, token, terminator)
643 cpp_reader *pfile;
644 cpp_token *token;
645 cppchar_t terminator;
647 cpp_buffer *buffer = pfile->buffer;
648 unsigned char *dest, *limit;
649 cppchar_t c;
650 bool warned_nulls = false;
651 #ifdef MULTIBYTE_CHARS
652 wchar_t wc;
653 int char_len;
654 #endif
656 dest = BUFF_FRONT (pfile->u_buff);
657 limit = BUFF_LIMIT (pfile->u_buff);
659 #ifdef MULTIBYTE_CHARS
660 /* Reset multibyte conversion state. */
661 (void) local_mbtowc (NULL, NULL, 0);
662 #endif
663 for (;;)
665 /* We need room for another char, possibly the terminating NUL. */
666 if ((size_t) (limit - dest) < 1)
668 size_t len_so_far = dest - BUFF_FRONT (pfile->u_buff);
669 _cpp_extend_buff (pfile, &pfile->u_buff, 2);
670 dest = BUFF_FRONT (pfile->u_buff) + len_so_far;
671 limit = BUFF_LIMIT (pfile->u_buff);
674 #ifdef MULTIBYTE_CHARS
675 char_len = local_mbtowc (&wc, (const char *) buffer->cur,
676 buffer->rlimit - buffer->cur);
677 if (char_len == -1)
679 cpp_error (pfile, DL_WARNING,
680 "ignoring invalid multibyte character");
681 char_len = 1;
682 c = *buffer->cur++;
684 else
686 buffer->cur += char_len;
687 c = wc;
689 #else
690 c = *buffer->cur++;
691 #endif
693 /* Handle trigraphs, escaped newlines etc. */
694 if (c == '?' || c == '\\')
695 c = skip_escaped_newlines (pfile);
697 if (c == terminator)
699 if (unescaped_terminator_p (pfile, dest))
700 break;
702 else if (is_vspace (c))
704 /* No string literal may extend over multiple lines. In
705 assembly language, suppress the error except for <>
706 includes. This is a kludge around not knowing where
707 comments are. */
708 unterminated:
709 if (CPP_OPTION (pfile, lang) != CLK_ASM || terminator == '>')
710 cpp_error (pfile, DL_ERROR, "missing terminating %c character",
711 (int) terminator);
712 buffer->cur--;
713 break;
715 else if (c == '\0')
717 if (buffer->cur - 1 == buffer->rlimit)
718 goto unterminated;
719 if (!warned_nulls)
721 warned_nulls = true;
722 cpp_error (pfile, DL_WARNING,
723 "null character(s) preserved in literal");
726 #ifdef MULTIBYTE_CHARS
727 if (char_len > 1)
729 for ( ; char_len > 0; --char_len)
730 *dest++ = (*buffer->cur - char_len);
732 else
733 #endif
734 *dest++ = c;
737 *dest = '\0';
739 token->val.str.text = BUFF_FRONT (pfile->u_buff);
740 token->val.str.len = dest - BUFF_FRONT (pfile->u_buff);
741 BUFF_FRONT (pfile->u_buff) = dest + 1;
744 /* The stored comment includes the comment start and any terminator. */
745 static void
746 save_comment (pfile, token, from, type)
747 cpp_reader *pfile;
748 cpp_token *token;
749 const unsigned char *from;
750 cppchar_t type;
752 unsigned char *buffer;
753 unsigned int len, clen;
755 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
757 /* C++ comments probably (not definitely) have moved past a new
758 line, which we don't want to save in the comment. */
759 if (is_vspace (pfile->buffer->cur[-1]))
760 len--;
762 /* If we are currently in a directive, then we need to store all
763 C++ comments as C comments internally, and so we need to
764 allocate a little extra space in that case.
766 Note that the only time we encounter a directive here is
767 when we are saving comments in a "#define". */
768 clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
770 buffer = _cpp_unaligned_alloc (pfile, clen);
772 token->type = CPP_COMMENT;
773 token->val.str.len = clen;
774 token->val.str.text = buffer;
776 buffer[0] = '/';
777 memcpy (buffer + 1, from, len - 1);
779 /* Finish conversion to a C comment, if necessary. */
780 if (pfile->state.in_directive && type == '/')
782 buffer[1] = '*';
783 buffer[clen - 2] = '*';
784 buffer[clen - 1] = '/';
788 /* Allocate COUNT tokens for RUN. */
789 void
790 _cpp_init_tokenrun (run, count)
791 tokenrun *run;
792 unsigned int count;
794 run->base = xnewvec (cpp_token, count);
795 run->limit = run->base + count;
796 run->next = NULL;
799 /* Returns the next tokenrun, or creates one if there is none. */
800 static tokenrun *
801 next_tokenrun (run)
802 tokenrun *run;
804 if (run->next == NULL)
806 run->next = xnew (tokenrun);
807 run->next->prev = run;
808 _cpp_init_tokenrun (run->next, 250);
811 return run->next;
814 /* Allocate a single token that is invalidated at the same time as the
815 rest of the tokens on the line. Has its line and col set to the
816 same as the last lexed token, so that diagnostics appear in the
817 right place. */
818 cpp_token *
819 _cpp_temp_token (pfile)
820 cpp_reader *pfile;
822 cpp_token *old, *result;
824 old = pfile->cur_token - 1;
825 if (pfile->cur_token == pfile->cur_run->limit)
827 pfile->cur_run = next_tokenrun (pfile->cur_run);
828 pfile->cur_token = pfile->cur_run->base;
831 result = pfile->cur_token++;
832 result->line = old->line;
833 result->col = old->col;
834 return result;
837 /* Lex a token into RESULT (external interface). Takes care of issues
838 like directive handling, token lookahead, multiple include
839 optimization and skipping. */
840 const cpp_token *
841 _cpp_lex_token (pfile)
842 cpp_reader *pfile;
844 cpp_token *result;
846 for (;;)
848 if (pfile->cur_token == pfile->cur_run->limit)
850 pfile->cur_run = next_tokenrun (pfile->cur_run);
851 pfile->cur_token = pfile->cur_run->base;
854 if (pfile->lookaheads)
856 pfile->lookaheads--;
857 result = pfile->cur_token++;
859 else
860 result = _cpp_lex_direct (pfile);
862 if (result->flags & BOL)
864 /* Is this a directive. If _cpp_handle_directive returns
865 false, it is an assembler #. */
866 if (result->type == CPP_HASH
867 /* 6.10.3 p 11: Directives in a list of macro arguments
868 gives undefined behavior. This implementation
869 handles the directive as normal. */
870 && pfile->state.parsing_args != 1
871 && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
872 continue;
873 if (pfile->cb.line_change && !pfile->state.skipping)
874 (*pfile->cb.line_change)(pfile, result, pfile->state.parsing_args);
877 /* We don't skip tokens in directives. */
878 if (pfile->state.in_directive)
879 break;
881 /* Outside a directive, invalidate controlling macros. At file
882 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
883 get here and MI optimisation works. */
884 pfile->mi_valid = false;
886 if (!pfile->state.skipping || result->type == CPP_EOF)
887 break;
890 return result;
893 /* A NUL terminates the current buffer. For ISO preprocessing this is
894 EOF, but for traditional preprocessing it indicates we need a line
895 refill. Returns TRUE to continue preprocessing a new buffer, FALSE
896 to return a CPP_EOF to the caller. */
897 static bool
898 continue_after_nul (pfile)
899 cpp_reader *pfile;
901 cpp_buffer *buffer = pfile->buffer;
902 bool more = false;
904 buffer->saved_flags = BOL;
905 if (CPP_OPTION (pfile, traditional))
907 if (pfile->state.in_directive)
908 return false;
910 _cpp_remove_overlay (pfile);
911 more = _cpp_read_logical_line_trad (pfile);
912 _cpp_overlay_buffer (pfile, pfile->out.base,
913 pfile->out.cur - pfile->out.base);
914 pfile->line = pfile->out.first_line;
916 else
918 /* Stop parsing arguments with a CPP_EOF. When we finally come
919 back here, do the work of popping the buffer. */
920 if (!pfile->state.parsing_args)
922 if (buffer->cur != buffer->line_base)
924 /* Non-empty files should end in a newline. Don't warn
925 for command line and _Pragma buffers. */
926 if (!buffer->from_stage3)
927 cpp_error (pfile, DL_PEDWARN, "no newline at end of file");
928 handle_newline (pfile);
931 /* Similarly, finish an in-progress directive with CPP_EOF
932 before popping the buffer. */
933 if (!pfile->state.in_directive && buffer->prev)
935 more = !buffer->return_at_eof;
936 _cpp_pop_buffer (pfile);
941 return more;
944 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
945 do { \
946 if (get_effective_char (pfile) == CHAR) \
947 result->type = THEN_TYPE; \
948 else \
950 BACKUP (); \
951 result->type = ELSE_TYPE; \
953 } while (0)
955 /* Lex a token into pfile->cur_token, which is also incremented, to
956 get diagnostics pointing to the correct location.
958 Does not handle issues such as token lookahead, multiple-include
959 optimisation, directives, skipping etc. This function is only
960 suitable for use by _cpp_lex_token, and in special cases like
961 lex_expansion_token which doesn't care for any of these issues.
963 When meeting a newline, returns CPP_EOF if parsing a directive,
964 otherwise returns to the start of the token buffer if permissible.
965 Returns the location of the lexed token. */
966 cpp_token *
967 _cpp_lex_direct (pfile)
968 cpp_reader *pfile;
970 cppchar_t c;
971 cpp_buffer *buffer;
972 const unsigned char *comment_start;
973 cpp_token *result = pfile->cur_token++;
975 fresh_line:
976 buffer = pfile->buffer;
977 result->flags = buffer->saved_flags;
978 buffer->saved_flags = 0;
979 update_tokens_line:
980 result->line = pfile->line;
982 skipped_white:
983 c = *buffer->cur++;
984 result->col = CPP_BUF_COLUMN (buffer, buffer->cur);
986 trigraph:
987 switch (c)
989 case ' ': case '\t': case '\f': case '\v': case '\0':
990 result->flags |= PREV_WHITE;
991 if (skip_whitespace (pfile, c))
992 goto skipped_white;
994 /* End of buffer. */
995 buffer->cur--;
996 if (continue_after_nul (pfile))
997 goto fresh_line;
998 result->type = CPP_EOF;
999 break;
1001 case '\n': case '\r':
1002 handle_newline (pfile);
1003 buffer->saved_flags = BOL;
1004 if (! pfile->state.in_directive)
1006 if (pfile->state.parsing_args == 2)
1007 buffer->saved_flags |= PREV_WHITE;
1008 if (!pfile->keep_tokens)
1010 pfile->cur_run = &pfile->base_run;
1011 result = pfile->base_run.base;
1012 pfile->cur_token = result + 1;
1014 goto fresh_line;
1016 result->type = CPP_EOF;
1017 break;
1019 case '?':
1020 case '\\':
1021 /* These could start an escaped newline, or '?' a trigraph. Let
1022 skip_escaped_newlines do all the work. */
1024 unsigned int line = pfile->line;
1026 c = skip_escaped_newlines (pfile);
1027 if (line != pfile->line)
1029 buffer->cur--;
1030 /* We had at least one escaped newline of some sort.
1031 Update the token's line and column. */
1032 goto update_tokens_line;
1036 /* We are either the original '?' or '\\', or a trigraph. */
1037 if (c == '?')
1038 result->type = CPP_QUERY;
1039 else if (c == '\\')
1040 goto random_char;
1041 else
1042 goto trigraph;
1043 break;
1045 case '0': case '1': case '2': case '3': case '4':
1046 case '5': case '6': case '7': case '8': case '9':
1047 result->type = CPP_NUMBER;
1048 parse_number (pfile, &result->val.str, 0);
1049 break;
1051 case 'L':
1052 /* 'L' may introduce wide characters or strings. */
1054 const unsigned char *pos = buffer->cur;
1056 c = get_effective_char (pfile);
1057 if (c == '\'' || c == '"')
1059 result->type = (c == '"' ? CPP_WSTRING: CPP_WCHAR);
1060 parse_string (pfile, result, c);
1061 break;
1063 buffer->cur = pos;
1065 /* Fall through. */
1067 start_ident:
1068 case '_':
1069 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1070 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1071 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1072 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1073 case 'y': case 'z':
1074 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1075 case 'G': case 'H': case 'I': case 'J': case 'K':
1076 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1077 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1078 case 'Y': case 'Z':
1079 result->type = CPP_NAME;
1080 result->val.node = parse_identifier (pfile);
1082 /* Convert named operators to their proper types. */
1083 if (result->val.node->flags & NODE_OPERATOR)
1085 result->flags |= NAMED_OP;
1086 result->type = result->val.node->directive_index;
1088 break;
1090 case '\'':
1091 case '"':
1092 result->type = c == '"' ? CPP_STRING: CPP_CHAR;
1093 parse_string (pfile, result, c);
1094 break;
1096 case '/':
1097 /* A potential block or line comment. */
1098 comment_start = buffer->cur;
1099 c = get_effective_char (pfile);
1101 if (c == '*')
1103 if (skip_block_comment (pfile))
1104 cpp_error (pfile, DL_ERROR, "unterminated comment");
1106 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1107 || CPP_IN_SYSTEM_HEADER (pfile)))
1109 /* Warn about comments only if pedantically GNUC89, and not
1110 in system headers. */
1111 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1112 && ! buffer->warned_cplusplus_comments)
1114 cpp_error (pfile, DL_PEDWARN,
1115 "C++ style comments are not allowed in ISO C90");
1116 cpp_error (pfile, DL_PEDWARN,
1117 "(this will be reported only once per input file)");
1118 buffer->warned_cplusplus_comments = 1;
1121 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1122 cpp_error (pfile, DL_WARNING, "multi-line comment");
1124 else if (c == '=')
1126 result->type = CPP_DIV_EQ;
1127 break;
1129 else
1131 BACKUP ();
1132 result->type = CPP_DIV;
1133 break;
1136 if (!pfile->state.save_comments)
1138 result->flags |= PREV_WHITE;
1139 goto update_tokens_line;
1142 /* Save the comment as a token in its own right. */
1143 save_comment (pfile, result, comment_start, c);
1144 break;
1146 case '<':
1147 if (pfile->state.angled_headers)
1149 result->type = CPP_HEADER_NAME;
1150 parse_string (pfile, result, '>');
1151 break;
1154 c = get_effective_char (pfile);
1155 if (c == '=')
1156 result->type = CPP_LESS_EQ;
1157 else if (c == '<')
1158 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1159 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1160 IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
1161 else if (c == ':' && CPP_OPTION (pfile, digraphs))
1163 result->type = CPP_OPEN_SQUARE;
1164 result->flags |= DIGRAPH;
1166 else if (c == '%' && CPP_OPTION (pfile, digraphs))
1168 result->type = CPP_OPEN_BRACE;
1169 result->flags |= DIGRAPH;
1171 else
1173 BACKUP ();
1174 result->type = CPP_LESS;
1176 break;
1178 case '>':
1179 c = get_effective_char (pfile);
1180 if (c == '=')
1181 result->type = CPP_GREATER_EQ;
1182 else if (c == '>')
1183 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1184 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1185 IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
1186 else
1188 BACKUP ();
1189 result->type = CPP_GREATER;
1191 break;
1193 case '%':
1194 c = get_effective_char (pfile);
1195 if (c == '=')
1196 result->type = CPP_MOD_EQ;
1197 else if (CPP_OPTION (pfile, digraphs) && c == ':')
1199 result->flags |= DIGRAPH;
1200 result->type = CPP_HASH;
1201 if (get_effective_char (pfile) == '%')
1203 const unsigned char *pos = buffer->cur;
1205 if (get_effective_char (pfile) == ':')
1206 result->type = CPP_PASTE;
1207 else
1208 buffer->cur = pos - 1;
1210 else
1211 BACKUP ();
1213 else if (CPP_OPTION (pfile, digraphs) && c == '>')
1215 result->flags |= DIGRAPH;
1216 result->type = CPP_CLOSE_BRACE;
1218 else
1220 BACKUP ();
1221 result->type = CPP_MOD;
1223 break;
1225 case '.':
1226 result->type = CPP_DOT;
1227 c = get_effective_char (pfile);
1228 if (c == '.')
1230 const unsigned char *pos = buffer->cur;
1232 if (get_effective_char (pfile) == '.')
1233 result->type = CPP_ELLIPSIS;
1234 else
1235 buffer->cur = pos - 1;
1237 /* All known character sets have 0...9 contiguous. */
1238 else if (ISDIGIT (c))
1240 result->type = CPP_NUMBER;
1241 parse_number (pfile, &result->val.str, 1);
1243 else if (c == '*' && CPP_OPTION (pfile, cplusplus))
1244 result->type = CPP_DOT_STAR;
1245 else
1246 BACKUP ();
1247 break;
1249 case '+':
1250 c = get_effective_char (pfile);
1251 if (c == '+')
1252 result->type = CPP_PLUS_PLUS;
1253 else if (c == '=')
1254 result->type = CPP_PLUS_EQ;
1255 else
1257 BACKUP ();
1258 result->type = CPP_PLUS;
1260 break;
1262 case '-':
1263 c = get_effective_char (pfile);
1264 if (c == '>')
1266 result->type = CPP_DEREF;
1267 if (CPP_OPTION (pfile, cplusplus))
1269 if (get_effective_char (pfile) == '*')
1270 result->type = CPP_DEREF_STAR;
1271 else
1272 BACKUP ();
1275 else if (c == '-')
1276 result->type = CPP_MINUS_MINUS;
1277 else if (c == '=')
1278 result->type = CPP_MINUS_EQ;
1279 else
1281 BACKUP ();
1282 result->type = CPP_MINUS;
1284 break;
1286 case '&':
1287 c = get_effective_char (pfile);
1288 if (c == '&')
1289 result->type = CPP_AND_AND;
1290 else if (c == '=')
1291 result->type = CPP_AND_EQ;
1292 else
1294 BACKUP ();
1295 result->type = CPP_AND;
1297 break;
1299 case '|':
1300 c = get_effective_char (pfile);
1301 if (c == '|')
1302 result->type = CPP_OR_OR;
1303 else if (c == '=')
1304 result->type = CPP_OR_EQ;
1305 else
1307 BACKUP ();
1308 result->type = CPP_OR;
1310 break;
1312 case ':':
1313 c = get_effective_char (pfile);
1314 if (c == ':' && CPP_OPTION (pfile, cplusplus))
1315 result->type = CPP_SCOPE;
1316 else if (c == '>' && CPP_OPTION (pfile, digraphs))
1318 result->flags |= DIGRAPH;
1319 result->type = CPP_CLOSE_SQUARE;
1321 else
1323 BACKUP ();
1324 result->type = CPP_COLON;
1326 break;
1328 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1329 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1330 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1331 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1332 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1334 case '~': result->type = CPP_COMPL; break;
1335 case ',': result->type = CPP_COMMA; break;
1336 case '(': result->type = CPP_OPEN_PAREN; break;
1337 case ')': result->type = CPP_CLOSE_PAREN; break;
1338 case '[': result->type = CPP_OPEN_SQUARE; break;
1339 case ']': result->type = CPP_CLOSE_SQUARE; break;
1340 case '{': result->type = CPP_OPEN_BRACE; break;
1341 case '}': result->type = CPP_CLOSE_BRACE; break;
1342 case ';': result->type = CPP_SEMICOLON; break;
1344 /* @ is a punctuator in Objective-C. */
1345 case '@': result->type = CPP_ATSIGN; break;
1347 case '$':
1348 if (CPP_OPTION (pfile, dollars_in_ident))
1349 goto start_ident;
1350 /* Fall through... */
1352 random_char:
1353 default:
1354 result->type = CPP_OTHER;
1355 result->val.c = c;
1356 break;
1359 return result;
1362 /* An upper bound on the number of bytes needed to spell TOKEN,
1363 including preceding whitespace. */
1364 unsigned int
1365 cpp_token_len (token)
1366 const cpp_token *token;
1368 unsigned int len;
1370 switch (TOKEN_SPELL (token))
1372 default: len = 0; break;
1373 case SPELL_NUMBER:
1374 case SPELL_STRING: len = token->val.str.len; break;
1375 case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
1377 /* 1 for whitespace, 4 for comment delimiters. */
1378 return len + 5;
1381 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1382 already contain the enough space to hold the token's spelling.
1383 Returns a pointer to the character after the last character
1384 written. */
1385 unsigned char *
1386 cpp_spell_token (pfile, token, buffer)
1387 cpp_reader *pfile; /* Would be nice to be rid of this... */
1388 const cpp_token *token;
1389 unsigned char *buffer;
1391 switch (TOKEN_SPELL (token))
1393 case SPELL_OPERATOR:
1395 const unsigned char *spelling;
1396 unsigned char c;
1398 if (token->flags & DIGRAPH)
1399 spelling
1400 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1401 else if (token->flags & NAMED_OP)
1402 goto spell_ident;
1403 else
1404 spelling = TOKEN_NAME (token);
1406 while ((c = *spelling++) != '\0')
1407 *buffer++ = c;
1409 break;
1411 case SPELL_CHAR:
1412 *buffer++ = token->val.c;
1413 break;
1415 spell_ident:
1416 case SPELL_IDENT:
1417 memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
1418 buffer += NODE_LEN (token->val.node);
1419 break;
1421 case SPELL_NUMBER:
1422 memcpy (buffer, token->val.str.text, token->val.str.len);
1423 buffer += token->val.str.len;
1424 break;
1426 case SPELL_STRING:
1428 int left, right, tag;
1429 switch (token->type)
1431 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1432 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1433 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1434 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1435 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1436 default:
1437 cpp_error (pfile, DL_ICE, "unknown string token %s\n",
1438 TOKEN_NAME (token));
1439 return buffer;
1441 if (tag) *buffer++ = tag;
1442 *buffer++ = left;
1443 memcpy (buffer, token->val.str.text, token->val.str.len);
1444 buffer += token->val.str.len;
1445 *buffer++ = right;
1447 break;
1449 case SPELL_NONE:
1450 cpp_error (pfile, DL_ICE, "unspellable token %s", TOKEN_NAME (token));
1451 break;
1454 return buffer;
1457 /* Returns TOKEN spelt as a null-terminated string. The string is
1458 freed when the reader is destroyed. Useful for diagnostics. */
1459 unsigned char *
1460 cpp_token_as_text (pfile, token)
1461 cpp_reader *pfile;
1462 const cpp_token *token;
1464 unsigned int len = cpp_token_len (token);
1465 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1467 end = cpp_spell_token (pfile, token, start);
1468 end[0] = '\0';
1470 return start;
1473 /* Used by C front ends, which really should move to using
1474 cpp_token_as_text. */
1475 const char *
1476 cpp_type2name (type)
1477 enum cpp_ttype type;
1479 return (const char *) token_spellings[type].name;
1482 /* Writes the spelling of token to FP, without any preceding space.
1483 Separated from cpp_spell_token for efficiency - to avoid stdio
1484 double-buffering. */
1485 void
1486 cpp_output_token (token, fp)
1487 const cpp_token *token;
1488 FILE *fp;
1490 switch (TOKEN_SPELL (token))
1492 case SPELL_OPERATOR:
1494 const unsigned char *spelling;
1495 int c;
1497 if (token->flags & DIGRAPH)
1498 spelling
1499 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1500 else if (token->flags & NAMED_OP)
1501 goto spell_ident;
1502 else
1503 spelling = TOKEN_NAME (token);
1505 c = *spelling;
1507 putc (c, fp);
1508 while ((c = *++spelling) != '\0');
1510 break;
1512 case SPELL_CHAR:
1513 putc (token->val.c, fp);
1514 break;
1516 spell_ident:
1517 case SPELL_IDENT:
1518 fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
1519 break;
1521 case SPELL_NUMBER:
1522 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1523 break;
1525 case SPELL_STRING:
1527 int left, right, tag;
1528 switch (token->type)
1530 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1531 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1532 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1533 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1534 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1535 default:
1536 fprintf (stderr, "impossible STRING token %s\n", TOKEN_NAME (token));
1537 return;
1539 if (tag) putc (tag, fp);
1540 putc (left, fp);
1541 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1542 putc (right, fp);
1544 break;
1546 case SPELL_NONE:
1547 /* An error, most probably. */
1548 break;
1552 /* Compare two tokens. */
1554 _cpp_equiv_tokens (a, b)
1555 const cpp_token *a, *b;
1557 if (a->type == b->type && a->flags == b->flags)
1558 switch (TOKEN_SPELL (a))
1560 default: /* Keep compiler happy. */
1561 case SPELL_OPERATOR:
1562 return 1;
1563 case SPELL_CHAR:
1564 return a->val.c == b->val.c; /* Character. */
1565 case SPELL_NONE:
1566 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1567 case SPELL_IDENT:
1568 return a->val.node == b->val.node;
1569 case SPELL_NUMBER:
1570 case SPELL_STRING:
1571 return (a->val.str.len == b->val.str.len
1572 && !memcmp (a->val.str.text, b->val.str.text,
1573 a->val.str.len));
1576 return 0;
1579 /* Returns nonzero if a space should be inserted to avoid an
1580 accidental token paste for output. For simplicity, it is
1581 conservative, and occasionally advises a space where one is not
1582 needed, e.g. "." and ".2". */
1584 cpp_avoid_paste (pfile, token1, token2)
1585 cpp_reader *pfile;
1586 const cpp_token *token1, *token2;
1588 enum cpp_ttype a = token1->type, b = token2->type;
1589 cppchar_t c;
1591 if (token1->flags & NAMED_OP)
1592 a = CPP_NAME;
1593 if (token2->flags & NAMED_OP)
1594 b = CPP_NAME;
1596 c = EOF;
1597 if (token2->flags & DIGRAPH)
1598 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1599 else if (token_spellings[b].category == SPELL_OPERATOR)
1600 c = token_spellings[b].name[0];
1602 /* Quickly get everything that can paste with an '='. */
1603 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1604 return 1;
1606 switch (a)
1608 case CPP_GREATER: return c == '>' || c == '?';
1609 case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
1610 case CPP_PLUS: return c == '+';
1611 case CPP_MINUS: return c == '-' || c == '>';
1612 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1613 case CPP_MOD: return c == ':' || c == '>';
1614 case CPP_AND: return c == '&';
1615 case CPP_OR: return c == '|';
1616 case CPP_COLON: return c == ':' || c == '>';
1617 case CPP_DEREF: return c == '*';
1618 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1619 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1620 case CPP_NAME: return ((b == CPP_NUMBER
1621 && name_p (pfile, &token2->val.str))
1622 || b == CPP_NAME
1623 || b == CPP_CHAR || b == CPP_STRING); /* L */
1624 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1625 || c == '.' || c == '+' || c == '-');
1626 case CPP_OTHER: return (CPP_OPTION (pfile, objc)
1627 && token1->val.c == '@'
1628 && (b == CPP_NAME || b == CPP_STRING));
1629 default: break;
1632 return 0;
1635 /* Output all the remaining tokens on the current line, and a newline
1636 character, to FP. Leading whitespace is removed. If there are
1637 macros, special token padding is not performed. */
1638 void
1639 cpp_output_line (pfile, fp)
1640 cpp_reader *pfile;
1641 FILE *fp;
1643 const cpp_token *token;
1645 token = cpp_get_token (pfile);
1646 while (token->type != CPP_EOF)
1648 cpp_output_token (token, fp);
1649 token = cpp_get_token (pfile);
1650 if (token->flags & PREV_WHITE)
1651 putc (' ', fp);
1654 putc ('\n', fp);
1657 /* Returns the value of a hexadecimal digit. */
1658 static unsigned int
1659 hex_digit_value (c)
1660 unsigned int c;
1662 if (hex_p (c))
1663 return hex_value (c);
1664 else
1665 abort ();
1668 /* Parse a '\uNNNN' or '\UNNNNNNNN' sequence. Returns 1 to indicate
1669 failure if cpplib is not parsing C++ or C99. Such failure is
1670 silent, and no variables are updated. Otherwise returns 0, and
1671 warns if -Wtraditional.
1673 [lex.charset]: The character designated by the universal character
1674 name \UNNNNNNNN is that character whose character short name in
1675 ISO/IEC 10646 is NNNNNNNN; the character designated by the
1676 universal character name \uNNNN is that character whose character
1677 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
1678 for a universal character name is less than 0x20 or in the range
1679 0x7F-0x9F (inclusive), or if the universal character name
1680 designates a character in the basic source character set, then the
1681 program is ill-formed.
1683 We assume that wchar_t is Unicode, so we don't need to do any
1684 mapping. Is this ever wrong?
1686 PC points to the 'u' or 'U', PSTR is points to the byte after PC,
1687 LIMIT is the end of the string or charconst. PSTR is updated to
1688 point after the UCS on return, and the UCS is written into PC. */
1690 static int
1691 maybe_read_ucs (pfile, pstr, limit, pc)
1692 cpp_reader *pfile;
1693 const unsigned char **pstr;
1694 const unsigned char *limit;
1695 cppchar_t *pc;
1697 const unsigned char *p = *pstr;
1698 unsigned int code = 0;
1699 unsigned int c = *pc, length;
1701 /* Only attempt to interpret a UCS for C++ and C99. */
1702 if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
1703 return 1;
1705 if (CPP_WTRADITIONAL (pfile))
1706 cpp_error (pfile, DL_WARNING,
1707 "the meaning of '\\%c' is different in traditional C", c);
1709 length = (c == 'u' ? 4: 8);
1711 if ((size_t) (limit - p) < length)
1713 cpp_error (pfile, DL_ERROR, "incomplete universal-character-name");
1714 /* Skip to the end to avoid more diagnostics. */
1715 p = limit;
1717 else
1719 for (; length; length--, p++)
1721 c = *p;
1722 if (ISXDIGIT (c))
1723 code = (code << 4) + hex_digit_value (c);
1724 else
1726 cpp_error (pfile, DL_ERROR,
1727 "non-hex digit '%c' in universal-character-name", c);
1728 /* We shouldn't skip in case there are multibyte chars. */
1729 break;
1734 if (CPP_OPTION (pfile, EBCDIC))
1736 cpp_error (pfile, DL_ERROR, "universal-character-name on EBCDIC target");
1737 code = 0x3f; /* EBCDIC invalid character */
1739 /* True extended characters are OK. */
1740 else if (code >= 0xa0
1741 && !(code & 0x80000000)
1742 && !(code >= 0xD800 && code <= 0xDFFF))
1744 /* The standard permits $, @ and ` to be specified as UCNs. We use
1745 hex escapes so that this also works with EBCDIC hosts. */
1746 else if (code == 0x24 || code == 0x40 || code == 0x60)
1748 /* Don't give another error if one occurred above. */
1749 else if (length == 0)
1750 cpp_error (pfile, DL_ERROR, "universal-character-name out of range");
1752 *pstr = p;
1753 *pc = code;
1754 return 0;
1757 /* Returns the value of an escape sequence, truncated to the correct
1758 target precision. PSTR points to the input pointer, which is just
1759 after the backslash. LIMIT is how much text we have. WIDE is true
1760 if the escape sequence is part of a wide character constant or
1761 string literal. Handles all relevant diagnostics. */
1762 cppchar_t
1763 cpp_parse_escape (pfile, pstr, limit, wide)
1764 cpp_reader *pfile;
1765 const unsigned char **pstr;
1766 const unsigned char *limit;
1767 int wide;
1769 /* Values of \a \b \e \f \n \r \t \v respectively. */
1770 static const uchar ascii[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
1771 static const uchar ebcdic[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
1773 int unknown = 0;
1774 const unsigned char *str = *pstr, *charconsts;
1775 cppchar_t c, mask;
1776 unsigned int width;
1778 if (CPP_OPTION (pfile, EBCDIC))
1779 charconsts = ebcdic;
1780 else
1781 charconsts = ascii;
1783 if (wide)
1784 width = CPP_OPTION (pfile, wchar_precision);
1785 else
1786 width = CPP_OPTION (pfile, char_precision);
1787 if (width < BITS_PER_CPPCHAR_T)
1788 mask = ((cppchar_t) 1 << width) - 1;
1789 else
1790 mask = ~0;
1792 c = *str++;
1793 switch (c)
1795 case '\\': case '\'': case '"': case '?': break;
1796 case 'b': c = charconsts[1]; break;
1797 case 'f': c = charconsts[3]; break;
1798 case 'n': c = charconsts[4]; break;
1799 case 'r': c = charconsts[5]; break;
1800 case 't': c = charconsts[6]; break;
1801 case 'v': c = charconsts[7]; break;
1803 case '(': case '{': case '[': case '%':
1804 /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
1805 '\%' is used to prevent SCCS from getting confused. */
1806 unknown = CPP_PEDANTIC (pfile);
1807 break;
1809 case 'a':
1810 if (CPP_WTRADITIONAL (pfile))
1811 cpp_error (pfile, DL_WARNING,
1812 "the meaning of '\\a' is different in traditional C");
1813 c = charconsts[0];
1814 break;
1816 case 'e': case 'E':
1817 if (CPP_PEDANTIC (pfile))
1818 cpp_error (pfile, DL_PEDWARN,
1819 "non-ISO-standard escape sequence, '\\%c'", (int) c);
1820 c = charconsts[2];
1821 break;
1823 case 'u': case 'U':
1824 unknown = maybe_read_ucs (pfile, &str, limit, &c);
1825 break;
1827 case 'x':
1828 if (CPP_WTRADITIONAL (pfile))
1829 cpp_error (pfile, DL_WARNING,
1830 "the meaning of '\\x' is different in traditional C");
1833 cppchar_t i = 0, overflow = 0;
1834 int digits_found = 0;
1836 while (str < limit)
1838 c = *str;
1839 if (! ISXDIGIT (c))
1840 break;
1841 str++;
1842 overflow |= i ^ (i << 4 >> 4);
1843 i = (i << 4) + hex_digit_value (c);
1844 digits_found = 1;
1847 if (!digits_found)
1848 cpp_error (pfile, DL_ERROR,
1849 "\\x used with no following hex digits");
1851 if (overflow | (i != (i & mask)))
1853 cpp_error (pfile, DL_PEDWARN,
1854 "hex escape sequence out of range");
1855 i &= mask;
1857 c = i;
1859 break;
1861 case '0': case '1': case '2': case '3':
1862 case '4': case '5': case '6': case '7':
1864 size_t count = 0;
1865 cppchar_t i = c - '0';
1867 while (str < limit && ++count < 3)
1869 c = *str;
1870 if (c < '0' || c > '7')
1871 break;
1872 str++;
1873 i = (i << 3) + c - '0';
1876 if (i != (i & mask))
1878 cpp_error (pfile, DL_PEDWARN,
1879 "octal escape sequence out of range");
1880 i &= mask;
1882 c = i;
1884 break;
1886 default:
1887 unknown = 1;
1888 break;
1891 if (unknown)
1893 if (ISGRAPH (c))
1894 cpp_error (pfile, DL_PEDWARN,
1895 "unknown escape sequence '\\%c'", (int) c);
1896 else
1897 cpp_error (pfile, DL_PEDWARN,
1898 "unknown escape sequence: '\\%03o'", (int) c);
1901 if (c > mask)
1903 cpp_error (pfile, DL_PEDWARN, "escape sequence out of range for its type");
1904 c &= mask;
1907 *pstr = str;
1908 return c;
1911 /* Interpret a (possibly wide) character constant in TOKEN.
1912 WARN_MULTI warns about multi-character charconsts. PCHARS_SEEN
1913 points to a variable that is filled in with the number of
1914 characters seen, and UNSIGNEDP to a variable that indicates whether
1915 the result has signed type. */
1916 cppchar_t
1917 cpp_interpret_charconst (pfile, token, pchars_seen, unsignedp)
1918 cpp_reader *pfile;
1919 const cpp_token *token;
1920 unsigned int *pchars_seen;
1921 int *unsignedp;
1923 const unsigned char *str = token->val.str.text;
1924 const unsigned char *limit = str + token->val.str.len;
1925 unsigned int chars_seen = 0;
1926 size_t width, max_chars;
1927 cppchar_t c, mask, result = 0;
1928 bool unsigned_p;
1930 #ifdef MULTIBYTE_CHARS
1931 (void) local_mbtowc (NULL, NULL, 0);
1932 #endif
1934 /* Width in bits. */
1935 if (token->type == CPP_CHAR)
1937 width = CPP_OPTION (pfile, char_precision);
1938 max_chars = CPP_OPTION (pfile, int_precision) / width;
1939 unsigned_p = CPP_OPTION (pfile, unsigned_char);
1941 else
1943 width = CPP_OPTION (pfile, wchar_precision);
1944 max_chars = 1;
1945 unsigned_p = CPP_OPTION (pfile, unsigned_wchar);
1948 if (width < BITS_PER_CPPCHAR_T)
1949 mask = ((cppchar_t) 1 << width) - 1;
1950 else
1951 mask = ~0;
1953 while (str < limit)
1955 #ifdef MULTIBYTE_CHARS
1956 wchar_t wc;
1957 int char_len;
1959 char_len = local_mbtowc (&wc, (const char *)str, limit - str);
1960 if (char_len == -1)
1962 cpp_error (pfile, DL_WARNING,
1963 "ignoring invalid multibyte character");
1964 c = *str++;
1966 else
1968 str += char_len;
1969 c = wc;
1971 #else
1972 c = *str++;
1973 #endif
1975 if (c == '\\')
1976 c = cpp_parse_escape (pfile, &str, limit, token->type == CPP_WCHAR);
1978 #ifdef MAP_CHARACTER
1979 if (ISPRINT (c))
1980 c = MAP_CHARACTER (c);
1981 #endif
1983 chars_seen++;
1985 /* Truncate the character, scale the result and merge the two. */
1986 c &= mask;
1987 if (width < BITS_PER_CPPCHAR_T)
1988 result = (result << width) | c;
1989 else
1990 result = c;
1993 if (chars_seen == 0)
1994 cpp_error (pfile, DL_ERROR, "empty character constant");
1995 else if (chars_seen > 1)
1997 /* Multichar charconsts are of type int and therefore signed. */
1998 unsigned_p = 0;
2000 if (chars_seen > max_chars)
2002 chars_seen = max_chars;
2003 cpp_error (pfile, DL_WARNING,
2004 "character constant too long for its type");
2006 else if (CPP_OPTION (pfile, warn_multichar))
2007 cpp_error (pfile, DL_WARNING, "multi-character character constant");
2010 /* Sign-extend or truncate the constant to cppchar_t. The value is
2011 in WIDTH bits, but for multi-char charconsts it's value is the
2012 full target type's width. */
2013 if (chars_seen > 1)
2014 width *= max_chars;
2015 if (width < BITS_PER_CPPCHAR_T)
2017 mask = ((cppchar_t) 1 << width) - 1;
2018 if (unsigned_p || !(result & (1 << (width - 1))))
2019 result &= mask;
2020 else
2021 result |= ~mask;
2024 *pchars_seen = chars_seen;
2025 *unsignedp = unsigned_p;
2026 return result;
2029 /* Memory buffers. Changing these three constants can have a dramatic
2030 effect on performance. The values here are reasonable defaults,
2031 but might be tuned. If you adjust them, be sure to test across a
2032 range of uses of cpplib, including heavy nested function-like macro
2033 expansion. Also check the change in peak memory usage (NJAMD is a
2034 good tool for this). */
2035 #define MIN_BUFF_SIZE 8000
2036 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2037 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2038 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2040 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2041 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2042 #endif
2044 /* Create a new allocation buffer. Place the control block at the end
2045 of the buffer, so that buffer overflows will cause immediate chaos. */
2046 static _cpp_buff *
2047 new_buff (len)
2048 size_t len;
2050 _cpp_buff *result;
2051 unsigned char *base;
2053 if (len < MIN_BUFF_SIZE)
2054 len = MIN_BUFF_SIZE;
2055 len = CPP_ALIGN (len);
2057 base = xmalloc (len + sizeof (_cpp_buff));
2058 result = (_cpp_buff *) (base + len);
2059 result->base = base;
2060 result->cur = base;
2061 result->limit = base + len;
2062 result->next = NULL;
2063 return result;
2066 /* Place a chain of unwanted allocation buffers on the free list. */
2067 void
2068 _cpp_release_buff (pfile, buff)
2069 cpp_reader *pfile;
2070 _cpp_buff *buff;
2072 _cpp_buff *end = buff;
2074 while (end->next)
2075 end = end->next;
2076 end->next = pfile->free_buffs;
2077 pfile->free_buffs = buff;
2080 /* Return a free buffer of size at least MIN_SIZE. */
2081 _cpp_buff *
2082 _cpp_get_buff (pfile, min_size)
2083 cpp_reader *pfile;
2084 size_t min_size;
2086 _cpp_buff *result, **p;
2088 for (p = &pfile->free_buffs;; p = &(*p)->next)
2090 size_t size;
2092 if (*p == NULL)
2093 return new_buff (min_size);
2094 result = *p;
2095 size = result->limit - result->base;
2096 /* Return a buffer that's big enough, but don't waste one that's
2097 way too big. */
2098 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2099 break;
2102 *p = result->next;
2103 result->next = NULL;
2104 result->cur = result->base;
2105 return result;
2108 /* Creates a new buffer with enough space to hold the uncommitted
2109 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
2110 the excess bytes to the new buffer. Chains the new buffer after
2111 BUFF, and returns the new buffer. */
2112 _cpp_buff *
2113 _cpp_append_extend_buff (pfile, buff, min_extra)
2114 cpp_reader *pfile;
2115 _cpp_buff *buff;
2116 size_t min_extra;
2118 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2119 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2121 buff->next = new_buff;
2122 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2123 return new_buff;
2126 /* Creates a new buffer with enough space to hold the uncommitted
2127 remaining bytes of the buffer pointed to by BUFF, and at least
2128 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
2129 Chains the new buffer before the buffer pointed to by BUFF, and
2130 updates the pointer to point to the new buffer. */
2131 void
2132 _cpp_extend_buff (pfile, pbuff, min_extra)
2133 cpp_reader *pfile;
2134 _cpp_buff **pbuff;
2135 size_t min_extra;
2137 _cpp_buff *new_buff, *old_buff = *pbuff;
2138 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2140 new_buff = _cpp_get_buff (pfile, size);
2141 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2142 new_buff->next = old_buff;
2143 *pbuff = new_buff;
2146 /* Free a chain of buffers starting at BUFF. */
2147 void
2148 _cpp_free_buff (buff)
2149 _cpp_buff *buff;
2151 _cpp_buff *next;
2153 for (; buff; buff = next)
2155 next = buff->next;
2156 free (buff->base);
2160 /* Allocate permanent, unaligned storage of length LEN. */
2161 unsigned char *
2162 _cpp_unaligned_alloc (pfile, len)
2163 cpp_reader *pfile;
2164 size_t len;
2166 _cpp_buff *buff = pfile->u_buff;
2167 unsigned char *result = buff->cur;
2169 if (len > (size_t) (buff->limit - result))
2171 buff = _cpp_get_buff (pfile, len);
2172 buff->next = pfile->u_buff;
2173 pfile->u_buff = buff;
2174 result = buff->cur;
2177 buff->cur = result + len;
2178 return result;
2181 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2182 That buffer is used for growing allocations when saving macro
2183 replacement lists in a #define, and when parsing an answer to an
2184 assertion in #assert, #unassert or #if (and therefore possibly
2185 whilst expanding macros). It therefore must not be used by any
2186 code that they might call: specifically the lexer and the guts of
2187 the macro expander.
2189 All existing other uses clearly fit this restriction: storing
2190 registered pragmas during initialization. */
2191 unsigned char *
2192 _cpp_aligned_alloc (pfile, len)
2193 cpp_reader *pfile;
2194 size_t len;
2196 _cpp_buff *buff = pfile->a_buff;
2197 unsigned char *result = buff->cur;
2199 if (len > (size_t) (buff->limit - result))
2201 buff = _cpp_get_buff (pfile, len);
2202 buff->next = pfile->a_buff;
2203 pfile->a_buff = buff;
2204 result = buff->cur;
2207 buff->cur = result + len;
2208 return result;