PR target/9164
[official-gcc.git] / gcc / cpplex.c
blob7a8af905a4ce4c35ca5a957bfb510a68e8232f77
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000, 2001, 2002 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7 Single-pass line tokenization by Neil Booth, April 2000
9 This program is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
12 later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "cpplib.h"
28 #include "cpphash.h"
30 #ifdef MULTIBYTE_CHARS
31 #include "mbchar.h"
32 #include <locale.h>
33 #endif
35 /* Tokens with SPELL_STRING store their spelling in the token list,
36 and it's length in the token->val.name.len. */
37 enum spell_type
39 SPELL_OPERATOR = 0,
40 SPELL_CHAR,
41 SPELL_IDENT,
42 SPELL_NUMBER,
43 SPELL_STRING,
44 SPELL_NONE
47 struct token_spelling
49 enum spell_type category;
50 const unsigned char *name;
53 static const unsigned char *const digraph_spellings[] =
54 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
56 #define OP(e, s) { SPELL_OPERATOR, U s },
57 #define TK(e, s) { s, U STRINGX (e) },
58 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
59 #undef OP
60 #undef TK
62 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
63 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
64 #define BACKUP() do {buffer->cur = buffer->backup_to;} while (0)
66 static void handle_newline PARAMS ((cpp_reader *));
67 static cppchar_t skip_escaped_newlines PARAMS ((cpp_reader *));
68 static cppchar_t get_effective_char PARAMS ((cpp_reader *));
70 static int skip_block_comment PARAMS ((cpp_reader *));
71 static int skip_line_comment PARAMS ((cpp_reader *));
72 static void adjust_column PARAMS ((cpp_reader *));
73 static int skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
74 static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *));
75 static uchar *parse_slow PARAMS ((cpp_reader *, const uchar *, int,
76 unsigned int *));
77 static void parse_number PARAMS ((cpp_reader *, cpp_string *, int));
78 static int unescaped_terminator_p PARAMS ((cpp_reader *, const uchar *));
79 static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
80 static bool trigraph_p PARAMS ((cpp_reader *));
81 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const uchar *,
82 cppchar_t));
83 static bool continue_after_nul PARAMS ((cpp_reader *));
84 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
85 static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
86 const unsigned char *, cppchar_t *));
87 static tokenrun *next_tokenrun PARAMS ((tokenrun *));
89 static unsigned int hex_digit_value PARAMS ((unsigned int));
90 static _cpp_buff *new_buff PARAMS ((size_t));
92 /* Change to the native locale for multibyte conversions. */
93 void
94 _cpp_init_mbchar ()
96 #ifdef MULTIBYTE_CHARS
97 setlocale (LC_CTYPE, "");
98 GET_ENVIRONMENT (literal_codeset, "LANG");
99 #endif
102 /* Utility routine:
104 Compares, the token TOKEN to the NUL-terminated string STRING.
105 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
107 cpp_ideq (token, string)
108 const cpp_token *token;
109 const char *string;
111 if (token->type != CPP_NAME)
112 return 0;
114 return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
117 /* Call when meeting a newline, assumed to be in buffer->cur[-1].
118 Returns with buffer->cur pointing to the character immediately
119 following the newline (combination). */
120 static void
121 handle_newline (pfile)
122 cpp_reader *pfile;
124 cpp_buffer *buffer = pfile->buffer;
126 /* Handle CR-LF and LF-CR. Most other implementations (e.g. java)
127 only accept CR-LF; maybe we should fall back to that behavior? */
128 if (buffer->cur[-1] + buffer->cur[0] == '\r' + '\n')
129 buffer->cur++;
131 buffer->line_base = buffer->cur;
132 buffer->col_adjust = 0;
133 pfile->line++;
136 /* Subroutine of skip_escaped_newlines; called when a 3-character
137 sequence beginning with "??" is encountered. buffer->cur points to
138 the second '?'.
140 Warn if necessary, and returns true if the sequence forms a
141 trigraph and the trigraph should be honored. */
142 static bool
143 trigraph_p (pfile)
144 cpp_reader *pfile;
146 cpp_buffer *buffer = pfile->buffer;
147 cppchar_t from_char = buffer->cur[1];
148 bool accept;
150 if (!_cpp_trigraph_map[from_char])
151 return false;
153 accept = CPP_OPTION (pfile, trigraphs);
155 /* Don't warn about trigraphs in comments. */
156 if (CPP_OPTION (pfile, warn_trigraphs) && !pfile->state.lexing_comment)
158 if (accept)
159 cpp_error_with_line (pfile, DL_WARNING,
160 pfile->line, CPP_BUF_COL (buffer) - 1,
161 "trigraph ??%c converted to %c",
162 (int) from_char,
163 (int) _cpp_trigraph_map[from_char]);
164 else if (buffer->cur != buffer->last_Wtrigraphs)
166 buffer->last_Wtrigraphs = buffer->cur;
167 cpp_error_with_line (pfile, DL_WARNING,
168 pfile->line, CPP_BUF_COL (buffer) - 1,
169 "trigraph ??%c ignored", (int) from_char);
173 return accept;
176 /* Skips any escaped newlines introduced by '?' or a '\\', assumed to
177 lie in buffer->cur[-1]. Returns the next byte, which will be in
178 buffer->cur[-1]. This routine performs preprocessing stages 1 and
179 2 of the ISO C standard. */
180 static cppchar_t
181 skip_escaped_newlines (pfile)
182 cpp_reader *pfile;
184 cpp_buffer *buffer = pfile->buffer;
185 cppchar_t next = buffer->cur[-1];
187 /* Only do this if we apply stages 1 and 2. */
188 if (!buffer->from_stage3)
190 const unsigned char *saved_cur;
191 cppchar_t next1;
195 if (next == '?')
197 if (buffer->cur[0] != '?' || !trigraph_p (pfile))
198 break;
200 /* Translate the trigraph. */
201 next = _cpp_trigraph_map[buffer->cur[1]];
202 buffer->cur += 2;
203 if (next != '\\')
204 break;
207 if (buffer->cur == buffer->rlimit)
208 break;
210 /* We have a backslash, and room for at least one more
211 character. Skip horizontal whitespace. */
212 saved_cur = buffer->cur;
214 next1 = *buffer->cur++;
215 while (is_nvspace (next1) && buffer->cur < buffer->rlimit);
217 if (!is_vspace (next1))
219 buffer->cur = saved_cur;
220 break;
223 if (saved_cur != buffer->cur - 1
224 && !pfile->state.lexing_comment)
225 cpp_error (pfile, DL_WARNING,
226 "backslash and newline separated by space");
228 handle_newline (pfile);
229 buffer->backup_to = buffer->cur;
230 if (buffer->cur == buffer->rlimit)
232 cpp_error (pfile, DL_PEDWARN,
233 "backslash-newline at end of file");
234 next = EOF;
236 else
237 next = *buffer->cur++;
239 while (next == '\\' || next == '?');
242 return next;
245 /* Obtain the next character, after trigraph conversion and skipping
246 an arbitrarily long string of escaped newlines. The common case of
247 no trigraphs or escaped newlines falls through quickly. On return,
248 buffer->backup_to points to where to return to if the character is
249 not to be processed. */
250 static cppchar_t
251 get_effective_char (pfile)
252 cpp_reader *pfile;
254 cppchar_t next;
255 cpp_buffer *buffer = pfile->buffer;
257 buffer->backup_to = buffer->cur;
258 next = *buffer->cur++;
259 if (__builtin_expect (next == '?' || next == '\\', 0))
260 next = skip_escaped_newlines (pfile);
262 return next;
265 /* Skip a C-style block comment. We find the end of the comment by
266 seeing if an asterisk is before every '/' we encounter. Returns
267 nonzero if comment terminated by EOF, zero otherwise. */
268 static int
269 skip_block_comment (pfile)
270 cpp_reader *pfile;
272 cpp_buffer *buffer = pfile->buffer;
273 cppchar_t c = EOF, prevc = EOF;
275 pfile->state.lexing_comment = 1;
276 while (buffer->cur != buffer->rlimit)
278 prevc = c, c = *buffer->cur++;
280 /* FIXME: For speed, create a new character class of characters
281 of interest inside block comments. */
282 if (c == '?' || c == '\\')
283 c = skip_escaped_newlines (pfile);
285 /* People like decorating comments with '*', so check for '/'
286 instead for efficiency. */
287 if (c == '/')
289 if (prevc == '*')
290 break;
292 /* Warn about potential nested comments, but not if the '/'
293 comes immediately before the true comment delimiter.
294 Don't bother to get it right across escaped newlines. */
295 if (CPP_OPTION (pfile, warn_comments)
296 && buffer->cur[0] == '*' && buffer->cur[1] != '/')
297 cpp_error_with_line (pfile, DL_WARNING,
298 pfile->line, CPP_BUF_COL (buffer),
299 "\"/*\" within comment");
301 else if (is_vspace (c))
302 handle_newline (pfile);
303 else if (c == '\t')
304 adjust_column (pfile);
307 pfile->state.lexing_comment = 0;
308 return c != '/' || prevc != '*';
311 /* Skip a C++ line comment, leaving buffer->cur pointing to the
312 terminating newline. Handles escaped newlines. Returns nonzero
313 if a multiline comment. */
314 static int
315 skip_line_comment (pfile)
316 cpp_reader *pfile;
318 cpp_buffer *buffer = pfile->buffer;
319 unsigned int orig_line = pfile->line;
320 cppchar_t c;
321 #ifdef MULTIBYTE_CHARS
322 wchar_t wc;
323 int char_len;
324 #endif
326 pfile->state.lexing_comment = 1;
327 #ifdef MULTIBYTE_CHARS
328 /* Reset multibyte conversion state. */
329 (void) local_mbtowc (NULL, NULL, 0);
330 #endif
333 if (buffer->cur == buffer->rlimit)
334 goto at_eof;
336 #ifdef MULTIBYTE_CHARS
337 char_len = local_mbtowc (&wc, (const char *) buffer->cur,
338 buffer->rlimit - buffer->cur);
339 if (char_len == -1)
341 cpp_error (pfile, DL_WARNING,
342 "ignoring invalid multibyte character");
343 char_len = 1;
344 c = *buffer->cur++;
346 else
348 buffer->cur += char_len;
349 c = wc;
351 #else
352 c = *buffer->cur++;
353 #endif
354 if (c == '?' || c == '\\')
355 c = skip_escaped_newlines (pfile);
357 while (!is_vspace (c));
359 /* Step back over the newline, except at EOF. */
360 buffer->cur--;
361 at_eof:
363 pfile->state.lexing_comment = 0;
364 return orig_line != pfile->line;
367 /* pfile->buffer->cur is one beyond the \t character. Update
368 col_adjust so we track the column correctly. */
369 static void
370 adjust_column (pfile)
371 cpp_reader *pfile;
373 cpp_buffer *buffer = pfile->buffer;
374 unsigned int col = CPP_BUF_COL (buffer) - 1; /* Zero-based column. */
376 /* Round it up to multiple of the tabstop, but subtract 1 since the
377 tab itself occupies a character position. */
378 buffer->col_adjust += (CPP_OPTION (pfile, tabstop)
379 - col % CPP_OPTION (pfile, tabstop)) - 1;
382 /* Skips whitespace, saving the next non-whitespace character.
383 Adjusts pfile->col_adjust to account for tabs. Without this,
384 tokens might be assigned an incorrect column. */
385 static int
386 skip_whitespace (pfile, c)
387 cpp_reader *pfile;
388 cppchar_t c;
390 cpp_buffer *buffer = pfile->buffer;
391 unsigned int warned = 0;
395 /* Horizontal space always OK. */
396 if (c == ' ')
398 else if (c == '\t')
399 adjust_column (pfile);
400 /* Just \f \v or \0 left. */
401 else if (c == '\0')
403 if (buffer->cur - 1 == buffer->rlimit)
404 return 0;
405 if (!warned)
407 cpp_error (pfile, DL_WARNING, "null character(s) ignored");
408 warned = 1;
411 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
412 cpp_error_with_line (pfile, DL_PEDWARN, pfile->line,
413 CPP_BUF_COL (buffer),
414 "%s in preprocessing directive",
415 c == '\f' ? "form feed" : "vertical tab");
417 c = *buffer->cur++;
419 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
420 while (is_nvspace (c));
422 buffer->cur--;
423 return 1;
426 /* See if the characters of a number token are valid in a name (no
427 '.', '+' or '-'). */
428 static int
429 name_p (pfile, string)
430 cpp_reader *pfile;
431 const cpp_string *string;
433 unsigned int i;
435 for (i = 0; i < string->len; i++)
436 if (!is_idchar (string->text[i]))
437 return 0;
439 return 1;
442 /* Parse an identifier, skipping embedded backslash-newlines. This is
443 a critical inner loop. The common case is an identifier which has
444 not been split by backslash-newline, does not contain a dollar
445 sign, and has already been scanned (roughly 10:1 ratio of
446 seen:unseen identifiers in normal code; the distribution is
447 Poisson-like). Second most common case is a new identifier, not
448 split and no dollar sign. The other possibilities are rare and
449 have been relegated to parse_slow. */
450 static cpp_hashnode *
451 parse_identifier (pfile)
452 cpp_reader *pfile;
454 cpp_hashnode *result;
455 const uchar *cur, *base;
457 /* Fast-path loop. Skim over a normal identifier.
458 N.B. ISIDNUM does not include $. */
459 cur = pfile->buffer->cur;
460 while (ISIDNUM (*cur))
461 cur++;
463 /* Check for slow-path cases. */
464 if (*cur == '?' || *cur == '\\' || *cur == '$')
466 unsigned int len;
468 base = parse_slow (pfile, cur, 0, &len);
469 result = (cpp_hashnode *)
470 ht_lookup (pfile->hash_table, base, len, HT_ALLOCED);
472 else
474 base = pfile->buffer->cur - 1;
475 pfile->buffer->cur = cur;
476 result = (cpp_hashnode *)
477 ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
480 /* Rarely, identifiers require diagnostics when lexed.
481 XXX Has to be forced out of the fast path. */
482 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
483 && !pfile->state.skipping, 0))
485 /* It is allowed to poison the same identifier twice. */
486 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
487 cpp_error (pfile, DL_ERROR, "attempt to use poisoned \"%s\"",
488 NODE_NAME (result));
490 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
491 replacement list of a variadic macro. */
492 if (result == pfile->spec_nodes.n__VA_ARGS__
493 && !pfile->state.va_args_ok)
494 cpp_error (pfile, DL_PEDWARN,
495 "__VA_ARGS__ can only appear in the expansion of a C99 variadic macro");
498 return result;
501 /* Slow path. This handles numbers and identifiers which have been
502 split, or contain dollar signs. The part of the token from
503 PFILE->buffer->cur-1 to CUR has already been scanned. NUMBER_P is
504 1 if it's a number, and 2 if it has a leading period. Returns a
505 pointer to the token's NUL-terminated spelling in permanent
506 storage, and sets PLEN to its length. */
507 static uchar *
508 parse_slow (pfile, cur, number_p, plen)
509 cpp_reader *pfile;
510 const uchar *cur;
511 int number_p;
512 unsigned int *plen;
514 cpp_buffer *buffer = pfile->buffer;
515 const uchar *base = buffer->cur - 1;
516 struct obstack *stack = &pfile->hash_table->stack;
517 unsigned int c, prevc, saw_dollar = 0;
519 /* Place any leading period. */
520 if (number_p == 2)
521 obstack_1grow (stack, '.');
523 /* Copy the part of the token which is known to be okay. */
524 obstack_grow (stack, base, cur - base);
526 /* Now process the part which isn't. We are looking at one of
527 '$', '\\', or '?' on entry to this loop. */
528 prevc = cur[-1];
529 c = *cur++;
530 buffer->cur = cur;
531 for (;;)
533 /* Potential escaped newline? */
534 buffer->backup_to = buffer->cur - 1;
535 if (c == '?' || c == '\\')
536 c = skip_escaped_newlines (pfile);
538 if (!is_idchar (c))
540 if (!number_p)
541 break;
542 if (c != '.' && !VALID_SIGN (c, prevc))
543 break;
546 /* Handle normal identifier characters in this loop. */
549 prevc = c;
550 obstack_1grow (stack, c);
552 if (c == '$')
553 saw_dollar++;
555 c = *buffer->cur++;
557 while (is_idchar (c));
560 /* Step back over the unwanted char. */
561 BACKUP ();
563 /* $ is not an identifier character in the standard, but is commonly
564 accepted as an extension. Don't warn about it in skipped
565 conditional blocks. */
566 if (saw_dollar && CPP_PEDANTIC (pfile) && ! pfile->state.skipping)
567 cpp_error (pfile, DL_PEDWARN, "'$' character(s) in identifier or number");
569 /* Identifiers and numbers are null-terminated. */
570 *plen = obstack_object_size (stack);
571 obstack_1grow (stack, '\0');
572 return obstack_finish (stack);
575 /* Parse a number, beginning with character C, skipping embedded
576 backslash-newlines. LEADING_PERIOD is nonzero if there was a "."
577 before C. Place the result in NUMBER. */
578 static void
579 parse_number (pfile, number, leading_period)
580 cpp_reader *pfile;
581 cpp_string *number;
582 int leading_period;
584 const uchar *cur;
586 /* Fast-path loop. Skim over a normal number.
587 N.B. ISIDNUM does not include $. */
588 cur = pfile->buffer->cur;
589 while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
590 cur++;
592 /* Check for slow-path cases. */
593 if (*cur == '?' || *cur == '\\' || *cur == '$')
594 number->text = parse_slow (pfile, cur, 1 + leading_period, &number->len);
595 else
597 const uchar *base = pfile->buffer->cur - 1;
598 uchar *dest;
600 number->len = cur - base + leading_period;
601 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
602 dest[number->len] = '\0';
603 number->text = dest;
605 if (leading_period)
606 *dest++ = '.';
607 memcpy (dest, base, cur - base);
608 pfile->buffer->cur = cur;
612 /* Subroutine of parse_string. */
613 static int
614 unescaped_terminator_p (pfile, dest)
615 cpp_reader *pfile;
616 const unsigned char *dest;
618 const unsigned char *start, *temp;
620 /* In #include-style directives, terminators are not escapable. */
621 if (pfile->state.angled_headers)
622 return 1;
624 start = BUFF_FRONT (pfile->u_buff);
626 /* An odd number of consecutive backslashes represents an escaped
627 terminator. */
628 for (temp = dest; temp > start && temp[-1] == '\\'; temp--)
631 return ((dest - temp) & 1) == 0;
634 /* Parses a string, character constant, or angle-bracketed header file
635 name. Handles embedded trigraphs and escaped newlines. The stored
636 string is guaranteed NUL-terminated, but it is not guaranteed that
637 this is the first NUL since embedded NULs are preserved.
639 When this function returns, buffer->cur points to the next
640 character to be processed. */
641 static void
642 parse_string (pfile, token, terminator)
643 cpp_reader *pfile;
644 cpp_token *token;
645 cppchar_t terminator;
647 cpp_buffer *buffer = pfile->buffer;
648 unsigned char *dest, *limit;
649 cppchar_t c;
650 bool warned_nulls = false;
651 #ifdef MULTIBYTE_CHARS
652 wchar_t wc;
653 int char_len;
654 #endif
656 dest = BUFF_FRONT (pfile->u_buff);
657 limit = BUFF_LIMIT (pfile->u_buff);
659 #ifdef MULTIBYTE_CHARS
660 /* Reset multibyte conversion state. */
661 (void) local_mbtowc (NULL, NULL, 0);
662 #endif
663 for (;;)
665 /* We need room for another char, possibly the terminating NUL. */
666 if ((size_t) (limit - dest) < 1)
668 size_t len_so_far = dest - BUFF_FRONT (pfile->u_buff);
669 _cpp_extend_buff (pfile, &pfile->u_buff, 2);
670 dest = BUFF_FRONT (pfile->u_buff) + len_so_far;
671 limit = BUFF_LIMIT (pfile->u_buff);
674 #ifdef MULTIBYTE_CHARS
675 char_len = local_mbtowc (&wc, (const char *) buffer->cur,
676 buffer->rlimit - buffer->cur);
677 if (char_len == -1)
679 cpp_error (pfile, DL_WARNING,
680 "ignoring invalid multibyte character");
681 char_len = 1;
682 c = *buffer->cur++;
684 else
686 buffer->cur += char_len;
687 c = wc;
689 #else
690 c = *buffer->cur++;
691 #endif
693 /* Handle trigraphs, escaped newlines etc. */
694 if (c == '?' || c == '\\')
695 c = skip_escaped_newlines (pfile);
697 if (c == terminator)
699 if (unescaped_terminator_p (pfile, dest))
700 break;
702 else if (is_vspace (c))
704 /* No string literal may extend over multiple lines. In
705 assembly language, suppress the error except for <>
706 includes. This is a kludge around not knowing where
707 comments are. */
708 unterminated:
709 if (CPP_OPTION (pfile, lang) != CLK_ASM || terminator == '>')
710 cpp_error (pfile, DL_ERROR, "missing terminating %c character",
711 (int) terminator);
712 buffer->cur--;
713 break;
715 else if (c == '\0')
717 if (buffer->cur - 1 == buffer->rlimit)
718 goto unterminated;
719 if (!warned_nulls)
721 warned_nulls = true;
722 cpp_error (pfile, DL_WARNING,
723 "null character(s) preserved in literal");
726 #ifdef MULTIBYTE_CHARS
727 if (char_len > 1)
729 for ( ; char_len > 0; --char_len)
730 *dest++ = (*buffer->cur - char_len);
732 else
733 #endif
734 *dest++ = c;
737 *dest = '\0';
739 token->val.str.text = BUFF_FRONT (pfile->u_buff);
740 token->val.str.len = dest - BUFF_FRONT (pfile->u_buff);
741 BUFF_FRONT (pfile->u_buff) = dest + 1;
744 /* The stored comment includes the comment start and any terminator. */
745 static void
746 save_comment (pfile, token, from, type)
747 cpp_reader *pfile;
748 cpp_token *token;
749 const unsigned char *from;
750 cppchar_t type;
752 unsigned char *buffer;
753 unsigned int len, clen;
755 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
757 /* C++ comments probably (not definitely) have moved past a new
758 line, which we don't want to save in the comment. */
759 if (is_vspace (pfile->buffer->cur[-1]))
760 len--;
762 /* If we are currently in a directive, then we need to store all
763 C++ comments as C comments internally, and so we need to
764 allocate a little extra space in that case.
766 Note that the only time we encounter a directive here is
767 when we are saving comments in a "#define". */
768 clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
770 buffer = _cpp_unaligned_alloc (pfile, clen);
772 token->type = CPP_COMMENT;
773 token->val.str.len = clen;
774 token->val.str.text = buffer;
776 buffer[0] = '/';
777 memcpy (buffer + 1, from, len - 1);
779 /* Finish conversion to a C comment, if necessary. */
780 if (pfile->state.in_directive && type == '/')
782 buffer[1] = '*';
783 buffer[clen - 2] = '*';
784 buffer[clen - 1] = '/';
788 /* Allocate COUNT tokens for RUN. */
789 void
790 _cpp_init_tokenrun (run, count)
791 tokenrun *run;
792 unsigned int count;
794 run->base = xnewvec (cpp_token, count);
795 run->limit = run->base + count;
796 run->next = NULL;
799 /* Returns the next tokenrun, or creates one if there is none. */
800 static tokenrun *
801 next_tokenrun (run)
802 tokenrun *run;
804 if (run->next == NULL)
806 run->next = xnew (tokenrun);
807 run->next->prev = run;
808 _cpp_init_tokenrun (run->next, 250);
811 return run->next;
814 /* Allocate a single token that is invalidated at the same time as the
815 rest of the tokens on the line. Has its line and col set to the
816 same as the last lexed token, so that diagnostics appear in the
817 right place. */
818 cpp_token *
819 _cpp_temp_token (pfile)
820 cpp_reader *pfile;
822 cpp_token *old, *result;
824 old = pfile->cur_token - 1;
825 if (pfile->cur_token == pfile->cur_run->limit)
827 pfile->cur_run = next_tokenrun (pfile->cur_run);
828 pfile->cur_token = pfile->cur_run->base;
831 result = pfile->cur_token++;
832 result->line = old->line;
833 result->col = old->col;
834 return result;
837 /* Lex a token into RESULT (external interface). Takes care of issues
838 like directive handling, token lookahead, multiple include
839 optimization and skipping. */
840 const cpp_token *
841 _cpp_lex_token (pfile)
842 cpp_reader *pfile;
844 cpp_token *result;
846 for (;;)
848 if (pfile->cur_token == pfile->cur_run->limit)
850 pfile->cur_run = next_tokenrun (pfile->cur_run);
851 pfile->cur_token = pfile->cur_run->base;
854 if (pfile->lookaheads)
856 pfile->lookaheads--;
857 result = pfile->cur_token++;
859 else
860 result = _cpp_lex_direct (pfile);
862 if (result->flags & BOL)
864 /* Is this a directive. If _cpp_handle_directive returns
865 false, it is an assembler #. */
866 if (result->type == CPP_HASH
867 /* 6.10.3 p 11: Directives in a list of macro arguments
868 gives undefined behavior. This implementation
869 handles the directive as normal. */
870 && pfile->state.parsing_args != 1
871 && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
872 continue;
873 if (pfile->cb.line_change && !pfile->state.skipping)
874 (*pfile->cb.line_change)(pfile, result, pfile->state.parsing_args);
877 /* We don't skip tokens in directives. */
878 if (pfile->state.in_directive)
879 break;
881 /* Outside a directive, invalidate controlling macros. At file
882 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
883 get here and MI optimisation works. */
884 pfile->mi_valid = false;
886 if (!pfile->state.skipping || result->type == CPP_EOF)
887 break;
890 return result;
893 /* A NUL terminates the current buffer. For ISO preprocessing this is
894 EOF, but for traditional preprocessing it indicates we need a line
895 refill. Returns TRUE to continue preprocessing a new buffer, FALSE
896 to return a CPP_EOF to the caller. */
897 static bool
898 continue_after_nul (pfile)
899 cpp_reader *pfile;
901 cpp_buffer *buffer = pfile->buffer;
902 bool more = false;
904 buffer->saved_flags = BOL;
905 if (CPP_OPTION (pfile, traditional))
907 if (pfile->state.in_directive)
908 return false;
910 _cpp_remove_overlay (pfile);
911 more = _cpp_read_logical_line_trad (pfile);
912 _cpp_overlay_buffer (pfile, pfile->out.base,
913 pfile->out.cur - pfile->out.base);
914 pfile->line = pfile->out.first_line;
916 else
918 /* Stop parsing arguments with a CPP_EOF. When we finally come
919 back here, do the work of popping the buffer. */
920 if (!pfile->state.parsing_args)
922 if (buffer->cur != buffer->line_base)
924 /* Non-empty files should end in a newline. Don't warn
925 for command line and _Pragma buffers. */
926 if (!buffer->from_stage3)
927 cpp_error (pfile, DL_PEDWARN, "no newline at end of file");
928 handle_newline (pfile);
931 /* Similarly, finish an in-progress directive with CPP_EOF
932 before popping the buffer. */
933 if (!pfile->state.in_directive && buffer->prev)
935 more = !buffer->return_at_eof;
936 _cpp_pop_buffer (pfile);
941 return more;
944 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
945 do { \
946 if (get_effective_char (pfile) == CHAR) \
947 result->type = THEN_TYPE; \
948 else \
950 BACKUP (); \
951 result->type = ELSE_TYPE; \
953 } while (0)
955 /* Lex a token into pfile->cur_token, which is also incremented, to
956 get diagnostics pointing to the correct location.
958 Does not handle issues such as token lookahead, multiple-include
959 optimisation, directives, skipping etc. This function is only
960 suitable for use by _cpp_lex_token, and in special cases like
961 lex_expansion_token which doesn't care for any of these issues.
963 When meeting a newline, returns CPP_EOF if parsing a directive,
964 otherwise returns to the start of the token buffer if permissible.
965 Returns the location of the lexed token. */
966 cpp_token *
967 _cpp_lex_direct (pfile)
968 cpp_reader *pfile;
970 cppchar_t c;
971 cpp_buffer *buffer;
972 const unsigned char *comment_start;
973 cpp_token *result = pfile->cur_token++;
975 fresh_line:
976 buffer = pfile->buffer;
977 result->flags = buffer->saved_flags;
978 buffer->saved_flags = 0;
979 update_tokens_line:
980 result->line = pfile->line;
982 skipped_white:
983 c = *buffer->cur++;
984 result->col = CPP_BUF_COLUMN (buffer, buffer->cur);
986 trigraph:
987 switch (c)
989 case ' ': case '\t': case '\f': case '\v': case '\0':
990 result->flags |= PREV_WHITE;
991 if (skip_whitespace (pfile, c))
992 goto skipped_white;
994 /* End of buffer. */
995 buffer->cur--;
996 if (continue_after_nul (pfile))
997 goto fresh_line;
998 result->type = CPP_EOF;
999 break;
1001 case '\n': case '\r':
1002 handle_newline (pfile);
1003 buffer->saved_flags = BOL;
1004 if (! pfile->state.in_directive)
1006 if (pfile->state.parsing_args == 2)
1007 buffer->saved_flags |= PREV_WHITE;
1008 if (!pfile->keep_tokens)
1010 pfile->cur_run = &pfile->base_run;
1011 result = pfile->base_run.base;
1012 pfile->cur_token = result + 1;
1014 goto fresh_line;
1016 result->type = CPP_EOF;
1017 break;
1019 case '?':
1020 case '\\':
1021 /* These could start an escaped newline, or '?' a trigraph. Let
1022 skip_escaped_newlines do all the work. */
1024 unsigned int line = pfile->line;
1026 c = skip_escaped_newlines (pfile);
1027 if (line != pfile->line)
1029 buffer->cur--;
1030 /* We had at least one escaped newline of some sort.
1031 Update the token's line and column. */
1032 goto update_tokens_line;
1036 /* We are either the original '?' or '\\', or a trigraph. */
1037 if (c == '?')
1038 result->type = CPP_QUERY;
1039 else if (c == '\\')
1040 goto random_char;
1041 else
1042 goto trigraph;
1043 break;
1045 case '0': case '1': case '2': case '3': case '4':
1046 case '5': case '6': case '7': case '8': case '9':
1047 result->type = CPP_NUMBER;
1048 parse_number (pfile, &result->val.str, 0);
1049 break;
1051 case 'L':
1052 /* 'L' may introduce wide characters or strings. */
1054 const unsigned char *pos = buffer->cur;
1056 c = get_effective_char (pfile);
1057 if (c == '\'' || c == '"')
1059 result->type = (c == '"' ? CPP_WSTRING: CPP_WCHAR);
1060 parse_string (pfile, result, c);
1061 break;
1063 buffer->cur = pos;
1065 /* Fall through. */
1067 start_ident:
1068 case '_':
1069 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1070 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1071 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1072 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1073 case 'y': case 'z':
1074 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1075 case 'G': case 'H': case 'I': case 'J': case 'K':
1076 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1077 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1078 case 'Y': case 'Z':
1079 result->type = CPP_NAME;
1080 result->val.node = parse_identifier (pfile);
1082 /* Convert named operators to their proper types. */
1083 if (result->val.node->flags & NODE_OPERATOR)
1085 result->flags |= NAMED_OP;
1086 result->type = result->val.node->directive_index;
1088 break;
1090 case '\'':
1091 case '"':
1092 result->type = c == '"' ? CPP_STRING: CPP_CHAR;
1093 parse_string (pfile, result, c);
1094 break;
1096 case '/':
1097 /* A potential block or line comment. */
1098 comment_start = buffer->cur;
1099 c = get_effective_char (pfile);
1101 if (c == '*')
1103 if (skip_block_comment (pfile))
1104 cpp_error (pfile, DL_ERROR, "unterminated comment");
1106 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1107 || CPP_IN_SYSTEM_HEADER (pfile)))
1109 /* Warn about comments only if pedantically GNUC89, and not
1110 in system headers. */
1111 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1112 && ! buffer->warned_cplusplus_comments)
1114 cpp_error (pfile, DL_PEDWARN,
1115 "C++ style comments are not allowed in ISO C90");
1116 cpp_error (pfile, DL_PEDWARN,
1117 "(this will be reported only once per input file)");
1118 buffer->warned_cplusplus_comments = 1;
1121 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1122 cpp_error (pfile, DL_WARNING, "multi-line comment");
1124 else if (c == '=')
1126 result->type = CPP_DIV_EQ;
1127 break;
1129 else
1131 BACKUP ();
1132 result->type = CPP_DIV;
1133 break;
1136 if (!pfile->state.save_comments)
1138 result->flags |= PREV_WHITE;
1139 goto update_tokens_line;
1142 /* Save the comment as a token in its own right. */
1143 save_comment (pfile, result, comment_start, c);
1144 break;
1146 case '<':
1147 if (pfile->state.angled_headers)
1149 result->type = CPP_HEADER_NAME;
1150 parse_string (pfile, result, '>');
1151 break;
1154 c = get_effective_char (pfile);
1155 if (c == '=')
1156 result->type = CPP_LESS_EQ;
1157 else if (c == '<')
1158 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1159 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1160 IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
1161 else if (c == ':' && CPP_OPTION (pfile, digraphs))
1163 result->type = CPP_OPEN_SQUARE;
1164 result->flags |= DIGRAPH;
1166 else if (c == '%' && CPP_OPTION (pfile, digraphs))
1168 result->type = CPP_OPEN_BRACE;
1169 result->flags |= DIGRAPH;
1171 else
1173 BACKUP ();
1174 result->type = CPP_LESS;
1176 break;
1178 case '>':
1179 c = get_effective_char (pfile);
1180 if (c == '=')
1181 result->type = CPP_GREATER_EQ;
1182 else if (c == '>')
1183 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1184 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1185 IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
1186 else
1188 BACKUP ();
1189 result->type = CPP_GREATER;
1191 break;
1193 case '%':
1194 c = get_effective_char (pfile);
1195 if (c == '=')
1196 result->type = CPP_MOD_EQ;
1197 else if (CPP_OPTION (pfile, digraphs) && c == ':')
1199 result->flags |= DIGRAPH;
1200 result->type = CPP_HASH;
1201 if (get_effective_char (pfile) == '%')
1203 const unsigned char *pos = buffer->cur;
1205 if (get_effective_char (pfile) == ':')
1206 result->type = CPP_PASTE;
1207 else
1208 buffer->cur = pos - 1;
1210 else
1211 BACKUP ();
1213 else if (CPP_OPTION (pfile, digraphs) && c == '>')
1215 result->flags |= DIGRAPH;
1216 result->type = CPP_CLOSE_BRACE;
1218 else
1220 BACKUP ();
1221 result->type = CPP_MOD;
1223 break;
1225 case '.':
1226 result->type = CPP_DOT;
1227 c = get_effective_char (pfile);
1228 if (c == '.')
1230 const unsigned char *pos = buffer->cur;
1232 if (get_effective_char (pfile) == '.')
1233 result->type = CPP_ELLIPSIS;
1234 else
1235 buffer->cur = pos - 1;
1237 /* All known character sets have 0...9 contiguous. */
1238 else if (ISDIGIT (c))
1240 result->type = CPP_NUMBER;
1241 parse_number (pfile, &result->val.str, 1);
1243 else if (c == '*' && CPP_OPTION (pfile, cplusplus))
1244 result->type = CPP_DOT_STAR;
1245 else
1246 BACKUP ();
1247 break;
1249 case '+':
1250 c = get_effective_char (pfile);
1251 if (c == '+')
1252 result->type = CPP_PLUS_PLUS;
1253 else if (c == '=')
1254 result->type = CPP_PLUS_EQ;
1255 else
1257 BACKUP ();
1258 result->type = CPP_PLUS;
1260 break;
1262 case '-':
1263 c = get_effective_char (pfile);
1264 if (c == '>')
1266 result->type = CPP_DEREF;
1267 if (CPP_OPTION (pfile, cplusplus))
1269 if (get_effective_char (pfile) == '*')
1270 result->type = CPP_DEREF_STAR;
1271 else
1272 BACKUP ();
1275 else if (c == '-')
1276 result->type = CPP_MINUS_MINUS;
1277 else if (c == '=')
1278 result->type = CPP_MINUS_EQ;
1279 else
1281 BACKUP ();
1282 result->type = CPP_MINUS;
1284 break;
1286 case '&':
1287 c = get_effective_char (pfile);
1288 if (c == '&')
1289 result->type = CPP_AND_AND;
1290 else if (c == '=')
1291 result->type = CPP_AND_EQ;
1292 else
1294 BACKUP ();
1295 result->type = CPP_AND;
1297 break;
1299 case '|':
1300 c = get_effective_char (pfile);
1301 if (c == '|')
1302 result->type = CPP_OR_OR;
1303 else if (c == '=')
1304 result->type = CPP_OR_EQ;
1305 else
1307 BACKUP ();
1308 result->type = CPP_OR;
1310 break;
1312 case ':':
1313 c = get_effective_char (pfile);
1314 if (c == ':' && CPP_OPTION (pfile, cplusplus))
1315 result->type = CPP_SCOPE;
1316 else if (c == '>' && CPP_OPTION (pfile, digraphs))
1318 result->flags |= DIGRAPH;
1319 result->type = CPP_CLOSE_SQUARE;
1321 else
1323 BACKUP ();
1324 result->type = CPP_COLON;
1326 break;
1328 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1329 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1330 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1331 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1332 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1334 case '~': result->type = CPP_COMPL; break;
1335 case ',': result->type = CPP_COMMA; break;
1336 case '(': result->type = CPP_OPEN_PAREN; break;
1337 case ')': result->type = CPP_CLOSE_PAREN; break;
1338 case '[': result->type = CPP_OPEN_SQUARE; break;
1339 case ']': result->type = CPP_CLOSE_SQUARE; break;
1340 case '{': result->type = CPP_OPEN_BRACE; break;
1341 case '}': result->type = CPP_CLOSE_BRACE; break;
1342 case ';': result->type = CPP_SEMICOLON; break;
1344 /* @ is a punctuator in Objective-C. */
1345 case '@': result->type = CPP_ATSIGN; break;
1347 case '$':
1348 if (CPP_OPTION (pfile, dollars_in_ident))
1349 goto start_ident;
1350 /* Fall through... */
1352 random_char:
1353 default:
1354 result->type = CPP_OTHER;
1355 result->val.c = c;
1356 break;
1359 return result;
1362 /* An upper bound on the number of bytes needed to spell TOKEN,
1363 including preceding whitespace. */
1364 unsigned int
1365 cpp_token_len (token)
1366 const cpp_token *token;
1368 unsigned int len;
1370 switch (TOKEN_SPELL (token))
1372 default: len = 0; break;
1373 case SPELL_NUMBER:
1374 case SPELL_STRING: len = token->val.str.len; break;
1375 case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
1377 /* 1 for whitespace, 4 for comment delimiters. */
1378 return len + 5;
1381 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1382 already contain the enough space to hold the token's spelling.
1383 Returns a pointer to the character after the last character
1384 written. */
1385 unsigned char *
1386 cpp_spell_token (pfile, token, buffer)
1387 cpp_reader *pfile; /* Would be nice to be rid of this... */
1388 const cpp_token *token;
1389 unsigned char *buffer;
1391 switch (TOKEN_SPELL (token))
1393 case SPELL_OPERATOR:
1395 const unsigned char *spelling;
1396 unsigned char c;
1398 if (token->flags & DIGRAPH)
1399 spelling
1400 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1401 else if (token->flags & NAMED_OP)
1402 goto spell_ident;
1403 else
1404 spelling = TOKEN_NAME (token);
1406 while ((c = *spelling++) != '\0')
1407 *buffer++ = c;
1409 break;
1411 case SPELL_CHAR:
1412 *buffer++ = token->val.c;
1413 break;
1415 spell_ident:
1416 case SPELL_IDENT:
1417 memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
1418 buffer += NODE_LEN (token->val.node);
1419 break;
1421 case SPELL_NUMBER:
1422 memcpy (buffer, token->val.str.text, token->val.str.len);
1423 buffer += token->val.str.len;
1424 break;
1426 case SPELL_STRING:
1428 int left, right, tag;
1429 switch (token->type)
1431 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1432 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1433 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1434 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1435 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1436 default:
1437 cpp_error (pfile, DL_ICE, "unknown string token %s\n",
1438 TOKEN_NAME (token));
1439 return buffer;
1441 if (tag) *buffer++ = tag;
1442 *buffer++ = left;
1443 memcpy (buffer, token->val.str.text, token->val.str.len);
1444 buffer += token->val.str.len;
1445 *buffer++ = right;
1447 break;
1449 case SPELL_NONE:
1450 cpp_error (pfile, DL_ICE, "unspellable token %s", TOKEN_NAME (token));
1451 break;
1454 return buffer;
1457 /* Returns TOKEN spelt as a null-terminated string. The string is
1458 freed when the reader is destroyed. Useful for diagnostics. */
1459 unsigned char *
1460 cpp_token_as_text (pfile, token)
1461 cpp_reader *pfile;
1462 const cpp_token *token;
1464 unsigned int len = cpp_token_len (token);
1465 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1467 end = cpp_spell_token (pfile, token, start);
1468 end[0] = '\0';
1470 return start;
1473 /* Used by C front ends, which really should move to using
1474 cpp_token_as_text. */
1475 const char *
1476 cpp_type2name (type)
1477 enum cpp_ttype type;
1479 return (const char *) token_spellings[type].name;
1482 /* Writes the spelling of token to FP, without any preceding space.
1483 Separated from cpp_spell_token for efficiency - to avoid stdio
1484 double-buffering. */
1485 void
1486 cpp_output_token (token, fp)
1487 const cpp_token *token;
1488 FILE *fp;
1490 switch (TOKEN_SPELL (token))
1492 case SPELL_OPERATOR:
1494 const unsigned char *spelling;
1495 int c;
1497 if (token->flags & DIGRAPH)
1498 spelling
1499 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1500 else if (token->flags & NAMED_OP)
1501 goto spell_ident;
1502 else
1503 spelling = TOKEN_NAME (token);
1505 c = *spelling;
1507 putc (c, fp);
1508 while ((c = *++spelling) != '\0');
1510 break;
1512 case SPELL_CHAR:
1513 putc (token->val.c, fp);
1514 break;
1516 spell_ident:
1517 case SPELL_IDENT:
1518 fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
1519 break;
1521 case SPELL_NUMBER:
1522 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1523 break;
1525 case SPELL_STRING:
1527 int left, right, tag;
1528 switch (token->type)
1530 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1531 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1532 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1533 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1534 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1535 default:
1536 fprintf (stderr, "impossible STRING token %s\n", TOKEN_NAME (token));
1537 return;
1539 if (tag) putc (tag, fp);
1540 putc (left, fp);
1541 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1542 putc (right, fp);
1544 break;
1546 case SPELL_NONE:
1547 /* An error, most probably. */
1548 break;
1552 /* Compare two tokens. */
1554 _cpp_equiv_tokens (a, b)
1555 const cpp_token *a, *b;
1557 if (a->type == b->type && a->flags == b->flags)
1558 switch (TOKEN_SPELL (a))
1560 default: /* Keep compiler happy. */
1561 case SPELL_OPERATOR:
1562 return 1;
1563 case SPELL_CHAR:
1564 return a->val.c == b->val.c; /* Character. */
1565 case SPELL_NONE:
1566 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1567 case SPELL_IDENT:
1568 return a->val.node == b->val.node;
1569 case SPELL_NUMBER:
1570 case SPELL_STRING:
1571 return (a->val.str.len == b->val.str.len
1572 && !memcmp (a->val.str.text, b->val.str.text,
1573 a->val.str.len));
1576 return 0;
1579 /* Returns nonzero if a space should be inserted to avoid an
1580 accidental token paste for output. For simplicity, it is
1581 conservative, and occasionally advises a space where one is not
1582 needed, e.g. "." and ".2". */
1584 cpp_avoid_paste (pfile, token1, token2)
1585 cpp_reader *pfile;
1586 const cpp_token *token1, *token2;
1588 enum cpp_ttype a = token1->type, b = token2->type;
1589 cppchar_t c;
1591 if (token1->flags & NAMED_OP)
1592 a = CPP_NAME;
1593 if (token2->flags & NAMED_OP)
1594 b = CPP_NAME;
1596 c = EOF;
1597 if (token2->flags & DIGRAPH)
1598 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1599 else if (token_spellings[b].category == SPELL_OPERATOR)
1600 c = token_spellings[b].name[0];
1602 /* Quickly get everything that can paste with an '='. */
1603 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1604 return 1;
1606 switch (a)
1608 case CPP_GREATER: return c == '>' || c == '?';
1609 case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
1610 case CPP_PLUS: return c == '+';
1611 case CPP_MINUS: return c == '-' || c == '>';
1612 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1613 case CPP_MOD: return c == ':' || c == '>';
1614 case CPP_AND: return c == '&';
1615 case CPP_OR: return c == '|';
1616 case CPP_COLON: return c == ':' || c == '>';
1617 case CPP_DEREF: return c == '*';
1618 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1619 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1620 case CPP_NAME: return ((b == CPP_NUMBER
1621 && name_p (pfile, &token2->val.str))
1622 || b == CPP_NAME
1623 || b == CPP_CHAR || b == CPP_STRING); /* L */
1624 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1625 || c == '.' || c == '+' || c == '-');
1626 case CPP_OTHER: return (CPP_OPTION (pfile, objc)
1627 && token1->val.c == '@'
1628 && (b == CPP_NAME || b == CPP_STRING));
1629 default: break;
1632 return 0;
1635 /* Output all the remaining tokens on the current line, and a newline
1636 character, to FP. Leading whitespace is removed. If there are
1637 macros, special token padding is not performed. */
1638 void
1639 cpp_output_line (pfile, fp)
1640 cpp_reader *pfile;
1641 FILE *fp;
1643 const cpp_token *token;
1645 token = cpp_get_token (pfile);
1646 while (token->type != CPP_EOF)
1648 cpp_output_token (token, fp);
1649 token = cpp_get_token (pfile);
1650 if (token->flags & PREV_WHITE)
1651 putc (' ', fp);
1654 putc ('\n', fp);
1657 /* Returns the value of a hexadecimal digit. */
1658 static unsigned int
1659 hex_digit_value (c)
1660 unsigned int c;
1662 if (hex_p (c))
1663 return hex_value (c);
1664 else
1665 abort ();
1668 /* Parse a '\uNNNN' or '\UNNNNNNNN' sequence. Returns 1 to indicate
1669 failure if cpplib is not parsing C++ or C99. Such failure is
1670 silent, and no variables are updated. Otherwise returns 0, and
1671 warns if -Wtraditional.
1673 [lex.charset]: The character designated by the universal character
1674 name \UNNNNNNNN is that character whose character short name in
1675 ISO/IEC 10646 is NNNNNNNN; the character designated by the
1676 universal character name \uNNNN is that character whose character
1677 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
1678 for a universal character name is less than 0x20 or in the range
1679 0x7F-0x9F (inclusive), or if the universal character name
1680 designates a character in the basic source character set, then the
1681 program is ill-formed.
1683 We assume that wchar_t is Unicode, so we don't need to do any
1684 mapping. Is this ever wrong?
1686 PC points to the 'u' or 'U', PSTR is points to the byte after PC,
1687 LIMIT is the end of the string or charconst. PSTR is updated to
1688 point after the UCS on return, and the UCS is written into PC. */
1690 static int
1691 maybe_read_ucs (pfile, pstr, limit, pc)
1692 cpp_reader *pfile;
1693 const unsigned char **pstr;
1694 const unsigned char *limit;
1695 cppchar_t *pc;
1697 const unsigned char *p = *pstr;
1698 unsigned int code = 0;
1699 unsigned int c = *pc, length;
1701 /* Only attempt to interpret a UCS for C++ and C99. */
1702 if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
1703 return 1;
1705 if (CPP_WTRADITIONAL (pfile))
1706 cpp_error (pfile, DL_WARNING,
1707 "the meaning of '\\%c' is different in traditional C", c);
1709 length = (c == 'u' ? 4: 8);
1711 if ((size_t) (limit - p) < length)
1713 cpp_error (pfile, DL_ERROR, "incomplete universal-character-name");
1714 /* Skip to the end to avoid more diagnostics. */
1715 p = limit;
1717 else
1719 for (; length; length--, p++)
1721 c = *p;
1722 if (ISXDIGIT (c))
1723 code = (code << 4) + hex_digit_value (c);
1724 else
1726 cpp_error (pfile, DL_ERROR,
1727 "non-hex digit '%c' in universal-character-name", c);
1728 /* We shouldn't skip in case there are multibyte chars. */
1729 break;
1734 #ifdef TARGET_EBCDIC
1735 cpp_error (pfile, DL_ERROR, "universal-character-name on EBCDIC target");
1736 code = 0x3f; /* EBCDIC invalid character */
1737 #else
1738 /* True extended characters are OK. */
1739 if (code >= 0xa0
1740 && !(code & 0x80000000)
1741 && !(code >= 0xD800 && code <= 0xDFFF))
1743 /* The standard permits $, @ and ` to be specified as UCNs. We use
1744 hex escapes so that this also works with EBCDIC hosts. */
1745 else if (code == 0x24 || code == 0x40 || code == 0x60)
1747 /* Don't give another error if one occurred above. */
1748 else if (length == 0)
1749 cpp_error (pfile, DL_ERROR, "universal-character-name out of range");
1750 #endif
1752 *pstr = p;
1753 *pc = code;
1754 return 0;
1757 /* Returns the value of an escape sequence, truncated to the correct
1758 target precision. PSTR points to the input pointer, which is just
1759 after the backslash. LIMIT is how much text we have. WIDE is true
1760 if the escape sequence is part of a wide character constant or
1761 string literal. Handles all relevant diagnostics. */
1762 cppchar_t
1763 cpp_parse_escape (pfile, pstr, limit, wide)
1764 cpp_reader *pfile;
1765 const unsigned char **pstr;
1766 const unsigned char *limit;
1767 int wide;
1769 int unknown = 0;
1770 const unsigned char *str = *pstr;
1771 cppchar_t c, mask;
1772 unsigned int width;
1774 if (wide)
1775 width = CPP_OPTION (pfile, wchar_precision);
1776 else
1777 width = CPP_OPTION (pfile, char_precision);
1778 if (width < BITS_PER_CPPCHAR_T)
1779 mask = ((cppchar_t) 1 << width) - 1;
1780 else
1781 mask = ~0;
1783 c = *str++;
1784 switch (c)
1786 case '\\': case '\'': case '"': case '?': break;
1787 case 'b': c = TARGET_BS; break;
1788 case 'f': c = TARGET_FF; break;
1789 case 'n': c = TARGET_NEWLINE; break;
1790 case 'r': c = TARGET_CR; break;
1791 case 't': c = TARGET_TAB; break;
1792 case 'v': c = TARGET_VT; break;
1794 case '(': case '{': case '[': case '%':
1795 /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
1796 '\%' is used to prevent SCCS from getting confused. */
1797 unknown = CPP_PEDANTIC (pfile);
1798 break;
1800 case 'a':
1801 if (CPP_WTRADITIONAL (pfile))
1802 cpp_error (pfile, DL_WARNING,
1803 "the meaning of '\\a' is different in traditional C");
1804 c = TARGET_BELL;
1805 break;
1807 case 'e': case 'E':
1808 if (CPP_PEDANTIC (pfile))
1809 cpp_error (pfile, DL_PEDWARN,
1810 "non-ISO-standard escape sequence, '\\%c'", (int) c);
1811 c = TARGET_ESC;
1812 break;
1814 case 'u': case 'U':
1815 unknown = maybe_read_ucs (pfile, &str, limit, &c);
1816 break;
1818 case 'x':
1819 if (CPP_WTRADITIONAL (pfile))
1820 cpp_error (pfile, DL_WARNING,
1821 "the meaning of '\\x' is different in traditional C");
1824 cppchar_t i = 0, overflow = 0;
1825 int digits_found = 0;
1827 while (str < limit)
1829 c = *str;
1830 if (! ISXDIGIT (c))
1831 break;
1832 str++;
1833 overflow |= i ^ (i << 4 >> 4);
1834 i = (i << 4) + hex_digit_value (c);
1835 digits_found = 1;
1838 if (!digits_found)
1839 cpp_error (pfile, DL_ERROR,
1840 "\\x used with no following hex digits");
1842 if (overflow | (i != (i & mask)))
1844 cpp_error (pfile, DL_PEDWARN,
1845 "hex escape sequence out of range");
1846 i &= mask;
1848 c = i;
1850 break;
1852 case '0': case '1': case '2': case '3':
1853 case '4': case '5': case '6': case '7':
1855 size_t count = 0;
1856 cppchar_t i = c - '0';
1858 while (str < limit && ++count < 3)
1860 c = *str;
1861 if (c < '0' || c > '7')
1862 break;
1863 str++;
1864 i = (i << 3) + c - '0';
1867 if (i != (i & mask))
1869 cpp_error (pfile, DL_PEDWARN,
1870 "octal escape sequence out of range");
1871 i &= mask;
1873 c = i;
1875 break;
1877 default:
1878 unknown = 1;
1879 break;
1882 if (unknown)
1884 if (ISGRAPH (c))
1885 cpp_error (pfile, DL_PEDWARN,
1886 "unknown escape sequence '\\%c'", (int) c);
1887 else
1888 cpp_error (pfile, DL_PEDWARN,
1889 "unknown escape sequence: '\\%03o'", (int) c);
1892 if (c > mask)
1894 cpp_error (pfile, DL_PEDWARN, "escape sequence out of range for its type");
1895 c &= mask;
1898 *pstr = str;
1899 return c;
1902 /* Interpret a (possibly wide) character constant in TOKEN.
1903 WARN_MULTI warns about multi-character charconsts. PCHARS_SEEN
1904 points to a variable that is filled in with the number of
1905 characters seen, and UNSIGNEDP to a variable that indicates whether
1906 the result has signed type. */
1907 cppchar_t
1908 cpp_interpret_charconst (pfile, token, pchars_seen, unsignedp)
1909 cpp_reader *pfile;
1910 const cpp_token *token;
1911 unsigned int *pchars_seen;
1912 int *unsignedp;
1914 const unsigned char *str = token->val.str.text;
1915 const unsigned char *limit = str + token->val.str.len;
1916 unsigned int chars_seen = 0;
1917 size_t width, max_chars;
1918 cppchar_t c, mask, result = 0;
1919 bool unsigned_p;
1921 #ifdef MULTIBYTE_CHARS
1922 (void) local_mbtowc (NULL, NULL, 0);
1923 #endif
1925 /* Width in bits. */
1926 if (token->type == CPP_CHAR)
1928 width = CPP_OPTION (pfile, char_precision);
1929 max_chars = CPP_OPTION (pfile, int_precision) / width;
1930 unsigned_p = CPP_OPTION (pfile, unsigned_char);
1932 else
1934 width = CPP_OPTION (pfile, wchar_precision);
1935 max_chars = 1;
1936 unsigned_p = CPP_OPTION (pfile, unsigned_wchar);
1939 if (width < BITS_PER_CPPCHAR_T)
1940 mask = ((cppchar_t) 1 << width) - 1;
1941 else
1942 mask = ~0;
1944 while (str < limit)
1946 #ifdef MULTIBYTE_CHARS
1947 wchar_t wc;
1948 int char_len;
1950 char_len = local_mbtowc (&wc, (const char *)str, limit - str);
1951 if (char_len == -1)
1953 cpp_error (pfile, DL_WARNING,
1954 "ignoring invalid multibyte character");
1955 c = *str++;
1957 else
1959 str += char_len;
1960 c = wc;
1962 #else
1963 c = *str++;
1964 #endif
1966 if (c == '\\')
1967 c = cpp_parse_escape (pfile, &str, limit, token->type == CPP_WCHAR);
1969 #ifdef MAP_CHARACTER
1970 if (ISPRINT (c))
1971 c = MAP_CHARACTER (c);
1972 #endif
1974 chars_seen++;
1976 /* Truncate the character, scale the result and merge the two. */
1977 c &= mask;
1978 if (width < BITS_PER_CPPCHAR_T)
1979 result = (result << width) | c;
1980 else
1981 result = c;
1984 if (chars_seen == 0)
1985 cpp_error (pfile, DL_ERROR, "empty character constant");
1986 else if (chars_seen > 1)
1988 /* Multichar charconsts are of type int and therefore signed. */
1989 unsigned_p = 0;
1991 if (chars_seen > max_chars)
1993 chars_seen = max_chars;
1994 cpp_error (pfile, DL_WARNING,
1995 "character constant too long for its type");
1997 else if (CPP_OPTION (pfile, warn_multichar))
1998 cpp_error (pfile, DL_WARNING, "multi-character character constant");
2001 /* Sign-extend or truncate the constant to cppchar_t. The value is
2002 in WIDTH bits, but for multi-char charconsts it's value is the
2003 full target type's width. */
2004 if (chars_seen > 1)
2005 width *= max_chars;
2006 if (width < BITS_PER_CPPCHAR_T)
2008 mask = ((cppchar_t) 1 << width) - 1;
2009 if (unsigned_p || !(result & (1 << (width - 1))))
2010 result &= mask;
2011 else
2012 result |= ~mask;
2015 *pchars_seen = chars_seen;
2016 *unsignedp = unsigned_p;
2017 return result;
2020 /* Memory buffers. Changing these three constants can have a dramatic
2021 effect on performance. The values here are reasonable defaults,
2022 but might be tuned. If you adjust them, be sure to test across a
2023 range of uses of cpplib, including heavy nested function-like macro
2024 expansion. Also check the change in peak memory usage (NJAMD is a
2025 good tool for this). */
2026 #define MIN_BUFF_SIZE 8000
2027 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2028 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2029 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2031 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2032 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2033 #endif
2035 /* Create a new allocation buffer. Place the control block at the end
2036 of the buffer, so that buffer overflows will cause immediate chaos. */
2037 static _cpp_buff *
2038 new_buff (len)
2039 size_t len;
2041 _cpp_buff *result;
2042 unsigned char *base;
2044 if (len < MIN_BUFF_SIZE)
2045 len = MIN_BUFF_SIZE;
2046 len = CPP_ALIGN (len);
2048 base = xmalloc (len + sizeof (_cpp_buff));
2049 result = (_cpp_buff *) (base + len);
2050 result->base = base;
2051 result->cur = base;
2052 result->limit = base + len;
2053 result->next = NULL;
2054 return result;
2057 /* Place a chain of unwanted allocation buffers on the free list. */
2058 void
2059 _cpp_release_buff (pfile, buff)
2060 cpp_reader *pfile;
2061 _cpp_buff *buff;
2063 _cpp_buff *end = buff;
2065 while (end->next)
2066 end = end->next;
2067 end->next = pfile->free_buffs;
2068 pfile->free_buffs = buff;
2071 /* Return a free buffer of size at least MIN_SIZE. */
2072 _cpp_buff *
2073 _cpp_get_buff (pfile, min_size)
2074 cpp_reader *pfile;
2075 size_t min_size;
2077 _cpp_buff *result, **p;
2079 for (p = &pfile->free_buffs;; p = &(*p)->next)
2081 size_t size;
2083 if (*p == NULL)
2084 return new_buff (min_size);
2085 result = *p;
2086 size = result->limit - result->base;
2087 /* Return a buffer that's big enough, but don't waste one that's
2088 way too big. */
2089 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2090 break;
2093 *p = result->next;
2094 result->next = NULL;
2095 result->cur = result->base;
2096 return result;
2099 /* Creates a new buffer with enough space to hold the uncommitted
2100 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
2101 the excess bytes to the new buffer. Chains the new buffer after
2102 BUFF, and returns the new buffer. */
2103 _cpp_buff *
2104 _cpp_append_extend_buff (pfile, buff, min_extra)
2105 cpp_reader *pfile;
2106 _cpp_buff *buff;
2107 size_t min_extra;
2109 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2110 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2112 buff->next = new_buff;
2113 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2114 return new_buff;
2117 /* Creates a new buffer with enough space to hold the uncommitted
2118 remaining bytes of the buffer pointed to by BUFF, and at least
2119 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
2120 Chains the new buffer before the buffer pointed to by BUFF, and
2121 updates the pointer to point to the new buffer. */
2122 void
2123 _cpp_extend_buff (pfile, pbuff, min_extra)
2124 cpp_reader *pfile;
2125 _cpp_buff **pbuff;
2126 size_t min_extra;
2128 _cpp_buff *new_buff, *old_buff = *pbuff;
2129 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2131 new_buff = _cpp_get_buff (pfile, size);
2132 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2133 new_buff->next = old_buff;
2134 *pbuff = new_buff;
2137 /* Free a chain of buffers starting at BUFF. */
2138 void
2139 _cpp_free_buff (buff)
2140 _cpp_buff *buff;
2142 _cpp_buff *next;
2144 for (; buff; buff = next)
2146 next = buff->next;
2147 free (buff->base);
2151 /* Allocate permanent, unaligned storage of length LEN. */
2152 unsigned char *
2153 _cpp_unaligned_alloc (pfile, len)
2154 cpp_reader *pfile;
2155 size_t len;
2157 _cpp_buff *buff = pfile->u_buff;
2158 unsigned char *result = buff->cur;
2160 if (len > (size_t) (buff->limit - result))
2162 buff = _cpp_get_buff (pfile, len);
2163 buff->next = pfile->u_buff;
2164 pfile->u_buff = buff;
2165 result = buff->cur;
2168 buff->cur = result + len;
2169 return result;
2172 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2173 That buffer is used for growing allocations when saving macro
2174 replacement lists in a #define, and when parsing an answer to an
2175 assertion in #assert, #unassert or #if (and therefore possibly
2176 whilst expanding macros). It therefore must not be used by any
2177 code that they might call: specifically the lexer and the guts of
2178 the macro expander.
2180 All existing other uses clearly fit this restriction: storing
2181 registered pragmas during initialization. */
2182 unsigned char *
2183 _cpp_aligned_alloc (pfile, len)
2184 cpp_reader *pfile;
2185 size_t len;
2187 _cpp_buff *buff = pfile->a_buff;
2188 unsigned char *result = buff->cur;
2190 if (len > (size_t) (buff->limit - result))
2192 buff = _cpp_get_buff (pfile, len);
2193 buff->next = pfile->a_buff;
2194 pfile->a_buff = buff;
2195 result = buff->cur;
2198 buff->cur = result + len;
2199 return result;