2002-06-05 Eric Christopher <echristo@redhat.com>
[official-gcc.git] / gcc / cpplex.c
blob2baa3e00a0260a66d18df6badaa3fda8496b6a2c
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000, 2001, 2002 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7 Single-pass line tokenization by Neil Booth, April 2000
9 This program is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
12 later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23 #include "config.h"
24 #include "system.h"
25 #include "cpplib.h"
26 #include "cpphash.h"
28 #ifdef MULTIBYTE_CHARS
29 #include "mbchar.h"
30 #include <locale.h>
31 #endif
33 /* Tokens with SPELL_STRING store their spelling in the token list,
34 and it's length in the token->val.name.len. */
35 enum spell_type
37 SPELL_OPERATOR = 0,
38 SPELL_CHAR,
39 SPELL_IDENT,
40 SPELL_NUMBER,
41 SPELL_STRING,
42 SPELL_NONE
45 struct token_spelling
47 enum spell_type category;
48 const unsigned char *name;
51 static const unsigned char *const digraph_spellings[] =
52 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
54 #define OP(e, s) { SPELL_OPERATOR, U s },
55 #define TK(e, s) { s, U STRINGX (e) },
56 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
57 #undef OP
58 #undef TK
60 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
61 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
62 #define BACKUP() do {buffer->cur = buffer->backup_to;} while (0)
64 static void handle_newline PARAMS ((cpp_reader *));
65 static cppchar_t skip_escaped_newlines PARAMS ((cpp_reader *));
66 static cppchar_t get_effective_char PARAMS ((cpp_reader *));
68 static int skip_block_comment PARAMS ((cpp_reader *));
69 static int skip_line_comment PARAMS ((cpp_reader *));
70 static void adjust_column PARAMS ((cpp_reader *));
71 static int skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
72 static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *));
73 static uchar *parse_slow PARAMS ((cpp_reader *, const uchar *, int,
74 unsigned int *));
75 static void parse_number PARAMS ((cpp_reader *, cpp_string *, int));
76 static int unescaped_terminator_p PARAMS ((cpp_reader *, const uchar *));
77 static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
78 static bool trigraph_p PARAMS ((cpp_reader *));
79 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const uchar *,
80 cppchar_t));
81 static bool continue_after_nul PARAMS ((cpp_reader *));
82 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
83 static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
84 const unsigned char *, cppchar_t *));
85 static tokenrun *next_tokenrun PARAMS ((tokenrun *));
87 static unsigned int hex_digit_value PARAMS ((unsigned int));
88 static _cpp_buff *new_buff PARAMS ((size_t));
90 /* Utility routine:
92 Compares, the token TOKEN to the NUL-terminated string STRING.
93 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
94 int
95 cpp_ideq (token, string)
96 const cpp_token *token;
97 const char *string;
99 if (token->type != CPP_NAME)
100 return 0;
102 return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
105 /* Call when meeting a newline, assumed to be in buffer->cur[-1].
106 Returns with buffer->cur pointing to the character immediately
107 following the newline (combination). */
108 static void
109 handle_newline (pfile)
110 cpp_reader *pfile;
112 cpp_buffer *buffer = pfile->buffer;
114 /* Handle CR-LF and LF-CR. Most other implementations (e.g. java)
115 only accept CR-LF; maybe we should fall back to that behaviour? */
116 if (buffer->cur[-1] + buffer->cur[0] == '\r' + '\n')
117 buffer->cur++;
119 buffer->line_base = buffer->cur;
120 buffer->col_adjust = 0;
121 pfile->line++;
124 /* Subroutine of skip_escaped_newlines; called when a 3-character
125 sequence beginning with "??" is encountered. buffer->cur points to
126 the second '?'.
128 Warn if necessary, and returns true if the sequence forms a
129 trigraph and the trigraph should be honoured. */
130 static bool
131 trigraph_p (pfile)
132 cpp_reader *pfile;
134 cpp_buffer *buffer = pfile->buffer;
135 cppchar_t from_char = buffer->cur[1];
136 bool accept;
138 if (!_cpp_trigraph_map[from_char])
139 return false;
141 accept = CPP_OPTION (pfile, trigraphs);
143 /* Don't warn about trigraphs in comments. */
144 if (CPP_OPTION (pfile, warn_trigraphs) && !pfile->state.lexing_comment)
146 if (accept)
147 cpp_error_with_line (pfile, DL_WARNING,
148 pfile->line, CPP_BUF_COL (buffer) - 1,
149 "trigraph ??%c converted to %c",
150 (int) from_char,
151 (int) _cpp_trigraph_map[from_char]);
152 else if (buffer->cur != buffer->last_Wtrigraphs)
154 buffer->last_Wtrigraphs = buffer->cur;
155 cpp_error_with_line (pfile, DL_WARNING,
156 pfile->line, CPP_BUF_COL (buffer) - 1,
157 "trigraph ??%c ignored", (int) from_char);
161 return accept;
164 /* Skips any escaped newlines introduced by '?' or a '\\', assumed to
165 lie in buffer->cur[-1]. Returns the next byte, which will be in
166 buffer->cur[-1]. This routine performs preprocessing stages 1 and
167 2 of the ISO C standard. */
168 static cppchar_t
169 skip_escaped_newlines (pfile)
170 cpp_reader *pfile;
172 cpp_buffer *buffer = pfile->buffer;
173 cppchar_t next = buffer->cur[-1];
175 /* Only do this if we apply stages 1 and 2. */
176 if (!buffer->from_stage3)
178 const unsigned char *saved_cur;
179 cppchar_t next1;
183 if (next == '?')
185 if (buffer->cur[0] != '?' || !trigraph_p (pfile))
186 break;
188 /* Translate the trigraph. */
189 next = _cpp_trigraph_map[buffer->cur[1]];
190 buffer->cur += 2;
191 if (next != '\\')
192 break;
195 if (buffer->cur == buffer->rlimit)
196 break;
198 /* We have a backslash, and room for at least one more
199 character. Skip horizontal whitespace. */
200 saved_cur = buffer->cur;
202 next1 = *buffer->cur++;
203 while (is_nvspace (next1) && buffer->cur < buffer->rlimit);
205 if (!is_vspace (next1))
207 buffer->cur = saved_cur;
208 break;
211 if (saved_cur != buffer->cur - 1
212 && !pfile->state.lexing_comment)
213 cpp_error (pfile, DL_WARNING,
214 "backslash and newline separated by space");
216 handle_newline (pfile);
217 buffer->backup_to = buffer->cur;
218 if (buffer->cur == buffer->rlimit)
220 cpp_error (pfile, DL_PEDWARN,
221 "backslash-newline at end of file");
222 next = EOF;
224 else
225 next = *buffer->cur++;
227 while (next == '\\' || next == '?');
230 return next;
233 /* Obtain the next character, after trigraph conversion and skipping
234 an arbitrarily long string of escaped newlines. The common case of
235 no trigraphs or escaped newlines falls through quickly. On return,
236 buffer->backup_to points to where to return to if the character is
237 not to be processed. */
238 static cppchar_t
239 get_effective_char (pfile)
240 cpp_reader *pfile;
242 cppchar_t next;
243 cpp_buffer *buffer = pfile->buffer;
245 buffer->backup_to = buffer->cur;
246 next = *buffer->cur++;
247 if (__builtin_expect (next == '?' || next == '\\', 0))
248 next = skip_escaped_newlines (pfile);
250 return next;
253 /* Skip a C-style block comment. We find the end of the comment by
254 seeing if an asterisk is before every '/' we encounter. Returns
255 non-zero if comment terminated by EOF, zero otherwise. */
256 static int
257 skip_block_comment (pfile)
258 cpp_reader *pfile;
260 cpp_buffer *buffer = pfile->buffer;
261 cppchar_t c = EOF, prevc = EOF;
263 pfile->state.lexing_comment = 1;
264 while (buffer->cur != buffer->rlimit)
266 prevc = c, c = *buffer->cur++;
268 /* FIXME: For speed, create a new character class of characters
269 of interest inside block comments. */
270 if (c == '?' || c == '\\')
271 c = skip_escaped_newlines (pfile);
273 /* People like decorating comments with '*', so check for '/'
274 instead for efficiency. */
275 if (c == '/')
277 if (prevc == '*')
278 break;
280 /* Warn about potential nested comments, but not if the '/'
281 comes immediately before the true comment delimiter.
282 Don't bother to get it right across escaped newlines. */
283 if (CPP_OPTION (pfile, warn_comments)
284 && buffer->cur[0] == '*' && buffer->cur[1] != '/')
285 cpp_error_with_line (pfile, DL_WARNING,
286 pfile->line, CPP_BUF_COL (buffer),
287 "\"/*\" within comment");
289 else if (is_vspace (c))
290 handle_newline (pfile);
291 else if (c == '\t')
292 adjust_column (pfile);
295 pfile->state.lexing_comment = 0;
296 return c != '/' || prevc != '*';
299 /* Skip a C++ line comment, leaving buffer->cur pointing to the
300 terminating newline. Handles escaped newlines. Returns non-zero
301 if a multiline comment. */
302 static int
303 skip_line_comment (pfile)
304 cpp_reader *pfile;
306 cpp_buffer *buffer = pfile->buffer;
307 unsigned int orig_line = pfile->line;
308 cppchar_t c;
309 #ifdef MULTIBYTE_CHARS
310 wchar_t wc;
311 int char_len;
312 #endif
314 pfile->state.lexing_comment = 1;
315 #ifdef MULTIBYTE_CHARS
316 /* Reset multibyte conversion state. */
317 (void) local_mbtowc (NULL, NULL, 0);
318 #endif
321 if (buffer->cur == buffer->rlimit)
322 goto at_eof;
324 #ifdef MULTIBYTE_CHARS
325 char_len = local_mbtowc (&wc, (const char *) buffer->cur,
326 buffer->rlimit - buffer->cur);
327 if (char_len == -1)
329 cpp_error (pfile, DL_WARNING,
330 "ignoring invalid multibyte character");
331 char_len = 1;
332 c = *buffer->cur++;
334 else
336 buffer->cur += char_len;
337 c = wc;
339 #else
340 c = *buffer->cur++;
341 #endif
342 if (c == '?' || c == '\\')
343 c = skip_escaped_newlines (pfile);
345 while (!is_vspace (c));
347 /* Step back over the newline, except at EOF. */
348 buffer->cur--;
349 at_eof:
351 pfile->state.lexing_comment = 0;
352 return orig_line != pfile->line;
355 /* pfile->buffer->cur is one beyond the \t character. Update
356 col_adjust so we track the column correctly. */
357 static void
358 adjust_column (pfile)
359 cpp_reader *pfile;
361 cpp_buffer *buffer = pfile->buffer;
362 unsigned int col = CPP_BUF_COL (buffer) - 1; /* Zero-based column. */
364 /* Round it up to multiple of the tabstop, but subtract 1 since the
365 tab itself occupies a character position. */
366 buffer->col_adjust += (CPP_OPTION (pfile, tabstop)
367 - col % CPP_OPTION (pfile, tabstop)) - 1;
370 /* Skips whitespace, saving the next non-whitespace character.
371 Adjusts pfile->col_adjust to account for tabs. Without this,
372 tokens might be assigned an incorrect column. */
373 static int
374 skip_whitespace (pfile, c)
375 cpp_reader *pfile;
376 cppchar_t c;
378 cpp_buffer *buffer = pfile->buffer;
379 unsigned int warned = 0;
383 /* Horizontal space always OK. */
384 if (c == ' ')
386 else if (c == '\t')
387 adjust_column (pfile);
388 /* Just \f \v or \0 left. */
389 else if (c == '\0')
391 if (buffer->cur - 1 == buffer->rlimit)
392 return 0;
393 if (!warned)
395 cpp_error (pfile, DL_WARNING, "null character(s) ignored");
396 warned = 1;
399 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
400 cpp_error_with_line (pfile, DL_PEDWARN, pfile->line,
401 CPP_BUF_COL (buffer),
402 "%s in preprocessing directive",
403 c == '\f' ? "form feed" : "vertical tab");
405 c = *buffer->cur++;
407 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
408 while (is_nvspace (c));
410 buffer->cur--;
411 return 1;
414 /* See if the characters of a number token are valid in a name (no
415 '.', '+' or '-'). */
416 static int
417 name_p (pfile, string)
418 cpp_reader *pfile;
419 const cpp_string *string;
421 unsigned int i;
423 for (i = 0; i < string->len; i++)
424 if (!is_idchar (string->text[i]))
425 return 0;
427 return 1;
430 /* Parse an identifier, skipping embedded backslash-newlines. This is
431 a critical inner loop. The common case is an identifier which has
432 not been split by backslash-newline, does not contain a dollar
433 sign, and has already been scanned (roughly 10:1 ratio of
434 seen:unseen identifiers in normal code; the distribution is
435 Poisson-like). Second most common case is a new identifier, not
436 split and no dollar sign. The other possibilities are rare and
437 have been relegated to parse_slow. */
438 static cpp_hashnode *
439 parse_identifier (pfile)
440 cpp_reader *pfile;
442 cpp_hashnode *result;
443 const uchar *cur, *base;
445 /* Fast-path loop. Skim over a normal identifier.
446 N.B. ISIDNUM does not include $. */
447 cur = pfile->buffer->cur;
448 while (ISIDNUM (*cur))
449 cur++;
451 /* Check for slow-path cases. */
452 if (*cur == '?' || *cur == '\\' || *cur == '$')
454 unsigned int len;
456 base = parse_slow (pfile, cur, 0, &len);
457 result = (cpp_hashnode *)
458 ht_lookup (pfile->hash_table, base, len, HT_ALLOCED);
460 else
462 base = pfile->buffer->cur - 1;
463 pfile->buffer->cur = cur;
464 result = (cpp_hashnode *)
465 ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
468 /* Rarely, identifiers require diagnostics when lexed.
469 XXX Has to be forced out of the fast path. */
470 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
471 && !pfile->state.skipping, 0))
473 /* It is allowed to poison the same identifier twice. */
474 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
475 cpp_error (pfile, DL_ERROR, "attempt to use poisoned \"%s\"",
476 NODE_NAME (result));
478 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
479 replacement list of a variadic macro. */
480 if (result == pfile->spec_nodes.n__VA_ARGS__
481 && !pfile->state.va_args_ok)
482 cpp_error (pfile, DL_PEDWARN,
483 "__VA_ARGS__ can only appear in the expansion of a C99 variadic macro");
486 return result;
489 /* Slow path. This handles numbers and identifiers which have been
490 split, or contain dollar signs. The part of the token from
491 PFILE->buffer->cur-1 to CUR has already been scanned. NUMBER_P is
492 1 if it's a number, and 2 if it has a leading period. Returns a
493 pointer to the token's NUL-terminated spelling in permanent
494 storage, and sets PLEN to its length. */
495 static uchar *
496 parse_slow (pfile, cur, number_p, plen)
497 cpp_reader *pfile;
498 const uchar *cur;
499 int number_p;
500 unsigned int *plen;
502 cpp_buffer *buffer = pfile->buffer;
503 const uchar *base = buffer->cur - 1;
504 struct obstack *stack = &pfile->hash_table->stack;
505 unsigned int c, prevc, saw_dollar = 0;
507 /* Place any leading period. */
508 if (number_p == 2)
509 obstack_1grow (stack, '.');
511 /* Copy the part of the token which is known to be okay. */
512 obstack_grow (stack, base, cur - base);
514 /* Now process the part which isn't. We are looking at one of
515 '$', '\\', or '?' on entry to this loop. */
516 prevc = cur[-1];
517 c = *cur++;
518 buffer->cur = cur;
519 for (;;)
521 /* Potential escaped newline? */
522 buffer->backup_to = buffer->cur - 1;
523 if (c == '?' || c == '\\')
524 c = skip_escaped_newlines (pfile);
526 if (!is_idchar (c))
528 if (!number_p)
529 break;
530 if (c != '.' && !VALID_SIGN (c, prevc))
531 break;
534 /* Handle normal identifier characters in this loop. */
537 prevc = c;
538 obstack_1grow (stack, c);
540 if (c == '$')
541 saw_dollar++;
543 c = *buffer->cur++;
545 while (is_idchar (c));
548 /* Step back over the unwanted char. */
549 BACKUP ();
551 /* $ is not an identifier character in the standard, but is commonly
552 accepted as an extension. Don't warn about it in skipped
553 conditional blocks. */
554 if (saw_dollar && CPP_PEDANTIC (pfile) && ! pfile->state.skipping)
555 cpp_error (pfile, DL_PEDWARN, "'$' character(s) in identifier or number");
557 /* Identifiers and numbers are null-terminated. */
558 *plen = obstack_object_size (stack);
559 obstack_1grow (stack, '\0');
560 return obstack_finish (stack);
563 /* Parse a number, beginning with character C, skipping embedded
564 backslash-newlines. LEADING_PERIOD is non-zero if there was a "."
565 before C. Place the result in NUMBER. */
566 static void
567 parse_number (pfile, number, leading_period)
568 cpp_reader *pfile;
569 cpp_string *number;
570 int leading_period;
572 const uchar *cur;
574 /* Fast-path loop. Skim over a normal number.
575 N.B. ISIDNUM does not include $. */
576 cur = pfile->buffer->cur;
577 while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
578 cur++;
580 /* Check for slow-path cases. */
581 if (*cur == '?' || *cur == '\\' || *cur == '$')
582 number->text = parse_slow (pfile, cur, 1 + leading_period, &number->len);
583 else
585 const uchar *base = pfile->buffer->cur - 1;
586 uchar *dest;
588 number->len = cur - base + leading_period;
589 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
590 dest[number->len] = '\0';
591 number->text = dest;
593 if (leading_period)
594 *dest++ = '.';
595 memcpy (dest, base, cur - base);
596 pfile->buffer->cur = cur;
600 /* Subroutine of parse_string. */
601 static int
602 unescaped_terminator_p (pfile, dest)
603 cpp_reader *pfile;
604 const unsigned char *dest;
606 const unsigned char *start, *temp;
608 /* In #include-style directives, terminators are not escapeable. */
609 if (pfile->state.angled_headers)
610 return 1;
612 start = BUFF_FRONT (pfile->u_buff);
614 /* An odd number of consecutive backslashes represents an escaped
615 terminator. */
616 for (temp = dest; temp > start && temp[-1] == '\\'; temp--)
619 return ((dest - temp) & 1) == 0;
622 /* Parses a string, character constant, or angle-bracketed header file
623 name. Handles embedded trigraphs and escaped newlines. The stored
624 string is guaranteed NUL-terminated, but it is not guaranteed that
625 this is the first NUL since embedded NULs are preserved.
627 When this function returns, buffer->cur points to the next
628 character to be processed. */
629 static void
630 parse_string (pfile, token, terminator)
631 cpp_reader *pfile;
632 cpp_token *token;
633 cppchar_t terminator;
635 cpp_buffer *buffer = pfile->buffer;
636 unsigned char *dest, *limit;
637 cppchar_t c;
638 bool warned_nulls = false;
639 #ifdef MULTIBYTE_CHARS
640 wchar_t wc;
641 int char_len;
642 #endif
644 dest = BUFF_FRONT (pfile->u_buff);
645 limit = BUFF_LIMIT (pfile->u_buff);
647 #ifdef MULTIBYTE_CHARS
648 /* Reset multibyte conversion state. */
649 (void) local_mbtowc (NULL, NULL, 0);
650 #endif
651 for (;;)
653 /* We need room for another char, possibly the terminating NUL. */
654 if ((size_t) (limit - dest) < 1)
656 size_t len_so_far = dest - BUFF_FRONT (pfile->u_buff);
657 _cpp_extend_buff (pfile, &pfile->u_buff, 2);
658 dest = BUFF_FRONT (pfile->u_buff) + len_so_far;
659 limit = BUFF_LIMIT (pfile->u_buff);
662 #ifdef MULTIBYTE_CHARS
663 char_len = local_mbtowc (&wc, (const char *) buffer->cur,
664 buffer->rlimit - buffer->cur);
665 if (char_len == -1)
667 cpp_error (pfile, DL_WARNING,
668 "ignoring invalid multibyte character");
669 char_len = 1;
670 c = *buffer->cur++;
672 else
674 buffer->cur += char_len;
675 c = wc;
677 #else
678 c = *buffer->cur++;
679 #endif
681 /* Handle trigraphs, escaped newlines etc. */
682 if (c == '?' || c == '\\')
683 c = skip_escaped_newlines (pfile);
685 if (c == terminator)
687 if (unescaped_terminator_p (pfile, dest))
688 break;
690 else if (is_vspace (c))
692 /* No string literal may extend over multiple lines. In
693 assembly language, suppress the error except for <>
694 includes. This is a kludge around not knowing where
695 comments are. */
696 unterminated:
697 if (CPP_OPTION (pfile, lang) != CLK_ASM || terminator == '>')
698 cpp_error (pfile, DL_ERROR, "missing terminating %c character",
699 (int) terminator);
700 buffer->cur--;
701 break;
703 else if (c == '\0')
705 if (buffer->cur - 1 == buffer->rlimit)
706 goto unterminated;
707 if (!warned_nulls)
709 warned_nulls = true;
710 cpp_error (pfile, DL_WARNING,
711 "null character(s) preserved in literal");
714 #ifdef MULTIBYTE_CHARS
715 if (char_len > 1)
717 for ( ; char_len > 0; --char_len)
718 *dest++ = (*buffer->cur - char_len);
720 else
721 #endif
722 *dest++ = c;
725 *dest = '\0';
727 token->val.str.text = BUFF_FRONT (pfile->u_buff);
728 token->val.str.len = dest - BUFF_FRONT (pfile->u_buff);
729 BUFF_FRONT (pfile->u_buff) = dest + 1;
732 /* The stored comment includes the comment start and any terminator. */
733 static void
734 save_comment (pfile, token, from, type)
735 cpp_reader *pfile;
736 cpp_token *token;
737 const unsigned char *from;
738 cppchar_t type;
740 unsigned char *buffer;
741 unsigned int len, clen;
743 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
745 /* C++ comments probably (not definitely) have moved past a new
746 line, which we don't want to save in the comment. */
747 if (is_vspace (pfile->buffer->cur[-1]))
748 len--;
750 /* If we are currently in a directive, then we need to store all
751 C++ comments as C comments internally, and so we need to
752 allocate a little extra space in that case.
754 Note that the only time we encounter a directive here is
755 when we are saving comments in a "#define". */
756 clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
758 buffer = _cpp_unaligned_alloc (pfile, clen);
760 token->type = CPP_COMMENT;
761 token->val.str.len = clen;
762 token->val.str.text = buffer;
764 buffer[0] = '/';
765 memcpy (buffer + 1, from, len - 1);
767 /* Finish conversion to a C comment, if necessary. */
768 if (pfile->state.in_directive && type == '/')
770 buffer[1] = '*';
771 buffer[clen - 2] = '*';
772 buffer[clen - 1] = '/';
776 /* Allocate COUNT tokens for RUN. */
777 void
778 _cpp_init_tokenrun (run, count)
779 tokenrun *run;
780 unsigned int count;
782 run->base = xnewvec (cpp_token, count);
783 run->limit = run->base + count;
784 run->next = NULL;
787 /* Returns the next tokenrun, or creates one if there is none. */
788 static tokenrun *
789 next_tokenrun (run)
790 tokenrun *run;
792 if (run->next == NULL)
794 run->next = xnew (tokenrun);
795 run->next->prev = run;
796 _cpp_init_tokenrun (run->next, 250);
799 return run->next;
802 /* Allocate a single token that is invalidated at the same time as the
803 rest of the tokens on the line. Has its line and col set to the
804 same as the last lexed token, so that diagnostics appear in the
805 right place. */
806 cpp_token *
807 _cpp_temp_token (pfile)
808 cpp_reader *pfile;
810 cpp_token *old, *result;
812 old = pfile->cur_token - 1;
813 if (pfile->cur_token == pfile->cur_run->limit)
815 pfile->cur_run = next_tokenrun (pfile->cur_run);
816 pfile->cur_token = pfile->cur_run->base;
819 result = pfile->cur_token++;
820 result->line = old->line;
821 result->col = old->col;
822 return result;
825 /* Lex a token into RESULT (external interface). Takes care of issues
826 like directive handling, token lookahead, multiple include
827 optimization and skipping. */
828 const cpp_token *
829 _cpp_lex_token (pfile)
830 cpp_reader *pfile;
832 cpp_token *result;
834 for (;;)
836 if (pfile->cur_token == pfile->cur_run->limit)
838 pfile->cur_run = next_tokenrun (pfile->cur_run);
839 pfile->cur_token = pfile->cur_run->base;
842 if (pfile->lookaheads)
844 pfile->lookaheads--;
845 result = pfile->cur_token++;
847 else
848 result = _cpp_lex_direct (pfile);
850 if (result->flags & BOL)
852 /* Is this a directive. If _cpp_handle_directive returns
853 false, it is an assembler #. */
854 if (result->type == CPP_HASH
855 /* 6.10.3 p 11: Directives in a list of macro arguments
856 gives undefined behavior. This implementation
857 handles the directive as normal. */
858 && pfile->state.parsing_args != 1
859 && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
860 continue;
861 if (pfile->cb.line_change && !pfile->state.skipping)
862 (*pfile->cb.line_change)(pfile, result, pfile->state.parsing_args);
865 /* We don't skip tokens in directives. */
866 if (pfile->state.in_directive)
867 break;
869 /* Outside a directive, invalidate controlling macros. At file
870 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
871 get here and MI optimisation works. */
872 pfile->mi_valid = false;
874 if (!pfile->state.skipping || result->type == CPP_EOF)
875 break;
878 return result;
881 /* A NUL terminates the current buffer. For ISO preprocessing this is
882 EOF, but for traditional preprocessing it indicates we need a line
883 refill. Returns TRUE to continue preprocessing a new buffer, FALSE
884 to return a CPP_EOF to the caller. */
885 static bool
886 continue_after_nul (pfile)
887 cpp_reader *pfile;
889 cpp_buffer *buffer = pfile->buffer;
890 bool more = false;
892 buffer->saved_flags = BOL;
893 if (CPP_OPTION (pfile, traditional))
894 more = _cpp_read_logical_line_trad (pfile);
895 else
897 /* Stop parsing arguments with a CPP_EOF. When we finally come
898 back here, do the work of popping the buffer. */
899 if (!pfile->state.parsing_args)
901 if (buffer->cur != buffer->line_base)
903 /* Non-empty files should end in a newline. Don't warn
904 for command line and _Pragma buffers. */
905 if (!buffer->from_stage3)
906 cpp_error (pfile, DL_PEDWARN, "no newline at end of file");
907 handle_newline (pfile);
910 /* Similarly, finish an in-progress directive with CPP_EOF
911 before popping the buffer. */
912 if (!pfile->state.in_directive && buffer->prev)
914 more = !buffer->return_at_eof;
915 _cpp_pop_buffer (pfile);
920 return more;
923 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
924 do { \
925 if (get_effective_char (pfile) == CHAR) \
926 result->type = THEN_TYPE; \
927 else \
929 BACKUP (); \
930 result->type = ELSE_TYPE; \
932 } while (0)
934 /* Lex a token into pfile->cur_token, which is also incremented, to
935 get diagnostics pointing to the correct location.
937 Does not handle issues such as token lookahead, multiple-include
938 optimisation, directives, skipping etc. This function is only
939 suitable for use by _cpp_lex_token, and in special cases like
940 lex_expansion_token which doesn't care for any of these issues.
942 When meeting a newline, returns CPP_EOF if parsing a directive,
943 otherwise returns to the start of the token buffer if permissible.
944 Returns the location of the lexed token. */
945 cpp_token *
946 _cpp_lex_direct (pfile)
947 cpp_reader *pfile;
949 cppchar_t c;
950 cpp_buffer *buffer;
951 const unsigned char *comment_start;
952 cpp_token *result = pfile->cur_token++;
954 fresh_line:
955 buffer = pfile->buffer;
956 result->flags = buffer->saved_flags;
957 buffer->saved_flags = 0;
958 update_tokens_line:
959 result->line = pfile->line;
961 skipped_white:
962 c = *buffer->cur++;
963 result->col = CPP_BUF_COLUMN (buffer, buffer->cur);
965 trigraph:
966 switch (c)
968 case ' ': case '\t': case '\f': case '\v': case '\0':
969 result->flags |= PREV_WHITE;
970 if (skip_whitespace (pfile, c))
971 goto skipped_white;
973 /* End of buffer. */
974 buffer->cur--;
975 if (continue_after_nul (pfile))
976 goto fresh_line;
977 result->type = CPP_EOF;
978 break;
980 case '\n': case '\r':
981 handle_newline (pfile);
982 buffer->saved_flags = BOL;
983 if (! pfile->state.in_directive)
985 if (pfile->state.parsing_args == 2)
986 buffer->saved_flags |= PREV_WHITE;
987 if (!pfile->keep_tokens)
989 pfile->cur_run = &pfile->base_run;
990 result = pfile->base_run.base;
991 pfile->cur_token = result + 1;
993 goto fresh_line;
995 result->type = CPP_EOF;
996 break;
998 case '?':
999 case '\\':
1000 /* These could start an escaped newline, or '?' a trigraph. Let
1001 skip_escaped_newlines do all the work. */
1003 unsigned int line = pfile->line;
1005 c = skip_escaped_newlines (pfile);
1006 if (line != pfile->line)
1008 buffer->cur--;
1009 /* We had at least one escaped newline of some sort.
1010 Update the token's line and column. */
1011 goto update_tokens_line;
1015 /* We are either the original '?' or '\\', or a trigraph. */
1016 if (c == '?')
1017 result->type = CPP_QUERY;
1018 else if (c == '\\')
1019 goto random_char;
1020 else
1021 goto trigraph;
1022 break;
1024 case '0': case '1': case '2': case '3': case '4':
1025 case '5': case '6': case '7': case '8': case '9':
1026 result->type = CPP_NUMBER;
1027 parse_number (pfile, &result->val.str, 0);
1028 break;
1030 case 'L':
1031 /* 'L' may introduce wide characters or strings. */
1033 const unsigned char *pos = buffer->cur;
1035 c = get_effective_char (pfile);
1036 if (c == '\'' || c == '"')
1038 result->type = (c == '"' ? CPP_WSTRING: CPP_WCHAR);
1039 parse_string (pfile, result, c);
1040 break;
1042 buffer->cur = pos;
1044 /* Fall through. */
1046 start_ident:
1047 case '_':
1048 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1049 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1050 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1051 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1052 case 'y': case 'z':
1053 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1054 case 'G': case 'H': case 'I': case 'J': case 'K':
1055 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1056 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1057 case 'Y': case 'Z':
1058 result->type = CPP_NAME;
1059 result->val.node = parse_identifier (pfile);
1061 /* Convert named operators to their proper types. */
1062 if (result->val.node->flags & NODE_OPERATOR)
1064 result->flags |= NAMED_OP;
1065 result->type = result->val.node->value.operator;
1067 break;
1069 case '\'':
1070 case '"':
1071 result->type = c == '"' ? CPP_STRING: CPP_CHAR;
1072 parse_string (pfile, result, c);
1073 break;
1075 case '/':
1076 /* A potential block or line comment. */
1077 comment_start = buffer->cur;
1078 c = get_effective_char (pfile);
1080 if (c == '*')
1082 if (skip_block_comment (pfile))
1083 cpp_error (pfile, DL_ERROR, "unterminated comment");
1085 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1086 || CPP_IN_SYSTEM_HEADER (pfile)))
1088 /* Warn about comments only if pedantically GNUC89, and not
1089 in system headers. */
1090 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1091 && ! buffer->warned_cplusplus_comments)
1093 cpp_error (pfile, DL_PEDWARN,
1094 "C++ style comments are not allowed in ISO C89");
1095 cpp_error (pfile, DL_PEDWARN,
1096 "(this will be reported only once per input file)");
1097 buffer->warned_cplusplus_comments = 1;
1100 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1101 cpp_error (pfile, DL_WARNING, "multi-line comment");
1103 else if (c == '=')
1105 result->type = CPP_DIV_EQ;
1106 break;
1108 else
1110 BACKUP ();
1111 result->type = CPP_DIV;
1112 break;
1115 if (!pfile->state.save_comments)
1117 result->flags |= PREV_WHITE;
1118 goto update_tokens_line;
1121 /* Save the comment as a token in its own right. */
1122 save_comment (pfile, result, comment_start, c);
1123 break;
1125 case '<':
1126 if (pfile->state.angled_headers)
1128 result->type = CPP_HEADER_NAME;
1129 parse_string (pfile, result, '>');
1130 break;
1133 c = get_effective_char (pfile);
1134 if (c == '=')
1135 result->type = CPP_LESS_EQ;
1136 else if (c == '<')
1137 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1138 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1139 IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
1140 else if (c == ':' && CPP_OPTION (pfile, digraphs))
1142 result->type = CPP_OPEN_SQUARE;
1143 result->flags |= DIGRAPH;
1145 else if (c == '%' && CPP_OPTION (pfile, digraphs))
1147 result->type = CPP_OPEN_BRACE;
1148 result->flags |= DIGRAPH;
1150 else
1152 BACKUP ();
1153 result->type = CPP_LESS;
1155 break;
1157 case '>':
1158 c = get_effective_char (pfile);
1159 if (c == '=')
1160 result->type = CPP_GREATER_EQ;
1161 else if (c == '>')
1162 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1163 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1164 IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
1165 else
1167 BACKUP ();
1168 result->type = CPP_GREATER;
1170 break;
1172 case '%':
1173 c = get_effective_char (pfile);
1174 if (c == '=')
1175 result->type = CPP_MOD_EQ;
1176 else if (CPP_OPTION (pfile, digraphs) && c == ':')
1178 result->flags |= DIGRAPH;
1179 result->type = CPP_HASH;
1180 if (get_effective_char (pfile) == '%')
1182 const unsigned char *pos = buffer->cur;
1184 if (get_effective_char (pfile) == ':')
1185 result->type = CPP_PASTE;
1186 else
1187 buffer->cur = pos - 1;
1189 else
1190 BACKUP ();
1192 else if (CPP_OPTION (pfile, digraphs) && c == '>')
1194 result->flags |= DIGRAPH;
1195 result->type = CPP_CLOSE_BRACE;
1197 else
1199 BACKUP ();
1200 result->type = CPP_MOD;
1202 break;
1204 case '.':
1205 result->type = CPP_DOT;
1206 c = get_effective_char (pfile);
1207 if (c == '.')
1209 const unsigned char *pos = buffer->cur;
1211 if (get_effective_char (pfile) == '.')
1212 result->type = CPP_ELLIPSIS;
1213 else
1214 buffer->cur = pos - 1;
1216 /* All known character sets have 0...9 contiguous. */
1217 else if (ISDIGIT (c))
1219 result->type = CPP_NUMBER;
1220 parse_number (pfile, &result->val.str, 1);
1222 else if (c == '*' && CPP_OPTION (pfile, cplusplus))
1223 result->type = CPP_DOT_STAR;
1224 else
1225 BACKUP ();
1226 break;
1228 case '+':
1229 c = get_effective_char (pfile);
1230 if (c == '+')
1231 result->type = CPP_PLUS_PLUS;
1232 else if (c == '=')
1233 result->type = CPP_PLUS_EQ;
1234 else
1236 BACKUP ();
1237 result->type = CPP_PLUS;
1239 break;
1241 case '-':
1242 c = get_effective_char (pfile);
1243 if (c == '>')
1245 result->type = CPP_DEREF;
1246 if (CPP_OPTION (pfile, cplusplus))
1248 if (get_effective_char (pfile) == '*')
1249 result->type = CPP_DEREF_STAR;
1250 else
1251 BACKUP ();
1254 else if (c == '-')
1255 result->type = CPP_MINUS_MINUS;
1256 else if (c == '=')
1257 result->type = CPP_MINUS_EQ;
1258 else
1260 BACKUP ();
1261 result->type = CPP_MINUS;
1263 break;
1265 case '&':
1266 c = get_effective_char (pfile);
1267 if (c == '&')
1268 result->type = CPP_AND_AND;
1269 else if (c == '=')
1270 result->type = CPP_AND_EQ;
1271 else
1273 BACKUP ();
1274 result->type = CPP_AND;
1276 break;
1278 case '|':
1279 c = get_effective_char (pfile);
1280 if (c == '|')
1281 result->type = CPP_OR_OR;
1282 else if (c == '=')
1283 result->type = CPP_OR_EQ;
1284 else
1286 BACKUP ();
1287 result->type = CPP_OR;
1289 break;
1291 case ':':
1292 c = get_effective_char (pfile);
1293 if (c == ':' && CPP_OPTION (pfile, cplusplus))
1294 result->type = CPP_SCOPE;
1295 else if (c == '>' && CPP_OPTION (pfile, digraphs))
1297 result->flags |= DIGRAPH;
1298 result->type = CPP_CLOSE_SQUARE;
1300 else
1302 BACKUP ();
1303 result->type = CPP_COLON;
1305 break;
1307 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1308 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1309 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1310 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1311 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1313 case '~': result->type = CPP_COMPL; break;
1314 case ',': result->type = CPP_COMMA; break;
1315 case '(': result->type = CPP_OPEN_PAREN; break;
1316 case ')': result->type = CPP_CLOSE_PAREN; break;
1317 case '[': result->type = CPP_OPEN_SQUARE; break;
1318 case ']': result->type = CPP_CLOSE_SQUARE; break;
1319 case '{': result->type = CPP_OPEN_BRACE; break;
1320 case '}': result->type = CPP_CLOSE_BRACE; break;
1321 case ';': result->type = CPP_SEMICOLON; break;
1323 /* @ is a punctuator in Objective C. */
1324 case '@': result->type = CPP_ATSIGN; break;
1326 case '$':
1327 if (CPP_OPTION (pfile, dollars_in_ident))
1328 goto start_ident;
1329 /* Fall through... */
1331 random_char:
1332 default:
1333 result->type = CPP_OTHER;
1334 result->val.c = c;
1335 break;
1338 return result;
1341 /* An upper bound on the number of bytes needed to spell TOKEN,
1342 including preceding whitespace. */
1343 unsigned int
1344 cpp_token_len (token)
1345 const cpp_token *token;
1347 unsigned int len;
1349 switch (TOKEN_SPELL (token))
1351 default: len = 0; break;
1352 case SPELL_NUMBER:
1353 case SPELL_STRING: len = token->val.str.len; break;
1354 case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
1356 /* 1 for whitespace, 4 for comment delimiters. */
1357 return len + 5;
1360 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1361 already contain the enough space to hold the token's spelling.
1362 Returns a pointer to the character after the last character
1363 written. */
1364 unsigned char *
1365 cpp_spell_token (pfile, token, buffer)
1366 cpp_reader *pfile; /* Would be nice to be rid of this... */
1367 const cpp_token *token;
1368 unsigned char *buffer;
1370 switch (TOKEN_SPELL (token))
1372 case SPELL_OPERATOR:
1374 const unsigned char *spelling;
1375 unsigned char c;
1377 if (token->flags & DIGRAPH)
1378 spelling
1379 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1380 else if (token->flags & NAMED_OP)
1381 goto spell_ident;
1382 else
1383 spelling = TOKEN_NAME (token);
1385 while ((c = *spelling++) != '\0')
1386 *buffer++ = c;
1388 break;
1390 case SPELL_CHAR:
1391 *buffer++ = token->val.c;
1392 break;
1394 spell_ident:
1395 case SPELL_IDENT:
1396 memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
1397 buffer += NODE_LEN (token->val.node);
1398 break;
1400 case SPELL_NUMBER:
1401 memcpy (buffer, token->val.str.text, token->val.str.len);
1402 buffer += token->val.str.len;
1403 break;
1405 case SPELL_STRING:
1407 int left, right, tag;
1408 switch (token->type)
1410 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1411 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1412 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1413 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1414 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1415 default:
1416 cpp_error (pfile, DL_ICE, "unknown string token %s\n",
1417 TOKEN_NAME (token));
1418 return buffer;
1420 if (tag) *buffer++ = tag;
1421 *buffer++ = left;
1422 memcpy (buffer, token->val.str.text, token->val.str.len);
1423 buffer += token->val.str.len;
1424 *buffer++ = right;
1426 break;
1428 case SPELL_NONE:
1429 cpp_error (pfile, DL_ICE, "unspellable token %s", TOKEN_NAME (token));
1430 break;
1433 return buffer;
1436 /* Returns TOKEN spelt as a null-terminated string. The string is
1437 freed when the reader is destroyed. Useful for diagnostics. */
1438 unsigned char *
1439 cpp_token_as_text (pfile, token)
1440 cpp_reader *pfile;
1441 const cpp_token *token;
1443 unsigned int len = cpp_token_len (token);
1444 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1446 end = cpp_spell_token (pfile, token, start);
1447 end[0] = '\0';
1449 return start;
1452 /* Used by C front ends, which really should move to using
1453 cpp_token_as_text. */
1454 const char *
1455 cpp_type2name (type)
1456 enum cpp_ttype type;
1458 return (const char *) token_spellings[type].name;
1461 /* Writes the spelling of token to FP, without any preceding space.
1462 Separated from cpp_spell_token for efficiency - to avoid stdio
1463 double-buffering. */
1464 void
1465 cpp_output_token (token, fp)
1466 const cpp_token *token;
1467 FILE *fp;
1469 switch (TOKEN_SPELL (token))
1471 case SPELL_OPERATOR:
1473 const unsigned char *spelling;
1474 int c;
1476 if (token->flags & DIGRAPH)
1477 spelling
1478 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1479 else if (token->flags & NAMED_OP)
1480 goto spell_ident;
1481 else
1482 spelling = TOKEN_NAME (token);
1484 c = *spelling;
1486 putc (c, fp);
1487 while ((c = *++spelling) != '\0');
1489 break;
1491 case SPELL_CHAR:
1492 putc (token->val.c, fp);
1493 break;
1495 spell_ident:
1496 case SPELL_IDENT:
1497 fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
1498 break;
1500 case SPELL_NUMBER:
1501 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1502 break;
1504 case SPELL_STRING:
1506 int left, right, tag;
1507 switch (token->type)
1509 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1510 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1511 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1512 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1513 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1514 default:
1515 fprintf (stderr, "impossible STRING token %s\n", TOKEN_NAME (token));
1516 return;
1518 if (tag) putc (tag, fp);
1519 putc (left, fp);
1520 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1521 putc (right, fp);
1523 break;
1525 case SPELL_NONE:
1526 /* An error, most probably. */
1527 break;
1531 /* Compare two tokens. */
1533 _cpp_equiv_tokens (a, b)
1534 const cpp_token *a, *b;
1536 if (a->type == b->type && a->flags == b->flags)
1537 switch (TOKEN_SPELL (a))
1539 default: /* Keep compiler happy. */
1540 case SPELL_OPERATOR:
1541 return 1;
1542 case SPELL_CHAR:
1543 return a->val.c == b->val.c; /* Character. */
1544 case SPELL_NONE:
1545 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1546 case SPELL_IDENT:
1547 return a->val.node == b->val.node;
1548 case SPELL_NUMBER:
1549 case SPELL_STRING:
1550 return (a->val.str.len == b->val.str.len
1551 && !memcmp (a->val.str.text, b->val.str.text,
1552 a->val.str.len));
1555 return 0;
1558 /* Returns nonzero if a space should be inserted to avoid an
1559 accidental token paste for output. For simplicity, it is
1560 conservative, and occasionally advises a space where one is not
1561 needed, e.g. "." and ".2". */
1563 cpp_avoid_paste (pfile, token1, token2)
1564 cpp_reader *pfile;
1565 const cpp_token *token1, *token2;
1567 enum cpp_ttype a = token1->type, b = token2->type;
1568 cppchar_t c;
1570 if (token1->flags & NAMED_OP)
1571 a = CPP_NAME;
1572 if (token2->flags & NAMED_OP)
1573 b = CPP_NAME;
1575 c = EOF;
1576 if (token2->flags & DIGRAPH)
1577 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1578 else if (token_spellings[b].category == SPELL_OPERATOR)
1579 c = token_spellings[b].name[0];
1581 /* Quickly get everything that can paste with an '='. */
1582 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1583 return 1;
1585 switch (a)
1587 case CPP_GREATER: return c == '>' || c == '?';
1588 case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
1589 case CPP_PLUS: return c == '+';
1590 case CPP_MINUS: return c == '-' || c == '>';
1591 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1592 case CPP_MOD: return c == ':' || c == '>';
1593 case CPP_AND: return c == '&';
1594 case CPP_OR: return c == '|';
1595 case CPP_COLON: return c == ':' || c == '>';
1596 case CPP_DEREF: return c == '*';
1597 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1598 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1599 case CPP_NAME: return ((b == CPP_NUMBER
1600 && name_p (pfile, &token2->val.str))
1601 || b == CPP_NAME
1602 || b == CPP_CHAR || b == CPP_STRING); /* L */
1603 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1604 || c == '.' || c == '+' || c == '-');
1605 case CPP_OTHER: return (CPP_OPTION (pfile, objc)
1606 && token1->val.c == '@'
1607 && (b == CPP_NAME || b == CPP_STRING));
1608 default: break;
1611 return 0;
1614 /* Output all the remaining tokens on the current line, and a newline
1615 character, to FP. Leading whitespace is removed. If there are
1616 macros, special token padding is not performed. */
1617 void
1618 cpp_output_line (pfile, fp)
1619 cpp_reader *pfile;
1620 FILE *fp;
1622 const cpp_token *token;
1624 token = cpp_get_token (pfile);
1625 while (token->type != CPP_EOF)
1627 cpp_output_token (token, fp);
1628 token = cpp_get_token (pfile);
1629 if (token->flags & PREV_WHITE)
1630 putc (' ', fp);
1633 putc ('\n', fp);
1636 /* Returns the value of a hexadecimal digit. */
1637 static unsigned int
1638 hex_digit_value (c)
1639 unsigned int c;
1641 if (hex_p (c))
1642 return hex_value (c);
1643 else
1644 abort ();
1647 /* Parse a '\uNNNN' or '\UNNNNNNNN' sequence. Returns 1 to indicate
1648 failure if cpplib is not parsing C++ or C99. Such failure is
1649 silent, and no variables are updated. Otherwise returns 0, and
1650 warns if -Wtraditional.
1652 [lex.charset]: The character designated by the universal character
1653 name \UNNNNNNNN is that character whose character short name in
1654 ISO/IEC 10646 is NNNNNNNN; the character designated by the
1655 universal character name \uNNNN is that character whose character
1656 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
1657 for a universal character name is less than 0x20 or in the range
1658 0x7F-0x9F (inclusive), or if the universal character name
1659 designates a character in the basic source character set, then the
1660 program is ill-formed.
1662 We assume that wchar_t is Unicode, so we don't need to do any
1663 mapping. Is this ever wrong?
1665 PC points to the 'u' or 'U', PSTR is points to the byte after PC,
1666 LIMIT is the end of the string or charconst. PSTR is updated to
1667 point after the UCS on return, and the UCS is written into PC. */
1669 static int
1670 maybe_read_ucs (pfile, pstr, limit, pc)
1671 cpp_reader *pfile;
1672 const unsigned char **pstr;
1673 const unsigned char *limit;
1674 cppchar_t *pc;
1676 const unsigned char *p = *pstr;
1677 unsigned int code = 0;
1678 unsigned int c = *pc, length;
1680 /* Only attempt to interpret a UCS for C++ and C99. */
1681 if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
1682 return 1;
1684 if (CPP_WTRADITIONAL (pfile))
1685 cpp_error (pfile, DL_WARNING,
1686 "the meaning of '\\%c' is different in traditional C", c);
1688 length = (c == 'u' ? 4: 8);
1690 if ((size_t) (limit - p) < length)
1692 cpp_error (pfile, DL_ERROR, "incomplete universal-character-name");
1693 /* Skip to the end to avoid more diagnostics. */
1694 p = limit;
1696 else
1698 for (; length; length--, p++)
1700 c = *p;
1701 if (ISXDIGIT (c))
1702 code = (code << 4) + hex_digit_value (c);
1703 else
1705 cpp_error (pfile, DL_ERROR,
1706 "non-hex digit '%c' in universal-character-name", c);
1707 /* We shouldn't skip in case there are multibyte chars. */
1708 break;
1713 #ifdef TARGET_EBCDIC
1714 cpp_error (pfile, DL_ERROR, "universal-character-name on EBCDIC target");
1715 code = 0x3f; /* EBCDIC invalid character */
1716 #else
1717 /* True extended characters are OK. */
1718 if (code >= 0xa0
1719 && !(code & 0x80000000)
1720 && !(code >= 0xD800 && code <= 0xDFFF))
1722 /* The standard permits $, @ and ` to be specified as UCNs. We use
1723 hex escapes so that this also works with EBCDIC hosts. */
1724 else if (code == 0x24 || code == 0x40 || code == 0x60)
1726 /* Don't give another error if one occurred above. */
1727 else if (length == 0)
1728 cpp_error (pfile, DL_ERROR, "universal-character-name out of range");
1729 #endif
1731 *pstr = p;
1732 *pc = code;
1733 return 0;
1736 /* Returns the value of an escape sequence, truncated to the correct
1737 target precision. PSTR points to the input pointer, which is just
1738 after the backslash. LIMIT is how much text we have. WIDE is true
1739 if the escape sequence is part of a wide character constant or
1740 string literal. Handles all relevant diagnostics. */
1741 cppchar_t
1742 cpp_parse_escape (pfile, pstr, limit, wide)
1743 cpp_reader *pfile;
1744 const unsigned char **pstr;
1745 const unsigned char *limit;
1746 int wide;
1748 int unknown = 0;
1749 const unsigned char *str = *pstr;
1750 cppchar_t c, mask;
1751 unsigned int width;
1753 if (wide)
1754 width = CPP_OPTION (pfile, wchar_precision);
1755 else
1756 width = CPP_OPTION (pfile, char_precision);
1757 if (width < BITS_PER_CPPCHAR_T)
1758 mask = ((cppchar_t) 1 << width) - 1;
1759 else
1760 mask = ~0;
1762 c = *str++;
1763 switch (c)
1765 case '\\': case '\'': case '"': case '?': break;
1766 case 'b': c = TARGET_BS; break;
1767 case 'f': c = TARGET_FF; break;
1768 case 'n': c = TARGET_NEWLINE; break;
1769 case 'r': c = TARGET_CR; break;
1770 case 't': c = TARGET_TAB; break;
1771 case 'v': c = TARGET_VT; break;
1773 case '(': case '{': case '[': case '%':
1774 /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
1775 '\%' is used to prevent SCCS from getting confused. */
1776 unknown = CPP_PEDANTIC (pfile);
1777 break;
1779 case 'a':
1780 if (CPP_WTRADITIONAL (pfile))
1781 cpp_error (pfile, DL_WARNING,
1782 "the meaning of '\\a' is different in traditional C");
1783 c = TARGET_BELL;
1784 break;
1786 case 'e': case 'E':
1787 if (CPP_PEDANTIC (pfile))
1788 cpp_error (pfile, DL_PEDWARN,
1789 "non-ISO-standard escape sequence, '\\%c'", (int) c);
1790 c = TARGET_ESC;
1791 break;
1793 case 'u': case 'U':
1794 unknown = maybe_read_ucs (pfile, &str, limit, &c);
1795 break;
1797 case 'x':
1798 if (CPP_WTRADITIONAL (pfile))
1799 cpp_error (pfile, DL_WARNING,
1800 "the meaning of '\\x' is different in traditional C");
1803 cppchar_t i = 0, overflow = 0;
1804 int digits_found = 0;
1806 while (str < limit)
1808 c = *str;
1809 if (! ISXDIGIT (c))
1810 break;
1811 str++;
1812 overflow |= i ^ (i << 4 >> 4);
1813 i = (i << 4) + hex_digit_value (c);
1814 digits_found = 1;
1817 if (!digits_found)
1818 cpp_error (pfile, DL_ERROR,
1819 "\\x used with no following hex digits");
1821 if (overflow | (i != (i & mask)))
1823 cpp_error (pfile, DL_PEDWARN,
1824 "hex escape sequence out of range");
1825 i &= mask;
1827 c = i;
1829 break;
1831 case '0': case '1': case '2': case '3':
1832 case '4': case '5': case '6': case '7':
1834 size_t count = 0;
1835 cppchar_t i = c - '0';
1837 while (str < limit && ++count < 3)
1839 c = *str;
1840 if (c < '0' || c > '7')
1841 break;
1842 str++;
1843 i = (i << 3) + c - '0';
1846 if (i != (i & mask))
1848 cpp_error (pfile, DL_PEDWARN,
1849 "octal escape sequence out of range");
1850 i &= mask;
1852 c = i;
1854 break;
1856 default:
1857 unknown = 1;
1858 break;
1861 if (unknown)
1863 if (ISGRAPH (c))
1864 cpp_error (pfile, DL_PEDWARN,
1865 "unknown escape sequence '\\%c'", (int) c);
1866 else
1867 cpp_error (pfile, DL_PEDWARN,
1868 "unknown escape sequence: '\\%03o'", (int) c);
1871 if (c > mask)
1873 cpp_error (pfile, DL_PEDWARN, "escape sequence out of range for its type");
1874 c &= mask;
1877 *pstr = str;
1878 return c;
1881 /* Interpret a (possibly wide) character constant in TOKEN.
1882 WARN_MULTI warns about multi-character charconsts. PCHARS_SEEN
1883 points to a variable that is filled in with the number of
1884 characters seen, and UNSIGNEDP to a variable that indicates whether
1885 the result has signed type. */
1886 cppchar_t
1887 cpp_interpret_charconst (pfile, token, pchars_seen, unsignedp)
1888 cpp_reader *pfile;
1889 const cpp_token *token;
1890 unsigned int *pchars_seen;
1891 int *unsignedp;
1893 const unsigned char *str = token->val.str.text;
1894 const unsigned char *limit = str + token->val.str.len;
1895 unsigned int chars_seen = 0;
1896 size_t width, max_chars;
1897 cppchar_t c, mask, result = 0;
1898 bool unsigned_p;
1900 #ifdef MULTIBYTE_CHARS
1901 (void) local_mbtowc (NULL, NULL, 0);
1902 #endif
1904 /* Width in bits. */
1905 if (token->type == CPP_CHAR)
1907 width = CPP_OPTION (pfile, char_precision);
1908 max_chars = CPP_OPTION (pfile, int_precision) / width;
1909 unsigned_p = CPP_OPTION (pfile, unsigned_char);
1911 else
1913 width = CPP_OPTION (pfile, wchar_precision);
1914 max_chars = 1;
1915 unsigned_p = CPP_OPTION (pfile, unsigned_wchar);
1918 if (width < BITS_PER_CPPCHAR_T)
1919 mask = ((cppchar_t) 1 << width) - 1;
1920 else
1921 mask = ~0;
1923 while (str < limit)
1925 #ifdef MULTIBYTE_CHARS
1926 wchar_t wc;
1927 int char_len;
1929 char_len = local_mbtowc (&wc, str, limit - str);
1930 if (char_len == -1)
1932 cpp_error (pfile, DL_WARNING,
1933 "ignoring invalid multibyte character");
1934 c = *str++;
1936 else
1938 str += char_len;
1939 c = wc;
1941 #else
1942 c = *str++;
1943 #endif
1945 if (c == '\\')
1946 c = cpp_parse_escape (pfile, &str, limit, token->type == CPP_WCHAR);
1948 #ifdef MAP_CHARACTER
1949 if (ISPRINT (c))
1950 c = MAP_CHARACTER (c);
1951 #endif
1953 chars_seen++;
1955 /* Truncate the character, scale the result and merge the two. */
1956 c &= mask;
1957 if (width < BITS_PER_CPPCHAR_T)
1958 result = (result << width) | c;
1959 else
1960 result = c;
1963 if (chars_seen == 0)
1964 cpp_error (pfile, DL_ERROR, "empty character constant");
1965 else if (chars_seen > 1)
1967 /* Multichar charconsts are of type int and therefore signed. */
1968 unsigned_p = 0;
1970 if (chars_seen > max_chars)
1972 chars_seen = max_chars;
1973 cpp_error (pfile, DL_WARNING,
1974 "character constant too long for its type");
1976 else if (CPP_OPTION (pfile, warn_multichar))
1977 cpp_error (pfile, DL_WARNING, "multi-character character constant");
1980 /* Sign-extend or truncate the constant to cppchar_t. The value is
1981 in WIDTH bits, but for multi-char charconsts it's value is the
1982 full target type's width. */
1983 if (chars_seen > 1)
1984 width *= max_chars;
1985 if (width < BITS_PER_CPPCHAR_T)
1987 mask = ((cppchar_t) 1 << width) - 1;
1988 if (unsigned_p || !(result & (1 << (width - 1))))
1989 result &= mask;
1990 else
1991 result |= ~mask;
1994 *pchars_seen = chars_seen;
1995 *unsignedp = unsigned_p;
1996 return result;
1999 /* Memory buffers. Changing these three constants can have a dramatic
2000 effect on performance. The values here are reasonable defaults,
2001 but might be tuned. If you adjust them, be sure to test across a
2002 range of uses of cpplib, including heavy nested function-like macro
2003 expansion. Also check the change in peak memory usage (NJAMD is a
2004 good tool for this). */
2005 #define MIN_BUFF_SIZE 8000
2006 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2007 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2008 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2010 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2011 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2012 #endif
2014 struct dummy
2016 char c;
2017 union
2019 double d;
2020 int *p;
2021 } u;
2024 #define DEFAULT_ALIGNMENT (offsetof (struct dummy, u))
2025 #define CPP_ALIGN(size, align) (((size) + ((align) - 1)) & ~((align) - 1))
2027 /* Create a new allocation buffer. Place the control block at the end
2028 of the buffer, so that buffer overflows will cause immediate chaos. */
2029 static _cpp_buff *
2030 new_buff (len)
2031 size_t len;
2033 _cpp_buff *result;
2034 unsigned char *base;
2036 if (len < MIN_BUFF_SIZE)
2037 len = MIN_BUFF_SIZE;
2038 len = CPP_ALIGN (len, DEFAULT_ALIGNMENT);
2040 base = xmalloc (len + sizeof (_cpp_buff));
2041 result = (_cpp_buff *) (base + len);
2042 result->base = base;
2043 result->cur = base;
2044 result->limit = base + len;
2045 result->next = NULL;
2046 return result;
2049 /* Place a chain of unwanted allocation buffers on the free list. */
2050 void
2051 _cpp_release_buff (pfile, buff)
2052 cpp_reader *pfile;
2053 _cpp_buff *buff;
2055 _cpp_buff *end = buff;
2057 while (end->next)
2058 end = end->next;
2059 end->next = pfile->free_buffs;
2060 pfile->free_buffs = buff;
2063 /* Return a free buffer of size at least MIN_SIZE. */
2064 _cpp_buff *
2065 _cpp_get_buff (pfile, min_size)
2066 cpp_reader *pfile;
2067 size_t min_size;
2069 _cpp_buff *result, **p;
2071 for (p = &pfile->free_buffs;; p = &(*p)->next)
2073 size_t size;
2075 if (*p == NULL)
2076 return new_buff (min_size);
2077 result = *p;
2078 size = result->limit - result->base;
2079 /* Return a buffer that's big enough, but don't waste one that's
2080 way too big. */
2081 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2082 break;
2085 *p = result->next;
2086 result->next = NULL;
2087 result->cur = result->base;
2088 return result;
2091 /* Creates a new buffer with enough space to hold the uncommitted
2092 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
2093 the excess bytes to the new buffer. Chains the new buffer after
2094 BUFF, and returns the new buffer. */
2095 _cpp_buff *
2096 _cpp_append_extend_buff (pfile, buff, min_extra)
2097 cpp_reader *pfile;
2098 _cpp_buff *buff;
2099 size_t min_extra;
2101 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2102 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2104 buff->next = new_buff;
2105 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2106 return new_buff;
2109 /* Creates a new buffer with enough space to hold the uncommitted
2110 remaining bytes of the buffer pointed to by BUFF, and at least
2111 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
2112 Chains the new buffer before the buffer pointed to by BUFF, and
2113 updates the pointer to point to the new buffer. */
2114 void
2115 _cpp_extend_buff (pfile, pbuff, min_extra)
2116 cpp_reader *pfile;
2117 _cpp_buff **pbuff;
2118 size_t min_extra;
2120 _cpp_buff *new_buff, *old_buff = *pbuff;
2121 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2123 new_buff = _cpp_get_buff (pfile, size);
2124 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2125 new_buff->next = old_buff;
2126 *pbuff = new_buff;
2129 /* Free a chain of buffers starting at BUFF. */
2130 void
2131 _cpp_free_buff (buff)
2132 _cpp_buff *buff;
2134 _cpp_buff *next;
2136 for (; buff; buff = next)
2138 next = buff->next;
2139 free (buff->base);
2143 /* Allocate permanent, unaligned storage of length LEN. */
2144 unsigned char *
2145 _cpp_unaligned_alloc (pfile, len)
2146 cpp_reader *pfile;
2147 size_t len;
2149 _cpp_buff *buff = pfile->u_buff;
2150 unsigned char *result = buff->cur;
2152 if (len > (size_t) (buff->limit - result))
2154 buff = _cpp_get_buff (pfile, len);
2155 buff->next = pfile->u_buff;
2156 pfile->u_buff = buff;
2157 result = buff->cur;
2160 buff->cur = result + len;
2161 return result;
2164 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2165 That buffer is used for growing allocations when saving macro
2166 replacement lists in a #define, and when parsing an answer to an
2167 assertion in #assert, #unassert or #if (and therefore possibly
2168 whilst expanding macros). It therefore must not be used by any
2169 code that they might call: specifically the lexer and the guts of
2170 the macro expander.
2172 All existing other uses clearly fit this restriction: storing
2173 registered pragmas during initialization. */
2174 unsigned char *
2175 _cpp_aligned_alloc (pfile, len)
2176 cpp_reader *pfile;
2177 size_t len;
2179 _cpp_buff *buff = pfile->a_buff;
2180 unsigned char *result = buff->cur;
2182 if (len > (size_t) (buff->limit - result))
2184 buff = _cpp_get_buff (pfile, len);
2185 buff->next = pfile->a_buff;
2186 pfile->a_buff = buff;
2187 result = buff->cur;
2190 buff->cur = result + len;
2191 return result;