2002-08-22 Paolo Carlini <pcarlini@unitus.it>
[official-gcc.git] / gcc / cpplex.c
blob7942c96ad3010e320fc59b421c32ae1682fffea5
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000, 2001, 2002 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7 Single-pass line tokenization by Neil Booth, April 2000
9 This program is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
12 later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23 #include "config.h"
24 #include "system.h"
25 #include "cpplib.h"
26 #include "cpphash.h"
28 #ifdef MULTIBYTE_CHARS
29 #include "mbchar.h"
30 #include <locale.h>
31 #endif
33 /* Tokens with SPELL_STRING store their spelling in the token list,
34 and it's length in the token->val.name.len. */
35 enum spell_type
37 SPELL_OPERATOR = 0,
38 SPELL_CHAR,
39 SPELL_IDENT,
40 SPELL_NUMBER,
41 SPELL_STRING,
42 SPELL_NONE
45 struct token_spelling
47 enum spell_type category;
48 const unsigned char *name;
51 static const unsigned char *const digraph_spellings[] =
52 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
54 #define OP(e, s) { SPELL_OPERATOR, U s },
55 #define TK(e, s) { s, U STRINGX (e) },
56 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
57 #undef OP
58 #undef TK
60 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
61 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
62 #define BACKUP() do {buffer->cur = buffer->backup_to;} while (0)
64 static void handle_newline PARAMS ((cpp_reader *));
65 static cppchar_t skip_escaped_newlines PARAMS ((cpp_reader *));
66 static cppchar_t get_effective_char PARAMS ((cpp_reader *));
68 static int skip_block_comment PARAMS ((cpp_reader *));
69 static int skip_line_comment PARAMS ((cpp_reader *));
70 static void adjust_column PARAMS ((cpp_reader *));
71 static int skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
72 static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *));
73 static uchar *parse_slow PARAMS ((cpp_reader *, const uchar *, int,
74 unsigned int *));
75 static void parse_number PARAMS ((cpp_reader *, cpp_string *, int));
76 static int unescaped_terminator_p PARAMS ((cpp_reader *, const uchar *));
77 static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
78 static bool trigraph_p PARAMS ((cpp_reader *));
79 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const uchar *,
80 cppchar_t));
81 static bool continue_after_nul PARAMS ((cpp_reader *));
82 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
83 static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
84 const unsigned char *, cppchar_t *));
85 static tokenrun *next_tokenrun PARAMS ((tokenrun *));
87 static unsigned int hex_digit_value PARAMS ((unsigned int));
88 static _cpp_buff *new_buff PARAMS ((size_t));
90 /* Utility routine:
92 Compares, the token TOKEN to the NUL-terminated string STRING.
93 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
94 int
95 cpp_ideq (token, string)
96 const cpp_token *token;
97 const char *string;
99 if (token->type != CPP_NAME)
100 return 0;
102 return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
105 /* Call when meeting a newline, assumed to be in buffer->cur[-1].
106 Returns with buffer->cur pointing to the character immediately
107 following the newline (combination). */
108 static void
109 handle_newline (pfile)
110 cpp_reader *pfile;
112 cpp_buffer *buffer = pfile->buffer;
114 /* Handle CR-LF and LF-CR. Most other implementations (e.g. java)
115 only accept CR-LF; maybe we should fall back to that behaviour? */
116 if (buffer->cur[-1] + buffer->cur[0] == '\r' + '\n')
117 buffer->cur++;
119 buffer->line_base = buffer->cur;
120 buffer->col_adjust = 0;
121 pfile->line++;
124 /* Subroutine of skip_escaped_newlines; called when a 3-character
125 sequence beginning with "??" is encountered. buffer->cur points to
126 the second '?'.
128 Warn if necessary, and returns true if the sequence forms a
129 trigraph and the trigraph should be honoured. */
130 static bool
131 trigraph_p (pfile)
132 cpp_reader *pfile;
134 cpp_buffer *buffer = pfile->buffer;
135 cppchar_t from_char = buffer->cur[1];
136 bool accept;
138 if (!_cpp_trigraph_map[from_char])
139 return false;
141 accept = CPP_OPTION (pfile, trigraphs);
143 /* Don't warn about trigraphs in comments. */
144 if (CPP_OPTION (pfile, warn_trigraphs) && !pfile->state.lexing_comment)
146 if (accept)
147 cpp_error_with_line (pfile, DL_WARNING,
148 pfile->line, CPP_BUF_COL (buffer) - 1,
149 "trigraph ??%c converted to %c",
150 (int) from_char,
151 (int) _cpp_trigraph_map[from_char]);
152 else if (buffer->cur != buffer->last_Wtrigraphs)
154 buffer->last_Wtrigraphs = buffer->cur;
155 cpp_error_with_line (pfile, DL_WARNING,
156 pfile->line, CPP_BUF_COL (buffer) - 1,
157 "trigraph ??%c ignored", (int) from_char);
161 return accept;
164 /* Skips any escaped newlines introduced by '?' or a '\\', assumed to
165 lie in buffer->cur[-1]. Returns the next byte, which will be in
166 buffer->cur[-1]. This routine performs preprocessing stages 1 and
167 2 of the ISO C standard. */
168 static cppchar_t
169 skip_escaped_newlines (pfile)
170 cpp_reader *pfile;
172 cpp_buffer *buffer = pfile->buffer;
173 cppchar_t next = buffer->cur[-1];
175 /* Only do this if we apply stages 1 and 2. */
176 if (!buffer->from_stage3)
178 const unsigned char *saved_cur;
179 cppchar_t next1;
183 if (next == '?')
185 if (buffer->cur[0] != '?' || !trigraph_p (pfile))
186 break;
188 /* Translate the trigraph. */
189 next = _cpp_trigraph_map[buffer->cur[1]];
190 buffer->cur += 2;
191 if (next != '\\')
192 break;
195 if (buffer->cur == buffer->rlimit)
196 break;
198 /* We have a backslash, and room for at least one more
199 character. Skip horizontal whitespace. */
200 saved_cur = buffer->cur;
202 next1 = *buffer->cur++;
203 while (is_nvspace (next1) && buffer->cur < buffer->rlimit);
205 if (!is_vspace (next1))
207 buffer->cur = saved_cur;
208 break;
211 if (saved_cur != buffer->cur - 1
212 && !pfile->state.lexing_comment)
213 cpp_error (pfile, DL_WARNING,
214 "backslash and newline separated by space");
216 handle_newline (pfile);
217 buffer->backup_to = buffer->cur;
218 if (buffer->cur == buffer->rlimit)
220 cpp_error (pfile, DL_PEDWARN,
221 "backslash-newline at end of file");
222 next = EOF;
224 else
225 next = *buffer->cur++;
227 while (next == '\\' || next == '?');
230 return next;
233 /* Obtain the next character, after trigraph conversion and skipping
234 an arbitrarily long string of escaped newlines. The common case of
235 no trigraphs or escaped newlines falls through quickly. On return,
236 buffer->backup_to points to where to return to if the character is
237 not to be processed. */
238 static cppchar_t
239 get_effective_char (pfile)
240 cpp_reader *pfile;
242 cppchar_t next;
243 cpp_buffer *buffer = pfile->buffer;
245 buffer->backup_to = buffer->cur;
246 next = *buffer->cur++;
247 if (__builtin_expect (next == '?' || next == '\\', 0))
248 next = skip_escaped_newlines (pfile);
250 return next;
253 /* Skip a C-style block comment. We find the end of the comment by
254 seeing if an asterisk is before every '/' we encounter. Returns
255 non-zero if comment terminated by EOF, zero otherwise. */
256 static int
257 skip_block_comment (pfile)
258 cpp_reader *pfile;
260 cpp_buffer *buffer = pfile->buffer;
261 cppchar_t c = EOF, prevc = EOF;
263 pfile->state.lexing_comment = 1;
264 while (buffer->cur != buffer->rlimit)
266 prevc = c, c = *buffer->cur++;
268 /* FIXME: For speed, create a new character class of characters
269 of interest inside block comments. */
270 if (c == '?' || c == '\\')
271 c = skip_escaped_newlines (pfile);
273 /* People like decorating comments with '*', so check for '/'
274 instead for efficiency. */
275 if (c == '/')
277 if (prevc == '*')
278 break;
280 /* Warn about potential nested comments, but not if the '/'
281 comes immediately before the true comment delimiter.
282 Don't bother to get it right across escaped newlines. */
283 if (CPP_OPTION (pfile, warn_comments)
284 && buffer->cur[0] == '*' && buffer->cur[1] != '/')
285 cpp_error_with_line (pfile, DL_WARNING,
286 pfile->line, CPP_BUF_COL (buffer),
287 "\"/*\" within comment");
289 else if (is_vspace (c))
290 handle_newline (pfile);
291 else if (c == '\t')
292 adjust_column (pfile);
295 pfile->state.lexing_comment = 0;
296 return c != '/' || prevc != '*';
299 /* Skip a C++ line comment, leaving buffer->cur pointing to the
300 terminating newline. Handles escaped newlines. Returns non-zero
301 if a multiline comment. */
302 static int
303 skip_line_comment (pfile)
304 cpp_reader *pfile;
306 cpp_buffer *buffer = pfile->buffer;
307 unsigned int orig_line = pfile->line;
308 cppchar_t c;
309 #ifdef MULTIBYTE_CHARS
310 wchar_t wc;
311 int char_len;
312 #endif
314 pfile->state.lexing_comment = 1;
315 #ifdef MULTIBYTE_CHARS
316 /* Reset multibyte conversion state. */
317 (void) local_mbtowc (NULL, NULL, 0);
318 #endif
321 if (buffer->cur == buffer->rlimit)
322 goto at_eof;
324 #ifdef MULTIBYTE_CHARS
325 char_len = local_mbtowc (&wc, (const char *) buffer->cur,
326 buffer->rlimit - buffer->cur);
327 if (char_len == -1)
329 cpp_error (pfile, DL_WARNING,
330 "ignoring invalid multibyte character");
331 char_len = 1;
332 c = *buffer->cur++;
334 else
336 buffer->cur += char_len;
337 c = wc;
339 #else
340 c = *buffer->cur++;
341 #endif
342 if (c == '?' || c == '\\')
343 c = skip_escaped_newlines (pfile);
345 while (!is_vspace (c));
347 /* Step back over the newline, except at EOF. */
348 buffer->cur--;
349 at_eof:
351 pfile->state.lexing_comment = 0;
352 return orig_line != pfile->line;
355 /* pfile->buffer->cur is one beyond the \t character. Update
356 col_adjust so we track the column correctly. */
357 static void
358 adjust_column (pfile)
359 cpp_reader *pfile;
361 cpp_buffer *buffer = pfile->buffer;
362 unsigned int col = CPP_BUF_COL (buffer) - 1; /* Zero-based column. */
364 /* Round it up to multiple of the tabstop, but subtract 1 since the
365 tab itself occupies a character position. */
366 buffer->col_adjust += (CPP_OPTION (pfile, tabstop)
367 - col % CPP_OPTION (pfile, tabstop)) - 1;
370 /* Skips whitespace, saving the next non-whitespace character.
371 Adjusts pfile->col_adjust to account for tabs. Without this,
372 tokens might be assigned an incorrect column. */
373 static int
374 skip_whitespace (pfile, c)
375 cpp_reader *pfile;
376 cppchar_t c;
378 cpp_buffer *buffer = pfile->buffer;
379 unsigned int warned = 0;
383 /* Horizontal space always OK. */
384 if (c == ' ')
386 else if (c == '\t')
387 adjust_column (pfile);
388 /* Just \f \v or \0 left. */
389 else if (c == '\0')
391 if (buffer->cur - 1 == buffer->rlimit)
392 return 0;
393 if (!warned)
395 cpp_error (pfile, DL_WARNING, "null character(s) ignored");
396 warned = 1;
399 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
400 cpp_error_with_line (pfile, DL_PEDWARN, pfile->line,
401 CPP_BUF_COL (buffer),
402 "%s in preprocessing directive",
403 c == '\f' ? "form feed" : "vertical tab");
405 c = *buffer->cur++;
407 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
408 while (is_nvspace (c));
410 buffer->cur--;
411 return 1;
414 /* See if the characters of a number token are valid in a name (no
415 '.', '+' or '-'). */
416 static int
417 name_p (pfile, string)
418 cpp_reader *pfile;
419 const cpp_string *string;
421 unsigned int i;
423 for (i = 0; i < string->len; i++)
424 if (!is_idchar (string->text[i]))
425 return 0;
427 return 1;
430 /* Parse an identifier, skipping embedded backslash-newlines. This is
431 a critical inner loop. The common case is an identifier which has
432 not been split by backslash-newline, does not contain a dollar
433 sign, and has already been scanned (roughly 10:1 ratio of
434 seen:unseen identifiers in normal code; the distribution is
435 Poisson-like). Second most common case is a new identifier, not
436 split and no dollar sign. The other possibilities are rare and
437 have been relegated to parse_slow. */
438 static cpp_hashnode *
439 parse_identifier (pfile)
440 cpp_reader *pfile;
442 cpp_hashnode *result;
443 const uchar *cur, *base;
445 /* Fast-path loop. Skim over a normal identifier.
446 N.B. ISIDNUM does not include $. */
447 cur = pfile->buffer->cur;
448 while (ISIDNUM (*cur))
449 cur++;
451 /* Check for slow-path cases. */
452 if (*cur == '?' || *cur == '\\' || *cur == '$')
454 unsigned int len;
456 base = parse_slow (pfile, cur, 0, &len);
457 result = (cpp_hashnode *)
458 ht_lookup (pfile->hash_table, base, len, HT_ALLOCED);
460 else
462 base = pfile->buffer->cur - 1;
463 pfile->buffer->cur = cur;
464 result = (cpp_hashnode *)
465 ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
468 /* Rarely, identifiers require diagnostics when lexed.
469 XXX Has to be forced out of the fast path. */
470 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
471 && !pfile->state.skipping, 0))
473 /* It is allowed to poison the same identifier twice. */
474 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
475 cpp_error (pfile, DL_ERROR, "attempt to use poisoned \"%s\"",
476 NODE_NAME (result));
478 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
479 replacement list of a variadic macro. */
480 if (result == pfile->spec_nodes.n__VA_ARGS__
481 && !pfile->state.va_args_ok)
482 cpp_error (pfile, DL_PEDWARN,
483 "__VA_ARGS__ can only appear in the expansion of a C99 variadic macro");
486 return result;
489 /* Slow path. This handles numbers and identifiers which have been
490 split, or contain dollar signs. The part of the token from
491 PFILE->buffer->cur-1 to CUR has already been scanned. NUMBER_P is
492 1 if it's a number, and 2 if it has a leading period. Returns a
493 pointer to the token's NUL-terminated spelling in permanent
494 storage, and sets PLEN to its length. */
495 static uchar *
496 parse_slow (pfile, cur, number_p, plen)
497 cpp_reader *pfile;
498 const uchar *cur;
499 int number_p;
500 unsigned int *plen;
502 cpp_buffer *buffer = pfile->buffer;
503 const uchar *base = buffer->cur - 1;
504 struct obstack *stack = &pfile->hash_table->stack;
505 unsigned int c, prevc, saw_dollar = 0;
507 /* Place any leading period. */
508 if (number_p == 2)
509 obstack_1grow (stack, '.');
511 /* Copy the part of the token which is known to be okay. */
512 obstack_grow (stack, base, cur - base);
514 /* Now process the part which isn't. We are looking at one of
515 '$', '\\', or '?' on entry to this loop. */
516 prevc = cur[-1];
517 c = *cur++;
518 buffer->cur = cur;
519 for (;;)
521 /* Potential escaped newline? */
522 buffer->backup_to = buffer->cur - 1;
523 if (c == '?' || c == '\\')
524 c = skip_escaped_newlines (pfile);
526 if (!is_idchar (c))
528 if (!number_p)
529 break;
530 if (c != '.' && !VALID_SIGN (c, prevc))
531 break;
534 /* Handle normal identifier characters in this loop. */
537 prevc = c;
538 obstack_1grow (stack, c);
540 if (c == '$')
541 saw_dollar++;
543 c = *buffer->cur++;
545 while (is_idchar (c));
548 /* Step back over the unwanted char. */
549 BACKUP ();
551 /* $ is not an identifier character in the standard, but is commonly
552 accepted as an extension. Don't warn about it in skipped
553 conditional blocks. */
554 if (saw_dollar && CPP_PEDANTIC (pfile) && ! pfile->state.skipping)
555 cpp_error (pfile, DL_PEDWARN, "'$' character(s) in identifier or number");
557 /* Identifiers and numbers are null-terminated. */
558 *plen = obstack_object_size (stack);
559 obstack_1grow (stack, '\0');
560 return obstack_finish (stack);
563 /* Parse a number, beginning with character C, skipping embedded
564 backslash-newlines. LEADING_PERIOD is non-zero if there was a "."
565 before C. Place the result in NUMBER. */
566 static void
567 parse_number (pfile, number, leading_period)
568 cpp_reader *pfile;
569 cpp_string *number;
570 int leading_period;
572 const uchar *cur;
574 /* Fast-path loop. Skim over a normal number.
575 N.B. ISIDNUM does not include $. */
576 cur = pfile->buffer->cur;
577 while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
578 cur++;
580 /* Check for slow-path cases. */
581 if (*cur == '?' || *cur == '\\' || *cur == '$')
582 number->text = parse_slow (pfile, cur, 1 + leading_period, &number->len);
583 else
585 const uchar *base = pfile->buffer->cur - 1;
586 uchar *dest;
588 number->len = cur - base + leading_period;
589 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
590 dest[number->len] = '\0';
591 number->text = dest;
593 if (leading_period)
594 *dest++ = '.';
595 memcpy (dest, base, cur - base);
596 pfile->buffer->cur = cur;
600 /* Subroutine of parse_string. */
601 static int
602 unescaped_terminator_p (pfile, dest)
603 cpp_reader *pfile;
604 const unsigned char *dest;
606 const unsigned char *start, *temp;
608 /* In #include-style directives, terminators are not escapeable. */
609 if (pfile->state.angled_headers)
610 return 1;
612 start = BUFF_FRONT (pfile->u_buff);
614 /* An odd number of consecutive backslashes represents an escaped
615 terminator. */
616 for (temp = dest; temp > start && temp[-1] == '\\'; temp--)
619 return ((dest - temp) & 1) == 0;
622 /* Parses a string, character constant, or angle-bracketed header file
623 name. Handles embedded trigraphs and escaped newlines. The stored
624 string is guaranteed NUL-terminated, but it is not guaranteed that
625 this is the first NUL since embedded NULs are preserved.
627 When this function returns, buffer->cur points to the next
628 character to be processed. */
629 static void
630 parse_string (pfile, token, terminator)
631 cpp_reader *pfile;
632 cpp_token *token;
633 cppchar_t terminator;
635 cpp_buffer *buffer = pfile->buffer;
636 unsigned char *dest, *limit;
637 cppchar_t c;
638 bool warned_nulls = false;
639 #ifdef MULTIBYTE_CHARS
640 wchar_t wc;
641 int char_len;
642 #endif
644 dest = BUFF_FRONT (pfile->u_buff);
645 limit = BUFF_LIMIT (pfile->u_buff);
647 #ifdef MULTIBYTE_CHARS
648 /* Reset multibyte conversion state. */
649 (void) local_mbtowc (NULL, NULL, 0);
650 #endif
651 for (;;)
653 /* We need room for another char, possibly the terminating NUL. */
654 if ((size_t) (limit - dest) < 1)
656 size_t len_so_far = dest - BUFF_FRONT (pfile->u_buff);
657 _cpp_extend_buff (pfile, &pfile->u_buff, 2);
658 dest = BUFF_FRONT (pfile->u_buff) + len_so_far;
659 limit = BUFF_LIMIT (pfile->u_buff);
662 #ifdef MULTIBYTE_CHARS
663 char_len = local_mbtowc (&wc, (const char *) buffer->cur,
664 buffer->rlimit - buffer->cur);
665 if (char_len == -1)
667 cpp_error (pfile, DL_WARNING,
668 "ignoring invalid multibyte character");
669 char_len = 1;
670 c = *buffer->cur++;
672 else
674 buffer->cur += char_len;
675 c = wc;
677 #else
678 c = *buffer->cur++;
679 #endif
681 /* Handle trigraphs, escaped newlines etc. */
682 if (c == '?' || c == '\\')
683 c = skip_escaped_newlines (pfile);
685 if (c == terminator)
687 if (unescaped_terminator_p (pfile, dest))
688 break;
690 else if (is_vspace (c))
692 /* No string literal may extend over multiple lines. In
693 assembly language, suppress the error except for <>
694 includes. This is a kludge around not knowing where
695 comments are. */
696 unterminated:
697 if (CPP_OPTION (pfile, lang) != CLK_ASM || terminator == '>')
698 cpp_error (pfile, DL_ERROR, "missing terminating %c character",
699 (int) terminator);
700 buffer->cur--;
701 break;
703 else if (c == '\0')
705 if (buffer->cur - 1 == buffer->rlimit)
706 goto unterminated;
707 if (!warned_nulls)
709 warned_nulls = true;
710 cpp_error (pfile, DL_WARNING,
711 "null character(s) preserved in literal");
714 #ifdef MULTIBYTE_CHARS
715 if (char_len > 1)
717 for ( ; char_len > 0; --char_len)
718 *dest++ = (*buffer->cur - char_len);
720 else
721 #endif
722 *dest++ = c;
725 *dest = '\0';
727 token->val.str.text = BUFF_FRONT (pfile->u_buff);
728 token->val.str.len = dest - BUFF_FRONT (pfile->u_buff);
729 BUFF_FRONT (pfile->u_buff) = dest + 1;
732 /* The stored comment includes the comment start and any terminator. */
733 static void
734 save_comment (pfile, token, from, type)
735 cpp_reader *pfile;
736 cpp_token *token;
737 const unsigned char *from;
738 cppchar_t type;
740 unsigned char *buffer;
741 unsigned int len, clen;
743 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
745 /* C++ comments probably (not definitely) have moved past a new
746 line, which we don't want to save in the comment. */
747 if (is_vspace (pfile->buffer->cur[-1]))
748 len--;
750 /* If we are currently in a directive, then we need to store all
751 C++ comments as C comments internally, and so we need to
752 allocate a little extra space in that case.
754 Note that the only time we encounter a directive here is
755 when we are saving comments in a "#define". */
756 clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
758 buffer = _cpp_unaligned_alloc (pfile, clen);
760 token->type = CPP_COMMENT;
761 token->val.str.len = clen;
762 token->val.str.text = buffer;
764 buffer[0] = '/';
765 memcpy (buffer + 1, from, len - 1);
767 /* Finish conversion to a C comment, if necessary. */
768 if (pfile->state.in_directive && type == '/')
770 buffer[1] = '*';
771 buffer[clen - 2] = '*';
772 buffer[clen - 1] = '/';
776 /* Allocate COUNT tokens for RUN. */
777 void
778 _cpp_init_tokenrun (run, count)
779 tokenrun *run;
780 unsigned int count;
782 run->base = xnewvec (cpp_token, count);
783 run->limit = run->base + count;
784 run->next = NULL;
787 /* Returns the next tokenrun, or creates one if there is none. */
788 static tokenrun *
789 next_tokenrun (run)
790 tokenrun *run;
792 if (run->next == NULL)
794 run->next = xnew (tokenrun);
795 run->next->prev = run;
796 _cpp_init_tokenrun (run->next, 250);
799 return run->next;
802 /* Allocate a single token that is invalidated at the same time as the
803 rest of the tokens on the line. Has its line and col set to the
804 same as the last lexed token, so that diagnostics appear in the
805 right place. */
806 cpp_token *
807 _cpp_temp_token (pfile)
808 cpp_reader *pfile;
810 cpp_token *old, *result;
812 old = pfile->cur_token - 1;
813 if (pfile->cur_token == pfile->cur_run->limit)
815 pfile->cur_run = next_tokenrun (pfile->cur_run);
816 pfile->cur_token = pfile->cur_run->base;
819 result = pfile->cur_token++;
820 result->line = old->line;
821 result->col = old->col;
822 return result;
825 /* Lex a token into RESULT (external interface). Takes care of issues
826 like directive handling, token lookahead, multiple include
827 optimization and skipping. */
828 const cpp_token *
829 _cpp_lex_token (pfile)
830 cpp_reader *pfile;
832 cpp_token *result;
834 for (;;)
836 if (pfile->cur_token == pfile->cur_run->limit)
838 pfile->cur_run = next_tokenrun (pfile->cur_run);
839 pfile->cur_token = pfile->cur_run->base;
842 if (pfile->lookaheads)
844 pfile->lookaheads--;
845 result = pfile->cur_token++;
847 else
848 result = _cpp_lex_direct (pfile);
850 if (result->flags & BOL)
852 /* Is this a directive. If _cpp_handle_directive returns
853 false, it is an assembler #. */
854 if (result->type == CPP_HASH
855 /* 6.10.3 p 11: Directives in a list of macro arguments
856 gives undefined behavior. This implementation
857 handles the directive as normal. */
858 && pfile->state.parsing_args != 1
859 && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
860 continue;
861 if (pfile->cb.line_change && !pfile->state.skipping)
862 (*pfile->cb.line_change)(pfile, result, pfile->state.parsing_args);
865 /* We don't skip tokens in directives. */
866 if (pfile->state.in_directive)
867 break;
869 /* Outside a directive, invalidate controlling macros. At file
870 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
871 get here and MI optimisation works. */
872 pfile->mi_valid = false;
874 if (!pfile->state.skipping || result->type == CPP_EOF)
875 break;
878 return result;
881 /* A NUL terminates the current buffer. For ISO preprocessing this is
882 EOF, but for traditional preprocessing it indicates we need a line
883 refill. Returns TRUE to continue preprocessing a new buffer, FALSE
884 to return a CPP_EOF to the caller. */
885 static bool
886 continue_after_nul (pfile)
887 cpp_reader *pfile;
889 cpp_buffer *buffer = pfile->buffer;
890 bool more = false;
892 buffer->saved_flags = BOL;
893 if (CPP_OPTION (pfile, traditional))
895 if (pfile->state.in_directive)
896 return false;
898 _cpp_remove_overlay (pfile);
899 more = _cpp_read_logical_line_trad (pfile);
900 _cpp_overlay_buffer (pfile, pfile->out.base,
901 pfile->out.cur - pfile->out.base);
902 pfile->line = pfile->out.first_line;
904 else
906 /* Stop parsing arguments with a CPP_EOF. When we finally come
907 back here, do the work of popping the buffer. */
908 if (!pfile->state.parsing_args)
910 if (buffer->cur != buffer->line_base)
912 /* Non-empty files should end in a newline. Don't warn
913 for command line and _Pragma buffers. */
914 if (!buffer->from_stage3)
915 cpp_error (pfile, DL_PEDWARN, "no newline at end of file");
916 handle_newline (pfile);
919 /* Similarly, finish an in-progress directive with CPP_EOF
920 before popping the buffer. */
921 if (!pfile->state.in_directive && buffer->prev)
923 more = !buffer->return_at_eof;
924 _cpp_pop_buffer (pfile);
929 return more;
932 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
933 do { \
934 if (get_effective_char (pfile) == CHAR) \
935 result->type = THEN_TYPE; \
936 else \
938 BACKUP (); \
939 result->type = ELSE_TYPE; \
941 } while (0)
943 /* Lex a token into pfile->cur_token, which is also incremented, to
944 get diagnostics pointing to the correct location.
946 Does not handle issues such as token lookahead, multiple-include
947 optimisation, directives, skipping etc. This function is only
948 suitable for use by _cpp_lex_token, and in special cases like
949 lex_expansion_token which doesn't care for any of these issues.
951 When meeting a newline, returns CPP_EOF if parsing a directive,
952 otherwise returns to the start of the token buffer if permissible.
953 Returns the location of the lexed token. */
954 cpp_token *
955 _cpp_lex_direct (pfile)
956 cpp_reader *pfile;
958 cppchar_t c;
959 cpp_buffer *buffer;
960 const unsigned char *comment_start;
961 cpp_token *result = pfile->cur_token++;
963 fresh_line:
964 buffer = pfile->buffer;
965 result->flags = buffer->saved_flags;
966 buffer->saved_flags = 0;
967 update_tokens_line:
968 result->line = pfile->line;
970 skipped_white:
971 c = *buffer->cur++;
972 result->col = CPP_BUF_COLUMN (buffer, buffer->cur);
974 trigraph:
975 switch (c)
977 case ' ': case '\t': case '\f': case '\v': case '\0':
978 result->flags |= PREV_WHITE;
979 if (skip_whitespace (pfile, c))
980 goto skipped_white;
982 /* End of buffer. */
983 buffer->cur--;
984 if (continue_after_nul (pfile))
985 goto fresh_line;
986 result->type = CPP_EOF;
987 break;
989 case '\n': case '\r':
990 handle_newline (pfile);
991 buffer->saved_flags = BOL;
992 if (! pfile->state.in_directive)
994 if (pfile->state.parsing_args == 2)
995 buffer->saved_flags |= PREV_WHITE;
996 if (!pfile->keep_tokens)
998 pfile->cur_run = &pfile->base_run;
999 result = pfile->base_run.base;
1000 pfile->cur_token = result + 1;
1002 goto fresh_line;
1004 result->type = CPP_EOF;
1005 break;
1007 case '?':
1008 case '\\':
1009 /* These could start an escaped newline, or '?' a trigraph. Let
1010 skip_escaped_newlines do all the work. */
1012 unsigned int line = pfile->line;
1014 c = skip_escaped_newlines (pfile);
1015 if (line != pfile->line)
1017 buffer->cur--;
1018 /* We had at least one escaped newline of some sort.
1019 Update the token's line and column. */
1020 goto update_tokens_line;
1024 /* We are either the original '?' or '\\', or a trigraph. */
1025 if (c == '?')
1026 result->type = CPP_QUERY;
1027 else if (c == '\\')
1028 goto random_char;
1029 else
1030 goto trigraph;
1031 break;
1033 case '0': case '1': case '2': case '3': case '4':
1034 case '5': case '6': case '7': case '8': case '9':
1035 result->type = CPP_NUMBER;
1036 parse_number (pfile, &result->val.str, 0);
1037 break;
1039 case 'L':
1040 /* 'L' may introduce wide characters or strings. */
1042 const unsigned char *pos = buffer->cur;
1044 c = get_effective_char (pfile);
1045 if (c == '\'' || c == '"')
1047 result->type = (c == '"' ? CPP_WSTRING: CPP_WCHAR);
1048 parse_string (pfile, result, c);
1049 break;
1051 buffer->cur = pos;
1053 /* Fall through. */
1055 start_ident:
1056 case '_':
1057 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1058 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1059 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1060 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1061 case 'y': case 'z':
1062 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1063 case 'G': case 'H': case 'I': case 'J': case 'K':
1064 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1065 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1066 case 'Y': case 'Z':
1067 result->type = CPP_NAME;
1068 result->val.node = parse_identifier (pfile);
1070 /* Convert named operators to their proper types. */
1071 if (result->val.node->flags & NODE_OPERATOR)
1073 result->flags |= NAMED_OP;
1074 result->type = result->val.node->value.operator;
1076 break;
1078 case '\'':
1079 case '"':
1080 result->type = c == '"' ? CPP_STRING: CPP_CHAR;
1081 parse_string (pfile, result, c);
1082 break;
1084 case '/':
1085 /* A potential block or line comment. */
1086 comment_start = buffer->cur;
1087 c = get_effective_char (pfile);
1089 if (c == '*')
1091 if (skip_block_comment (pfile))
1092 cpp_error (pfile, DL_ERROR, "unterminated comment");
1094 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1095 || CPP_IN_SYSTEM_HEADER (pfile)))
1097 /* Warn about comments only if pedantically GNUC89, and not
1098 in system headers. */
1099 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1100 && ! buffer->warned_cplusplus_comments)
1102 cpp_error (pfile, DL_PEDWARN,
1103 "C++ style comments are not allowed in ISO C90");
1104 cpp_error (pfile, DL_PEDWARN,
1105 "(this will be reported only once per input file)");
1106 buffer->warned_cplusplus_comments = 1;
1109 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1110 cpp_error (pfile, DL_WARNING, "multi-line comment");
1112 else if (c == '=')
1114 result->type = CPP_DIV_EQ;
1115 break;
1117 else
1119 BACKUP ();
1120 result->type = CPP_DIV;
1121 break;
1124 if (!pfile->state.save_comments)
1126 result->flags |= PREV_WHITE;
1127 goto update_tokens_line;
1130 /* Save the comment as a token in its own right. */
1131 save_comment (pfile, result, comment_start, c);
1132 break;
1134 case '<':
1135 if (pfile->state.angled_headers)
1137 result->type = CPP_HEADER_NAME;
1138 parse_string (pfile, result, '>');
1139 break;
1142 c = get_effective_char (pfile);
1143 if (c == '=')
1144 result->type = CPP_LESS_EQ;
1145 else if (c == '<')
1146 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1147 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1148 IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
1149 else if (c == ':' && CPP_OPTION (pfile, digraphs))
1151 result->type = CPP_OPEN_SQUARE;
1152 result->flags |= DIGRAPH;
1154 else if (c == '%' && CPP_OPTION (pfile, digraphs))
1156 result->type = CPP_OPEN_BRACE;
1157 result->flags |= DIGRAPH;
1159 else
1161 BACKUP ();
1162 result->type = CPP_LESS;
1164 break;
1166 case '>':
1167 c = get_effective_char (pfile);
1168 if (c == '=')
1169 result->type = CPP_GREATER_EQ;
1170 else if (c == '>')
1171 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1172 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1173 IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
1174 else
1176 BACKUP ();
1177 result->type = CPP_GREATER;
1179 break;
1181 case '%':
1182 c = get_effective_char (pfile);
1183 if (c == '=')
1184 result->type = CPP_MOD_EQ;
1185 else if (CPP_OPTION (pfile, digraphs) && c == ':')
1187 result->flags |= DIGRAPH;
1188 result->type = CPP_HASH;
1189 if (get_effective_char (pfile) == '%')
1191 const unsigned char *pos = buffer->cur;
1193 if (get_effective_char (pfile) == ':')
1194 result->type = CPP_PASTE;
1195 else
1196 buffer->cur = pos - 1;
1198 else
1199 BACKUP ();
1201 else if (CPP_OPTION (pfile, digraphs) && c == '>')
1203 result->flags |= DIGRAPH;
1204 result->type = CPP_CLOSE_BRACE;
1206 else
1208 BACKUP ();
1209 result->type = CPP_MOD;
1211 break;
1213 case '.':
1214 result->type = CPP_DOT;
1215 c = get_effective_char (pfile);
1216 if (c == '.')
1218 const unsigned char *pos = buffer->cur;
1220 if (get_effective_char (pfile) == '.')
1221 result->type = CPP_ELLIPSIS;
1222 else
1223 buffer->cur = pos - 1;
1225 /* All known character sets have 0...9 contiguous. */
1226 else if (ISDIGIT (c))
1228 result->type = CPP_NUMBER;
1229 parse_number (pfile, &result->val.str, 1);
1231 else if (c == '*' && CPP_OPTION (pfile, cplusplus))
1232 result->type = CPP_DOT_STAR;
1233 else
1234 BACKUP ();
1235 break;
1237 case '+':
1238 c = get_effective_char (pfile);
1239 if (c == '+')
1240 result->type = CPP_PLUS_PLUS;
1241 else if (c == '=')
1242 result->type = CPP_PLUS_EQ;
1243 else
1245 BACKUP ();
1246 result->type = CPP_PLUS;
1248 break;
1250 case '-':
1251 c = get_effective_char (pfile);
1252 if (c == '>')
1254 result->type = CPP_DEREF;
1255 if (CPP_OPTION (pfile, cplusplus))
1257 if (get_effective_char (pfile) == '*')
1258 result->type = CPP_DEREF_STAR;
1259 else
1260 BACKUP ();
1263 else if (c == '-')
1264 result->type = CPP_MINUS_MINUS;
1265 else if (c == '=')
1266 result->type = CPP_MINUS_EQ;
1267 else
1269 BACKUP ();
1270 result->type = CPP_MINUS;
1272 break;
1274 case '&':
1275 c = get_effective_char (pfile);
1276 if (c == '&')
1277 result->type = CPP_AND_AND;
1278 else if (c == '=')
1279 result->type = CPP_AND_EQ;
1280 else
1282 BACKUP ();
1283 result->type = CPP_AND;
1285 break;
1287 case '|':
1288 c = get_effective_char (pfile);
1289 if (c == '|')
1290 result->type = CPP_OR_OR;
1291 else if (c == '=')
1292 result->type = CPP_OR_EQ;
1293 else
1295 BACKUP ();
1296 result->type = CPP_OR;
1298 break;
1300 case ':':
1301 c = get_effective_char (pfile);
1302 if (c == ':' && CPP_OPTION (pfile, cplusplus))
1303 result->type = CPP_SCOPE;
1304 else if (c == '>' && CPP_OPTION (pfile, digraphs))
1306 result->flags |= DIGRAPH;
1307 result->type = CPP_CLOSE_SQUARE;
1309 else
1311 BACKUP ();
1312 result->type = CPP_COLON;
1314 break;
1316 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1317 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1318 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1319 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1320 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1322 case '~': result->type = CPP_COMPL; break;
1323 case ',': result->type = CPP_COMMA; break;
1324 case '(': result->type = CPP_OPEN_PAREN; break;
1325 case ')': result->type = CPP_CLOSE_PAREN; break;
1326 case '[': result->type = CPP_OPEN_SQUARE; break;
1327 case ']': result->type = CPP_CLOSE_SQUARE; break;
1328 case '{': result->type = CPP_OPEN_BRACE; break;
1329 case '}': result->type = CPP_CLOSE_BRACE; break;
1330 case ';': result->type = CPP_SEMICOLON; break;
1332 /* @ is a punctuator in Objective C. */
1333 case '@': result->type = CPP_ATSIGN; break;
1335 case '$':
1336 if (CPP_OPTION (pfile, dollars_in_ident))
1337 goto start_ident;
1338 /* Fall through... */
1340 random_char:
1341 default:
1342 result->type = CPP_OTHER;
1343 result->val.c = c;
1344 break;
1347 return result;
1350 /* An upper bound on the number of bytes needed to spell TOKEN,
1351 including preceding whitespace. */
1352 unsigned int
1353 cpp_token_len (token)
1354 const cpp_token *token;
1356 unsigned int len;
1358 switch (TOKEN_SPELL (token))
1360 default: len = 0; break;
1361 case SPELL_NUMBER:
1362 case SPELL_STRING: len = token->val.str.len; break;
1363 case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
1365 /* 1 for whitespace, 4 for comment delimiters. */
1366 return len + 5;
1369 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1370 already contain the enough space to hold the token's spelling.
1371 Returns a pointer to the character after the last character
1372 written. */
1373 unsigned char *
1374 cpp_spell_token (pfile, token, buffer)
1375 cpp_reader *pfile; /* Would be nice to be rid of this... */
1376 const cpp_token *token;
1377 unsigned char *buffer;
1379 switch (TOKEN_SPELL (token))
1381 case SPELL_OPERATOR:
1383 const unsigned char *spelling;
1384 unsigned char c;
1386 if (token->flags & DIGRAPH)
1387 spelling
1388 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1389 else if (token->flags & NAMED_OP)
1390 goto spell_ident;
1391 else
1392 spelling = TOKEN_NAME (token);
1394 while ((c = *spelling++) != '\0')
1395 *buffer++ = c;
1397 break;
1399 case SPELL_CHAR:
1400 *buffer++ = token->val.c;
1401 break;
1403 spell_ident:
1404 case SPELL_IDENT:
1405 memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
1406 buffer += NODE_LEN (token->val.node);
1407 break;
1409 case SPELL_NUMBER:
1410 memcpy (buffer, token->val.str.text, token->val.str.len);
1411 buffer += token->val.str.len;
1412 break;
1414 case SPELL_STRING:
1416 int left, right, tag;
1417 switch (token->type)
1419 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1420 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1421 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1422 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1423 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1424 default:
1425 cpp_error (pfile, DL_ICE, "unknown string token %s\n",
1426 TOKEN_NAME (token));
1427 return buffer;
1429 if (tag) *buffer++ = tag;
1430 *buffer++ = left;
1431 memcpy (buffer, token->val.str.text, token->val.str.len);
1432 buffer += token->val.str.len;
1433 *buffer++ = right;
1435 break;
1437 case SPELL_NONE:
1438 cpp_error (pfile, DL_ICE, "unspellable token %s", TOKEN_NAME (token));
1439 break;
1442 return buffer;
1445 /* Returns TOKEN spelt as a null-terminated string. The string is
1446 freed when the reader is destroyed. Useful for diagnostics. */
1447 unsigned char *
1448 cpp_token_as_text (pfile, token)
1449 cpp_reader *pfile;
1450 const cpp_token *token;
1452 unsigned int len = cpp_token_len (token);
1453 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1455 end = cpp_spell_token (pfile, token, start);
1456 end[0] = '\0';
1458 return start;
1461 /* Used by C front ends, which really should move to using
1462 cpp_token_as_text. */
1463 const char *
1464 cpp_type2name (type)
1465 enum cpp_ttype type;
1467 return (const char *) token_spellings[type].name;
1470 /* Writes the spelling of token to FP, without any preceding space.
1471 Separated from cpp_spell_token for efficiency - to avoid stdio
1472 double-buffering. */
1473 void
1474 cpp_output_token (token, fp)
1475 const cpp_token *token;
1476 FILE *fp;
1478 switch (TOKEN_SPELL (token))
1480 case SPELL_OPERATOR:
1482 const unsigned char *spelling;
1483 int c;
1485 if (token->flags & DIGRAPH)
1486 spelling
1487 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1488 else if (token->flags & NAMED_OP)
1489 goto spell_ident;
1490 else
1491 spelling = TOKEN_NAME (token);
1493 c = *spelling;
1495 putc (c, fp);
1496 while ((c = *++spelling) != '\0');
1498 break;
1500 case SPELL_CHAR:
1501 putc (token->val.c, fp);
1502 break;
1504 spell_ident:
1505 case SPELL_IDENT:
1506 fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
1507 break;
1509 case SPELL_NUMBER:
1510 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1511 break;
1513 case SPELL_STRING:
1515 int left, right, tag;
1516 switch (token->type)
1518 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1519 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1520 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1521 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1522 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1523 default:
1524 fprintf (stderr, "impossible STRING token %s\n", TOKEN_NAME (token));
1525 return;
1527 if (tag) putc (tag, fp);
1528 putc (left, fp);
1529 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1530 putc (right, fp);
1532 break;
1534 case SPELL_NONE:
1535 /* An error, most probably. */
1536 break;
1540 /* Compare two tokens. */
1542 _cpp_equiv_tokens (a, b)
1543 const cpp_token *a, *b;
1545 if (a->type == b->type && a->flags == b->flags)
1546 switch (TOKEN_SPELL (a))
1548 default: /* Keep compiler happy. */
1549 case SPELL_OPERATOR:
1550 return 1;
1551 case SPELL_CHAR:
1552 return a->val.c == b->val.c; /* Character. */
1553 case SPELL_NONE:
1554 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1555 case SPELL_IDENT:
1556 return a->val.node == b->val.node;
1557 case SPELL_NUMBER:
1558 case SPELL_STRING:
1559 return (a->val.str.len == b->val.str.len
1560 && !memcmp (a->val.str.text, b->val.str.text,
1561 a->val.str.len));
1564 return 0;
1567 /* Returns nonzero if a space should be inserted to avoid an
1568 accidental token paste for output. For simplicity, it is
1569 conservative, and occasionally advises a space where one is not
1570 needed, e.g. "." and ".2". */
1572 cpp_avoid_paste (pfile, token1, token2)
1573 cpp_reader *pfile;
1574 const cpp_token *token1, *token2;
1576 enum cpp_ttype a = token1->type, b = token2->type;
1577 cppchar_t c;
1579 if (token1->flags & NAMED_OP)
1580 a = CPP_NAME;
1581 if (token2->flags & NAMED_OP)
1582 b = CPP_NAME;
1584 c = EOF;
1585 if (token2->flags & DIGRAPH)
1586 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1587 else if (token_spellings[b].category == SPELL_OPERATOR)
1588 c = token_spellings[b].name[0];
1590 /* Quickly get everything that can paste with an '='. */
1591 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1592 return 1;
1594 switch (a)
1596 case CPP_GREATER: return c == '>' || c == '?';
1597 case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
1598 case CPP_PLUS: return c == '+';
1599 case CPP_MINUS: return c == '-' || c == '>';
1600 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1601 case CPP_MOD: return c == ':' || c == '>';
1602 case CPP_AND: return c == '&';
1603 case CPP_OR: return c == '|';
1604 case CPP_COLON: return c == ':' || c == '>';
1605 case CPP_DEREF: return c == '*';
1606 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1607 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1608 case CPP_NAME: return ((b == CPP_NUMBER
1609 && name_p (pfile, &token2->val.str))
1610 || b == CPP_NAME
1611 || b == CPP_CHAR || b == CPP_STRING); /* L */
1612 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1613 || c == '.' || c == '+' || c == '-');
1614 case CPP_OTHER: return (CPP_OPTION (pfile, objc)
1615 && token1->val.c == '@'
1616 && (b == CPP_NAME || b == CPP_STRING));
1617 default: break;
1620 return 0;
1623 /* Output all the remaining tokens on the current line, and a newline
1624 character, to FP. Leading whitespace is removed. If there are
1625 macros, special token padding is not performed. */
1626 void
1627 cpp_output_line (pfile, fp)
1628 cpp_reader *pfile;
1629 FILE *fp;
1631 const cpp_token *token;
1633 token = cpp_get_token (pfile);
1634 while (token->type != CPP_EOF)
1636 cpp_output_token (token, fp);
1637 token = cpp_get_token (pfile);
1638 if (token->flags & PREV_WHITE)
1639 putc (' ', fp);
1642 putc ('\n', fp);
1645 /* Returns the value of a hexadecimal digit. */
1646 static unsigned int
1647 hex_digit_value (c)
1648 unsigned int c;
1650 if (hex_p (c))
1651 return hex_value (c);
1652 else
1653 abort ();
1656 /* Parse a '\uNNNN' or '\UNNNNNNNN' sequence. Returns 1 to indicate
1657 failure if cpplib is not parsing C++ or C99. Such failure is
1658 silent, and no variables are updated. Otherwise returns 0, and
1659 warns if -Wtraditional.
1661 [lex.charset]: The character designated by the universal character
1662 name \UNNNNNNNN is that character whose character short name in
1663 ISO/IEC 10646 is NNNNNNNN; the character designated by the
1664 universal character name \uNNNN is that character whose character
1665 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
1666 for a universal character name is less than 0x20 or in the range
1667 0x7F-0x9F (inclusive), or if the universal character name
1668 designates a character in the basic source character set, then the
1669 program is ill-formed.
1671 We assume that wchar_t is Unicode, so we don't need to do any
1672 mapping. Is this ever wrong?
1674 PC points to the 'u' or 'U', PSTR is points to the byte after PC,
1675 LIMIT is the end of the string or charconst. PSTR is updated to
1676 point after the UCS on return, and the UCS is written into PC. */
1678 static int
1679 maybe_read_ucs (pfile, pstr, limit, pc)
1680 cpp_reader *pfile;
1681 const unsigned char **pstr;
1682 const unsigned char *limit;
1683 cppchar_t *pc;
1685 const unsigned char *p = *pstr;
1686 unsigned int code = 0;
1687 unsigned int c = *pc, length;
1689 /* Only attempt to interpret a UCS for C++ and C99. */
1690 if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
1691 return 1;
1693 if (CPP_WTRADITIONAL (pfile))
1694 cpp_error (pfile, DL_WARNING,
1695 "the meaning of '\\%c' is different in traditional C", c);
1697 length = (c == 'u' ? 4: 8);
1699 if ((size_t) (limit - p) < length)
1701 cpp_error (pfile, DL_ERROR, "incomplete universal-character-name");
1702 /* Skip to the end to avoid more diagnostics. */
1703 p = limit;
1705 else
1707 for (; length; length--, p++)
1709 c = *p;
1710 if (ISXDIGIT (c))
1711 code = (code << 4) + hex_digit_value (c);
1712 else
1714 cpp_error (pfile, DL_ERROR,
1715 "non-hex digit '%c' in universal-character-name", c);
1716 /* We shouldn't skip in case there are multibyte chars. */
1717 break;
1722 #ifdef TARGET_EBCDIC
1723 cpp_error (pfile, DL_ERROR, "universal-character-name on EBCDIC target");
1724 code = 0x3f; /* EBCDIC invalid character */
1725 #else
1726 /* True extended characters are OK. */
1727 if (code >= 0xa0
1728 && !(code & 0x80000000)
1729 && !(code >= 0xD800 && code <= 0xDFFF))
1731 /* The standard permits $, @ and ` to be specified as UCNs. We use
1732 hex escapes so that this also works with EBCDIC hosts. */
1733 else if (code == 0x24 || code == 0x40 || code == 0x60)
1735 /* Don't give another error if one occurred above. */
1736 else if (length == 0)
1737 cpp_error (pfile, DL_ERROR, "universal-character-name out of range");
1738 #endif
1740 *pstr = p;
1741 *pc = code;
1742 return 0;
1745 /* Returns the value of an escape sequence, truncated to the correct
1746 target precision. PSTR points to the input pointer, which is just
1747 after the backslash. LIMIT is how much text we have. WIDE is true
1748 if the escape sequence is part of a wide character constant or
1749 string literal. Handles all relevant diagnostics. */
1750 cppchar_t
1751 cpp_parse_escape (pfile, pstr, limit, wide)
1752 cpp_reader *pfile;
1753 const unsigned char **pstr;
1754 const unsigned char *limit;
1755 int wide;
1757 int unknown = 0;
1758 const unsigned char *str = *pstr;
1759 cppchar_t c, mask;
1760 unsigned int width;
1762 if (wide)
1763 width = CPP_OPTION (pfile, wchar_precision);
1764 else
1765 width = CPP_OPTION (pfile, char_precision);
1766 if (width < BITS_PER_CPPCHAR_T)
1767 mask = ((cppchar_t) 1 << width) - 1;
1768 else
1769 mask = ~0;
1771 c = *str++;
1772 switch (c)
1774 case '\\': case '\'': case '"': case '?': break;
1775 case 'b': c = TARGET_BS; break;
1776 case 'f': c = TARGET_FF; break;
1777 case 'n': c = TARGET_NEWLINE; break;
1778 case 'r': c = TARGET_CR; break;
1779 case 't': c = TARGET_TAB; break;
1780 case 'v': c = TARGET_VT; break;
1782 case '(': case '{': case '[': case '%':
1783 /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
1784 '\%' is used to prevent SCCS from getting confused. */
1785 unknown = CPP_PEDANTIC (pfile);
1786 break;
1788 case 'a':
1789 if (CPP_WTRADITIONAL (pfile))
1790 cpp_error (pfile, DL_WARNING,
1791 "the meaning of '\\a' is different in traditional C");
1792 c = TARGET_BELL;
1793 break;
1795 case 'e': case 'E':
1796 if (CPP_PEDANTIC (pfile))
1797 cpp_error (pfile, DL_PEDWARN,
1798 "non-ISO-standard escape sequence, '\\%c'", (int) c);
1799 c = TARGET_ESC;
1800 break;
1802 case 'u': case 'U':
1803 unknown = maybe_read_ucs (pfile, &str, limit, &c);
1804 break;
1806 case 'x':
1807 if (CPP_WTRADITIONAL (pfile))
1808 cpp_error (pfile, DL_WARNING,
1809 "the meaning of '\\x' is different in traditional C");
1812 cppchar_t i = 0, overflow = 0;
1813 int digits_found = 0;
1815 while (str < limit)
1817 c = *str;
1818 if (! ISXDIGIT (c))
1819 break;
1820 str++;
1821 overflow |= i ^ (i << 4 >> 4);
1822 i = (i << 4) + hex_digit_value (c);
1823 digits_found = 1;
1826 if (!digits_found)
1827 cpp_error (pfile, DL_ERROR,
1828 "\\x used with no following hex digits");
1830 if (overflow | (i != (i & mask)))
1832 cpp_error (pfile, DL_PEDWARN,
1833 "hex escape sequence out of range");
1834 i &= mask;
1836 c = i;
1838 break;
1840 case '0': case '1': case '2': case '3':
1841 case '4': case '5': case '6': case '7':
1843 size_t count = 0;
1844 cppchar_t i = c - '0';
1846 while (str < limit && ++count < 3)
1848 c = *str;
1849 if (c < '0' || c > '7')
1850 break;
1851 str++;
1852 i = (i << 3) + c - '0';
1855 if (i != (i & mask))
1857 cpp_error (pfile, DL_PEDWARN,
1858 "octal escape sequence out of range");
1859 i &= mask;
1861 c = i;
1863 break;
1865 default:
1866 unknown = 1;
1867 break;
1870 if (unknown)
1872 if (ISGRAPH (c))
1873 cpp_error (pfile, DL_PEDWARN,
1874 "unknown escape sequence '\\%c'", (int) c);
1875 else
1876 cpp_error (pfile, DL_PEDWARN,
1877 "unknown escape sequence: '\\%03o'", (int) c);
1880 if (c > mask)
1882 cpp_error (pfile, DL_PEDWARN, "escape sequence out of range for its type");
1883 c &= mask;
1886 *pstr = str;
1887 return c;
1890 /* Interpret a (possibly wide) character constant in TOKEN.
1891 WARN_MULTI warns about multi-character charconsts. PCHARS_SEEN
1892 points to a variable that is filled in with the number of
1893 characters seen, and UNSIGNEDP to a variable that indicates whether
1894 the result has signed type. */
1895 cppchar_t
1896 cpp_interpret_charconst (pfile, token, pchars_seen, unsignedp)
1897 cpp_reader *pfile;
1898 const cpp_token *token;
1899 unsigned int *pchars_seen;
1900 int *unsignedp;
1902 const unsigned char *str = token->val.str.text;
1903 const unsigned char *limit = str + token->val.str.len;
1904 unsigned int chars_seen = 0;
1905 size_t width, max_chars;
1906 cppchar_t c, mask, result = 0;
1907 bool unsigned_p;
1909 #ifdef MULTIBYTE_CHARS
1910 (void) local_mbtowc (NULL, NULL, 0);
1911 #endif
1913 /* Width in bits. */
1914 if (token->type == CPP_CHAR)
1916 width = CPP_OPTION (pfile, char_precision);
1917 max_chars = CPP_OPTION (pfile, int_precision) / width;
1918 unsigned_p = CPP_OPTION (pfile, unsigned_char);
1920 else
1922 width = CPP_OPTION (pfile, wchar_precision);
1923 max_chars = 1;
1924 unsigned_p = CPP_OPTION (pfile, unsigned_wchar);
1927 if (width < BITS_PER_CPPCHAR_T)
1928 mask = ((cppchar_t) 1 << width) - 1;
1929 else
1930 mask = ~0;
1932 while (str < limit)
1934 #ifdef MULTIBYTE_CHARS
1935 wchar_t wc;
1936 int char_len;
1938 char_len = local_mbtowc (&wc, str, limit - str);
1939 if (char_len == -1)
1941 cpp_error (pfile, DL_WARNING,
1942 "ignoring invalid multibyte character");
1943 c = *str++;
1945 else
1947 str += char_len;
1948 c = wc;
1950 #else
1951 c = *str++;
1952 #endif
1954 if (c == '\\')
1955 c = cpp_parse_escape (pfile, &str, limit, token->type == CPP_WCHAR);
1957 #ifdef MAP_CHARACTER
1958 if (ISPRINT (c))
1959 c = MAP_CHARACTER (c);
1960 #endif
1962 chars_seen++;
1964 /* Truncate the character, scale the result and merge the two. */
1965 c &= mask;
1966 if (width < BITS_PER_CPPCHAR_T)
1967 result = (result << width) | c;
1968 else
1969 result = c;
1972 if (chars_seen == 0)
1973 cpp_error (pfile, DL_ERROR, "empty character constant");
1974 else if (chars_seen > 1)
1976 /* Multichar charconsts are of type int and therefore signed. */
1977 unsigned_p = 0;
1979 if (chars_seen > max_chars)
1981 chars_seen = max_chars;
1982 cpp_error (pfile, DL_WARNING,
1983 "character constant too long for its type");
1985 else if (CPP_OPTION (pfile, warn_multichar))
1986 cpp_error (pfile, DL_WARNING, "multi-character character constant");
1989 /* Sign-extend or truncate the constant to cppchar_t. The value is
1990 in WIDTH bits, but for multi-char charconsts it's value is the
1991 full target type's width. */
1992 if (chars_seen > 1)
1993 width *= max_chars;
1994 if (width < BITS_PER_CPPCHAR_T)
1996 mask = ((cppchar_t) 1 << width) - 1;
1997 if (unsigned_p || !(result & (1 << (width - 1))))
1998 result &= mask;
1999 else
2000 result |= ~mask;
2003 *pchars_seen = chars_seen;
2004 *unsignedp = unsigned_p;
2005 return result;
2008 /* Memory buffers. Changing these three constants can have a dramatic
2009 effect on performance. The values here are reasonable defaults,
2010 but might be tuned. If you adjust them, be sure to test across a
2011 range of uses of cpplib, including heavy nested function-like macro
2012 expansion. Also check the change in peak memory usage (NJAMD is a
2013 good tool for this). */
2014 #define MIN_BUFF_SIZE 8000
2015 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2016 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2017 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2019 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2020 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2021 #endif
2023 /* Create a new allocation buffer. Place the control block at the end
2024 of the buffer, so that buffer overflows will cause immediate chaos. */
2025 static _cpp_buff *
2026 new_buff (len)
2027 size_t len;
2029 _cpp_buff *result;
2030 unsigned char *base;
2032 if (len < MIN_BUFF_SIZE)
2033 len = MIN_BUFF_SIZE;
2034 len = CPP_ALIGN (len);
2036 base = xmalloc (len + sizeof (_cpp_buff));
2037 result = (_cpp_buff *) (base + len);
2038 result->base = base;
2039 result->cur = base;
2040 result->limit = base + len;
2041 result->next = NULL;
2042 return result;
2045 /* Place a chain of unwanted allocation buffers on the free list. */
2046 void
2047 _cpp_release_buff (pfile, buff)
2048 cpp_reader *pfile;
2049 _cpp_buff *buff;
2051 _cpp_buff *end = buff;
2053 while (end->next)
2054 end = end->next;
2055 end->next = pfile->free_buffs;
2056 pfile->free_buffs = buff;
2059 /* Return a free buffer of size at least MIN_SIZE. */
2060 _cpp_buff *
2061 _cpp_get_buff (pfile, min_size)
2062 cpp_reader *pfile;
2063 size_t min_size;
2065 _cpp_buff *result, **p;
2067 for (p = &pfile->free_buffs;; p = &(*p)->next)
2069 size_t size;
2071 if (*p == NULL)
2072 return new_buff (min_size);
2073 result = *p;
2074 size = result->limit - result->base;
2075 /* Return a buffer that's big enough, but don't waste one that's
2076 way too big. */
2077 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2078 break;
2081 *p = result->next;
2082 result->next = NULL;
2083 result->cur = result->base;
2084 return result;
2087 /* Creates a new buffer with enough space to hold the uncommitted
2088 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
2089 the excess bytes to the new buffer. Chains the new buffer after
2090 BUFF, and returns the new buffer. */
2091 _cpp_buff *
2092 _cpp_append_extend_buff (pfile, buff, min_extra)
2093 cpp_reader *pfile;
2094 _cpp_buff *buff;
2095 size_t min_extra;
2097 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2098 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2100 buff->next = new_buff;
2101 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2102 return new_buff;
2105 /* Creates a new buffer with enough space to hold the uncommitted
2106 remaining bytes of the buffer pointed to by BUFF, and at least
2107 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
2108 Chains the new buffer before the buffer pointed to by BUFF, and
2109 updates the pointer to point to the new buffer. */
2110 void
2111 _cpp_extend_buff (pfile, pbuff, min_extra)
2112 cpp_reader *pfile;
2113 _cpp_buff **pbuff;
2114 size_t min_extra;
2116 _cpp_buff *new_buff, *old_buff = *pbuff;
2117 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2119 new_buff = _cpp_get_buff (pfile, size);
2120 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2121 new_buff->next = old_buff;
2122 *pbuff = new_buff;
2125 /* Free a chain of buffers starting at BUFF. */
2126 void
2127 _cpp_free_buff (buff)
2128 _cpp_buff *buff;
2130 _cpp_buff *next;
2132 for (; buff; buff = next)
2134 next = buff->next;
2135 free (buff->base);
2139 /* Allocate permanent, unaligned storage of length LEN. */
2140 unsigned char *
2141 _cpp_unaligned_alloc (pfile, len)
2142 cpp_reader *pfile;
2143 size_t len;
2145 _cpp_buff *buff = pfile->u_buff;
2146 unsigned char *result = buff->cur;
2148 if (len > (size_t) (buff->limit - result))
2150 buff = _cpp_get_buff (pfile, len);
2151 buff->next = pfile->u_buff;
2152 pfile->u_buff = buff;
2153 result = buff->cur;
2156 buff->cur = result + len;
2157 return result;
2160 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2161 That buffer is used for growing allocations when saving macro
2162 replacement lists in a #define, and when parsing an answer to an
2163 assertion in #assert, #unassert or #if (and therefore possibly
2164 whilst expanding macros). It therefore must not be used by any
2165 code that they might call: specifically the lexer and the guts of
2166 the macro expander.
2168 All existing other uses clearly fit this restriction: storing
2169 registered pragmas during initialization. */
2170 unsigned char *
2171 _cpp_aligned_alloc (pfile, len)
2172 cpp_reader *pfile;
2173 size_t len;
2175 _cpp_buff *buff = pfile->a_buff;
2176 unsigned char *result = buff->cur;
2178 if (len > (size_t) (buff->limit - result))
2180 buff = _cpp_get_buff (pfile, len);
2181 buff->next = pfile->a_buff;
2182 pfile->a_buff = buff;
2183 result = buff->cur;
2186 buff->cur = result + len;
2187 return result;