cpplex.c (cpp_interpret_charconst): Sign-extend each character.
[official-gcc.git] / gcc / cpplex.c
blobd3268985697b42bdfe6a94b1ea2f7ca1b81c3829
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000, 2001, 2002 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7 Single-pass line tokenization by Neil Booth, April 2000
9 This program is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
12 later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23 #include "config.h"
24 #include "system.h"
25 #include "cpplib.h"
26 #include "cpphash.h"
28 #ifdef MULTIBYTE_CHARS
29 #include "mbchar.h"
30 #include <locale.h>
31 #endif
33 /* Tokens with SPELL_STRING store their spelling in the token list,
34 and it's length in the token->val.name.len. */
35 enum spell_type
37 SPELL_OPERATOR = 0,
38 SPELL_CHAR,
39 SPELL_IDENT,
40 SPELL_NUMBER,
41 SPELL_STRING,
42 SPELL_NONE
45 struct token_spelling
47 enum spell_type category;
48 const unsigned char *name;
51 static const unsigned char *const digraph_spellings[] =
52 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
54 #define OP(e, s) { SPELL_OPERATOR, U s },
55 #define TK(e, s) { s, U STRINGX (e) },
56 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
57 #undef OP
58 #undef TK
60 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
61 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
62 #define BACKUP() do {buffer->cur = buffer->backup_to;} while (0)
64 static void handle_newline PARAMS ((cpp_reader *));
65 static cppchar_t skip_escaped_newlines PARAMS ((cpp_reader *));
66 static cppchar_t get_effective_char PARAMS ((cpp_reader *));
68 static int skip_block_comment PARAMS ((cpp_reader *));
69 static int skip_line_comment PARAMS ((cpp_reader *));
70 static void adjust_column PARAMS ((cpp_reader *));
71 static int skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
72 static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *));
73 static uchar *parse_slow PARAMS ((cpp_reader *, const uchar *, int,
74 unsigned int *));
75 static void parse_number PARAMS ((cpp_reader *, cpp_string *, int));
76 static int unescaped_terminator_p PARAMS ((cpp_reader *, const uchar *));
77 static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
78 static bool trigraph_p PARAMS ((cpp_reader *));
79 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const uchar *,
80 cppchar_t));
81 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
82 static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
83 const unsigned char *, cppchar_t *));
84 static tokenrun *next_tokenrun PARAMS ((tokenrun *));
86 static unsigned int hex_digit_value PARAMS ((unsigned int));
87 static _cpp_buff *new_buff PARAMS ((size_t));
89 /* Utility routine:
91 Compares, the token TOKEN to the NUL-terminated string STRING.
92 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
93 int
94 cpp_ideq (token, string)
95 const cpp_token *token;
96 const char *string;
98 if (token->type != CPP_NAME)
99 return 0;
101 return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
104 /* Call when meeting a newline, assumed to be in buffer->cur[-1].
105 Returns with buffer->cur pointing to the character immediately
106 following the newline (combination). */
107 static void
108 handle_newline (pfile)
109 cpp_reader *pfile;
111 cpp_buffer *buffer = pfile->buffer;
113 /* Handle CR-LF and LF-CR. Most other implementations (e.g. java)
114 only accept CR-LF; maybe we should fall back to that behaviour? */
115 if (buffer->cur[-1] + buffer->cur[0] == '\r' + '\n')
116 buffer->cur++;
118 buffer->line_base = buffer->cur;
119 buffer->col_adjust = 0;
120 pfile->line++;
123 /* Subroutine of skip_escaped_newlines; called when a 3-character
124 sequence beginning with "??" is encountered. buffer->cur points to
125 the second '?'.
127 Warn if necessary, and returns true if the sequence forms a
128 trigraph and the trigraph should be honoured. */
129 static bool
130 trigraph_p (pfile)
131 cpp_reader *pfile;
133 cpp_buffer *buffer = pfile->buffer;
134 cppchar_t from_char = buffer->cur[1];
135 bool accept;
137 if (!_cpp_trigraph_map[from_char])
138 return false;
140 accept = CPP_OPTION (pfile, trigraphs);
142 /* Don't warn about trigraphs in comments. */
143 if (CPP_OPTION (pfile, warn_trigraphs) && !pfile->state.lexing_comment)
145 if (accept)
146 cpp_error_with_line (pfile, DL_WARNING,
147 pfile->line, CPP_BUF_COL (buffer) - 1,
148 "trigraph ??%c converted to %c",
149 (int) from_char,
150 (int) _cpp_trigraph_map[from_char]);
151 else if (buffer->cur != buffer->last_Wtrigraphs)
153 buffer->last_Wtrigraphs = buffer->cur;
154 cpp_error_with_line (pfile, DL_WARNING,
155 pfile->line, CPP_BUF_COL (buffer) - 1,
156 "trigraph ??%c ignored", (int) from_char);
160 return accept;
163 /* Skips any escaped newlines introduced by '?' or a '\\', assumed to
164 lie in buffer->cur[-1]. Returns the next byte, which will be in
165 buffer->cur[-1]. This routine performs preprocessing stages 1 and
166 2 of the ISO C standard. */
167 static cppchar_t
168 skip_escaped_newlines (pfile)
169 cpp_reader *pfile;
171 cpp_buffer *buffer = pfile->buffer;
172 cppchar_t next = buffer->cur[-1];
174 /* Only do this if we apply stages 1 and 2. */
175 if (!buffer->from_stage3)
177 const unsigned char *saved_cur;
178 cppchar_t next1;
182 if (next == '?')
184 if (buffer->cur[0] != '?' || !trigraph_p (pfile))
185 break;
187 /* Translate the trigraph. */
188 next = _cpp_trigraph_map[buffer->cur[1]];
189 buffer->cur += 2;
190 if (next != '\\')
191 break;
194 if (buffer->cur == buffer->rlimit)
195 break;
197 /* We have a backslash, and room for at least one more
198 character. Skip horizontal whitespace. */
199 saved_cur = buffer->cur;
201 next1 = *buffer->cur++;
202 while (is_nvspace (next1) && buffer->cur < buffer->rlimit);
204 if (!is_vspace (next1))
206 buffer->cur = saved_cur;
207 break;
210 if (saved_cur != buffer->cur - 1
211 && !pfile->state.lexing_comment)
212 cpp_error (pfile, DL_WARNING,
213 "backslash and newline separated by space");
215 handle_newline (pfile);
216 buffer->backup_to = buffer->cur;
217 if (buffer->cur == buffer->rlimit)
219 cpp_error (pfile, DL_PEDWARN,
220 "backslash-newline at end of file");
221 next = EOF;
223 else
224 next = *buffer->cur++;
226 while (next == '\\' || next == '?');
229 return next;
232 /* Obtain the next character, after trigraph conversion and skipping
233 an arbitrarily long string of escaped newlines. The common case of
234 no trigraphs or escaped newlines falls through quickly. On return,
235 buffer->backup_to points to where to return to if the character is
236 not to be processed. */
237 static cppchar_t
238 get_effective_char (pfile)
239 cpp_reader *pfile;
241 cppchar_t next;
242 cpp_buffer *buffer = pfile->buffer;
244 buffer->backup_to = buffer->cur;
245 next = *buffer->cur++;
246 if (__builtin_expect (next == '?' || next == '\\', 0))
247 next = skip_escaped_newlines (pfile);
249 return next;
252 /* Skip a C-style block comment. We find the end of the comment by
253 seeing if an asterisk is before every '/' we encounter. Returns
254 non-zero if comment terminated by EOF, zero otherwise. */
255 static int
256 skip_block_comment (pfile)
257 cpp_reader *pfile;
259 cpp_buffer *buffer = pfile->buffer;
260 cppchar_t c = EOF, prevc = EOF;
262 pfile->state.lexing_comment = 1;
263 while (buffer->cur != buffer->rlimit)
265 prevc = c, c = *buffer->cur++;
267 /* FIXME: For speed, create a new character class of characters
268 of interest inside block comments. */
269 if (c == '?' || c == '\\')
270 c = skip_escaped_newlines (pfile);
272 /* People like decorating comments with '*', so check for '/'
273 instead for efficiency. */
274 if (c == '/')
276 if (prevc == '*')
277 break;
279 /* Warn about potential nested comments, but not if the '/'
280 comes immediately before the true comment delimiter.
281 Don't bother to get it right across escaped newlines. */
282 if (CPP_OPTION (pfile, warn_comments)
283 && buffer->cur[0] == '*' && buffer->cur[1] != '/')
284 cpp_error_with_line (pfile, DL_WARNING,
285 pfile->line, CPP_BUF_COL (buffer),
286 "\"/*\" within comment");
288 else if (is_vspace (c))
289 handle_newline (pfile);
290 else if (c == '\t')
291 adjust_column (pfile);
294 pfile->state.lexing_comment = 0;
295 return c != '/' || prevc != '*';
298 /* Skip a C++ line comment, leaving buffer->cur pointing to the
299 terminating newline. Handles escaped newlines. Returns non-zero
300 if a multiline comment. */
301 static int
302 skip_line_comment (pfile)
303 cpp_reader *pfile;
305 cpp_buffer *buffer = pfile->buffer;
306 unsigned int orig_line = pfile->line;
307 cppchar_t c;
308 #ifdef MULTIBYTE_CHARS
309 wchar_t wc;
310 int char_len;
311 #endif
313 pfile->state.lexing_comment = 1;
314 #ifdef MULTIBYTE_CHARS
315 /* Reset multibyte conversion state. */
316 (void) local_mbtowc (NULL, NULL, 0);
317 #endif
320 if (buffer->cur == buffer->rlimit)
321 goto at_eof;
323 #ifdef MULTIBYTE_CHARS
324 char_len = local_mbtowc (&wc, (const char *) buffer->cur,
325 buffer->rlimit - buffer->cur);
326 if (char_len == -1)
328 cpp_error (pfile, DL_WARNING,
329 "ignoring invalid multibyte character");
330 char_len = 1;
331 c = *buffer->cur++;
333 else
335 buffer->cur += char_len;
336 c = wc;
338 #else
339 c = *buffer->cur++;
340 #endif
341 if (c == '?' || c == '\\')
342 c = skip_escaped_newlines (pfile);
344 while (!is_vspace (c));
346 /* Step back over the newline, except at EOF. */
347 buffer->cur--;
348 at_eof:
350 pfile->state.lexing_comment = 0;
351 return orig_line != pfile->line;
354 /* pfile->buffer->cur is one beyond the \t character. Update
355 col_adjust so we track the column correctly. */
356 static void
357 adjust_column (pfile)
358 cpp_reader *pfile;
360 cpp_buffer *buffer = pfile->buffer;
361 unsigned int col = CPP_BUF_COL (buffer) - 1; /* Zero-based column. */
363 /* Round it up to multiple of the tabstop, but subtract 1 since the
364 tab itself occupies a character position. */
365 buffer->col_adjust += (CPP_OPTION (pfile, tabstop)
366 - col % CPP_OPTION (pfile, tabstop)) - 1;
369 /* Skips whitespace, saving the next non-whitespace character.
370 Adjusts pfile->col_adjust to account for tabs. Without this,
371 tokens might be assigned an incorrect column. */
372 static int
373 skip_whitespace (pfile, c)
374 cpp_reader *pfile;
375 cppchar_t c;
377 cpp_buffer *buffer = pfile->buffer;
378 unsigned int warned = 0;
382 /* Horizontal space always OK. */
383 if (c == ' ')
385 else if (c == '\t')
386 adjust_column (pfile);
387 /* Just \f \v or \0 left. */
388 else if (c == '\0')
390 if (buffer->cur - 1 == buffer->rlimit)
391 return 0;
392 if (!warned)
394 cpp_error (pfile, DL_WARNING, "null character(s) ignored");
395 warned = 1;
398 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
399 cpp_error_with_line (pfile, DL_PEDWARN, pfile->line,
400 CPP_BUF_COL (buffer),
401 "%s in preprocessing directive",
402 c == '\f' ? "form feed" : "vertical tab");
404 c = *buffer->cur++;
406 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
407 while (is_nvspace (c));
409 buffer->cur--;
410 return 1;
413 /* See if the characters of a number token are valid in a name (no
414 '.', '+' or '-'). */
415 static int
416 name_p (pfile, string)
417 cpp_reader *pfile;
418 const cpp_string *string;
420 unsigned int i;
422 for (i = 0; i < string->len; i++)
423 if (!is_idchar (string->text[i]))
424 return 0;
426 return 1;
429 /* Parse an identifier, skipping embedded backslash-newlines. This is
430 a critical inner loop. The common case is an identifier which has
431 not been split by backslash-newline, does not contain a dollar
432 sign, and has already been scanned (roughly 10:1 ratio of
433 seen:unseen identifiers in normal code; the distribution is
434 Poisson-like). Second most common case is a new identifier, not
435 split and no dollar sign. The other possibilities are rare and
436 have been relegated to parse_slow. */
437 static cpp_hashnode *
438 parse_identifier (pfile)
439 cpp_reader *pfile;
441 cpp_hashnode *result;
442 const uchar *cur, *base;
444 /* Fast-path loop. Skim over a normal identifier.
445 N.B. ISIDNUM does not include $. */
446 cur = pfile->buffer->cur;
447 while (ISIDNUM (*cur))
448 cur++;
450 /* Check for slow-path cases. */
451 if (*cur == '?' || *cur == '\\' || *cur == '$')
453 unsigned int len;
455 base = parse_slow (pfile, cur, 0, &len);
456 result = (cpp_hashnode *)
457 ht_lookup (pfile->hash_table, base, len, HT_ALLOCED);
459 else
461 base = pfile->buffer->cur - 1;
462 pfile->buffer->cur = cur;
463 result = (cpp_hashnode *)
464 ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
467 /* Rarely, identifiers require diagnostics when lexed.
468 XXX Has to be forced out of the fast path. */
469 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
470 && !pfile->state.skipping, 0))
472 /* It is allowed to poison the same identifier twice. */
473 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
474 cpp_error (pfile, DL_ERROR, "attempt to use poisoned \"%s\"",
475 NODE_NAME (result));
477 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
478 replacement list of a variadic macro. */
479 if (result == pfile->spec_nodes.n__VA_ARGS__
480 && !pfile->state.va_args_ok)
481 cpp_error (pfile, DL_PEDWARN,
482 "__VA_ARGS__ can only appear in the expansion of a C99 variadic macro");
485 return result;
488 /* Slow path. This handles numbers and identifiers which have been
489 split, or contain dollar signs. The part of the token from
490 PFILE->buffer->cur-1 to CUR has already been scanned. NUMBER_P is
491 1 if it's a number, and 2 if it has a leading period. Returns a
492 pointer to the token's NUL-terminated spelling in permanent
493 storage, and sets PLEN to its length. */
494 static uchar *
495 parse_slow (pfile, cur, number_p, plen)
496 cpp_reader *pfile;
497 const uchar *cur;
498 int number_p;
499 unsigned int *plen;
501 cpp_buffer *buffer = pfile->buffer;
502 const uchar *base = buffer->cur - 1;
503 struct obstack *stack = &pfile->hash_table->stack;
504 unsigned int c, prevc, saw_dollar = 0;
506 /* Place any leading period. */
507 if (number_p == 2)
508 obstack_1grow (stack, '.');
510 /* Copy the part of the token which is known to be okay. */
511 obstack_grow (stack, base, cur - base);
513 /* Now process the part which isn't. We are looking at one of
514 '$', '\\', or '?' on entry to this loop. */
515 prevc = cur[-1];
516 c = *cur++;
517 buffer->cur = cur;
518 for (;;)
520 /* Potential escaped newline? */
521 buffer->backup_to = buffer->cur - 1;
522 if (c == '?' || c == '\\')
523 c = skip_escaped_newlines (pfile);
525 if (!is_idchar (c))
527 if (!number_p)
528 break;
529 if (c != '.' && !VALID_SIGN (c, prevc))
530 break;
533 /* Handle normal identifier characters in this loop. */
536 prevc = c;
537 obstack_1grow (stack, c);
539 if (c == '$')
540 saw_dollar++;
542 c = *buffer->cur++;
544 while (is_idchar (c));
547 /* Step back over the unwanted char. */
548 BACKUP ();
550 /* $ is not an identifier character in the standard, but is commonly
551 accepted as an extension. Don't warn about it in skipped
552 conditional blocks. */
553 if (saw_dollar && CPP_PEDANTIC (pfile) && ! pfile->state.skipping)
554 cpp_error (pfile, DL_PEDWARN, "'$' character(s) in identifier or number");
556 /* Identifiers and numbers are null-terminated. */
557 *plen = obstack_object_size (stack);
558 obstack_1grow (stack, '\0');
559 return obstack_finish (stack);
562 /* Parse a number, beginning with character C, skipping embedded
563 backslash-newlines. LEADING_PERIOD is non-zero if there was a "."
564 before C. Place the result in NUMBER. */
565 static void
566 parse_number (pfile, number, leading_period)
567 cpp_reader *pfile;
568 cpp_string *number;
569 int leading_period;
571 const uchar *cur;
573 /* Fast-path loop. Skim over a normal number.
574 N.B. ISIDNUM does not include $. */
575 cur = pfile->buffer->cur;
576 while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
577 cur++;
579 /* Check for slow-path cases. */
580 if (*cur == '?' || *cur == '\\' || *cur == '$')
581 number->text = parse_slow (pfile, cur, 1 + leading_period, &number->len);
582 else
584 const uchar *base = pfile->buffer->cur - 1;
585 uchar *dest;
587 number->len = cur - base + leading_period;
588 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
589 dest[number->len] = '\0';
590 number->text = dest;
592 if (leading_period)
593 *dest++ = '.';
594 memcpy (dest, base, cur - base);
595 pfile->buffer->cur = cur;
599 /* Subroutine of parse_string. */
600 static int
601 unescaped_terminator_p (pfile, dest)
602 cpp_reader *pfile;
603 const unsigned char *dest;
605 const unsigned char *start, *temp;
607 /* In #include-style directives, terminators are not escapeable. */
608 if (pfile->state.angled_headers)
609 return 1;
611 start = BUFF_FRONT (pfile->u_buff);
613 /* An odd number of consecutive backslashes represents an escaped
614 terminator. */
615 for (temp = dest; temp > start && temp[-1] == '\\'; temp--)
618 return ((dest - temp) & 1) == 0;
621 /* Parses a string, character constant, or angle-bracketed header file
622 name. Handles embedded trigraphs and escaped newlines. The stored
623 string is guaranteed NUL-terminated, but it is not guaranteed that
624 this is the first NUL since embedded NULs are preserved.
626 When this function returns, buffer->cur points to the next
627 character to be processed. */
628 static void
629 parse_string (pfile, token, terminator)
630 cpp_reader *pfile;
631 cpp_token *token;
632 cppchar_t terminator;
634 cpp_buffer *buffer = pfile->buffer;
635 unsigned char *dest, *limit;
636 cppchar_t c;
637 bool warned_nulls = false;
638 #ifdef MULTIBYTE_CHARS
639 wchar_t wc;
640 int char_len;
641 #endif
643 dest = BUFF_FRONT (pfile->u_buff);
644 limit = BUFF_LIMIT (pfile->u_buff);
646 #ifdef MULTIBYTE_CHARS
647 /* Reset multibyte conversion state. */
648 (void) local_mbtowc (NULL, NULL, 0);
649 #endif
650 for (;;)
652 /* We need room for another char, possibly the terminating NUL. */
653 if ((size_t) (limit - dest) < 1)
655 size_t len_so_far = dest - BUFF_FRONT (pfile->u_buff);
656 _cpp_extend_buff (pfile, &pfile->u_buff, 2);
657 dest = BUFF_FRONT (pfile->u_buff) + len_so_far;
658 limit = BUFF_LIMIT (pfile->u_buff);
661 #ifdef MULTIBYTE_CHARS
662 char_len = local_mbtowc (&wc, (const char *) buffer->cur,
663 buffer->rlimit - buffer->cur);
664 if (char_len == -1)
666 cpp_error (pfile, DL_WARNING,
667 "ignoring invalid multibyte character");
668 char_len = 1;
669 c = *buffer->cur++;
671 else
673 buffer->cur += char_len;
674 c = wc;
676 #else
677 c = *buffer->cur++;
678 #endif
680 /* Handle trigraphs, escaped newlines etc. */
681 if (c == '?' || c == '\\')
682 c = skip_escaped_newlines (pfile);
684 if (c == terminator)
686 if (unescaped_terminator_p (pfile, dest))
687 break;
689 else if (is_vspace (c))
691 /* No string literal may extend over multiple lines. In
692 assembly language, suppress the error except for <>
693 includes. This is a kludge around not knowing where
694 comments are. */
695 unterminated:
696 if (CPP_OPTION (pfile, lang) != CLK_ASM || terminator == '>')
697 cpp_error (pfile, DL_ERROR, "missing terminating %c character",
698 (int) terminator);
699 buffer->cur--;
700 break;
702 else if (c == '\0')
704 if (buffer->cur - 1 == buffer->rlimit)
705 goto unterminated;
706 if (!warned_nulls)
708 warned_nulls = true;
709 cpp_error (pfile, DL_WARNING,
710 "null character(s) preserved in literal");
713 #ifdef MULTIBYTE_CHARS
714 if (char_len > 1)
716 for ( ; char_len > 0; --char_len)
717 *dest++ = (*buffer->cur - char_len);
719 else
720 #endif
721 *dest++ = c;
724 *dest = '\0';
726 token->val.str.text = BUFF_FRONT (pfile->u_buff);
727 token->val.str.len = dest - BUFF_FRONT (pfile->u_buff);
728 BUFF_FRONT (pfile->u_buff) = dest + 1;
731 /* The stored comment includes the comment start and any terminator. */
732 static void
733 save_comment (pfile, token, from, type)
734 cpp_reader *pfile;
735 cpp_token *token;
736 const unsigned char *from;
737 cppchar_t type;
739 unsigned char *buffer;
740 unsigned int len, clen;
742 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
744 /* C++ comments probably (not definitely) have moved past a new
745 line, which we don't want to save in the comment. */
746 if (is_vspace (pfile->buffer->cur[-1]))
747 len--;
749 /* If we are currently in a directive, then we need to store all
750 C++ comments as C comments internally, and so we need to
751 allocate a little extra space in that case.
753 Note that the only time we encounter a directive here is
754 when we are saving comments in a "#define". */
755 clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
757 buffer = _cpp_unaligned_alloc (pfile, clen);
759 token->type = CPP_COMMENT;
760 token->val.str.len = clen;
761 token->val.str.text = buffer;
763 buffer[0] = '/';
764 memcpy (buffer + 1, from, len - 1);
766 /* Finish conversion to a C comment, if necessary. */
767 if (pfile->state.in_directive && type == '/')
769 buffer[1] = '*';
770 buffer[clen - 2] = '*';
771 buffer[clen - 1] = '/';
775 /* Allocate COUNT tokens for RUN. */
776 void
777 _cpp_init_tokenrun (run, count)
778 tokenrun *run;
779 unsigned int count;
781 run->base = xnewvec (cpp_token, count);
782 run->limit = run->base + count;
783 run->next = NULL;
786 /* Returns the next tokenrun, or creates one if there is none. */
787 static tokenrun *
788 next_tokenrun (run)
789 tokenrun *run;
791 if (run->next == NULL)
793 run->next = xnew (tokenrun);
794 run->next->prev = run;
795 _cpp_init_tokenrun (run->next, 250);
798 return run->next;
801 /* Allocate a single token that is invalidated at the same time as the
802 rest of the tokens on the line. Has its line and col set to the
803 same as the last lexed token, so that diagnostics appear in the
804 right place. */
805 cpp_token *
806 _cpp_temp_token (pfile)
807 cpp_reader *pfile;
809 cpp_token *old, *result;
811 old = pfile->cur_token - 1;
812 if (pfile->cur_token == pfile->cur_run->limit)
814 pfile->cur_run = next_tokenrun (pfile->cur_run);
815 pfile->cur_token = pfile->cur_run->base;
818 result = pfile->cur_token++;
819 result->line = old->line;
820 result->col = old->col;
821 return result;
824 /* Lex a token into RESULT (external interface). Takes care of issues
825 like directive handling, token lookahead, multiple include
826 optimization and skipping. */
827 const cpp_token *
828 _cpp_lex_token (pfile)
829 cpp_reader *pfile;
831 cpp_token *result;
833 for (;;)
835 if (pfile->cur_token == pfile->cur_run->limit)
837 pfile->cur_run = next_tokenrun (pfile->cur_run);
838 pfile->cur_token = pfile->cur_run->base;
841 if (pfile->lookaheads)
843 pfile->lookaheads--;
844 result = pfile->cur_token++;
846 else
847 result = _cpp_lex_direct (pfile);
849 if (result->flags & BOL)
851 /* Is this a directive. If _cpp_handle_directive returns
852 false, it is an assembler #. */
853 if (result->type == CPP_HASH
854 /* 6.10.3 p 11: Directives in a list of macro arguments
855 gives undefined behavior. This implementation
856 handles the directive as normal. */
857 && pfile->state.parsing_args != 1
858 && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
859 continue;
860 if (pfile->cb.line_change && !pfile->state.skipping)
861 (*pfile->cb.line_change)(pfile, result, pfile->state.parsing_args);
864 /* We don't skip tokens in directives. */
865 if (pfile->state.in_directive)
866 break;
868 /* Outside a directive, invalidate controlling macros. At file
869 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
870 get here and MI optimisation works. */
871 pfile->mi_valid = false;
873 if (!pfile->state.skipping || result->type == CPP_EOF)
874 break;
877 return result;
880 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
881 do { \
882 if (get_effective_char (pfile) == CHAR) \
883 result->type = THEN_TYPE; \
884 else \
886 BACKUP (); \
887 result->type = ELSE_TYPE; \
889 } while (0)
891 /* Lex a token into pfile->cur_token, which is also incremented, to
892 get diagnostics pointing to the correct location.
894 Does not handle issues such as token lookahead, multiple-include
895 optimisation, directives, skipping etc. This function is only
896 suitable for use by _cpp_lex_token, and in special cases like
897 lex_expansion_token which doesn't care for any of these issues.
899 When meeting a newline, returns CPP_EOF if parsing a directive,
900 otherwise returns to the start of the token buffer if permissible.
901 Returns the location of the lexed token. */
902 cpp_token *
903 _cpp_lex_direct (pfile)
904 cpp_reader *pfile;
906 cppchar_t c;
907 cpp_buffer *buffer;
908 const unsigned char *comment_start;
909 cpp_token *result = pfile->cur_token++;
911 fresh_line:
912 buffer = pfile->buffer;
913 result->flags = buffer->saved_flags;
914 buffer->saved_flags = 0;
915 update_tokens_line:
916 result->line = pfile->line;
918 skipped_white:
919 c = *buffer->cur++;
920 result->col = CPP_BUF_COLUMN (buffer, buffer->cur);
922 trigraph:
923 switch (c)
925 case ' ': case '\t': case '\f': case '\v': case '\0':
926 result->flags |= PREV_WHITE;
927 if (skip_whitespace (pfile, c))
928 goto skipped_white;
930 /* EOF. */
931 buffer->cur--;
932 buffer->saved_flags = BOL;
933 if (!pfile->state.parsing_args && !pfile->state.in_directive)
935 if (buffer->cur != buffer->line_base)
937 /* Non-empty files should end in a newline. Don't warn
938 for command line and _Pragma buffers. */
939 if (!buffer->from_stage3)
940 cpp_error (pfile, DL_PEDWARN, "no newline at end of file");
941 handle_newline (pfile);
944 /* Don't pop the last buffer. */
945 if (buffer->prev)
947 unsigned char stop = buffer->return_at_eof;
949 _cpp_pop_buffer (pfile);
950 if (!stop)
951 goto fresh_line;
954 result->type = CPP_EOF;
955 break;
957 case '\n': case '\r':
958 handle_newline (pfile);
959 buffer->saved_flags = BOL;
960 if (! pfile->state.in_directive)
962 if (pfile->state.parsing_args == 2)
963 buffer->saved_flags |= PREV_WHITE;
964 if (!pfile->keep_tokens)
966 pfile->cur_run = &pfile->base_run;
967 result = pfile->base_run.base;
968 pfile->cur_token = result + 1;
970 goto fresh_line;
972 result->type = CPP_EOF;
973 break;
975 case '?':
976 case '\\':
977 /* These could start an escaped newline, or '?' a trigraph. Let
978 skip_escaped_newlines do all the work. */
980 unsigned int line = pfile->line;
982 c = skip_escaped_newlines (pfile);
983 if (line != pfile->line)
985 buffer->cur--;
986 /* We had at least one escaped newline of some sort.
987 Update the token's line and column. */
988 goto update_tokens_line;
992 /* We are either the original '?' or '\\', or a trigraph. */
993 if (c == '?')
994 result->type = CPP_QUERY;
995 else if (c == '\\')
996 goto random_char;
997 else
998 goto trigraph;
999 break;
1001 case '0': case '1': case '2': case '3': case '4':
1002 case '5': case '6': case '7': case '8': case '9':
1003 result->type = CPP_NUMBER;
1004 parse_number (pfile, &result->val.str, 0);
1005 break;
1007 case 'L':
1008 /* 'L' may introduce wide characters or strings. */
1010 const unsigned char *pos = buffer->cur;
1012 c = get_effective_char (pfile);
1013 if (c == '\'' || c == '"')
1015 result->type = (c == '"' ? CPP_WSTRING: CPP_WCHAR);
1016 parse_string (pfile, result, c);
1017 break;
1019 buffer->cur = pos;
1021 /* Fall through. */
1023 start_ident:
1024 case '_':
1025 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1026 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1027 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1028 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1029 case 'y': case 'z':
1030 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1031 case 'G': case 'H': case 'I': case 'J': case 'K':
1032 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1033 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1034 case 'Y': case 'Z':
1035 result->type = CPP_NAME;
1036 result->val.node = parse_identifier (pfile);
1038 /* Convert named operators to their proper types. */
1039 if (result->val.node->flags & NODE_OPERATOR)
1041 result->flags |= NAMED_OP;
1042 result->type = result->val.node->value.operator;
1044 break;
1046 case '\'':
1047 case '"':
1048 result->type = c == '"' ? CPP_STRING: CPP_CHAR;
1049 parse_string (pfile, result, c);
1050 break;
1052 case '/':
1053 /* A potential block or line comment. */
1054 comment_start = buffer->cur;
1055 c = get_effective_char (pfile);
1057 if (c == '*')
1059 if (skip_block_comment (pfile))
1060 cpp_error (pfile, DL_ERROR, "unterminated comment");
1062 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1063 || CPP_IN_SYSTEM_HEADER (pfile)))
1065 /* Warn about comments only if pedantically GNUC89, and not
1066 in system headers. */
1067 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1068 && ! buffer->warned_cplusplus_comments)
1070 cpp_error (pfile, DL_PEDWARN,
1071 "C++ style comments are not allowed in ISO C89");
1072 cpp_error (pfile, DL_PEDWARN,
1073 "(this will be reported only once per input file)");
1074 buffer->warned_cplusplus_comments = 1;
1077 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1078 cpp_error (pfile, DL_WARNING, "multi-line comment");
1080 else if (c == '=')
1082 result->type = CPP_DIV_EQ;
1083 break;
1085 else
1087 BACKUP ();
1088 result->type = CPP_DIV;
1089 break;
1092 if (!pfile->state.save_comments)
1094 result->flags |= PREV_WHITE;
1095 goto update_tokens_line;
1098 /* Save the comment as a token in its own right. */
1099 save_comment (pfile, result, comment_start, c);
1100 break;
1102 case '<':
1103 if (pfile->state.angled_headers)
1105 result->type = CPP_HEADER_NAME;
1106 parse_string (pfile, result, '>');
1107 break;
1110 c = get_effective_char (pfile);
1111 if (c == '=')
1112 result->type = CPP_LESS_EQ;
1113 else if (c == '<')
1114 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1115 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1116 IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
1117 else if (c == ':' && CPP_OPTION (pfile, digraphs))
1119 result->type = CPP_OPEN_SQUARE;
1120 result->flags |= DIGRAPH;
1122 else if (c == '%' && CPP_OPTION (pfile, digraphs))
1124 result->type = CPP_OPEN_BRACE;
1125 result->flags |= DIGRAPH;
1127 else
1129 BACKUP ();
1130 result->type = CPP_LESS;
1132 break;
1134 case '>':
1135 c = get_effective_char (pfile);
1136 if (c == '=')
1137 result->type = CPP_GREATER_EQ;
1138 else if (c == '>')
1139 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1140 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1141 IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
1142 else
1144 BACKUP ();
1145 result->type = CPP_GREATER;
1147 break;
1149 case '%':
1150 c = get_effective_char (pfile);
1151 if (c == '=')
1152 result->type = CPP_MOD_EQ;
1153 else if (CPP_OPTION (pfile, digraphs) && c == ':')
1155 result->flags |= DIGRAPH;
1156 result->type = CPP_HASH;
1157 if (get_effective_char (pfile) == '%')
1159 const unsigned char *pos = buffer->cur;
1161 if (get_effective_char (pfile) == ':')
1162 result->type = CPP_PASTE;
1163 else
1164 buffer->cur = pos - 1;
1166 else
1167 BACKUP ();
1169 else if (CPP_OPTION (pfile, digraphs) && c == '>')
1171 result->flags |= DIGRAPH;
1172 result->type = CPP_CLOSE_BRACE;
1174 else
1176 BACKUP ();
1177 result->type = CPP_MOD;
1179 break;
1181 case '.':
1182 result->type = CPP_DOT;
1183 c = get_effective_char (pfile);
1184 if (c == '.')
1186 const unsigned char *pos = buffer->cur;
1188 if (get_effective_char (pfile) == '.')
1189 result->type = CPP_ELLIPSIS;
1190 else
1191 buffer->cur = pos - 1;
1193 /* All known character sets have 0...9 contiguous. */
1194 else if (ISDIGIT (c))
1196 result->type = CPP_NUMBER;
1197 parse_number (pfile, &result->val.str, 1);
1199 else if (c == '*' && CPP_OPTION (pfile, cplusplus))
1200 result->type = CPP_DOT_STAR;
1201 else
1202 BACKUP ();
1203 break;
1205 case '+':
1206 c = get_effective_char (pfile);
1207 if (c == '+')
1208 result->type = CPP_PLUS_PLUS;
1209 else if (c == '=')
1210 result->type = CPP_PLUS_EQ;
1211 else
1213 BACKUP ();
1214 result->type = CPP_PLUS;
1216 break;
1218 case '-':
1219 c = get_effective_char (pfile);
1220 if (c == '>')
1222 result->type = CPP_DEREF;
1223 if (CPP_OPTION (pfile, cplusplus))
1225 if (get_effective_char (pfile) == '*')
1226 result->type = CPP_DEREF_STAR;
1227 else
1228 BACKUP ();
1231 else if (c == '-')
1232 result->type = CPP_MINUS_MINUS;
1233 else if (c == '=')
1234 result->type = CPP_MINUS_EQ;
1235 else
1237 BACKUP ();
1238 result->type = CPP_MINUS;
1240 break;
1242 case '&':
1243 c = get_effective_char (pfile);
1244 if (c == '&')
1245 result->type = CPP_AND_AND;
1246 else if (c == '=')
1247 result->type = CPP_AND_EQ;
1248 else
1250 BACKUP ();
1251 result->type = CPP_AND;
1253 break;
1255 case '|':
1256 c = get_effective_char (pfile);
1257 if (c == '|')
1258 result->type = CPP_OR_OR;
1259 else if (c == '=')
1260 result->type = CPP_OR_EQ;
1261 else
1263 BACKUP ();
1264 result->type = CPP_OR;
1266 break;
1268 case ':':
1269 c = get_effective_char (pfile);
1270 if (c == ':' && CPP_OPTION (pfile, cplusplus))
1271 result->type = CPP_SCOPE;
1272 else if (c == '>' && CPP_OPTION (pfile, digraphs))
1274 result->flags |= DIGRAPH;
1275 result->type = CPP_CLOSE_SQUARE;
1277 else
1279 BACKUP ();
1280 result->type = CPP_COLON;
1282 break;
1284 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1285 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1286 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1287 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1288 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1290 case '~': result->type = CPP_COMPL; break;
1291 case ',': result->type = CPP_COMMA; break;
1292 case '(': result->type = CPP_OPEN_PAREN; break;
1293 case ')': result->type = CPP_CLOSE_PAREN; break;
1294 case '[': result->type = CPP_OPEN_SQUARE; break;
1295 case ']': result->type = CPP_CLOSE_SQUARE; break;
1296 case '{': result->type = CPP_OPEN_BRACE; break;
1297 case '}': result->type = CPP_CLOSE_BRACE; break;
1298 case ';': result->type = CPP_SEMICOLON; break;
1300 /* @ is a punctuator in Objective C. */
1301 case '@': result->type = CPP_ATSIGN; break;
1303 case '$':
1304 if (CPP_OPTION (pfile, dollars_in_ident))
1305 goto start_ident;
1306 /* Fall through... */
1308 random_char:
1309 default:
1310 result->type = CPP_OTHER;
1311 result->val.c = c;
1312 break;
1315 return result;
1318 /* An upper bound on the number of bytes needed to spell TOKEN,
1319 including preceding whitespace. */
1320 unsigned int
1321 cpp_token_len (token)
1322 const cpp_token *token;
1324 unsigned int len;
1326 switch (TOKEN_SPELL (token))
1328 default: len = 0; break;
1329 case SPELL_NUMBER:
1330 case SPELL_STRING: len = token->val.str.len; break;
1331 case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
1333 /* 1 for whitespace, 4 for comment delimiters. */
1334 return len + 5;
1337 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1338 already contain the enough space to hold the token's spelling.
1339 Returns a pointer to the character after the last character
1340 written. */
1341 unsigned char *
1342 cpp_spell_token (pfile, token, buffer)
1343 cpp_reader *pfile; /* Would be nice to be rid of this... */
1344 const cpp_token *token;
1345 unsigned char *buffer;
1347 switch (TOKEN_SPELL (token))
1349 case SPELL_OPERATOR:
1351 const unsigned char *spelling;
1352 unsigned char c;
1354 if (token->flags & DIGRAPH)
1355 spelling
1356 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1357 else if (token->flags & NAMED_OP)
1358 goto spell_ident;
1359 else
1360 spelling = TOKEN_NAME (token);
1362 while ((c = *spelling++) != '\0')
1363 *buffer++ = c;
1365 break;
1367 case SPELL_CHAR:
1368 *buffer++ = token->val.c;
1369 break;
1371 spell_ident:
1372 case SPELL_IDENT:
1373 memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
1374 buffer += NODE_LEN (token->val.node);
1375 break;
1377 case SPELL_NUMBER:
1378 memcpy (buffer, token->val.str.text, token->val.str.len);
1379 buffer += token->val.str.len;
1380 break;
1382 case SPELL_STRING:
1384 int left, right, tag;
1385 switch (token->type)
1387 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1388 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1389 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1390 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1391 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1392 default:
1393 cpp_error (pfile, DL_ICE, "unknown string token %s\n",
1394 TOKEN_NAME (token));
1395 return buffer;
1397 if (tag) *buffer++ = tag;
1398 *buffer++ = left;
1399 memcpy (buffer, token->val.str.text, token->val.str.len);
1400 buffer += token->val.str.len;
1401 *buffer++ = right;
1403 break;
1405 case SPELL_NONE:
1406 cpp_error (pfile, DL_ICE, "unspellable token %s", TOKEN_NAME (token));
1407 break;
1410 return buffer;
1413 /* Returns TOKEN spelt as a null-terminated string. The string is
1414 freed when the reader is destroyed. Useful for diagnostics. */
1415 unsigned char *
1416 cpp_token_as_text (pfile, token)
1417 cpp_reader *pfile;
1418 const cpp_token *token;
1420 unsigned int len = cpp_token_len (token);
1421 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1423 end = cpp_spell_token (pfile, token, start);
1424 end[0] = '\0';
1426 return start;
1429 /* Used by C front ends, which really should move to using
1430 cpp_token_as_text. */
1431 const char *
1432 cpp_type2name (type)
1433 enum cpp_ttype type;
1435 return (const char *) token_spellings[type].name;
1438 /* Writes the spelling of token to FP, without any preceding space.
1439 Separated from cpp_spell_token for efficiency - to avoid stdio
1440 double-buffering. */
1441 void
1442 cpp_output_token (token, fp)
1443 const cpp_token *token;
1444 FILE *fp;
1446 switch (TOKEN_SPELL (token))
1448 case SPELL_OPERATOR:
1450 const unsigned char *spelling;
1451 int c;
1453 if (token->flags & DIGRAPH)
1454 spelling
1455 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1456 else if (token->flags & NAMED_OP)
1457 goto spell_ident;
1458 else
1459 spelling = TOKEN_NAME (token);
1461 c = *spelling;
1463 putc (c, fp);
1464 while ((c = *++spelling) != '\0');
1466 break;
1468 case SPELL_CHAR:
1469 putc (token->val.c, fp);
1470 break;
1472 spell_ident:
1473 case SPELL_IDENT:
1474 fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
1475 break;
1477 case SPELL_NUMBER:
1478 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1479 break;
1481 case SPELL_STRING:
1483 int left, right, tag;
1484 switch (token->type)
1486 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1487 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1488 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1489 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1490 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1491 default:
1492 fprintf (stderr, "impossible STRING token %s\n", TOKEN_NAME (token));
1493 return;
1495 if (tag) putc (tag, fp);
1496 putc (left, fp);
1497 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1498 putc (right, fp);
1500 break;
1502 case SPELL_NONE:
1503 /* An error, most probably. */
1504 break;
1508 /* Compare two tokens. */
1510 _cpp_equiv_tokens (a, b)
1511 const cpp_token *a, *b;
1513 if (a->type == b->type && a->flags == b->flags)
1514 switch (TOKEN_SPELL (a))
1516 default: /* Keep compiler happy. */
1517 case SPELL_OPERATOR:
1518 return 1;
1519 case SPELL_CHAR:
1520 return a->val.c == b->val.c; /* Character. */
1521 case SPELL_NONE:
1522 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1523 case SPELL_IDENT:
1524 return a->val.node == b->val.node;
1525 case SPELL_NUMBER:
1526 case SPELL_STRING:
1527 return (a->val.str.len == b->val.str.len
1528 && !memcmp (a->val.str.text, b->val.str.text,
1529 a->val.str.len));
1532 return 0;
1535 /* Returns nonzero if a space should be inserted to avoid an
1536 accidental token paste for output. For simplicity, it is
1537 conservative, and occasionally advises a space where one is not
1538 needed, e.g. "." and ".2". */
1540 cpp_avoid_paste (pfile, token1, token2)
1541 cpp_reader *pfile;
1542 const cpp_token *token1, *token2;
1544 enum cpp_ttype a = token1->type, b = token2->type;
1545 cppchar_t c;
1547 if (token1->flags & NAMED_OP)
1548 a = CPP_NAME;
1549 if (token2->flags & NAMED_OP)
1550 b = CPP_NAME;
1552 c = EOF;
1553 if (token2->flags & DIGRAPH)
1554 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1555 else if (token_spellings[b].category == SPELL_OPERATOR)
1556 c = token_spellings[b].name[0];
1558 /* Quickly get everything that can paste with an '='. */
1559 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1560 return 1;
1562 switch (a)
1564 case CPP_GREATER: return c == '>' || c == '?';
1565 case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
1566 case CPP_PLUS: return c == '+';
1567 case CPP_MINUS: return c == '-' || c == '>';
1568 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1569 case CPP_MOD: return c == ':' || c == '>';
1570 case CPP_AND: return c == '&';
1571 case CPP_OR: return c == '|';
1572 case CPP_COLON: return c == ':' || c == '>';
1573 case CPP_DEREF: return c == '*';
1574 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1575 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1576 case CPP_NAME: return ((b == CPP_NUMBER
1577 && name_p (pfile, &token2->val.str))
1578 || b == CPP_NAME
1579 || b == CPP_CHAR || b == CPP_STRING); /* L */
1580 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1581 || c == '.' || c == '+' || c == '-');
1582 case CPP_OTHER: return (CPP_OPTION (pfile, objc)
1583 && token1->val.c == '@'
1584 && (b == CPP_NAME || b == CPP_STRING));
1585 default: break;
1588 return 0;
1591 /* Output all the remaining tokens on the current line, and a newline
1592 character, to FP. Leading whitespace is removed. If there are
1593 macros, special token padding is not performed. */
1594 void
1595 cpp_output_line (pfile, fp)
1596 cpp_reader *pfile;
1597 FILE *fp;
1599 const cpp_token *token;
1601 token = cpp_get_token (pfile);
1602 while (token->type != CPP_EOF)
1604 cpp_output_token (token, fp);
1605 token = cpp_get_token (pfile);
1606 if (token->flags & PREV_WHITE)
1607 putc (' ', fp);
1610 putc ('\n', fp);
1613 /* Returns the value of a hexadecimal digit. */
1614 static unsigned int
1615 hex_digit_value (c)
1616 unsigned int c;
1618 if (hex_p (c))
1619 return hex_value (c);
1620 else
1621 abort ();
1624 /* Parse a '\uNNNN' or '\UNNNNNNNN' sequence. Returns 1 to indicate
1625 failure if cpplib is not parsing C++ or C99. Such failure is
1626 silent, and no variables are updated. Otherwise returns 0, and
1627 warns if -Wtraditional.
1629 [lex.charset]: The character designated by the universal character
1630 name \UNNNNNNNN is that character whose character short name in
1631 ISO/IEC 10646 is NNNNNNNN; the character designated by the
1632 universal character name \uNNNN is that character whose character
1633 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
1634 for a universal character name is less than 0x20 or in the range
1635 0x7F-0x9F (inclusive), or if the universal character name
1636 designates a character in the basic source character set, then the
1637 program is ill-formed.
1639 We assume that wchar_t is Unicode, so we don't need to do any
1640 mapping. Is this ever wrong?
1642 PC points to the 'u' or 'U', PSTR is points to the byte after PC,
1643 LIMIT is the end of the string or charconst. PSTR is updated to
1644 point after the UCS on return, and the UCS is written into PC. */
1646 static int
1647 maybe_read_ucs (pfile, pstr, limit, pc)
1648 cpp_reader *pfile;
1649 const unsigned char **pstr;
1650 const unsigned char *limit;
1651 cppchar_t *pc;
1653 const unsigned char *p = *pstr;
1654 unsigned int code = 0;
1655 unsigned int c = *pc, length;
1657 /* Only attempt to interpret a UCS for C++ and C99. */
1658 if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
1659 return 1;
1661 if (CPP_WTRADITIONAL (pfile))
1662 cpp_error (pfile, DL_WARNING,
1663 "the meaning of '\\%c' is different in traditional C", c);
1665 length = (c == 'u' ? 4: 8);
1667 if ((size_t) (limit - p) < length)
1669 cpp_error (pfile, DL_ERROR, "incomplete universal-character-name");
1670 /* Skip to the end to avoid more diagnostics. */
1671 p = limit;
1673 else
1675 for (; length; length--, p++)
1677 c = *p;
1678 if (ISXDIGIT (c))
1679 code = (code << 4) + hex_digit_value (c);
1680 else
1682 cpp_error (pfile, DL_ERROR,
1683 "non-hex digit '%c' in universal-character-name", c);
1684 /* We shouldn't skip in case there are multibyte chars. */
1685 break;
1690 #ifdef TARGET_EBCDIC
1691 cpp_error (pfile, DL_ERROR, "universal-character-name on EBCDIC target");
1692 code = 0x3f; /* EBCDIC invalid character */
1693 #else
1694 /* True extended characters are OK. */
1695 if (code >= 0xa0
1696 && !(code & 0x80000000)
1697 && !(code >= 0xD800 && code <= 0xDFFF))
1699 /* The standard permits $, @ and ` to be specified as UCNs. We use
1700 hex escapes so that this also works with EBCDIC hosts. */
1701 else if (code == 0x24 || code == 0x40 || code == 0x60)
1703 /* Don't give another error if one occurred above. */
1704 else if (length == 0)
1705 cpp_error (pfile, DL_ERROR, "universal-character-name out of range");
1706 #endif
1708 *pstr = p;
1709 *pc = code;
1710 return 0;
1713 /* Returns the value of an escape sequence, truncated to the correct
1714 target precision. PSTR points to the input pointer, which is just
1715 after the backslash. LIMIT is how much text we have. WIDE is true
1716 if the escape sequence is part of a wide character constant or
1717 string literal. Handles all relevant diagnostics. */
1718 cppchar_t
1719 cpp_parse_escape (pfile, pstr, limit, wide)
1720 cpp_reader *pfile;
1721 const unsigned char **pstr;
1722 const unsigned char *limit;
1723 int wide;
1725 int unknown = 0;
1726 const unsigned char *str = *pstr;
1727 cppchar_t c, mask;
1728 unsigned int width;
1730 if (wide)
1731 width = CPP_OPTION (pfile, wchar_precision);
1732 else
1733 width = CPP_OPTION (pfile, char_precision);
1734 if (width < BITS_PER_CPPCHAR_T)
1735 mask = ((cppchar_t) 1 << width) - 1;
1736 else
1737 mask = ~0;
1739 c = *str++;
1740 switch (c)
1742 case '\\': case '\'': case '"': case '?': break;
1743 case 'b': c = TARGET_BS; break;
1744 case 'f': c = TARGET_FF; break;
1745 case 'n': c = TARGET_NEWLINE; break;
1746 case 'r': c = TARGET_CR; break;
1747 case 't': c = TARGET_TAB; break;
1748 case 'v': c = TARGET_VT; break;
1750 case '(': case '{': case '[': case '%':
1751 /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
1752 '\%' is used to prevent SCCS from getting confused. */
1753 unknown = CPP_PEDANTIC (pfile);
1754 break;
1756 case 'a':
1757 if (CPP_WTRADITIONAL (pfile))
1758 cpp_error (pfile, DL_WARNING,
1759 "the meaning of '\\a' is different in traditional C");
1760 c = TARGET_BELL;
1761 break;
1763 case 'e': case 'E':
1764 if (CPP_PEDANTIC (pfile))
1765 cpp_error (pfile, DL_PEDWARN,
1766 "non-ISO-standard escape sequence, '\\%c'", (int) c);
1767 c = TARGET_ESC;
1768 break;
1770 case 'u': case 'U':
1771 unknown = maybe_read_ucs (pfile, &str, limit, &c);
1772 break;
1774 case 'x':
1775 if (CPP_WTRADITIONAL (pfile))
1776 cpp_error (pfile, DL_WARNING,
1777 "the meaning of '\\x' is different in traditional C");
1780 cppchar_t i = 0, overflow = 0;
1781 int digits_found = 0;
1783 while (str < limit)
1785 c = *str;
1786 if (! ISXDIGIT (c))
1787 break;
1788 str++;
1789 overflow |= i ^ (i << 4 >> 4);
1790 i = (i << 4) + hex_digit_value (c);
1791 digits_found = 1;
1794 if (!digits_found)
1795 cpp_error (pfile, DL_ERROR,
1796 "\\x used with no following hex digits");
1798 if (overflow | (i != (i & mask)))
1800 cpp_error (pfile, DL_PEDWARN,
1801 "hex escape sequence out of range");
1802 i &= mask;
1804 c = i;
1806 break;
1808 case '0': case '1': case '2': case '3':
1809 case '4': case '5': case '6': case '7':
1811 size_t count = 0;
1812 cppchar_t i = c - '0';
1814 while (str < limit && ++count < 3)
1816 c = *str;
1817 if (c < '0' || c > '7')
1818 break;
1819 str++;
1820 i = (i << 3) + c - '0';
1823 if (i != (i & mask))
1825 cpp_error (pfile, DL_PEDWARN,
1826 "octal escape sequence out of range");
1827 i &= mask;
1829 c = i;
1831 break;
1833 default:
1834 unknown = 1;
1835 break;
1838 if (unknown)
1840 if (ISGRAPH (c))
1841 cpp_error (pfile, DL_PEDWARN,
1842 "unknown escape sequence '\\%c'", (int) c);
1843 else
1844 cpp_error (pfile, DL_PEDWARN,
1845 "unknown escape sequence: '\\%03o'", (int) c);
1848 if (c > mask)
1850 cpp_error (pfile, DL_PEDWARN, "escape sequence out of range for its type");
1851 c &= mask;
1854 *pstr = str;
1855 return c;
1858 /* Interpret a (possibly wide) character constant in TOKEN.
1859 WARN_MULTI warns about multi-character charconsts. PCHARS_SEEN
1860 points to a variable that is filled in with the number of
1861 characters seen, and UNSIGNEDP to a variable that indicates whether
1862 the result has signed type. */
1863 cppchar_t
1864 cpp_interpret_charconst (pfile, token, warn_multi, pchars_seen, unsignedp)
1865 cpp_reader *pfile;
1866 const cpp_token *token;
1867 int warn_multi;
1868 unsigned int *pchars_seen;
1869 int *unsignedp;
1871 const unsigned char *str = token->val.str.text;
1872 const unsigned char *limit = str + token->val.str.len;
1873 unsigned int chars_seen = 0;
1874 size_t width, max_chars;
1875 cppchar_t c, mask, result = 0;
1876 bool unsigned_p;
1878 #ifdef MULTIBYTE_CHARS
1879 (void) local_mbtowc (NULL, NULL, 0);
1880 #endif
1882 /* Width in bits. */
1883 if (token->type == CPP_CHAR)
1885 width = CPP_OPTION (pfile, char_precision);
1886 max_chars = CPP_OPTION (pfile, int_precision) / width;
1887 unsigned_p = CPP_OPTION (pfile, signed_char) == 0;
1889 else
1891 width = CPP_OPTION (pfile, wchar_precision);
1892 max_chars = 1;
1893 unsigned_p = WCHAR_UNSIGNED;
1896 if (width < BITS_PER_CPPCHAR_T)
1897 mask = ((cppchar_t) 1 << width) - 1;
1898 else
1899 mask = ~0;
1901 while (str < limit)
1903 #ifdef MULTIBYTE_CHARS
1904 wchar_t wc;
1905 int char_len;
1907 char_len = local_mbtowc (&wc, str, limit - str);
1908 if (char_len == -1)
1910 cpp_error (pfile, DL_WARNING,
1911 "ignoring invalid multibyte character");
1912 c = *str++;
1914 else
1916 str += char_len;
1917 c = wc;
1919 #else
1920 c = *str++;
1921 #endif
1923 if (c == '\\')
1924 c = cpp_parse_escape (pfile, &str, limit, token->type == CPP_WCHAR);
1926 #ifdef MAP_CHARACTER
1927 if (ISPRINT (c))
1928 c = MAP_CHARACTER (c);
1929 #endif
1931 chars_seen++;
1933 /* Sign-extend the character, scale result, and add the two. */
1934 if (!unsigned_p && (c & (1 << (width - 1))))
1935 c |= ~mask;
1936 if (width < BITS_PER_CPPCHAR_T)
1937 result = (result << width) + c;
1938 else
1939 result = c;
1942 if (chars_seen == 0)
1943 cpp_error (pfile, DL_ERROR, "empty character constant");
1944 else if (chars_seen > 1)
1946 /* Multichar charconsts are of type int and therefore signed. */
1947 unsigned_p = 0;
1948 if (chars_seen > max_chars)
1950 chars_seen = max_chars;
1951 cpp_error (pfile, DL_WARNING,
1952 "character constant too long for its type");
1954 else if (warn_multi)
1955 cpp_error (pfile, DL_WARNING, "multi-character character constant");
1958 *pchars_seen = chars_seen;
1959 *unsignedp = unsigned_p;
1960 return result;
1963 /* Memory buffers. Changing these three constants can have a dramatic
1964 effect on performance. The values here are reasonable defaults,
1965 but might be tuned. If you adjust them, be sure to test across a
1966 range of uses of cpplib, including heavy nested function-like macro
1967 expansion. Also check the change in peak memory usage (NJAMD is a
1968 good tool for this). */
1969 #define MIN_BUFF_SIZE 8000
1970 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1971 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1972 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1974 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1975 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1976 #endif
1978 struct dummy
1980 char c;
1981 union
1983 double d;
1984 int *p;
1985 } u;
1988 #define DEFAULT_ALIGNMENT (offsetof (struct dummy, u))
1989 #define CPP_ALIGN(size, align) (((size) + ((align) - 1)) & ~((align) - 1))
1991 /* Create a new allocation buffer. Place the control block at the end
1992 of the buffer, so that buffer overflows will cause immediate chaos. */
1993 static _cpp_buff *
1994 new_buff (len)
1995 size_t len;
1997 _cpp_buff *result;
1998 unsigned char *base;
2000 if (len < MIN_BUFF_SIZE)
2001 len = MIN_BUFF_SIZE;
2002 len = CPP_ALIGN (len, DEFAULT_ALIGNMENT);
2004 base = xmalloc (len + sizeof (_cpp_buff));
2005 result = (_cpp_buff *) (base + len);
2006 result->base = base;
2007 result->cur = base;
2008 result->limit = base + len;
2009 result->next = NULL;
2010 return result;
2013 /* Place a chain of unwanted allocation buffers on the free list. */
2014 void
2015 _cpp_release_buff (pfile, buff)
2016 cpp_reader *pfile;
2017 _cpp_buff *buff;
2019 _cpp_buff *end = buff;
2021 while (end->next)
2022 end = end->next;
2023 end->next = pfile->free_buffs;
2024 pfile->free_buffs = buff;
2027 /* Return a free buffer of size at least MIN_SIZE. */
2028 _cpp_buff *
2029 _cpp_get_buff (pfile, min_size)
2030 cpp_reader *pfile;
2031 size_t min_size;
2033 _cpp_buff *result, **p;
2035 for (p = &pfile->free_buffs;; p = &(*p)->next)
2037 size_t size;
2039 if (*p == NULL)
2040 return new_buff (min_size);
2041 result = *p;
2042 size = result->limit - result->base;
2043 /* Return a buffer that's big enough, but don't waste one that's
2044 way too big. */
2045 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2046 break;
2049 *p = result->next;
2050 result->next = NULL;
2051 result->cur = result->base;
2052 return result;
2055 /* Creates a new buffer with enough space to hold the uncommitted
2056 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
2057 the excess bytes to the new buffer. Chains the new buffer after
2058 BUFF, and returns the new buffer. */
2059 _cpp_buff *
2060 _cpp_append_extend_buff (pfile, buff, min_extra)
2061 cpp_reader *pfile;
2062 _cpp_buff *buff;
2063 size_t min_extra;
2065 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2066 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2068 buff->next = new_buff;
2069 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2070 return new_buff;
2073 /* Creates a new buffer with enough space to hold the uncommitted
2074 remaining bytes of the buffer pointed to by BUFF, and at least
2075 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
2076 Chains the new buffer before the buffer pointed to by BUFF, and
2077 updates the pointer to point to the new buffer. */
2078 void
2079 _cpp_extend_buff (pfile, pbuff, min_extra)
2080 cpp_reader *pfile;
2081 _cpp_buff **pbuff;
2082 size_t min_extra;
2084 _cpp_buff *new_buff, *old_buff = *pbuff;
2085 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2087 new_buff = _cpp_get_buff (pfile, size);
2088 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2089 new_buff->next = old_buff;
2090 *pbuff = new_buff;
2093 /* Free a chain of buffers starting at BUFF. */
2094 void
2095 _cpp_free_buff (buff)
2096 _cpp_buff *buff;
2098 _cpp_buff *next;
2100 for (; buff; buff = next)
2102 next = buff->next;
2103 free (buff->base);
2107 /* Allocate permanent, unaligned storage of length LEN. */
2108 unsigned char *
2109 _cpp_unaligned_alloc (pfile, len)
2110 cpp_reader *pfile;
2111 size_t len;
2113 _cpp_buff *buff = pfile->u_buff;
2114 unsigned char *result = buff->cur;
2116 if (len > (size_t) (buff->limit - result))
2118 buff = _cpp_get_buff (pfile, len);
2119 buff->next = pfile->u_buff;
2120 pfile->u_buff = buff;
2121 result = buff->cur;
2124 buff->cur = result + len;
2125 return result;
2128 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2129 That buffer is used for growing allocations when saving macro
2130 replacement lists in a #define, and when parsing an answer to an
2131 assertion in #assert, #unassert or #if (and therefore possibly
2132 whilst expanding macros). It therefore must not be used by any
2133 code that they might call: specifically the lexer and the guts of
2134 the macro expander.
2136 All existing other uses clearly fit this restriction: storing
2137 registered pragmas during initialization. */
2138 unsigned char *
2139 _cpp_aligned_alloc (pfile, len)
2140 cpp_reader *pfile;
2141 size_t len;
2143 _cpp_buff *buff = pfile->a_buff;
2144 unsigned char *result = buff->cur;
2146 if (len > (size_t) (buff->limit - result))
2148 buff = _cpp_get_buff (pfile, len);
2149 buff->next = pfile->a_buff;
2150 pfile->a_buff = buff;
2151 result = buff->cur;
2154 buff->cur = result + len;
2155 return result;