2002-04-02 David S. Miller <davem@redhat.com>
[official-gcc.git] / gcc / cpplex.c
bloba765967facec2046decfba6542996770b4a20694
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000, 2001, 2002 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7 Single-pass line tokenization by Neil Booth, April 2000
9 This program is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
12 later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23 #include "config.h"
24 #include "system.h"
25 #include "cpplib.h"
26 #include "cpphash.h"
28 /* MULTIBYTE_CHARS support only works for native compilers.
29 ??? Ideally what we want is to model widechar support after
30 the current floating point support. */
31 #ifdef CROSS_COMPILE
32 #undef MULTIBYTE_CHARS
33 #endif
35 #ifdef MULTIBYTE_CHARS
36 #include "mbchar.h"
37 #include <locale.h>
38 #endif
40 /* Tokens with SPELL_STRING store their spelling in the token list,
41 and it's length in the token->val.name.len. */
42 enum spell_type
44 SPELL_OPERATOR = 0,
45 SPELL_CHAR,
46 SPELL_IDENT,
47 SPELL_NUMBER,
48 SPELL_STRING,
49 SPELL_NONE
52 struct token_spelling
54 enum spell_type category;
55 const unsigned char *name;
58 static const unsigned char *const digraph_spellings[] =
59 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
61 #define OP(e, s) { SPELL_OPERATOR, U s },
62 #define TK(e, s) { s, U STRINGX (e) },
63 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
64 #undef OP
65 #undef TK
67 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
68 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
69 #define BACKUP() do {buffer->cur = buffer->backup_to;} while (0)
71 static void handle_newline PARAMS ((cpp_reader *));
72 static cppchar_t skip_escaped_newlines PARAMS ((cpp_reader *));
73 static cppchar_t get_effective_char PARAMS ((cpp_reader *));
75 static int skip_block_comment PARAMS ((cpp_reader *));
76 static int skip_line_comment PARAMS ((cpp_reader *));
77 static void adjust_column PARAMS ((cpp_reader *));
78 static int skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
79 static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *));
80 static U_CHAR *parse_slow PARAMS ((cpp_reader *, const U_CHAR *, int,
81 unsigned int *));
82 static void parse_number PARAMS ((cpp_reader *, cpp_string *, int));
83 static int unescaped_terminator_p PARAMS ((cpp_reader *, const U_CHAR *));
84 static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
85 static bool trigraph_p PARAMS ((cpp_reader *));
86 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const U_CHAR *));
87 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
88 static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
89 const unsigned char *, unsigned int *));
90 static tokenrun *next_tokenrun PARAMS ((tokenrun *));
92 static unsigned int hex_digit_value PARAMS ((unsigned int));
93 static _cpp_buff *new_buff PARAMS ((size_t));
95 /* Utility routine:
97 Compares, the token TOKEN to the NUL-terminated string STRING.
98 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
99 int
100 cpp_ideq (token, string)
101 const cpp_token *token;
102 const char *string;
104 if (token->type != CPP_NAME)
105 return 0;
107 return !ustrcmp (NODE_NAME (token->val.node), (const U_CHAR *) string);
110 /* Call when meeting a newline, assumed to be in buffer->cur[-1].
111 Returns with buffer->cur pointing to the character immediately
112 following the newline (combination). */
113 static void
114 handle_newline (pfile)
115 cpp_reader *pfile;
117 cpp_buffer *buffer = pfile->buffer;
119 /* Handle CR-LF and LF-CR. Most other implementations (e.g. java)
120 only accept CR-LF; maybe we should fall back to that behaviour? */
121 if (buffer->cur[-1] + buffer->cur[0] == '\r' + '\n')
122 buffer->cur++;
124 buffer->line_base = buffer->cur;
125 buffer->col_adjust = 0;
126 pfile->line++;
129 /* Subroutine of skip_escaped_newlines; called when a 3-character
130 sequence beginning with "??" is encountered. buffer->cur points to
131 the second '?'.
133 Warn if necessary, and returns true if the sequence forms a
134 trigraph and the trigraph should be honoured. */
135 static bool
136 trigraph_p (pfile)
137 cpp_reader *pfile;
139 cpp_buffer *buffer = pfile->buffer;
140 cppchar_t from_char = buffer->cur[1];
141 bool accept;
143 if (!_cpp_trigraph_map[from_char])
144 return false;
146 accept = CPP_OPTION (pfile, trigraphs);
148 /* Don't warn about trigraphs in comments. */
149 if (CPP_OPTION (pfile, warn_trigraphs) && !pfile->state.lexing_comment)
151 if (accept)
152 cpp_warning_with_line (pfile, pfile->line, CPP_BUF_COL (buffer) - 1,
153 "trigraph ??%c converted to %c",
154 (int) from_char,
155 (int) _cpp_trigraph_map[from_char]);
156 else if (buffer->cur != buffer->last_Wtrigraphs)
158 buffer->last_Wtrigraphs = buffer->cur;
159 cpp_warning_with_line (pfile, pfile->line,
160 CPP_BUF_COL (buffer) - 1,
161 "trigraph ??%c ignored", (int) from_char);
165 return accept;
168 /* Skips any escaped newlines introduced by '?' or a '\\', assumed to
169 lie in buffer->cur[-1]. Returns the next byte, which will be in
170 buffer->cur[-1]. This routine performs preprocessing stages 1 and
171 2 of the ISO C standard. */
172 static cppchar_t
173 skip_escaped_newlines (pfile)
174 cpp_reader *pfile;
176 cpp_buffer *buffer = pfile->buffer;
177 cppchar_t next = buffer->cur[-1];
179 /* Only do this if we apply stages 1 and 2. */
180 if (!buffer->from_stage3)
182 const unsigned char *saved_cur;
183 cppchar_t next1;
187 if (next == '?')
189 if (buffer->cur[0] != '?' || !trigraph_p (pfile))
190 break;
192 /* Translate the trigraph. */
193 next = _cpp_trigraph_map[buffer->cur[1]];
194 buffer->cur += 2;
195 if (next != '\\')
196 break;
199 if (buffer->cur == buffer->rlimit)
200 break;
202 /* We have a backslash, and room for at least one more
203 character. Skip horizontal whitespace. */
204 saved_cur = buffer->cur;
206 next1 = *buffer->cur++;
207 while (is_nvspace (next1) && buffer->cur < buffer->rlimit);
209 if (!is_vspace (next1))
211 buffer->cur = saved_cur;
212 break;
215 if (saved_cur != buffer->cur - 1
216 && !pfile->state.lexing_comment)
217 cpp_warning (pfile, "backslash and newline separated by space");
219 handle_newline (pfile);
220 buffer->backup_to = buffer->cur;
221 if (buffer->cur == buffer->rlimit)
223 cpp_pedwarn (pfile, "backslash-newline at end of file");
224 next = EOF;
226 else
227 next = *buffer->cur++;
229 while (next == '\\' || next == '?');
232 return next;
235 /* Obtain the next character, after trigraph conversion and skipping
236 an arbitrarily long string of escaped newlines. The common case of
237 no trigraphs or escaped newlines falls through quickly. On return,
238 buffer->backup_to points to where to return to if the character is
239 not to be processed. */
240 static cppchar_t
241 get_effective_char (pfile)
242 cpp_reader *pfile;
244 cppchar_t next;
245 cpp_buffer *buffer = pfile->buffer;
247 buffer->backup_to = buffer->cur;
248 next = *buffer->cur++;
249 if (__builtin_expect (next == '?' || next == '\\', 0))
250 next = skip_escaped_newlines (pfile);
252 return next;
255 /* Skip a C-style block comment. We find the end of the comment by
256 seeing if an asterisk is before every '/' we encounter. Returns
257 non-zero if comment terminated by EOF, zero otherwise. */
258 static int
259 skip_block_comment (pfile)
260 cpp_reader *pfile;
262 cpp_buffer *buffer = pfile->buffer;
263 cppchar_t c = EOF, prevc = EOF;
265 pfile->state.lexing_comment = 1;
266 while (buffer->cur != buffer->rlimit)
268 prevc = c, c = *buffer->cur++;
270 /* FIXME: For speed, create a new character class of characters
271 of interest inside block comments. */
272 if (c == '?' || c == '\\')
273 c = skip_escaped_newlines (pfile);
275 /* People like decorating comments with '*', so check for '/'
276 instead for efficiency. */
277 if (c == '/')
279 if (prevc == '*')
280 break;
282 /* Warn about potential nested comments, but not if the '/'
283 comes immediately before the true comment delimiter.
284 Don't bother to get it right across escaped newlines. */
285 if (CPP_OPTION (pfile, warn_comments)
286 && buffer->cur[0] == '*' && buffer->cur[1] != '/')
287 cpp_warning_with_line (pfile,
288 pfile->line, CPP_BUF_COL (buffer),
289 "\"/*\" within comment");
291 else if (is_vspace (c))
292 handle_newline (pfile);
293 else if (c == '\t')
294 adjust_column (pfile);
297 pfile->state.lexing_comment = 0;
298 return c != '/' || prevc != '*';
301 /* Skip a C++ line comment, leaving buffer->cur pointing to the
302 terminating newline. Handles escaped newlines. Returns non-zero
303 if a multiline comment. */
304 static int
305 skip_line_comment (pfile)
306 cpp_reader *pfile;
308 cpp_buffer *buffer = pfile->buffer;
309 unsigned int orig_line = pfile->line;
310 cppchar_t c;
312 pfile->state.lexing_comment = 1;
315 if (buffer->cur == buffer->rlimit)
316 goto at_eof;
318 c = *buffer->cur++;
319 if (c == '?' || c == '\\')
320 c = skip_escaped_newlines (pfile);
322 while (!is_vspace (c));
324 /* Step back over the newline, except at EOF. */
325 buffer->cur--;
326 at_eof:
328 pfile->state.lexing_comment = 0;
329 return orig_line != pfile->line;
332 /* pfile->buffer->cur is one beyond the \t character. Update
333 col_adjust so we track the column correctly. */
334 static void
335 adjust_column (pfile)
336 cpp_reader *pfile;
338 cpp_buffer *buffer = pfile->buffer;
339 unsigned int col = CPP_BUF_COL (buffer) - 1; /* Zero-based column. */
341 /* Round it up to multiple of the tabstop, but subtract 1 since the
342 tab itself occupies a character position. */
343 buffer->col_adjust += (CPP_OPTION (pfile, tabstop)
344 - col % CPP_OPTION (pfile, tabstop)) - 1;
347 /* Skips whitespace, saving the next non-whitespace character.
348 Adjusts pfile->col_adjust to account for tabs. Without this,
349 tokens might be assigned an incorrect column. */
350 static int
351 skip_whitespace (pfile, c)
352 cpp_reader *pfile;
353 cppchar_t c;
355 cpp_buffer *buffer = pfile->buffer;
356 unsigned int warned = 0;
360 /* Horizontal space always OK. */
361 if (c == ' ')
363 else if (c == '\t')
364 adjust_column (pfile);
365 /* Just \f \v or \0 left. */
366 else if (c == '\0')
368 if (buffer->cur - 1 == buffer->rlimit)
369 return 0;
370 if (!warned)
372 cpp_warning (pfile, "null character(s) ignored");
373 warned = 1;
376 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
377 cpp_pedwarn_with_line (pfile, pfile->line,
378 CPP_BUF_COL (buffer),
379 "%s in preprocessing directive",
380 c == '\f' ? "form feed" : "vertical tab");
382 c = *buffer->cur++;
384 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
385 while (is_nvspace (c));
387 buffer->cur--;
388 return 1;
391 /* See if the characters of a number token are valid in a name (no
392 '.', '+' or '-'). */
393 static int
394 name_p (pfile, string)
395 cpp_reader *pfile;
396 const cpp_string *string;
398 unsigned int i;
400 for (i = 0; i < string->len; i++)
401 if (!is_idchar (string->text[i]))
402 return 0;
404 return 1;
407 /* Parse an identifier, skipping embedded backslash-newlines. This is
408 a critical inner loop. The common case is an identifier which has
409 not been split by backslash-newline, does not contain a dollar
410 sign, and has already been scanned (roughly 10:1 ratio of
411 seen:unseen identifiers in normal code; the distribution is
412 Poisson-like). Second most common case is a new identifier, not
413 split and no dollar sign. The other possibilities are rare and
414 have been relegated to parse_slow. */
415 static cpp_hashnode *
416 parse_identifier (pfile)
417 cpp_reader *pfile;
419 cpp_hashnode *result;
420 const U_CHAR *cur, *base;
422 /* Fast-path loop. Skim over a normal identifier.
423 N.B. ISIDNUM does not include $. */
424 cur = pfile->buffer->cur;
425 while (ISIDNUM (*cur))
426 cur++;
428 /* Check for slow-path cases. */
429 if (*cur == '?' || *cur == '\\' || *cur == '$')
431 unsigned int len;
433 base = parse_slow (pfile, cur, 0, &len);
434 result = (cpp_hashnode *)
435 ht_lookup (pfile->hash_table, base, len, HT_ALLOCED);
437 else
439 base = pfile->buffer->cur - 1;
440 pfile->buffer->cur = cur;
441 result = (cpp_hashnode *)
442 ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
445 /* Rarely, identifiers require diagnostics when lexed.
446 XXX Has to be forced out of the fast path. */
447 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
448 && !pfile->state.skipping, 0))
450 /* It is allowed to poison the same identifier twice. */
451 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
452 cpp_error (pfile, "attempt to use poisoned \"%s\"",
453 NODE_NAME (result));
455 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
456 replacement list of a variadic macro. */
457 if (result == pfile->spec_nodes.n__VA_ARGS__
458 && !pfile->state.va_args_ok)
459 cpp_pedwarn (pfile,
460 "__VA_ARGS__ can only appear in the expansion of a C99 variadic macro");
463 return result;
466 /* Slow path. This handles numbers and identifiers which have been
467 split, or contain dollar signs. The part of the token from
468 PFILE->buffer->cur-1 to CUR has already been scanned. NUMBER_P is
469 1 if it's a number, and 2 if it has a leading period. Returns a
470 pointer to the token's NUL-terminated spelling in permanent
471 storage, and sets PLEN to its length. */
472 static U_CHAR *
473 parse_slow (pfile, cur, number_p, plen)
474 cpp_reader *pfile;
475 const U_CHAR *cur;
476 int number_p;
477 unsigned int *plen;
479 cpp_buffer *buffer = pfile->buffer;
480 const U_CHAR *base = buffer->cur - 1;
481 struct obstack *stack = &pfile->hash_table->stack;
482 unsigned int c, prevc, saw_dollar = 0;
484 /* Place any leading period. */
485 if (number_p == 2)
486 obstack_1grow (stack, '.');
488 /* Copy the part of the token which is known to be okay. */
489 obstack_grow (stack, base, cur - base);
491 /* Now process the part which isn't. We are looking at one of
492 '$', '\\', or '?' on entry to this loop. */
493 prevc = cur[-1];
494 c = *cur++;
495 buffer->cur = cur;
496 for (;;)
498 /* Potential escaped newline? */
499 buffer->backup_to = buffer->cur - 1;
500 if (c == '?' || c == '\\')
501 c = skip_escaped_newlines (pfile);
503 if (!is_idchar (c))
505 if (!number_p)
506 break;
507 if (c != '.' && !VALID_SIGN (c, prevc))
508 break;
511 /* Handle normal identifier characters in this loop. */
514 prevc = c;
515 obstack_1grow (stack, c);
517 if (c == '$')
518 saw_dollar++;
520 c = *buffer->cur++;
522 while (is_idchar (c));
525 /* Step back over the unwanted char. */
526 BACKUP ();
528 /* $ is not an identifier character in the standard, but is commonly
529 accepted as an extension. Don't warn about it in skipped
530 conditional blocks. */
531 if (saw_dollar && CPP_PEDANTIC (pfile) && ! pfile->state.skipping)
532 cpp_pedwarn (pfile, "'$' character(s) in identifier or number");
534 /* Identifiers and numbers are null-terminated. */
535 *plen = obstack_object_size (stack);
536 obstack_1grow (stack, '\0');
537 return obstack_finish (stack);
540 /* Parse a number, beginning with character C, skipping embedded
541 backslash-newlines. LEADING_PERIOD is non-zero if there was a "."
542 before C. Place the result in NUMBER. */
543 static void
544 parse_number (pfile, number, leading_period)
545 cpp_reader *pfile;
546 cpp_string *number;
547 int leading_period;
549 const U_CHAR *cur;
551 /* Fast-path loop. Skim over a normal number.
552 N.B. ISIDNUM does not include $. */
553 cur = pfile->buffer->cur;
554 while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
555 cur++;
557 /* Check for slow-path cases. */
558 if (*cur == '?' || *cur == '\\' || *cur == '$')
559 number->text = parse_slow (pfile, cur, 1 + leading_period, &number->len);
560 else
562 const U_CHAR *base = pfile->buffer->cur - 1;
563 U_CHAR *dest;
565 number->len = cur - base + leading_period;
566 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
567 dest[number->len] = '\0';
568 number->text = dest;
570 if (leading_period)
571 *dest++ = '.';
572 memcpy (dest, base, cur - base);
573 pfile->buffer->cur = cur;
577 /* Subroutine of parse_string. */
578 static int
579 unescaped_terminator_p (pfile, dest)
580 cpp_reader *pfile;
581 const unsigned char *dest;
583 const unsigned char *start, *temp;
585 /* In #include-style directives, terminators are not escapeable. */
586 if (pfile->state.angled_headers)
587 return 1;
589 start = BUFF_FRONT (pfile->u_buff);
591 /* An odd number of consecutive backslashes represents an escaped
592 terminator. */
593 for (temp = dest; temp > start && temp[-1] == '\\'; temp--)
596 return ((dest - temp) & 1) == 0;
599 /* Parses a string, character constant, or angle-bracketed header file
600 name. Handles embedded trigraphs and escaped newlines. The stored
601 string is guaranteed NUL-terminated, but it is not guaranteed that
602 this is the first NUL since embedded NULs are preserved.
604 When this function returns, buffer->cur points to the next
605 character to be processed. */
606 static void
607 parse_string (pfile, token, terminator)
608 cpp_reader *pfile;
609 cpp_token *token;
610 cppchar_t terminator;
612 cpp_buffer *buffer = pfile->buffer;
613 unsigned char *dest, *limit;
614 cppchar_t c;
615 bool warned_nulls = false;
617 dest = BUFF_FRONT (pfile->u_buff);
618 limit = BUFF_LIMIT (pfile->u_buff);
620 for (;;)
622 /* We need room for another char, possibly the terminating NUL. */
623 if ((size_t) (limit - dest) < 1)
625 size_t len_so_far = dest - BUFF_FRONT (pfile->u_buff);
626 _cpp_extend_buff (pfile, &pfile->u_buff, 2);
627 dest = BUFF_FRONT (pfile->u_buff) + len_so_far;
628 limit = BUFF_LIMIT (pfile->u_buff);
631 /* Handle trigraphs, escaped newlines etc. */
632 c = *buffer->cur++;
633 if (c == '?' || c == '\\')
634 c = skip_escaped_newlines (pfile);
636 if (c == terminator)
638 if (unescaped_terminator_p (pfile, dest))
639 break;
641 else if (is_vspace (c))
643 /* No string literal may extend over multiple lines. In
644 assembly language, suppress the error except for <>
645 includes. This is a kludge around not knowing where
646 comments are. */
647 unterminated:
648 if (CPP_OPTION (pfile, lang) != CLK_ASM || terminator == '>')
649 cpp_error (pfile, "missing terminating %c character", terminator);
650 buffer->cur--;
651 break;
653 else if (c == '\0')
655 if (buffer->cur - 1 == buffer->rlimit)
656 goto unterminated;
657 if (!warned_nulls)
659 warned_nulls = true;
660 cpp_warning (pfile, "null character(s) preserved in literal");
664 *dest++ = c;
667 *dest = '\0';
669 token->val.str.text = BUFF_FRONT (pfile->u_buff);
670 token->val.str.len = dest - BUFF_FRONT (pfile->u_buff);
671 BUFF_FRONT (pfile->u_buff) = dest + 1;
674 /* The stored comment includes the comment start and any terminator. */
675 static void
676 save_comment (pfile, token, from)
677 cpp_reader *pfile;
678 cpp_token *token;
679 const unsigned char *from;
681 unsigned char *buffer;
682 unsigned int len;
684 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
686 /* C++ comments probably (not definitely) have moved past a new
687 line, which we don't want to save in the comment. */
688 if (is_vspace (pfile->buffer->cur[-1]))
689 len--;
690 buffer = _cpp_unaligned_alloc (pfile, len);
692 token->type = CPP_COMMENT;
693 token->val.str.len = len;
694 token->val.str.text = buffer;
696 buffer[0] = '/';
697 memcpy (buffer + 1, from, len - 1);
700 /* Allocate COUNT tokens for RUN. */
701 void
702 _cpp_init_tokenrun (run, count)
703 tokenrun *run;
704 unsigned int count;
706 run->base = xnewvec (cpp_token, count);
707 run->limit = run->base + count;
708 run->next = NULL;
711 /* Returns the next tokenrun, or creates one if there is none. */
712 static tokenrun *
713 next_tokenrun (run)
714 tokenrun *run;
716 if (run->next == NULL)
718 run->next = xnew (tokenrun);
719 run->next->prev = run;
720 _cpp_init_tokenrun (run->next, 250);
723 return run->next;
726 /* Allocate a single token that is invalidated at the same time as the
727 rest of the tokens on the line. Has its line and col set to the
728 same as the last lexed token, so that diagnostics appear in the
729 right place. */
730 cpp_token *
731 _cpp_temp_token (pfile)
732 cpp_reader *pfile;
734 cpp_token *old, *result;
736 old = pfile->cur_token - 1;
737 if (pfile->cur_token == pfile->cur_run->limit)
739 pfile->cur_run = next_tokenrun (pfile->cur_run);
740 pfile->cur_token = pfile->cur_run->base;
743 result = pfile->cur_token++;
744 result->line = old->line;
745 result->col = old->col;
746 return result;
749 /* Lex a token into RESULT (external interface). Takes care of issues
750 like directive handling, token lookahead, multiple include
751 optimization and skipping. */
752 const cpp_token *
753 _cpp_lex_token (pfile)
754 cpp_reader *pfile;
756 cpp_token *result;
758 for (;;)
760 if (pfile->cur_token == pfile->cur_run->limit)
762 pfile->cur_run = next_tokenrun (pfile->cur_run);
763 pfile->cur_token = pfile->cur_run->base;
766 if (pfile->lookaheads)
768 pfile->lookaheads--;
769 result = pfile->cur_token++;
771 else
772 result = _cpp_lex_direct (pfile);
774 if (result->flags & BOL)
776 /* Is this a directive. If _cpp_handle_directive returns
777 false, it is an assembler #. */
778 if (result->type == CPP_HASH
779 /* 6.10.3 p 11: Directives in a list of macro arguments
780 gives undefined behavior. This implementation
781 handles the directive as normal. */
782 && pfile->state.parsing_args != 1
783 && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
784 continue;
785 if (pfile->cb.line_change && !pfile->state.skipping)
786 (*pfile->cb.line_change)(pfile, result, pfile->state.parsing_args);
789 /* We don't skip tokens in directives. */
790 if (pfile->state.in_directive)
791 break;
793 /* Outside a directive, invalidate controlling macros. At file
794 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
795 get here and MI optimisation works. */
796 pfile->mi_valid = false;
798 if (!pfile->state.skipping || result->type == CPP_EOF)
799 break;
802 return result;
805 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
806 do { \
807 if (get_effective_char (pfile) == CHAR) \
808 result->type = THEN_TYPE; \
809 else \
811 BACKUP (); \
812 result->type = ELSE_TYPE; \
814 } while (0)
816 /* Lex a token into pfile->cur_token, which is also incremented, to
817 get diagnostics pointing to the correct location.
819 Does not handle issues such as token lookahead, multiple-include
820 optimisation, directives, skipping etc. This function is only
821 suitable for use by _cpp_lex_token, and in special cases like
822 lex_expansion_token which doesn't care for any of these issues.
824 When meeting a newline, returns CPP_EOF if parsing a directive,
825 otherwise returns to the start of the token buffer if permissible.
826 Returns the location of the lexed token. */
827 cpp_token *
828 _cpp_lex_direct (pfile)
829 cpp_reader *pfile;
831 cppchar_t c;
832 cpp_buffer *buffer;
833 const unsigned char *comment_start;
834 cpp_token *result = pfile->cur_token++;
836 fresh_line:
837 buffer = pfile->buffer;
838 result->flags = buffer->saved_flags;
839 buffer->saved_flags = 0;
840 update_tokens_line:
841 result->line = pfile->line;
843 skipped_white:
844 c = *buffer->cur++;
845 result->col = CPP_BUF_COLUMN (buffer, buffer->cur);
847 trigraph:
848 switch (c)
850 case ' ': case '\t': case '\f': case '\v': case '\0':
851 result->flags |= PREV_WHITE;
852 if (skip_whitespace (pfile, c))
853 goto skipped_white;
855 /* EOF. */
856 buffer->cur--;
857 buffer->saved_flags = BOL;
858 if (!pfile->state.parsing_args && !pfile->state.in_directive)
860 if (buffer->cur != buffer->line_base)
862 /* Non-empty files should end in a newline. Don't warn
863 for command line and _Pragma buffers. */
864 if (!buffer->from_stage3)
865 cpp_pedwarn (pfile, "no newline at end of file");
866 handle_newline (pfile);
869 /* Don't pop the last buffer. */
870 if (buffer->prev)
872 unsigned char stop = buffer->return_at_eof;
874 _cpp_pop_buffer (pfile);
875 if (!stop)
876 goto fresh_line;
879 result->type = CPP_EOF;
880 break;
882 case '\n': case '\r':
883 handle_newline (pfile);
884 buffer->saved_flags = BOL;
885 if (! pfile->state.in_directive)
887 if (pfile->state.parsing_args == 2)
888 buffer->saved_flags |= PREV_WHITE;
889 if (!pfile->keep_tokens)
891 pfile->cur_run = &pfile->base_run;
892 result = pfile->base_run.base;
893 pfile->cur_token = result + 1;
895 goto fresh_line;
897 result->type = CPP_EOF;
898 break;
900 case '?':
901 case '\\':
902 /* These could start an escaped newline, or '?' a trigraph. Let
903 skip_escaped_newlines do all the work. */
905 unsigned int line = pfile->line;
907 c = skip_escaped_newlines (pfile);
908 if (line != pfile->line)
910 buffer->cur--;
911 /* We had at least one escaped newline of some sort.
912 Update the token's line and column. */
913 goto update_tokens_line;
917 /* We are either the original '?' or '\\', or a trigraph. */
918 if (c == '?')
919 result->type = CPP_QUERY;
920 else if (c == '\\')
921 goto random_char;
922 else
923 goto trigraph;
924 break;
926 case '0': case '1': case '2': case '3': case '4':
927 case '5': case '6': case '7': case '8': case '9':
928 result->type = CPP_NUMBER;
929 parse_number (pfile, &result->val.str, 0);
930 break;
932 case 'L':
933 /* 'L' may introduce wide characters or strings. */
935 const unsigned char *pos = buffer->cur;
937 c = get_effective_char (pfile);
938 if (c == '\'' || c == '"')
940 result->type = (c == '"' ? CPP_WSTRING: CPP_WCHAR);
941 parse_string (pfile, result, c);
942 break;
944 buffer->cur = pos;
946 /* Fall through. */
948 start_ident:
949 case '_':
950 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
951 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
952 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
953 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
954 case 'y': case 'z':
955 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
956 case 'G': case 'H': case 'I': case 'J': case 'K':
957 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
958 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
959 case 'Y': case 'Z':
960 result->type = CPP_NAME;
961 result->val.node = parse_identifier (pfile);
963 /* Convert named operators to their proper types. */
964 if (result->val.node->flags & NODE_OPERATOR)
966 result->flags |= NAMED_OP;
967 result->type = result->val.node->value.operator;
969 break;
971 case '\'':
972 case '"':
973 result->type = c == '"' ? CPP_STRING: CPP_CHAR;
974 parse_string (pfile, result, c);
975 break;
977 case '/':
978 /* A potential block or line comment. */
979 comment_start = buffer->cur;
980 c = get_effective_char (pfile);
982 if (c == '*')
984 if (skip_block_comment (pfile))
985 cpp_error (pfile, "unterminated comment");
987 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
988 || CPP_IN_SYSTEM_HEADER (pfile)))
990 /* Warn about comments only if pedantically GNUC89, and not
991 in system headers. */
992 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
993 && ! buffer->warned_cplusplus_comments)
995 cpp_pedwarn (pfile,
996 "C++ style comments are not allowed in ISO C89");
997 cpp_pedwarn (pfile,
998 "(this will be reported only once per input file)");
999 buffer->warned_cplusplus_comments = 1;
1002 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1003 cpp_warning (pfile, "multi-line comment");
1005 else if (c == '=')
1007 result->type = CPP_DIV_EQ;
1008 break;
1010 else
1012 BACKUP ();
1013 result->type = CPP_DIV;
1014 break;
1017 if (!pfile->state.save_comments)
1019 result->flags |= PREV_WHITE;
1020 goto update_tokens_line;
1023 /* Save the comment as a token in its own right. */
1024 save_comment (pfile, result, comment_start);
1025 break;
1027 case '<':
1028 if (pfile->state.angled_headers)
1030 result->type = CPP_HEADER_NAME;
1031 parse_string (pfile, result, '>');
1032 break;
1035 c = get_effective_char (pfile);
1036 if (c == '=')
1037 result->type = CPP_LESS_EQ;
1038 else if (c == '<')
1039 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1040 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1041 IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
1042 else if (c == ':' && CPP_OPTION (pfile, digraphs))
1044 result->type = CPP_OPEN_SQUARE;
1045 result->flags |= DIGRAPH;
1047 else if (c == '%' && CPP_OPTION (pfile, digraphs))
1049 result->type = CPP_OPEN_BRACE;
1050 result->flags |= DIGRAPH;
1052 else
1054 BACKUP ();
1055 result->type = CPP_LESS;
1057 break;
1059 case '>':
1060 c = get_effective_char (pfile);
1061 if (c == '=')
1062 result->type = CPP_GREATER_EQ;
1063 else if (c == '>')
1064 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1065 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1066 IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
1067 else
1069 BACKUP ();
1070 result->type = CPP_GREATER;
1072 break;
1074 case '%':
1075 c = get_effective_char (pfile);
1076 if (c == '=')
1077 result->type = CPP_MOD_EQ;
1078 else if (CPP_OPTION (pfile, digraphs) && c == ':')
1080 result->flags |= DIGRAPH;
1081 result->type = CPP_HASH;
1082 if (get_effective_char (pfile) == '%')
1084 const unsigned char *pos = buffer->cur;
1086 if (get_effective_char (pfile) == ':')
1087 result->type = CPP_PASTE;
1088 else
1089 buffer->cur = pos - 1;
1091 else
1092 BACKUP ();
1094 else if (CPP_OPTION (pfile, digraphs) && c == '>')
1096 result->flags |= DIGRAPH;
1097 result->type = CPP_CLOSE_BRACE;
1099 else
1101 BACKUP ();
1102 result->type = CPP_MOD;
1104 break;
1106 case '.':
1107 result->type = CPP_DOT;
1108 c = get_effective_char (pfile);
1109 if (c == '.')
1111 const unsigned char *pos = buffer->cur;
1113 if (get_effective_char (pfile) == '.')
1114 result->type = CPP_ELLIPSIS;
1115 else
1116 buffer->cur = pos - 1;
1118 /* All known character sets have 0...9 contiguous. */
1119 else if (ISDIGIT (c))
1121 result->type = CPP_NUMBER;
1122 parse_number (pfile, &result->val.str, 1);
1124 else if (c == '*' && CPP_OPTION (pfile, cplusplus))
1125 result->type = CPP_DOT_STAR;
1126 else
1127 BACKUP ();
1128 break;
1130 case '+':
1131 c = get_effective_char (pfile);
1132 if (c == '+')
1133 result->type = CPP_PLUS_PLUS;
1134 else if (c == '=')
1135 result->type = CPP_PLUS_EQ;
1136 else
1138 BACKUP ();
1139 result->type = CPP_PLUS;
1141 break;
1143 case '-':
1144 c = get_effective_char (pfile);
1145 if (c == '>')
1147 result->type = CPP_DEREF;
1148 if (CPP_OPTION (pfile, cplusplus))
1150 if (get_effective_char (pfile) == '*')
1151 result->type = CPP_DEREF_STAR;
1152 else
1153 BACKUP ();
1156 else if (c == '-')
1157 result->type = CPP_MINUS_MINUS;
1158 else if (c == '=')
1159 result->type = CPP_MINUS_EQ;
1160 else
1162 BACKUP ();
1163 result->type = CPP_MINUS;
1165 break;
1167 case '&':
1168 c = get_effective_char (pfile);
1169 if (c == '&')
1170 result->type = CPP_AND_AND;
1171 else if (c == '=')
1172 result->type = CPP_AND_EQ;
1173 else
1175 BACKUP ();
1176 result->type = CPP_AND;
1178 break;
1180 case '|':
1181 c = get_effective_char (pfile);
1182 if (c == '|')
1183 result->type = CPP_OR_OR;
1184 else if (c == '=')
1185 result->type = CPP_OR_EQ;
1186 else
1188 BACKUP ();
1189 result->type = CPP_OR;
1191 break;
1193 case ':':
1194 c = get_effective_char (pfile);
1195 if (c == ':' && CPP_OPTION (pfile, cplusplus))
1196 result->type = CPP_SCOPE;
1197 else if (c == '>' && CPP_OPTION (pfile, digraphs))
1199 result->flags |= DIGRAPH;
1200 result->type = CPP_CLOSE_SQUARE;
1202 else
1204 BACKUP ();
1205 result->type = CPP_COLON;
1207 break;
1209 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1210 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1211 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1212 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1213 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1215 case '~': result->type = CPP_COMPL; break;
1216 case ',': result->type = CPP_COMMA; break;
1217 case '(': result->type = CPP_OPEN_PAREN; break;
1218 case ')': result->type = CPP_CLOSE_PAREN; break;
1219 case '[': result->type = CPP_OPEN_SQUARE; break;
1220 case ']': result->type = CPP_CLOSE_SQUARE; break;
1221 case '{': result->type = CPP_OPEN_BRACE; break;
1222 case '}': result->type = CPP_CLOSE_BRACE; break;
1223 case ';': result->type = CPP_SEMICOLON; break;
1225 /* @ is a punctuator in Objective C. */
1226 case '@': result->type = CPP_ATSIGN; break;
1228 case '$':
1229 if (CPP_OPTION (pfile, dollars_in_ident))
1230 goto start_ident;
1231 /* Fall through... */
1233 random_char:
1234 default:
1235 result->type = CPP_OTHER;
1236 result->val.c = c;
1237 break;
1240 return result;
1243 /* An upper bound on the number of bytes needed to spell TOKEN,
1244 including preceding whitespace. */
1245 unsigned int
1246 cpp_token_len (token)
1247 const cpp_token *token;
1249 unsigned int len;
1251 switch (TOKEN_SPELL (token))
1253 default: len = 0; break;
1254 case SPELL_NUMBER:
1255 case SPELL_STRING: len = token->val.str.len; break;
1256 case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
1258 /* 1 for whitespace, 4 for comment delimiters. */
1259 return len + 5;
1262 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1263 already contain the enough space to hold the token's spelling.
1264 Returns a pointer to the character after the last character
1265 written. */
1266 unsigned char *
1267 cpp_spell_token (pfile, token, buffer)
1268 cpp_reader *pfile; /* Would be nice to be rid of this... */
1269 const cpp_token *token;
1270 unsigned char *buffer;
1272 switch (TOKEN_SPELL (token))
1274 case SPELL_OPERATOR:
1276 const unsigned char *spelling;
1277 unsigned char c;
1279 if (token->flags & DIGRAPH)
1280 spelling
1281 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1282 else if (token->flags & NAMED_OP)
1283 goto spell_ident;
1284 else
1285 spelling = TOKEN_NAME (token);
1287 while ((c = *spelling++) != '\0')
1288 *buffer++ = c;
1290 break;
1292 case SPELL_CHAR:
1293 *buffer++ = token->val.c;
1294 break;
1296 spell_ident:
1297 case SPELL_IDENT:
1298 memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
1299 buffer += NODE_LEN (token->val.node);
1300 break;
1302 case SPELL_NUMBER:
1303 memcpy (buffer, token->val.str.text, token->val.str.len);
1304 buffer += token->val.str.len;
1305 break;
1307 case SPELL_STRING:
1309 int left, right, tag;
1310 switch (token->type)
1312 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1313 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1314 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1315 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1316 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1317 default:
1318 cpp_ice (pfile, "unknown string token %s\n", TOKEN_NAME (token));
1319 return buffer;
1321 if (tag) *buffer++ = tag;
1322 *buffer++ = left;
1323 memcpy (buffer, token->val.str.text, token->val.str.len);
1324 buffer += token->val.str.len;
1325 *buffer++ = right;
1327 break;
1329 case SPELL_NONE:
1330 cpp_ice (pfile, "unspellable token %s", TOKEN_NAME (token));
1331 break;
1334 return buffer;
1337 /* Returns TOKEN spelt as a null-terminated string. The string is
1338 freed when the reader is destroyed. Useful for diagnostics. */
1339 unsigned char *
1340 cpp_token_as_text (pfile, token)
1341 cpp_reader *pfile;
1342 const cpp_token *token;
1344 unsigned int len = cpp_token_len (token);
1345 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1347 end = cpp_spell_token (pfile, token, start);
1348 end[0] = '\0';
1350 return start;
1353 /* Used by C front ends, which really should move to using
1354 cpp_token_as_text. */
1355 const char *
1356 cpp_type2name (type)
1357 enum cpp_ttype type;
1359 return (const char *) token_spellings[type].name;
1362 /* Writes the spelling of token to FP, without any preceding space.
1363 Separated from cpp_spell_token for efficiency - to avoid stdio
1364 double-buffering. */
1365 void
1366 cpp_output_token (token, fp)
1367 const cpp_token *token;
1368 FILE *fp;
1370 switch (TOKEN_SPELL (token))
1372 case SPELL_OPERATOR:
1374 const unsigned char *spelling;
1375 int c;
1377 if (token->flags & DIGRAPH)
1378 spelling
1379 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1380 else if (token->flags & NAMED_OP)
1381 goto spell_ident;
1382 else
1383 spelling = TOKEN_NAME (token);
1385 c = *spelling;
1387 putc (c, fp);
1388 while ((c = *++spelling) != '\0');
1390 break;
1392 case SPELL_CHAR:
1393 putc (token->val.c, fp);
1394 break;
1396 spell_ident:
1397 case SPELL_IDENT:
1398 fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
1399 break;
1401 case SPELL_NUMBER:
1402 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1403 break;
1405 case SPELL_STRING:
1407 int left, right, tag;
1408 switch (token->type)
1410 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1411 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1412 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1413 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1414 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1415 default:
1416 fprintf (stderr, "impossible STRING token %s\n", TOKEN_NAME (token));
1417 return;
1419 if (tag) putc (tag, fp);
1420 putc (left, fp);
1421 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1422 putc (right, fp);
1424 break;
1426 case SPELL_NONE:
1427 /* An error, most probably. */
1428 break;
1432 /* Compare two tokens. */
1434 _cpp_equiv_tokens (a, b)
1435 const cpp_token *a, *b;
1437 if (a->type == b->type && a->flags == b->flags)
1438 switch (TOKEN_SPELL (a))
1440 default: /* Keep compiler happy. */
1441 case SPELL_OPERATOR:
1442 return 1;
1443 case SPELL_CHAR:
1444 return a->val.c == b->val.c; /* Character. */
1445 case SPELL_NONE:
1446 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1447 case SPELL_IDENT:
1448 return a->val.node == b->val.node;
1449 case SPELL_NUMBER:
1450 case SPELL_STRING:
1451 return (a->val.str.len == b->val.str.len
1452 && !memcmp (a->val.str.text, b->val.str.text,
1453 a->val.str.len));
1456 return 0;
1459 /* Returns nonzero if a space should be inserted to avoid an
1460 accidental token paste for output. For simplicity, it is
1461 conservative, and occasionally advises a space where one is not
1462 needed, e.g. "." and ".2". */
1464 cpp_avoid_paste (pfile, token1, token2)
1465 cpp_reader *pfile;
1466 const cpp_token *token1, *token2;
1468 enum cpp_ttype a = token1->type, b = token2->type;
1469 cppchar_t c;
1471 if (token1->flags & NAMED_OP)
1472 a = CPP_NAME;
1473 if (token2->flags & NAMED_OP)
1474 b = CPP_NAME;
1476 c = EOF;
1477 if (token2->flags & DIGRAPH)
1478 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1479 else if (token_spellings[b].category == SPELL_OPERATOR)
1480 c = token_spellings[b].name[0];
1482 /* Quickly get everything that can paste with an '='. */
1483 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1484 return 1;
1486 switch (a)
1488 case CPP_GREATER: return c == '>' || c == '?';
1489 case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
1490 case CPP_PLUS: return c == '+';
1491 case CPP_MINUS: return c == '-' || c == '>';
1492 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1493 case CPP_MOD: return c == ':' || c == '>';
1494 case CPP_AND: return c == '&';
1495 case CPP_OR: return c == '|';
1496 case CPP_COLON: return c == ':' || c == '>';
1497 case CPP_DEREF: return c == '*';
1498 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1499 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1500 case CPP_NAME: return ((b == CPP_NUMBER
1501 && name_p (pfile, &token2->val.str))
1502 || b == CPP_NAME
1503 || b == CPP_CHAR || b == CPP_STRING); /* L */
1504 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1505 || c == '.' || c == '+' || c == '-');
1506 case CPP_OTHER: return (CPP_OPTION (pfile, objc)
1507 && token1->val.c == '@'
1508 && (b == CPP_NAME || b == CPP_STRING));
1509 default: break;
1512 return 0;
1515 /* Output all the remaining tokens on the current line, and a newline
1516 character, to FP. Leading whitespace is removed. If there are
1517 macros, special token padding is not performed. */
1518 void
1519 cpp_output_line (pfile, fp)
1520 cpp_reader *pfile;
1521 FILE *fp;
1523 const cpp_token *token;
1525 token = cpp_get_token (pfile);
1526 while (token->type != CPP_EOF)
1528 cpp_output_token (token, fp);
1529 token = cpp_get_token (pfile);
1530 if (token->flags & PREV_WHITE)
1531 putc (' ', fp);
1534 putc ('\n', fp);
1537 /* Returns the value of a hexadecimal digit. */
1538 static unsigned int
1539 hex_digit_value (c)
1540 unsigned int c;
1542 if (hex_p (c))
1543 return hex_value (c);
1544 else
1545 abort ();
1548 /* Parse a '\uNNNN' or '\UNNNNNNNN' sequence. Returns 1 to indicate
1549 failure if cpplib is not parsing C++ or C99. Such failure is
1550 silent, and no variables are updated. Otherwise returns 0, and
1551 warns if -Wtraditional.
1553 [lex.charset]: The character designated by the universal character
1554 name \UNNNNNNNN is that character whose character short name in
1555 ISO/IEC 10646 is NNNNNNNN; the character designated by the
1556 universal character name \uNNNN is that character whose character
1557 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
1558 for a universal character name is less than 0x20 or in the range
1559 0x7F-0x9F (inclusive), or if the universal character name
1560 designates a character in the basic source character set, then the
1561 program is ill-formed.
1563 We assume that wchar_t is Unicode, so we don't need to do any
1564 mapping. Is this ever wrong?
1566 PC points to the 'u' or 'U', PSTR is points to the byte after PC,
1567 LIMIT is the end of the string or charconst. PSTR is updated to
1568 point after the UCS on return, and the UCS is written into PC. */
1570 static int
1571 maybe_read_ucs (pfile, pstr, limit, pc)
1572 cpp_reader *pfile;
1573 const unsigned char **pstr;
1574 const unsigned char *limit;
1575 unsigned int *pc;
1577 const unsigned char *p = *pstr;
1578 unsigned int code = 0;
1579 unsigned int c = *pc, length;
1581 /* Only attempt to interpret a UCS for C++ and C99. */
1582 if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
1583 return 1;
1585 if (CPP_WTRADITIONAL (pfile))
1586 cpp_warning (pfile, "the meaning of '\\%c' is different in traditional C", c);
1588 length = (c == 'u' ? 4: 8);
1590 if ((size_t) (limit - p) < length)
1592 cpp_error (pfile, "incomplete universal-character-name");
1593 /* Skip to the end to avoid more diagnostics. */
1594 p = limit;
1596 else
1598 for (; length; length--, p++)
1600 c = *p;
1601 if (ISXDIGIT (c))
1602 code = (code << 4) + hex_digit_value (c);
1603 else
1605 cpp_error (pfile,
1606 "non-hex digit '%c' in universal-character-name", c);
1607 /* We shouldn't skip in case there are multibyte chars. */
1608 break;
1613 #ifdef TARGET_EBCDIC
1614 cpp_error (pfile, "universal-character-name on EBCDIC target");
1615 code = 0x3f; /* EBCDIC invalid character */
1616 #else
1617 /* True extended characters are OK. */
1618 if (code >= 0xa0
1619 && !(code & 0x80000000)
1620 && !(code >= 0xD800 && code <= 0xDFFF))
1622 /* The standard permits $, @ and ` to be specified as UCNs. We use
1623 hex escapes so that this also works with EBCDIC hosts. */
1624 else if (code == 0x24 || code == 0x40 || code == 0x60)
1626 /* Don't give another error if one occurred above. */
1627 else if (length == 0)
1628 cpp_error (pfile, "universal-character-name out of range");
1629 #endif
1631 *pstr = p;
1632 *pc = code;
1633 return 0;
1636 /* Interpret an escape sequence, and return its value. PSTR points to
1637 the input pointer, which is just after the backslash. LIMIT is how
1638 much text we have. MASK is a bitmask for the precision for the
1639 destination type (char or wchar_t).
1641 Handles all relevant diagnostics. */
1642 unsigned int
1643 cpp_parse_escape (pfile, pstr, limit, mask)
1644 cpp_reader *pfile;
1645 const unsigned char **pstr;
1646 const unsigned char *limit;
1647 unsigned HOST_WIDE_INT mask;
1649 int unknown = 0;
1650 const unsigned char *str = *pstr;
1651 unsigned int c = *str++;
1653 switch (c)
1655 case '\\': case '\'': case '"': case '?': break;
1656 case 'b': c = TARGET_BS; break;
1657 case 'f': c = TARGET_FF; break;
1658 case 'n': c = TARGET_NEWLINE; break;
1659 case 'r': c = TARGET_CR; break;
1660 case 't': c = TARGET_TAB; break;
1661 case 'v': c = TARGET_VT; break;
1663 case '(': case '{': case '[': case '%':
1664 /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
1665 '\%' is used to prevent SCCS from getting confused. */
1666 unknown = CPP_PEDANTIC (pfile);
1667 break;
1669 case 'a':
1670 if (CPP_WTRADITIONAL (pfile))
1671 cpp_warning (pfile, "the meaning of '\\a' is different in traditional C");
1672 c = TARGET_BELL;
1673 break;
1675 case 'e': case 'E':
1676 if (CPP_PEDANTIC (pfile))
1677 cpp_pedwarn (pfile, "non-ISO-standard escape sequence, '\\%c'", c);
1678 c = TARGET_ESC;
1679 break;
1681 case 'u': case 'U':
1682 unknown = maybe_read_ucs (pfile, &str, limit, &c);
1683 break;
1685 case 'x':
1686 if (CPP_WTRADITIONAL (pfile))
1687 cpp_warning (pfile, "the meaning of '\\x' is different in traditional C");
1690 unsigned int i = 0, overflow = 0;
1691 int digits_found = 0;
1693 while (str < limit)
1695 c = *str;
1696 if (! ISXDIGIT (c))
1697 break;
1698 str++;
1699 overflow |= i ^ (i << 4 >> 4);
1700 i = (i << 4) + hex_digit_value (c);
1701 digits_found = 1;
1704 if (!digits_found)
1705 cpp_error (pfile, "\\x used with no following hex digits");
1707 if (overflow | (i != (i & mask)))
1709 cpp_pedwarn (pfile, "hex escape sequence out of range");
1710 i &= mask;
1712 c = i;
1714 break;
1716 case '0': case '1': case '2': case '3':
1717 case '4': case '5': case '6': case '7':
1719 unsigned int i = c - '0';
1720 int count = 0;
1722 while (str < limit && ++count < 3)
1724 c = *str;
1725 if (c < '0' || c > '7')
1726 break;
1727 str++;
1728 i = (i << 3) + c - '0';
1731 if (i != (i & mask))
1733 cpp_pedwarn (pfile, "octal escape sequence out of range");
1734 i &= mask;
1736 c = i;
1738 break;
1740 default:
1741 unknown = 1;
1742 break;
1745 if (unknown)
1747 if (ISGRAPH (c))
1748 cpp_pedwarn (pfile, "unknown escape sequence '\\%c'", c);
1749 else
1750 cpp_pedwarn (pfile, "unknown escape sequence: '\\%03o'", c);
1753 if (c > mask)
1754 cpp_pedwarn (pfile, "escape sequence out of range for character");
1756 *pstr = str;
1757 return c;
1760 #ifndef MAX_CHAR_TYPE_SIZE
1761 #define MAX_CHAR_TYPE_SIZE CHAR_TYPE_SIZE
1762 #endif
1764 #ifndef MAX_WCHAR_TYPE_SIZE
1765 #define MAX_WCHAR_TYPE_SIZE WCHAR_TYPE_SIZE
1766 #endif
1768 /* Interpret a (possibly wide) character constant in TOKEN.
1769 WARN_MULTI warns about multi-character charconsts. PCHARS_SEEN points
1770 to a variable that is filled in with the number of characters seen. */
1771 HOST_WIDE_INT
1772 cpp_interpret_charconst (pfile, token, warn_multi, pchars_seen)
1773 cpp_reader *pfile;
1774 const cpp_token *token;
1775 int warn_multi;
1776 unsigned int *pchars_seen;
1778 const unsigned char *str = token->val.str.text;
1779 const unsigned char *limit = str + token->val.str.len;
1780 unsigned int chars_seen = 0;
1781 unsigned int width, max_chars, c;
1782 unsigned HOST_WIDE_INT mask;
1783 HOST_WIDE_INT result = 0;
1784 bool unsigned_p;
1786 #ifdef MULTIBYTE_CHARS
1787 (void) local_mbtowc (NULL, NULL, 0);
1788 #endif
1790 /* Width in bits. */
1791 if (token->type == CPP_CHAR)
1793 width = MAX_CHAR_TYPE_SIZE;
1794 unsigned_p = CPP_OPTION (pfile, signed_char) == 0;
1796 else
1798 width = MAX_WCHAR_TYPE_SIZE;
1799 unsigned_p = WCHAR_UNSIGNED;
1802 if (width < HOST_BITS_PER_WIDE_INT)
1803 mask = ((unsigned HOST_WIDE_INT) 1 << width) - 1;
1804 else
1805 mask = ~0;
1806 max_chars = HOST_BITS_PER_WIDE_INT / width;
1808 while (str < limit)
1810 #ifdef MULTIBYTE_CHARS
1811 wchar_t wc;
1812 int char_len;
1814 char_len = local_mbtowc (&wc, str, limit - str);
1815 if (char_len == -1)
1817 cpp_warning (pfile, "ignoring invalid multibyte character");
1818 c = *str++;
1820 else
1822 str += char_len;
1823 c = wc;
1825 #else
1826 c = *str++;
1827 #endif
1829 if (c == '\\')
1830 c = cpp_parse_escape (pfile, &str, limit, mask);
1832 #ifdef MAP_CHARACTER
1833 if (ISPRINT (c))
1834 c = MAP_CHARACTER (c);
1835 #endif
1837 /* Merge character into result; ignore excess chars. */
1838 if (++chars_seen <= max_chars)
1840 if (width < HOST_BITS_PER_WIDE_INT)
1841 result = (result << width) | (c & mask);
1842 else
1843 result = c;
1847 if (chars_seen == 0)
1848 cpp_error (pfile, "empty character constant");
1849 else if (chars_seen > max_chars)
1851 chars_seen = max_chars;
1852 cpp_warning (pfile, "character constant too long");
1854 else if (chars_seen > 1 && warn_multi)
1855 cpp_warning (pfile, "multi-character character constant");
1857 /* If relevant type is signed, sign-extend the constant. */
1858 if (chars_seen)
1860 unsigned int nbits = chars_seen * width;
1862 mask = (unsigned HOST_WIDE_INT) ~0 >> (HOST_BITS_PER_WIDE_INT - nbits);
1863 if (unsigned_p || ((result >> (nbits - 1)) & 1) == 0)
1864 result &= mask;
1865 else
1866 result |= ~mask;
1869 *pchars_seen = chars_seen;
1870 return result;
1873 /* Memory buffers. Changing these three constants can have a dramatic
1874 effect on performance. The values here are reasonable defaults,
1875 but might be tuned. If you adjust them, be sure to test across a
1876 range of uses of cpplib, including heavy nested function-like macro
1877 expansion. Also check the change in peak memory usage (NJAMD is a
1878 good tool for this). */
1879 #define MIN_BUFF_SIZE 8000
1880 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1881 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1882 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1884 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1885 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1886 #endif
1888 struct dummy
1890 char c;
1891 union
1893 double d;
1894 int *p;
1895 } u;
1898 #define DEFAULT_ALIGNMENT (offsetof (struct dummy, u))
1899 #define CPP_ALIGN(size, align) (((size) + ((align) - 1)) & ~((align) - 1))
1901 /* Create a new allocation buffer. Place the control block at the end
1902 of the buffer, so that buffer overflows will cause immediate chaos. */
1903 static _cpp_buff *
1904 new_buff (len)
1905 size_t len;
1907 _cpp_buff *result;
1908 unsigned char *base;
1910 if (len < MIN_BUFF_SIZE)
1911 len = MIN_BUFF_SIZE;
1912 len = CPP_ALIGN (len, DEFAULT_ALIGNMENT);
1914 base = xmalloc (len + sizeof (_cpp_buff));
1915 result = (_cpp_buff *) (base + len);
1916 result->base = base;
1917 result->cur = base;
1918 result->limit = base + len;
1919 result->next = NULL;
1920 return result;
1923 /* Place a chain of unwanted allocation buffers on the free list. */
1924 void
1925 _cpp_release_buff (pfile, buff)
1926 cpp_reader *pfile;
1927 _cpp_buff *buff;
1929 _cpp_buff *end = buff;
1931 while (end->next)
1932 end = end->next;
1933 end->next = pfile->free_buffs;
1934 pfile->free_buffs = buff;
1937 /* Return a free buffer of size at least MIN_SIZE. */
1938 _cpp_buff *
1939 _cpp_get_buff (pfile, min_size)
1940 cpp_reader *pfile;
1941 size_t min_size;
1943 _cpp_buff *result, **p;
1945 for (p = &pfile->free_buffs;; p = &(*p)->next)
1947 size_t size;
1949 if (*p == NULL)
1950 return new_buff (min_size);
1951 result = *p;
1952 size = result->limit - result->base;
1953 /* Return a buffer that's big enough, but don't waste one that's
1954 way too big. */
1955 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1956 break;
1959 *p = result->next;
1960 result->next = NULL;
1961 result->cur = result->base;
1962 return result;
1965 /* Creates a new buffer with enough space to hold the uncommitted
1966 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
1967 the excess bytes to the new buffer. Chains the new buffer after
1968 BUFF, and returns the new buffer. */
1969 _cpp_buff *
1970 _cpp_append_extend_buff (pfile, buff, min_extra)
1971 cpp_reader *pfile;
1972 _cpp_buff *buff;
1973 size_t min_extra;
1975 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1976 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1978 buff->next = new_buff;
1979 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1980 return new_buff;
1983 /* Creates a new buffer with enough space to hold the uncommitted
1984 remaining bytes of the buffer pointed to by BUFF, and at least
1985 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
1986 Chains the new buffer before the buffer pointed to by BUFF, and
1987 updates the pointer to point to the new buffer. */
1988 void
1989 _cpp_extend_buff (pfile, pbuff, min_extra)
1990 cpp_reader *pfile;
1991 _cpp_buff **pbuff;
1992 size_t min_extra;
1994 _cpp_buff *new_buff, *old_buff = *pbuff;
1995 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1997 new_buff = _cpp_get_buff (pfile, size);
1998 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1999 new_buff->next = old_buff;
2000 *pbuff = new_buff;
2003 /* Free a chain of buffers starting at BUFF. */
2004 void
2005 _cpp_free_buff (buff)
2006 _cpp_buff *buff;
2008 _cpp_buff *next;
2010 for (; buff; buff = next)
2012 next = buff->next;
2013 free (buff->base);
2017 /* Allocate permanent, unaligned storage of length LEN. */
2018 unsigned char *
2019 _cpp_unaligned_alloc (pfile, len)
2020 cpp_reader *pfile;
2021 size_t len;
2023 _cpp_buff *buff = pfile->u_buff;
2024 unsigned char *result = buff->cur;
2026 if (len > (size_t) (buff->limit - result))
2028 buff = _cpp_get_buff (pfile, len);
2029 buff->next = pfile->u_buff;
2030 pfile->u_buff = buff;
2031 result = buff->cur;
2034 buff->cur = result + len;
2035 return result;
2038 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2039 That buffer is used for growing allocations when saving macro
2040 replacement lists in a #define, and when parsing an answer to an
2041 assertion in #assert, #unassert or #if (and therefore possibly
2042 whilst expanding macros). It therefore must not be used by any
2043 code that they might call: specifically the lexer and the guts of
2044 the macro expander.
2046 All existing other uses clearly fit this restriction: storing
2047 registered pragmas during initialization. */
2048 unsigned char *
2049 _cpp_aligned_alloc (pfile, len)
2050 cpp_reader *pfile;
2051 size_t len;
2053 _cpp_buff *buff = pfile->a_buff;
2054 unsigned char *result = buff->cur;
2056 if (len > (size_t) (buff->limit - result))
2058 buff = _cpp_get_buff (pfile, len);
2059 buff->next = pfile->a_buff;
2060 pfile->a_buff = buff;
2061 result = buff->cur;
2064 buff->cur = result + len;
2065 return result;