* java/util/Properties.java (load): Only skip line if the first
[official-gcc.git] / gcc / cpplex.c
blob169730d53aec29bb1d23454c8f2b5b63405d3f77
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000, 2001, 2002 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7 Single-pass line tokenization by Neil Booth, April 2000
9 This program is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option) any
12 later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "cpplib.h"
28 #include "cpphash.h"
30 #ifdef MULTIBYTE_CHARS
31 #include "mbchar.h"
32 #include <locale.h>
33 #endif
35 /* Tokens with SPELL_STRING store their spelling in the token list,
36 and it's length in the token->val.name.len. */
37 enum spell_type
39 SPELL_OPERATOR = 0,
40 SPELL_CHAR,
41 SPELL_IDENT,
42 SPELL_NUMBER,
43 SPELL_STRING,
44 SPELL_NONE
47 struct token_spelling
49 enum spell_type category;
50 const unsigned char *name;
53 static const unsigned char *const digraph_spellings[] =
54 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
56 #define OP(e, s) { SPELL_OPERATOR, U s },
57 #define TK(e, s) { s, U STRINGX (e) },
58 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
59 #undef OP
60 #undef TK
62 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
63 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
64 #define BACKUP() do {buffer->cur = buffer->backup_to;} while (0)
66 static void handle_newline PARAMS ((cpp_reader *));
67 static cppchar_t skip_escaped_newlines PARAMS ((cpp_reader *));
68 static cppchar_t get_effective_char PARAMS ((cpp_reader *));
70 static int skip_block_comment PARAMS ((cpp_reader *));
71 static int skip_line_comment PARAMS ((cpp_reader *));
72 static void adjust_column PARAMS ((cpp_reader *));
73 static int skip_whitespace PARAMS ((cpp_reader *, cppchar_t));
74 static cpp_hashnode *parse_identifier PARAMS ((cpp_reader *));
75 static uchar *parse_slow PARAMS ((cpp_reader *, const uchar *, int,
76 unsigned int *));
77 static void parse_number PARAMS ((cpp_reader *, cpp_string *, int));
78 static int unescaped_terminator_p PARAMS ((cpp_reader *, const uchar *));
79 static void parse_string PARAMS ((cpp_reader *, cpp_token *, cppchar_t));
80 static bool trigraph_p PARAMS ((cpp_reader *));
81 static void save_comment PARAMS ((cpp_reader *, cpp_token *, const uchar *,
82 cppchar_t));
83 static bool continue_after_nul PARAMS ((cpp_reader *));
84 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
85 static int maybe_read_ucs PARAMS ((cpp_reader *, const unsigned char **,
86 const unsigned char *, cppchar_t *));
87 static tokenrun *next_tokenrun PARAMS ((tokenrun *));
89 static unsigned int hex_digit_value PARAMS ((unsigned int));
90 static _cpp_buff *new_buff PARAMS ((size_t));
92 /* Utility routine:
94 Compares, the token TOKEN to the NUL-terminated string STRING.
95 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
96 int
97 cpp_ideq (token, string)
98 const cpp_token *token;
99 const char *string;
101 if (token->type != CPP_NAME)
102 return 0;
104 return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
107 /* Call when meeting a newline, assumed to be in buffer->cur[-1].
108 Returns with buffer->cur pointing to the character immediately
109 following the newline (combination). */
110 static void
111 handle_newline (pfile)
112 cpp_reader *pfile;
114 cpp_buffer *buffer = pfile->buffer;
116 /* Handle CR-LF and LF-CR. Most other implementations (e.g. java)
117 only accept CR-LF; maybe we should fall back to that behavior? */
118 if (buffer->cur[-1] + buffer->cur[0] == '\r' + '\n')
119 buffer->cur++;
121 buffer->line_base = buffer->cur;
122 buffer->col_adjust = 0;
123 pfile->line++;
126 /* Subroutine of skip_escaped_newlines; called when a 3-character
127 sequence beginning with "??" is encountered. buffer->cur points to
128 the second '?'.
130 Warn if necessary, and returns true if the sequence forms a
131 trigraph and the trigraph should be honored. */
132 static bool
133 trigraph_p (pfile)
134 cpp_reader *pfile;
136 cpp_buffer *buffer = pfile->buffer;
137 cppchar_t from_char = buffer->cur[1];
138 bool accept;
140 if (!_cpp_trigraph_map[from_char])
141 return false;
143 accept = CPP_OPTION (pfile, trigraphs);
145 /* Don't warn about trigraphs in comments. */
146 if (CPP_OPTION (pfile, warn_trigraphs) && !pfile->state.lexing_comment)
148 if (accept)
149 cpp_error_with_line (pfile, DL_WARNING,
150 pfile->line, CPP_BUF_COL (buffer) - 1,
151 "trigraph ??%c converted to %c",
152 (int) from_char,
153 (int) _cpp_trigraph_map[from_char]);
154 else if (buffer->cur != buffer->last_Wtrigraphs)
156 buffer->last_Wtrigraphs = buffer->cur;
157 cpp_error_with_line (pfile, DL_WARNING,
158 pfile->line, CPP_BUF_COL (buffer) - 1,
159 "trigraph ??%c ignored", (int) from_char);
163 return accept;
166 /* Skips any escaped newlines introduced by '?' or a '\\', assumed to
167 lie in buffer->cur[-1]. Returns the next byte, which will be in
168 buffer->cur[-1]. This routine performs preprocessing stages 1 and
169 2 of the ISO C standard. */
170 static cppchar_t
171 skip_escaped_newlines (pfile)
172 cpp_reader *pfile;
174 cpp_buffer *buffer = pfile->buffer;
175 cppchar_t next = buffer->cur[-1];
177 /* Only do this if we apply stages 1 and 2. */
178 if (!buffer->from_stage3)
180 const unsigned char *saved_cur;
181 cppchar_t next1;
185 if (next == '?')
187 if (buffer->cur[0] != '?' || !trigraph_p (pfile))
188 break;
190 /* Translate the trigraph. */
191 next = _cpp_trigraph_map[buffer->cur[1]];
192 buffer->cur += 2;
193 if (next != '\\')
194 break;
197 if (buffer->cur == buffer->rlimit)
198 break;
200 /* We have a backslash, and room for at least one more
201 character. Skip horizontal whitespace. */
202 saved_cur = buffer->cur;
204 next1 = *buffer->cur++;
205 while (is_nvspace (next1) && buffer->cur < buffer->rlimit);
207 if (!is_vspace (next1))
209 buffer->cur = saved_cur;
210 break;
213 if (saved_cur != buffer->cur - 1
214 && !pfile->state.lexing_comment)
215 cpp_error (pfile, DL_WARNING,
216 "backslash and newline separated by space");
218 handle_newline (pfile);
219 buffer->backup_to = buffer->cur;
220 if (buffer->cur == buffer->rlimit)
222 cpp_error (pfile, DL_PEDWARN,
223 "backslash-newline at end of file");
224 next = EOF;
226 else
227 next = *buffer->cur++;
229 while (next == '\\' || next == '?');
232 return next;
235 /* Obtain the next character, after trigraph conversion and skipping
236 an arbitrarily long string of escaped newlines. The common case of
237 no trigraphs or escaped newlines falls through quickly. On return,
238 buffer->backup_to points to where to return to if the character is
239 not to be processed. */
240 static cppchar_t
241 get_effective_char (pfile)
242 cpp_reader *pfile;
244 cppchar_t next;
245 cpp_buffer *buffer = pfile->buffer;
247 buffer->backup_to = buffer->cur;
248 next = *buffer->cur++;
249 if (__builtin_expect (next == '?' || next == '\\', 0))
250 next = skip_escaped_newlines (pfile);
252 return next;
255 /* Skip a C-style block comment. We find the end of the comment by
256 seeing if an asterisk is before every '/' we encounter. Returns
257 nonzero if comment terminated by EOF, zero otherwise. */
258 static int
259 skip_block_comment (pfile)
260 cpp_reader *pfile;
262 cpp_buffer *buffer = pfile->buffer;
263 cppchar_t c = EOF, prevc = EOF;
265 pfile->state.lexing_comment = 1;
266 while (buffer->cur != buffer->rlimit)
268 prevc = c, c = *buffer->cur++;
270 /* FIXME: For speed, create a new character class of characters
271 of interest inside block comments. */
272 if (c == '?' || c == '\\')
273 c = skip_escaped_newlines (pfile);
275 /* People like decorating comments with '*', so check for '/'
276 instead for efficiency. */
277 if (c == '/')
279 if (prevc == '*')
280 break;
282 /* Warn about potential nested comments, but not if the '/'
283 comes immediately before the true comment delimiter.
284 Don't bother to get it right across escaped newlines. */
285 if (CPP_OPTION (pfile, warn_comments)
286 && buffer->cur[0] == '*' && buffer->cur[1] != '/')
287 cpp_error_with_line (pfile, DL_WARNING,
288 pfile->line, CPP_BUF_COL (buffer),
289 "\"/*\" within comment");
291 else if (is_vspace (c))
292 handle_newline (pfile);
293 else if (c == '\t')
294 adjust_column (pfile);
297 pfile->state.lexing_comment = 0;
298 return c != '/' || prevc != '*';
301 /* Skip a C++ line comment, leaving buffer->cur pointing to the
302 terminating newline. Handles escaped newlines. Returns nonzero
303 if a multiline comment. */
304 static int
305 skip_line_comment (pfile)
306 cpp_reader *pfile;
308 cpp_buffer *buffer = pfile->buffer;
309 unsigned int orig_line = pfile->line;
310 cppchar_t c;
311 #ifdef MULTIBYTE_CHARS
312 wchar_t wc;
313 int char_len;
314 #endif
316 pfile->state.lexing_comment = 1;
317 #ifdef MULTIBYTE_CHARS
318 /* Reset multibyte conversion state. */
319 (void) local_mbtowc (NULL, NULL, 0);
320 #endif
323 if (buffer->cur == buffer->rlimit)
324 goto at_eof;
326 #ifdef MULTIBYTE_CHARS
327 char_len = local_mbtowc (&wc, (const char *) buffer->cur,
328 buffer->rlimit - buffer->cur);
329 if (char_len == -1)
331 cpp_error (pfile, DL_WARNING,
332 "ignoring invalid multibyte character");
333 char_len = 1;
334 c = *buffer->cur++;
336 else
338 buffer->cur += char_len;
339 c = wc;
341 #else
342 c = *buffer->cur++;
343 #endif
344 if (c == '?' || c == '\\')
345 c = skip_escaped_newlines (pfile);
347 while (!is_vspace (c));
349 /* Step back over the newline, except at EOF. */
350 buffer->cur--;
351 at_eof:
353 pfile->state.lexing_comment = 0;
354 return orig_line != pfile->line;
357 /* pfile->buffer->cur is one beyond the \t character. Update
358 col_adjust so we track the column correctly. */
359 static void
360 adjust_column (pfile)
361 cpp_reader *pfile;
363 cpp_buffer *buffer = pfile->buffer;
364 unsigned int col = CPP_BUF_COL (buffer) - 1; /* Zero-based column. */
366 /* Round it up to multiple of the tabstop, but subtract 1 since the
367 tab itself occupies a character position. */
368 buffer->col_adjust += (CPP_OPTION (pfile, tabstop)
369 - col % CPP_OPTION (pfile, tabstop)) - 1;
372 /* Skips whitespace, saving the next non-whitespace character.
373 Adjusts pfile->col_adjust to account for tabs. Without this,
374 tokens might be assigned an incorrect column. */
375 static int
376 skip_whitespace (pfile, c)
377 cpp_reader *pfile;
378 cppchar_t c;
380 cpp_buffer *buffer = pfile->buffer;
381 unsigned int warned = 0;
385 /* Horizontal space always OK. */
386 if (c == ' ')
388 else if (c == '\t')
389 adjust_column (pfile);
390 /* Just \f \v or \0 left. */
391 else if (c == '\0')
393 if (buffer->cur - 1 == buffer->rlimit)
394 return 0;
395 if (!warned)
397 cpp_error (pfile, DL_WARNING, "null character(s) ignored");
398 warned = 1;
401 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
402 cpp_error_with_line (pfile, DL_PEDWARN, pfile->line,
403 CPP_BUF_COL (buffer),
404 "%s in preprocessing directive",
405 c == '\f' ? "form feed" : "vertical tab");
407 c = *buffer->cur++;
409 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
410 while (is_nvspace (c));
412 buffer->cur--;
413 return 1;
416 /* See if the characters of a number token are valid in a name (no
417 '.', '+' or '-'). */
418 static int
419 name_p (pfile, string)
420 cpp_reader *pfile;
421 const cpp_string *string;
423 unsigned int i;
425 for (i = 0; i < string->len; i++)
426 if (!is_idchar (string->text[i]))
427 return 0;
429 return 1;
432 /* Parse an identifier, skipping embedded backslash-newlines. This is
433 a critical inner loop. The common case is an identifier which has
434 not been split by backslash-newline, does not contain a dollar
435 sign, and has already been scanned (roughly 10:1 ratio of
436 seen:unseen identifiers in normal code; the distribution is
437 Poisson-like). Second most common case is a new identifier, not
438 split and no dollar sign. The other possibilities are rare and
439 have been relegated to parse_slow. */
440 static cpp_hashnode *
441 parse_identifier (pfile)
442 cpp_reader *pfile;
444 cpp_hashnode *result;
445 const uchar *cur, *base;
447 /* Fast-path loop. Skim over a normal identifier.
448 N.B. ISIDNUM does not include $. */
449 cur = pfile->buffer->cur;
450 while (ISIDNUM (*cur))
451 cur++;
453 /* Check for slow-path cases. */
454 if (*cur == '?' || *cur == '\\' || *cur == '$')
456 unsigned int len;
458 base = parse_slow (pfile, cur, 0, &len);
459 result = (cpp_hashnode *)
460 ht_lookup (pfile->hash_table, base, len, HT_ALLOCED);
462 else
464 base = pfile->buffer->cur - 1;
465 pfile->buffer->cur = cur;
466 result = (cpp_hashnode *)
467 ht_lookup (pfile->hash_table, base, cur - base, HT_ALLOC);
470 /* Rarely, identifiers require diagnostics when lexed.
471 XXX Has to be forced out of the fast path. */
472 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
473 && !pfile->state.skipping, 0))
475 /* It is allowed to poison the same identifier twice. */
476 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
477 cpp_error (pfile, DL_ERROR, "attempt to use poisoned \"%s\"",
478 NODE_NAME (result));
480 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
481 replacement list of a variadic macro. */
482 if (result == pfile->spec_nodes.n__VA_ARGS__
483 && !pfile->state.va_args_ok)
484 cpp_error (pfile, DL_PEDWARN,
485 "__VA_ARGS__ can only appear in the expansion of a C99 variadic macro");
488 return result;
491 /* Slow path. This handles numbers and identifiers which have been
492 split, or contain dollar signs. The part of the token from
493 PFILE->buffer->cur-1 to CUR has already been scanned. NUMBER_P is
494 1 if it's a number, and 2 if it has a leading period. Returns a
495 pointer to the token's NUL-terminated spelling in permanent
496 storage, and sets PLEN to its length. */
497 static uchar *
498 parse_slow (pfile, cur, number_p, plen)
499 cpp_reader *pfile;
500 const uchar *cur;
501 int number_p;
502 unsigned int *plen;
504 cpp_buffer *buffer = pfile->buffer;
505 const uchar *base = buffer->cur - 1;
506 struct obstack *stack = &pfile->hash_table->stack;
507 unsigned int c, prevc, saw_dollar = 0;
509 /* Place any leading period. */
510 if (number_p == 2)
511 obstack_1grow (stack, '.');
513 /* Copy the part of the token which is known to be okay. */
514 obstack_grow (stack, base, cur - base);
516 /* Now process the part which isn't. We are looking at one of
517 '$', '\\', or '?' on entry to this loop. */
518 prevc = cur[-1];
519 c = *cur++;
520 buffer->cur = cur;
521 for (;;)
523 /* Potential escaped newline? */
524 buffer->backup_to = buffer->cur - 1;
525 if (c == '?' || c == '\\')
526 c = skip_escaped_newlines (pfile);
528 if (!is_idchar (c))
530 if (!number_p)
531 break;
532 if (c != '.' && !VALID_SIGN (c, prevc))
533 break;
536 /* Handle normal identifier characters in this loop. */
539 prevc = c;
540 obstack_1grow (stack, c);
542 if (c == '$')
543 saw_dollar++;
545 c = *buffer->cur++;
547 while (is_idchar (c));
550 /* Step back over the unwanted char. */
551 BACKUP ();
553 /* $ is not an identifier character in the standard, but is commonly
554 accepted as an extension. Don't warn about it in skipped
555 conditional blocks. */
556 if (saw_dollar && CPP_PEDANTIC (pfile) && ! pfile->state.skipping)
557 cpp_error (pfile, DL_PEDWARN, "'$' character(s) in identifier or number");
559 /* Identifiers and numbers are null-terminated. */
560 *plen = obstack_object_size (stack);
561 obstack_1grow (stack, '\0');
562 return obstack_finish (stack);
565 /* Parse a number, beginning with character C, skipping embedded
566 backslash-newlines. LEADING_PERIOD is nonzero if there was a "."
567 before C. Place the result in NUMBER. */
568 static void
569 parse_number (pfile, number, leading_period)
570 cpp_reader *pfile;
571 cpp_string *number;
572 int leading_period;
574 const uchar *cur;
576 /* Fast-path loop. Skim over a normal number.
577 N.B. ISIDNUM does not include $. */
578 cur = pfile->buffer->cur;
579 while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
580 cur++;
582 /* Check for slow-path cases. */
583 if (*cur == '?' || *cur == '\\' || *cur == '$')
584 number->text = parse_slow (pfile, cur, 1 + leading_period, &number->len);
585 else
587 const uchar *base = pfile->buffer->cur - 1;
588 uchar *dest;
590 number->len = cur - base + leading_period;
591 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
592 dest[number->len] = '\0';
593 number->text = dest;
595 if (leading_period)
596 *dest++ = '.';
597 memcpy (dest, base, cur - base);
598 pfile->buffer->cur = cur;
602 /* Subroutine of parse_string. */
603 static int
604 unescaped_terminator_p (pfile, dest)
605 cpp_reader *pfile;
606 const unsigned char *dest;
608 const unsigned char *start, *temp;
610 /* In #include-style directives, terminators are not escapable. */
611 if (pfile->state.angled_headers)
612 return 1;
614 start = BUFF_FRONT (pfile->u_buff);
616 /* An odd number of consecutive backslashes represents an escaped
617 terminator. */
618 for (temp = dest; temp > start && temp[-1] == '\\'; temp--)
621 return ((dest - temp) & 1) == 0;
624 /* Parses a string, character constant, or angle-bracketed header file
625 name. Handles embedded trigraphs and escaped newlines. The stored
626 string is guaranteed NUL-terminated, but it is not guaranteed that
627 this is the first NUL since embedded NULs are preserved.
629 When this function returns, buffer->cur points to the next
630 character to be processed. */
631 static void
632 parse_string (pfile, token, terminator)
633 cpp_reader *pfile;
634 cpp_token *token;
635 cppchar_t terminator;
637 cpp_buffer *buffer = pfile->buffer;
638 unsigned char *dest, *limit;
639 cppchar_t c;
640 bool warned_nulls = false;
641 #ifdef MULTIBYTE_CHARS
642 wchar_t wc;
643 int char_len;
644 #endif
646 dest = BUFF_FRONT (pfile->u_buff);
647 limit = BUFF_LIMIT (pfile->u_buff);
649 #ifdef MULTIBYTE_CHARS
650 /* Reset multibyte conversion state. */
651 (void) local_mbtowc (NULL, NULL, 0);
652 #endif
653 for (;;)
655 /* We need room for another char, possibly the terminating NUL. */
656 if ((size_t) (limit - dest) < 1)
658 size_t len_so_far = dest - BUFF_FRONT (pfile->u_buff);
659 _cpp_extend_buff (pfile, &pfile->u_buff, 2);
660 dest = BUFF_FRONT (pfile->u_buff) + len_so_far;
661 limit = BUFF_LIMIT (pfile->u_buff);
664 #ifdef MULTIBYTE_CHARS
665 char_len = local_mbtowc (&wc, (const char *) buffer->cur,
666 buffer->rlimit - buffer->cur);
667 if (char_len == -1)
669 cpp_error (pfile, DL_WARNING,
670 "ignoring invalid multibyte character");
671 char_len = 1;
672 c = *buffer->cur++;
674 else
676 buffer->cur += char_len;
677 c = wc;
679 #else
680 c = *buffer->cur++;
681 #endif
683 /* Handle trigraphs, escaped newlines etc. */
684 if (c == '?' || c == '\\')
685 c = skip_escaped_newlines (pfile);
687 if (c == terminator)
689 if (unescaped_terminator_p (pfile, dest))
690 break;
692 else if (is_vspace (c))
694 /* No string literal may extend over multiple lines. In
695 assembly language, suppress the error except for <>
696 includes. This is a kludge around not knowing where
697 comments are. */
698 unterminated:
699 if (CPP_OPTION (pfile, lang) != CLK_ASM || terminator == '>')
700 cpp_error (pfile, DL_ERROR, "missing terminating %c character",
701 (int) terminator);
702 buffer->cur--;
703 break;
705 else if (c == '\0')
707 if (buffer->cur - 1 == buffer->rlimit)
708 goto unterminated;
709 if (!warned_nulls)
711 warned_nulls = true;
712 cpp_error (pfile, DL_WARNING,
713 "null character(s) preserved in literal");
716 #ifdef MULTIBYTE_CHARS
717 if (char_len > 1)
719 for ( ; char_len > 0; --char_len)
720 *dest++ = (*buffer->cur - char_len);
722 else
723 #endif
724 *dest++ = c;
727 *dest = '\0';
729 token->val.str.text = BUFF_FRONT (pfile->u_buff);
730 token->val.str.len = dest - BUFF_FRONT (pfile->u_buff);
731 BUFF_FRONT (pfile->u_buff) = dest + 1;
734 /* The stored comment includes the comment start and any terminator. */
735 static void
736 save_comment (pfile, token, from, type)
737 cpp_reader *pfile;
738 cpp_token *token;
739 const unsigned char *from;
740 cppchar_t type;
742 unsigned char *buffer;
743 unsigned int len, clen;
745 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
747 /* C++ comments probably (not definitely) have moved past a new
748 line, which we don't want to save in the comment. */
749 if (is_vspace (pfile->buffer->cur[-1]))
750 len--;
752 /* If we are currently in a directive, then we need to store all
753 C++ comments as C comments internally, and so we need to
754 allocate a little extra space in that case.
756 Note that the only time we encounter a directive here is
757 when we are saving comments in a "#define". */
758 clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
760 buffer = _cpp_unaligned_alloc (pfile, clen);
762 token->type = CPP_COMMENT;
763 token->val.str.len = clen;
764 token->val.str.text = buffer;
766 buffer[0] = '/';
767 memcpy (buffer + 1, from, len - 1);
769 /* Finish conversion to a C comment, if necessary. */
770 if (pfile->state.in_directive && type == '/')
772 buffer[1] = '*';
773 buffer[clen - 2] = '*';
774 buffer[clen - 1] = '/';
778 /* Allocate COUNT tokens for RUN. */
779 void
780 _cpp_init_tokenrun (run, count)
781 tokenrun *run;
782 unsigned int count;
784 run->base = xnewvec (cpp_token, count);
785 run->limit = run->base + count;
786 run->next = NULL;
789 /* Returns the next tokenrun, or creates one if there is none. */
790 static tokenrun *
791 next_tokenrun (run)
792 tokenrun *run;
794 if (run->next == NULL)
796 run->next = xnew (tokenrun);
797 run->next->prev = run;
798 _cpp_init_tokenrun (run->next, 250);
801 return run->next;
804 /* Allocate a single token that is invalidated at the same time as the
805 rest of the tokens on the line. Has its line and col set to the
806 same as the last lexed token, so that diagnostics appear in the
807 right place. */
808 cpp_token *
809 _cpp_temp_token (pfile)
810 cpp_reader *pfile;
812 cpp_token *old, *result;
814 old = pfile->cur_token - 1;
815 if (pfile->cur_token == pfile->cur_run->limit)
817 pfile->cur_run = next_tokenrun (pfile->cur_run);
818 pfile->cur_token = pfile->cur_run->base;
821 result = pfile->cur_token++;
822 result->line = old->line;
823 result->col = old->col;
824 return result;
827 /* Lex a token into RESULT (external interface). Takes care of issues
828 like directive handling, token lookahead, multiple include
829 optimization and skipping. */
830 const cpp_token *
831 _cpp_lex_token (pfile)
832 cpp_reader *pfile;
834 cpp_token *result;
836 for (;;)
838 if (pfile->cur_token == pfile->cur_run->limit)
840 pfile->cur_run = next_tokenrun (pfile->cur_run);
841 pfile->cur_token = pfile->cur_run->base;
844 if (pfile->lookaheads)
846 pfile->lookaheads--;
847 result = pfile->cur_token++;
849 else
850 result = _cpp_lex_direct (pfile);
852 if (result->flags & BOL)
854 /* Is this a directive. If _cpp_handle_directive returns
855 false, it is an assembler #. */
856 if (result->type == CPP_HASH
857 /* 6.10.3 p 11: Directives in a list of macro arguments
858 gives undefined behavior. This implementation
859 handles the directive as normal. */
860 && pfile->state.parsing_args != 1
861 && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
862 continue;
863 if (pfile->cb.line_change && !pfile->state.skipping)
864 (*pfile->cb.line_change)(pfile, result, pfile->state.parsing_args);
867 /* We don't skip tokens in directives. */
868 if (pfile->state.in_directive)
869 break;
871 /* Outside a directive, invalidate controlling macros. At file
872 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
873 get here and MI optimisation works. */
874 pfile->mi_valid = false;
876 if (!pfile->state.skipping || result->type == CPP_EOF)
877 break;
880 return result;
883 /* A NUL terminates the current buffer. For ISO preprocessing this is
884 EOF, but for traditional preprocessing it indicates we need a line
885 refill. Returns TRUE to continue preprocessing a new buffer, FALSE
886 to return a CPP_EOF to the caller. */
887 static bool
888 continue_after_nul (pfile)
889 cpp_reader *pfile;
891 cpp_buffer *buffer = pfile->buffer;
892 bool more = false;
894 buffer->saved_flags = BOL;
895 if (CPP_OPTION (pfile, traditional))
897 if (pfile->state.in_directive)
898 return false;
900 _cpp_remove_overlay (pfile);
901 more = _cpp_read_logical_line_trad (pfile);
902 _cpp_overlay_buffer (pfile, pfile->out.base,
903 pfile->out.cur - pfile->out.base);
904 pfile->line = pfile->out.first_line;
906 else
908 /* Stop parsing arguments with a CPP_EOF. When we finally come
909 back here, do the work of popping the buffer. */
910 if (!pfile->state.parsing_args)
912 if (buffer->cur != buffer->line_base)
914 /* Non-empty files should end in a newline. Don't warn
915 for command line and _Pragma buffers. */
916 if (!buffer->from_stage3)
917 cpp_error (pfile, DL_PEDWARN, "no newline at end of file");
918 handle_newline (pfile);
921 /* Similarly, finish an in-progress directive with CPP_EOF
922 before popping the buffer. */
923 if (!pfile->state.in_directive && buffer->prev)
925 more = !buffer->return_at_eof;
926 _cpp_pop_buffer (pfile);
931 return more;
934 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
935 do { \
936 if (get_effective_char (pfile) == CHAR) \
937 result->type = THEN_TYPE; \
938 else \
940 BACKUP (); \
941 result->type = ELSE_TYPE; \
943 } while (0)
945 /* Lex a token into pfile->cur_token, which is also incremented, to
946 get diagnostics pointing to the correct location.
948 Does not handle issues such as token lookahead, multiple-include
949 optimisation, directives, skipping etc. This function is only
950 suitable for use by _cpp_lex_token, and in special cases like
951 lex_expansion_token which doesn't care for any of these issues.
953 When meeting a newline, returns CPP_EOF if parsing a directive,
954 otherwise returns to the start of the token buffer if permissible.
955 Returns the location of the lexed token. */
956 cpp_token *
957 _cpp_lex_direct (pfile)
958 cpp_reader *pfile;
960 cppchar_t c;
961 cpp_buffer *buffer;
962 const unsigned char *comment_start;
963 cpp_token *result = pfile->cur_token++;
965 fresh_line:
966 buffer = pfile->buffer;
967 result->flags = buffer->saved_flags;
968 buffer->saved_flags = 0;
969 update_tokens_line:
970 result->line = pfile->line;
972 skipped_white:
973 c = *buffer->cur++;
974 result->col = CPP_BUF_COLUMN (buffer, buffer->cur);
976 trigraph:
977 switch (c)
979 case ' ': case '\t': case '\f': case '\v': case '\0':
980 result->flags |= PREV_WHITE;
981 if (skip_whitespace (pfile, c))
982 goto skipped_white;
984 /* End of buffer. */
985 buffer->cur--;
986 if (continue_after_nul (pfile))
987 goto fresh_line;
988 result->type = CPP_EOF;
989 break;
991 case '\n': case '\r':
992 handle_newline (pfile);
993 buffer->saved_flags = BOL;
994 if (! pfile->state.in_directive)
996 if (pfile->state.parsing_args == 2)
997 buffer->saved_flags |= PREV_WHITE;
998 if (!pfile->keep_tokens)
1000 pfile->cur_run = &pfile->base_run;
1001 result = pfile->base_run.base;
1002 pfile->cur_token = result + 1;
1004 goto fresh_line;
1006 result->type = CPP_EOF;
1007 break;
1009 case '?':
1010 case '\\':
1011 /* These could start an escaped newline, or '?' a trigraph. Let
1012 skip_escaped_newlines do all the work. */
1014 unsigned int line = pfile->line;
1016 c = skip_escaped_newlines (pfile);
1017 if (line != pfile->line)
1019 buffer->cur--;
1020 /* We had at least one escaped newline of some sort.
1021 Update the token's line and column. */
1022 goto update_tokens_line;
1026 /* We are either the original '?' or '\\', or a trigraph. */
1027 if (c == '?')
1028 result->type = CPP_QUERY;
1029 else if (c == '\\')
1030 goto random_char;
1031 else
1032 goto trigraph;
1033 break;
1035 case '0': case '1': case '2': case '3': case '4':
1036 case '5': case '6': case '7': case '8': case '9':
1037 result->type = CPP_NUMBER;
1038 parse_number (pfile, &result->val.str, 0);
1039 break;
1041 case 'L':
1042 /* 'L' may introduce wide characters or strings. */
1044 const unsigned char *pos = buffer->cur;
1046 c = get_effective_char (pfile);
1047 if (c == '\'' || c == '"')
1049 result->type = (c == '"' ? CPP_WSTRING: CPP_WCHAR);
1050 parse_string (pfile, result, c);
1051 break;
1053 buffer->cur = pos;
1055 /* Fall through. */
1057 start_ident:
1058 case '_':
1059 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1060 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1061 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1062 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1063 case 'y': case 'z':
1064 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1065 case 'G': case 'H': case 'I': case 'J': case 'K':
1066 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1067 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1068 case 'Y': case 'Z':
1069 result->type = CPP_NAME;
1070 result->val.node = parse_identifier (pfile);
1072 /* Convert named operators to their proper types. */
1073 if (result->val.node->flags & NODE_OPERATOR)
1075 result->flags |= NAMED_OP;
1076 result->type = result->val.node->directive_index;
1078 break;
1080 case '\'':
1081 case '"':
1082 result->type = c == '"' ? CPP_STRING: CPP_CHAR;
1083 parse_string (pfile, result, c);
1084 break;
1086 case '/':
1087 /* A potential block or line comment. */
1088 comment_start = buffer->cur;
1089 c = get_effective_char (pfile);
1091 if (c == '*')
1093 if (skip_block_comment (pfile))
1094 cpp_error (pfile, DL_ERROR, "unterminated comment");
1096 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1097 || CPP_IN_SYSTEM_HEADER (pfile)))
1099 /* Warn about comments only if pedantically GNUC89, and not
1100 in system headers. */
1101 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1102 && ! buffer->warned_cplusplus_comments)
1104 cpp_error (pfile, DL_PEDWARN,
1105 "C++ style comments are not allowed in ISO C90");
1106 cpp_error (pfile, DL_PEDWARN,
1107 "(this will be reported only once per input file)");
1108 buffer->warned_cplusplus_comments = 1;
1111 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1112 cpp_error (pfile, DL_WARNING, "multi-line comment");
1114 else if (c == '=')
1116 result->type = CPP_DIV_EQ;
1117 break;
1119 else
1121 BACKUP ();
1122 result->type = CPP_DIV;
1123 break;
1126 if (!pfile->state.save_comments)
1128 result->flags |= PREV_WHITE;
1129 goto update_tokens_line;
1132 /* Save the comment as a token in its own right. */
1133 save_comment (pfile, result, comment_start, c);
1134 break;
1136 case '<':
1137 if (pfile->state.angled_headers)
1139 result->type = CPP_HEADER_NAME;
1140 parse_string (pfile, result, '>');
1141 break;
1144 c = get_effective_char (pfile);
1145 if (c == '=')
1146 result->type = CPP_LESS_EQ;
1147 else if (c == '<')
1148 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1149 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1150 IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
1151 else if (c == ':' && CPP_OPTION (pfile, digraphs))
1153 result->type = CPP_OPEN_SQUARE;
1154 result->flags |= DIGRAPH;
1156 else if (c == '%' && CPP_OPTION (pfile, digraphs))
1158 result->type = CPP_OPEN_BRACE;
1159 result->flags |= DIGRAPH;
1161 else
1163 BACKUP ();
1164 result->type = CPP_LESS;
1166 break;
1168 case '>':
1169 c = get_effective_char (pfile);
1170 if (c == '=')
1171 result->type = CPP_GREATER_EQ;
1172 else if (c == '>')
1173 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1174 else if (c == '?' && CPP_OPTION (pfile, cplusplus))
1175 IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
1176 else
1178 BACKUP ();
1179 result->type = CPP_GREATER;
1181 break;
1183 case '%':
1184 c = get_effective_char (pfile);
1185 if (c == '=')
1186 result->type = CPP_MOD_EQ;
1187 else if (CPP_OPTION (pfile, digraphs) && c == ':')
1189 result->flags |= DIGRAPH;
1190 result->type = CPP_HASH;
1191 if (get_effective_char (pfile) == '%')
1193 const unsigned char *pos = buffer->cur;
1195 if (get_effective_char (pfile) == ':')
1196 result->type = CPP_PASTE;
1197 else
1198 buffer->cur = pos - 1;
1200 else
1201 BACKUP ();
1203 else if (CPP_OPTION (pfile, digraphs) && c == '>')
1205 result->flags |= DIGRAPH;
1206 result->type = CPP_CLOSE_BRACE;
1208 else
1210 BACKUP ();
1211 result->type = CPP_MOD;
1213 break;
1215 case '.':
1216 result->type = CPP_DOT;
1217 c = get_effective_char (pfile);
1218 if (c == '.')
1220 const unsigned char *pos = buffer->cur;
1222 if (get_effective_char (pfile) == '.')
1223 result->type = CPP_ELLIPSIS;
1224 else
1225 buffer->cur = pos - 1;
1227 /* All known character sets have 0...9 contiguous. */
1228 else if (ISDIGIT (c))
1230 result->type = CPP_NUMBER;
1231 parse_number (pfile, &result->val.str, 1);
1233 else if (c == '*' && CPP_OPTION (pfile, cplusplus))
1234 result->type = CPP_DOT_STAR;
1235 else
1236 BACKUP ();
1237 break;
1239 case '+':
1240 c = get_effective_char (pfile);
1241 if (c == '+')
1242 result->type = CPP_PLUS_PLUS;
1243 else if (c == '=')
1244 result->type = CPP_PLUS_EQ;
1245 else
1247 BACKUP ();
1248 result->type = CPP_PLUS;
1250 break;
1252 case '-':
1253 c = get_effective_char (pfile);
1254 if (c == '>')
1256 result->type = CPP_DEREF;
1257 if (CPP_OPTION (pfile, cplusplus))
1259 if (get_effective_char (pfile) == '*')
1260 result->type = CPP_DEREF_STAR;
1261 else
1262 BACKUP ();
1265 else if (c == '-')
1266 result->type = CPP_MINUS_MINUS;
1267 else if (c == '=')
1268 result->type = CPP_MINUS_EQ;
1269 else
1271 BACKUP ();
1272 result->type = CPP_MINUS;
1274 break;
1276 case '&':
1277 c = get_effective_char (pfile);
1278 if (c == '&')
1279 result->type = CPP_AND_AND;
1280 else if (c == '=')
1281 result->type = CPP_AND_EQ;
1282 else
1284 BACKUP ();
1285 result->type = CPP_AND;
1287 break;
1289 case '|':
1290 c = get_effective_char (pfile);
1291 if (c == '|')
1292 result->type = CPP_OR_OR;
1293 else if (c == '=')
1294 result->type = CPP_OR_EQ;
1295 else
1297 BACKUP ();
1298 result->type = CPP_OR;
1300 break;
1302 case ':':
1303 c = get_effective_char (pfile);
1304 if (c == ':' && CPP_OPTION (pfile, cplusplus))
1305 result->type = CPP_SCOPE;
1306 else if (c == '>' && CPP_OPTION (pfile, digraphs))
1308 result->flags |= DIGRAPH;
1309 result->type = CPP_CLOSE_SQUARE;
1311 else
1313 BACKUP ();
1314 result->type = CPP_COLON;
1316 break;
1318 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1319 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1320 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1321 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1322 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1324 case '~': result->type = CPP_COMPL; break;
1325 case ',': result->type = CPP_COMMA; break;
1326 case '(': result->type = CPP_OPEN_PAREN; break;
1327 case ')': result->type = CPP_CLOSE_PAREN; break;
1328 case '[': result->type = CPP_OPEN_SQUARE; break;
1329 case ']': result->type = CPP_CLOSE_SQUARE; break;
1330 case '{': result->type = CPP_OPEN_BRACE; break;
1331 case '}': result->type = CPP_CLOSE_BRACE; break;
1332 case ';': result->type = CPP_SEMICOLON; break;
1334 /* @ is a punctuator in Objective-C. */
1335 case '@': result->type = CPP_ATSIGN; break;
1337 case '$':
1338 if (CPP_OPTION (pfile, dollars_in_ident))
1339 goto start_ident;
1340 /* Fall through... */
1342 random_char:
1343 default:
1344 result->type = CPP_OTHER;
1345 result->val.c = c;
1346 break;
1349 return result;
1352 /* An upper bound on the number of bytes needed to spell TOKEN,
1353 including preceding whitespace. */
1354 unsigned int
1355 cpp_token_len (token)
1356 const cpp_token *token;
1358 unsigned int len;
1360 switch (TOKEN_SPELL (token))
1362 default: len = 0; break;
1363 case SPELL_NUMBER:
1364 case SPELL_STRING: len = token->val.str.len; break;
1365 case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
1367 /* 1 for whitespace, 4 for comment delimiters. */
1368 return len + 5;
1371 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1372 already contain the enough space to hold the token's spelling.
1373 Returns a pointer to the character after the last character
1374 written. */
1375 unsigned char *
1376 cpp_spell_token (pfile, token, buffer)
1377 cpp_reader *pfile; /* Would be nice to be rid of this... */
1378 const cpp_token *token;
1379 unsigned char *buffer;
1381 switch (TOKEN_SPELL (token))
1383 case SPELL_OPERATOR:
1385 const unsigned char *spelling;
1386 unsigned char c;
1388 if (token->flags & DIGRAPH)
1389 spelling
1390 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1391 else if (token->flags & NAMED_OP)
1392 goto spell_ident;
1393 else
1394 spelling = TOKEN_NAME (token);
1396 while ((c = *spelling++) != '\0')
1397 *buffer++ = c;
1399 break;
1401 case SPELL_CHAR:
1402 *buffer++ = token->val.c;
1403 break;
1405 spell_ident:
1406 case SPELL_IDENT:
1407 memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
1408 buffer += NODE_LEN (token->val.node);
1409 break;
1411 case SPELL_NUMBER:
1412 memcpy (buffer, token->val.str.text, token->val.str.len);
1413 buffer += token->val.str.len;
1414 break;
1416 case SPELL_STRING:
1418 int left, right, tag;
1419 switch (token->type)
1421 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1422 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1423 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1424 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1425 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1426 default:
1427 cpp_error (pfile, DL_ICE, "unknown string token %s\n",
1428 TOKEN_NAME (token));
1429 return buffer;
1431 if (tag) *buffer++ = tag;
1432 *buffer++ = left;
1433 memcpy (buffer, token->val.str.text, token->val.str.len);
1434 buffer += token->val.str.len;
1435 *buffer++ = right;
1437 break;
1439 case SPELL_NONE:
1440 cpp_error (pfile, DL_ICE, "unspellable token %s", TOKEN_NAME (token));
1441 break;
1444 return buffer;
1447 /* Returns TOKEN spelt as a null-terminated string. The string is
1448 freed when the reader is destroyed. Useful for diagnostics. */
1449 unsigned char *
1450 cpp_token_as_text (pfile, token)
1451 cpp_reader *pfile;
1452 const cpp_token *token;
1454 unsigned int len = cpp_token_len (token);
1455 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1457 end = cpp_spell_token (pfile, token, start);
1458 end[0] = '\0';
1460 return start;
1463 /* Used by C front ends, which really should move to using
1464 cpp_token_as_text. */
1465 const char *
1466 cpp_type2name (type)
1467 enum cpp_ttype type;
1469 return (const char *) token_spellings[type].name;
1472 /* Writes the spelling of token to FP, without any preceding space.
1473 Separated from cpp_spell_token for efficiency - to avoid stdio
1474 double-buffering. */
1475 void
1476 cpp_output_token (token, fp)
1477 const cpp_token *token;
1478 FILE *fp;
1480 switch (TOKEN_SPELL (token))
1482 case SPELL_OPERATOR:
1484 const unsigned char *spelling;
1485 int c;
1487 if (token->flags & DIGRAPH)
1488 spelling
1489 = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1490 else if (token->flags & NAMED_OP)
1491 goto spell_ident;
1492 else
1493 spelling = TOKEN_NAME (token);
1495 c = *spelling;
1497 putc (c, fp);
1498 while ((c = *++spelling) != '\0');
1500 break;
1502 case SPELL_CHAR:
1503 putc (token->val.c, fp);
1504 break;
1506 spell_ident:
1507 case SPELL_IDENT:
1508 fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
1509 break;
1511 case SPELL_NUMBER:
1512 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1513 break;
1515 case SPELL_STRING:
1517 int left, right, tag;
1518 switch (token->type)
1520 case CPP_STRING: left = '"'; right = '"'; tag = '\0'; break;
1521 case CPP_WSTRING: left = '"'; right = '"'; tag = 'L'; break;
1522 case CPP_CHAR: left = '\''; right = '\''; tag = '\0'; break;
1523 case CPP_WCHAR: left = '\''; right = '\''; tag = 'L'; break;
1524 case CPP_HEADER_NAME: left = '<'; right = '>'; tag = '\0'; break;
1525 default:
1526 fprintf (stderr, "impossible STRING token %s\n", TOKEN_NAME (token));
1527 return;
1529 if (tag) putc (tag, fp);
1530 putc (left, fp);
1531 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1532 putc (right, fp);
1534 break;
1536 case SPELL_NONE:
1537 /* An error, most probably. */
1538 break;
1542 /* Compare two tokens. */
1544 _cpp_equiv_tokens (a, b)
1545 const cpp_token *a, *b;
1547 if (a->type == b->type && a->flags == b->flags)
1548 switch (TOKEN_SPELL (a))
1550 default: /* Keep compiler happy. */
1551 case SPELL_OPERATOR:
1552 return 1;
1553 case SPELL_CHAR:
1554 return a->val.c == b->val.c; /* Character. */
1555 case SPELL_NONE:
1556 return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1557 case SPELL_IDENT:
1558 return a->val.node == b->val.node;
1559 case SPELL_NUMBER:
1560 case SPELL_STRING:
1561 return (a->val.str.len == b->val.str.len
1562 && !memcmp (a->val.str.text, b->val.str.text,
1563 a->val.str.len));
1566 return 0;
1569 /* Returns nonzero if a space should be inserted to avoid an
1570 accidental token paste for output. For simplicity, it is
1571 conservative, and occasionally advises a space where one is not
1572 needed, e.g. "." and ".2". */
1574 cpp_avoid_paste (pfile, token1, token2)
1575 cpp_reader *pfile;
1576 const cpp_token *token1, *token2;
1578 enum cpp_ttype a = token1->type, b = token2->type;
1579 cppchar_t c;
1581 if (token1->flags & NAMED_OP)
1582 a = CPP_NAME;
1583 if (token2->flags & NAMED_OP)
1584 b = CPP_NAME;
1586 c = EOF;
1587 if (token2->flags & DIGRAPH)
1588 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1589 else if (token_spellings[b].category == SPELL_OPERATOR)
1590 c = token_spellings[b].name[0];
1592 /* Quickly get everything that can paste with an '='. */
1593 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1594 return 1;
1596 switch (a)
1598 case CPP_GREATER: return c == '>' || c == '?';
1599 case CPP_LESS: return c == '<' || c == '?' || c == '%' || c == ':';
1600 case CPP_PLUS: return c == '+';
1601 case CPP_MINUS: return c == '-' || c == '>';
1602 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1603 case CPP_MOD: return c == ':' || c == '>';
1604 case CPP_AND: return c == '&';
1605 case CPP_OR: return c == '|';
1606 case CPP_COLON: return c == ':' || c == '>';
1607 case CPP_DEREF: return c == '*';
1608 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1609 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1610 case CPP_NAME: return ((b == CPP_NUMBER
1611 && name_p (pfile, &token2->val.str))
1612 || b == CPP_NAME
1613 || b == CPP_CHAR || b == CPP_STRING); /* L */
1614 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1615 || c == '.' || c == '+' || c == '-');
1616 case CPP_OTHER: return (CPP_OPTION (pfile, objc)
1617 && token1->val.c == '@'
1618 && (b == CPP_NAME || b == CPP_STRING));
1619 default: break;
1622 return 0;
1625 /* Output all the remaining tokens on the current line, and a newline
1626 character, to FP. Leading whitespace is removed. If there are
1627 macros, special token padding is not performed. */
1628 void
1629 cpp_output_line (pfile, fp)
1630 cpp_reader *pfile;
1631 FILE *fp;
1633 const cpp_token *token;
1635 token = cpp_get_token (pfile);
1636 while (token->type != CPP_EOF)
1638 cpp_output_token (token, fp);
1639 token = cpp_get_token (pfile);
1640 if (token->flags & PREV_WHITE)
1641 putc (' ', fp);
1644 putc ('\n', fp);
1647 /* Returns the value of a hexadecimal digit. */
1648 static unsigned int
1649 hex_digit_value (c)
1650 unsigned int c;
1652 if (hex_p (c))
1653 return hex_value (c);
1654 else
1655 abort ();
1658 /* Parse a '\uNNNN' or '\UNNNNNNNN' sequence. Returns 1 to indicate
1659 failure if cpplib is not parsing C++ or C99. Such failure is
1660 silent, and no variables are updated. Otherwise returns 0, and
1661 warns if -Wtraditional.
1663 [lex.charset]: The character designated by the universal character
1664 name \UNNNNNNNN is that character whose character short name in
1665 ISO/IEC 10646 is NNNNNNNN; the character designated by the
1666 universal character name \uNNNN is that character whose character
1667 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
1668 for a universal character name is less than 0x20 or in the range
1669 0x7F-0x9F (inclusive), or if the universal character name
1670 designates a character in the basic source character set, then the
1671 program is ill-formed.
1673 We assume that wchar_t is Unicode, so we don't need to do any
1674 mapping. Is this ever wrong?
1676 PC points to the 'u' or 'U', PSTR is points to the byte after PC,
1677 LIMIT is the end of the string or charconst. PSTR is updated to
1678 point after the UCS on return, and the UCS is written into PC. */
1680 static int
1681 maybe_read_ucs (pfile, pstr, limit, pc)
1682 cpp_reader *pfile;
1683 const unsigned char **pstr;
1684 const unsigned char *limit;
1685 cppchar_t *pc;
1687 const unsigned char *p = *pstr;
1688 unsigned int code = 0;
1689 unsigned int c = *pc, length;
1691 /* Only attempt to interpret a UCS for C++ and C99. */
1692 if (! (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99)))
1693 return 1;
1695 if (CPP_WTRADITIONAL (pfile))
1696 cpp_error (pfile, DL_WARNING,
1697 "the meaning of '\\%c' is different in traditional C", c);
1699 length = (c == 'u' ? 4: 8);
1701 if ((size_t) (limit - p) < length)
1703 cpp_error (pfile, DL_ERROR, "incomplete universal-character-name");
1704 /* Skip to the end to avoid more diagnostics. */
1705 p = limit;
1707 else
1709 for (; length; length--, p++)
1711 c = *p;
1712 if (ISXDIGIT (c))
1713 code = (code << 4) + hex_digit_value (c);
1714 else
1716 cpp_error (pfile, DL_ERROR,
1717 "non-hex digit '%c' in universal-character-name", c);
1718 /* We shouldn't skip in case there are multibyte chars. */
1719 break;
1724 #ifdef TARGET_EBCDIC
1725 cpp_error (pfile, DL_ERROR, "universal-character-name on EBCDIC target");
1726 code = 0x3f; /* EBCDIC invalid character */
1727 #else
1728 /* True extended characters are OK. */
1729 if (code >= 0xa0
1730 && !(code & 0x80000000)
1731 && !(code >= 0xD800 && code <= 0xDFFF))
1733 /* The standard permits $, @ and ` to be specified as UCNs. We use
1734 hex escapes so that this also works with EBCDIC hosts. */
1735 else if (code == 0x24 || code == 0x40 || code == 0x60)
1737 /* Don't give another error if one occurred above. */
1738 else if (length == 0)
1739 cpp_error (pfile, DL_ERROR, "universal-character-name out of range");
1740 #endif
1742 *pstr = p;
1743 *pc = code;
1744 return 0;
1747 /* Returns the value of an escape sequence, truncated to the correct
1748 target precision. PSTR points to the input pointer, which is just
1749 after the backslash. LIMIT is how much text we have. WIDE is true
1750 if the escape sequence is part of a wide character constant or
1751 string literal. Handles all relevant diagnostics. */
1752 cppchar_t
1753 cpp_parse_escape (pfile, pstr, limit, wide)
1754 cpp_reader *pfile;
1755 const unsigned char **pstr;
1756 const unsigned char *limit;
1757 int wide;
1759 int unknown = 0;
1760 const unsigned char *str = *pstr;
1761 cppchar_t c, mask;
1762 unsigned int width;
1764 if (wide)
1765 width = CPP_OPTION (pfile, wchar_precision);
1766 else
1767 width = CPP_OPTION (pfile, char_precision);
1768 if (width < BITS_PER_CPPCHAR_T)
1769 mask = ((cppchar_t) 1 << width) - 1;
1770 else
1771 mask = ~0;
1773 c = *str++;
1774 switch (c)
1776 case '\\': case '\'': case '"': case '?': break;
1777 case 'b': c = TARGET_BS; break;
1778 case 'f': c = TARGET_FF; break;
1779 case 'n': c = TARGET_NEWLINE; break;
1780 case 'r': c = TARGET_CR; break;
1781 case 't': c = TARGET_TAB; break;
1782 case 'v': c = TARGET_VT; break;
1784 case '(': case '{': case '[': case '%':
1785 /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
1786 '\%' is used to prevent SCCS from getting confused. */
1787 unknown = CPP_PEDANTIC (pfile);
1788 break;
1790 case 'a':
1791 if (CPP_WTRADITIONAL (pfile))
1792 cpp_error (pfile, DL_WARNING,
1793 "the meaning of '\\a' is different in traditional C");
1794 c = TARGET_BELL;
1795 break;
1797 case 'e': case 'E':
1798 if (CPP_PEDANTIC (pfile))
1799 cpp_error (pfile, DL_PEDWARN,
1800 "non-ISO-standard escape sequence, '\\%c'", (int) c);
1801 c = TARGET_ESC;
1802 break;
1804 case 'u': case 'U':
1805 unknown = maybe_read_ucs (pfile, &str, limit, &c);
1806 break;
1808 case 'x':
1809 if (CPP_WTRADITIONAL (pfile))
1810 cpp_error (pfile, DL_WARNING,
1811 "the meaning of '\\x' is different in traditional C");
1814 cppchar_t i = 0, overflow = 0;
1815 int digits_found = 0;
1817 while (str < limit)
1819 c = *str;
1820 if (! ISXDIGIT (c))
1821 break;
1822 str++;
1823 overflow |= i ^ (i << 4 >> 4);
1824 i = (i << 4) + hex_digit_value (c);
1825 digits_found = 1;
1828 if (!digits_found)
1829 cpp_error (pfile, DL_ERROR,
1830 "\\x used with no following hex digits");
1832 if (overflow | (i != (i & mask)))
1834 cpp_error (pfile, DL_PEDWARN,
1835 "hex escape sequence out of range");
1836 i &= mask;
1838 c = i;
1840 break;
1842 case '0': case '1': case '2': case '3':
1843 case '4': case '5': case '6': case '7':
1845 size_t count = 0;
1846 cppchar_t i = c - '0';
1848 while (str < limit && ++count < 3)
1850 c = *str;
1851 if (c < '0' || c > '7')
1852 break;
1853 str++;
1854 i = (i << 3) + c - '0';
1857 if (i != (i & mask))
1859 cpp_error (pfile, DL_PEDWARN,
1860 "octal escape sequence out of range");
1861 i &= mask;
1863 c = i;
1865 break;
1867 default:
1868 unknown = 1;
1869 break;
1872 if (unknown)
1874 if (ISGRAPH (c))
1875 cpp_error (pfile, DL_PEDWARN,
1876 "unknown escape sequence '\\%c'", (int) c);
1877 else
1878 cpp_error (pfile, DL_PEDWARN,
1879 "unknown escape sequence: '\\%03o'", (int) c);
1882 if (c > mask)
1884 cpp_error (pfile, DL_PEDWARN, "escape sequence out of range for its type");
1885 c &= mask;
1888 *pstr = str;
1889 return c;
1892 /* Interpret a (possibly wide) character constant in TOKEN.
1893 WARN_MULTI warns about multi-character charconsts. PCHARS_SEEN
1894 points to a variable that is filled in with the number of
1895 characters seen, and UNSIGNEDP to a variable that indicates whether
1896 the result has signed type. */
1897 cppchar_t
1898 cpp_interpret_charconst (pfile, token, pchars_seen, unsignedp)
1899 cpp_reader *pfile;
1900 const cpp_token *token;
1901 unsigned int *pchars_seen;
1902 int *unsignedp;
1904 const unsigned char *str = token->val.str.text;
1905 const unsigned char *limit = str + token->val.str.len;
1906 unsigned int chars_seen = 0;
1907 size_t width, max_chars;
1908 cppchar_t c, mask, result = 0;
1909 bool unsigned_p;
1911 #ifdef MULTIBYTE_CHARS
1912 (void) local_mbtowc (NULL, NULL, 0);
1913 #endif
1915 /* Width in bits. */
1916 if (token->type == CPP_CHAR)
1918 width = CPP_OPTION (pfile, char_precision);
1919 max_chars = CPP_OPTION (pfile, int_precision) / width;
1920 unsigned_p = CPP_OPTION (pfile, unsigned_char);
1922 else
1924 width = CPP_OPTION (pfile, wchar_precision);
1925 max_chars = 1;
1926 unsigned_p = CPP_OPTION (pfile, unsigned_wchar);
1929 if (width < BITS_PER_CPPCHAR_T)
1930 mask = ((cppchar_t) 1 << width) - 1;
1931 else
1932 mask = ~0;
1934 while (str < limit)
1936 #ifdef MULTIBYTE_CHARS
1937 wchar_t wc;
1938 int char_len;
1940 char_len = local_mbtowc (&wc, (const char *)str, limit - str);
1941 if (char_len == -1)
1943 cpp_error (pfile, DL_WARNING,
1944 "ignoring invalid multibyte character");
1945 c = *str++;
1947 else
1949 str += char_len;
1950 c = wc;
1952 #else
1953 c = *str++;
1954 #endif
1956 if (c == '\\')
1957 c = cpp_parse_escape (pfile, &str, limit, token->type == CPP_WCHAR);
1959 #ifdef MAP_CHARACTER
1960 if (ISPRINT (c))
1961 c = MAP_CHARACTER (c);
1962 #endif
1964 chars_seen++;
1966 /* Truncate the character, scale the result and merge the two. */
1967 c &= mask;
1968 if (width < BITS_PER_CPPCHAR_T)
1969 result = (result << width) | c;
1970 else
1971 result = c;
1974 if (chars_seen == 0)
1975 cpp_error (pfile, DL_ERROR, "empty character constant");
1976 else if (chars_seen > 1)
1978 /* Multichar charconsts are of type int and therefore signed. */
1979 unsigned_p = 0;
1981 if (chars_seen > max_chars)
1983 chars_seen = max_chars;
1984 cpp_error (pfile, DL_WARNING,
1985 "character constant too long for its type");
1987 else if (CPP_OPTION (pfile, warn_multichar))
1988 cpp_error (pfile, DL_WARNING, "multi-character character constant");
1991 /* Sign-extend or truncate the constant to cppchar_t. The value is
1992 in WIDTH bits, but for multi-char charconsts it's value is the
1993 full target type's width. */
1994 if (chars_seen > 1)
1995 width *= max_chars;
1996 if (width < BITS_PER_CPPCHAR_T)
1998 mask = ((cppchar_t) 1 << width) - 1;
1999 if (unsigned_p || !(result & (1 << (width - 1))))
2000 result &= mask;
2001 else
2002 result |= ~mask;
2005 *pchars_seen = chars_seen;
2006 *unsignedp = unsigned_p;
2007 return result;
2010 /* Memory buffers. Changing these three constants can have a dramatic
2011 effect on performance. The values here are reasonable defaults,
2012 but might be tuned. If you adjust them, be sure to test across a
2013 range of uses of cpplib, including heavy nested function-like macro
2014 expansion. Also check the change in peak memory usage (NJAMD is a
2015 good tool for this). */
2016 #define MIN_BUFF_SIZE 8000
2017 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2018 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2019 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2021 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2022 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2023 #endif
2025 /* Create a new allocation buffer. Place the control block at the end
2026 of the buffer, so that buffer overflows will cause immediate chaos. */
2027 static _cpp_buff *
2028 new_buff (len)
2029 size_t len;
2031 _cpp_buff *result;
2032 unsigned char *base;
2034 if (len < MIN_BUFF_SIZE)
2035 len = MIN_BUFF_SIZE;
2036 len = CPP_ALIGN (len);
2038 base = xmalloc (len + sizeof (_cpp_buff));
2039 result = (_cpp_buff *) (base + len);
2040 result->base = base;
2041 result->cur = base;
2042 result->limit = base + len;
2043 result->next = NULL;
2044 return result;
2047 /* Place a chain of unwanted allocation buffers on the free list. */
2048 void
2049 _cpp_release_buff (pfile, buff)
2050 cpp_reader *pfile;
2051 _cpp_buff *buff;
2053 _cpp_buff *end = buff;
2055 while (end->next)
2056 end = end->next;
2057 end->next = pfile->free_buffs;
2058 pfile->free_buffs = buff;
2061 /* Return a free buffer of size at least MIN_SIZE. */
2062 _cpp_buff *
2063 _cpp_get_buff (pfile, min_size)
2064 cpp_reader *pfile;
2065 size_t min_size;
2067 _cpp_buff *result, **p;
2069 for (p = &pfile->free_buffs;; p = &(*p)->next)
2071 size_t size;
2073 if (*p == NULL)
2074 return new_buff (min_size);
2075 result = *p;
2076 size = result->limit - result->base;
2077 /* Return a buffer that's big enough, but don't waste one that's
2078 way too big. */
2079 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2080 break;
2083 *p = result->next;
2084 result->next = NULL;
2085 result->cur = result->base;
2086 return result;
2089 /* Creates a new buffer with enough space to hold the uncommitted
2090 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
2091 the excess bytes to the new buffer. Chains the new buffer after
2092 BUFF, and returns the new buffer. */
2093 _cpp_buff *
2094 _cpp_append_extend_buff (pfile, buff, min_extra)
2095 cpp_reader *pfile;
2096 _cpp_buff *buff;
2097 size_t min_extra;
2099 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2100 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2102 buff->next = new_buff;
2103 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2104 return new_buff;
2107 /* Creates a new buffer with enough space to hold the uncommitted
2108 remaining bytes of the buffer pointed to by BUFF, and at least
2109 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
2110 Chains the new buffer before the buffer pointed to by BUFF, and
2111 updates the pointer to point to the new buffer. */
2112 void
2113 _cpp_extend_buff (pfile, pbuff, min_extra)
2114 cpp_reader *pfile;
2115 _cpp_buff **pbuff;
2116 size_t min_extra;
2118 _cpp_buff *new_buff, *old_buff = *pbuff;
2119 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2121 new_buff = _cpp_get_buff (pfile, size);
2122 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2123 new_buff->next = old_buff;
2124 *pbuff = new_buff;
2127 /* Free a chain of buffers starting at BUFF. */
2128 void
2129 _cpp_free_buff (buff)
2130 _cpp_buff *buff;
2132 _cpp_buff *next;
2134 for (; buff; buff = next)
2136 next = buff->next;
2137 free (buff->base);
2141 /* Allocate permanent, unaligned storage of length LEN. */
2142 unsigned char *
2143 _cpp_unaligned_alloc (pfile, len)
2144 cpp_reader *pfile;
2145 size_t len;
2147 _cpp_buff *buff = pfile->u_buff;
2148 unsigned char *result = buff->cur;
2150 if (len > (size_t) (buff->limit - result))
2152 buff = _cpp_get_buff (pfile, len);
2153 buff->next = pfile->u_buff;
2154 pfile->u_buff = buff;
2155 result = buff->cur;
2158 buff->cur = result + len;
2159 return result;
2162 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2163 That buffer is used for growing allocations when saving macro
2164 replacement lists in a #define, and when parsing an answer to an
2165 assertion in #assert, #unassert or #if (and therefore possibly
2166 whilst expanding macros). It therefore must not be used by any
2167 code that they might call: specifically the lexer and the guts of
2168 the macro expander.
2170 All existing other uses clearly fit this restriction: storing
2171 registered pragmas during initialization. */
2172 unsigned char *
2173 _cpp_aligned_alloc (pfile, len)
2174 cpp_reader *pfile;
2175 size_t len;
2177 _cpp_buff *buff = pfile->a_buff;
2178 unsigned char *result = buff->cur;
2180 if (len > (size_t) (buff->limit - result))
2182 buff = _cpp_get_buff (pfile, len);
2183 buff->next = pfile->a_buff;
2184 pfile->a_buff = buff;
2185 result = buff->cur;
2188 buff->cur = result + len;
2189 return result;