Fix PR43464: loop close phi nodes can contain more than one argument.
[official-gcc/graphite-test-results.git] / libcpp / lex.c
blobac28f92e64093a7dfeb82bd9d40b831630b6c948
1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009
3 Free Software Foundation, Inc.
4 Contributed by Per Bothner, 1994-95.
5 Based on CCCP program by Paul Rubin, June 1986
6 Adapted to ANSI C, Richard Stallman, Jan 1987
7 Broken out to separate file, Zack Weinberg, Mar 2000
9 This program is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 3, or (at your option) any
12 later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
23 #include "config.h"
24 #include "system.h"
25 #include "cpplib.h"
26 #include "internal.h"
28 enum spell_type
30 SPELL_OPERATOR = 0,
31 SPELL_IDENT,
32 SPELL_LITERAL,
33 SPELL_NONE
36 struct token_spelling
38 enum spell_type category;
39 const unsigned char *name;
42 static const unsigned char *const digraph_spellings[] =
43 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
45 #define OP(e, s) { SPELL_OPERATOR, UC s },
46 #define TK(e, s) { SPELL_ ## s, UC #e },
47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
48 #undef OP
49 #undef TK
51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
55 static int skip_line_comment (cpp_reader *);
56 static void skip_whitespace (cpp_reader *, cppchar_t);
57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
59 static void store_comment (cpp_reader *, cpp_token *);
60 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
61 unsigned int, enum cpp_ttype);
62 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
63 static int name_p (cpp_reader *, const cpp_string *);
64 static tokenrun *next_tokenrun (tokenrun *);
66 static _cpp_buff *new_buff (size_t);
69 /* Utility routine:
71 Compares, the token TOKEN to the NUL-terminated string STRING.
72 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
73 int
74 cpp_ideq (const cpp_token *token, const char *string)
76 if (token->type != CPP_NAME)
77 return 0;
79 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
82 /* Record a note TYPE at byte POS into the current cleaned logical
83 line. */
84 static void
85 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
87 if (buffer->notes_used == buffer->notes_cap)
89 buffer->notes_cap = buffer->notes_cap * 2 + 200;
90 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
91 buffer->notes_cap);
94 buffer->notes[buffer->notes_used].pos = pos;
95 buffer->notes[buffer->notes_used].type = type;
96 buffer->notes_used++;
99 /* Returns with a logical line that contains no escaped newlines or
100 trigraphs. This is a time-critical inner loop. */
101 void
102 _cpp_clean_line (cpp_reader *pfile)
104 cpp_buffer *buffer;
105 const uchar *s;
106 uchar c, *d, *p;
108 buffer = pfile->buffer;
109 buffer->cur_note = buffer->notes_used = 0;
110 buffer->cur = buffer->line_base = buffer->next_line;
111 buffer->need_line = false;
112 s = buffer->next_line - 1;
114 if (!buffer->from_stage3)
116 const uchar *pbackslash = NULL;
118 /* Short circuit for the common case of an un-escaped line with
119 no trigraphs. The primary win here is by not writing any
120 data back to memory until we have to. */
121 for (;;)
123 c = *++s;
124 if (__builtin_expect (c == '\n', false)
125 || __builtin_expect (c == '\r', false))
127 d = (uchar *) s;
129 if (__builtin_expect (s == buffer->rlimit, false))
130 goto done;
132 /* DOS line ending? */
133 if (__builtin_expect (c == '\r', false)
134 && s[1] == '\n')
136 s++;
137 if (s == buffer->rlimit)
138 goto done;
141 if (__builtin_expect (pbackslash == NULL, true))
142 goto done;
144 /* Check for escaped newline. */
145 p = d;
146 while (is_nvspace (p[-1]))
147 p--;
148 if (p - 1 != pbackslash)
149 goto done;
151 /* Have an escaped newline; process it and proceed to
152 the slow path. */
153 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
154 d = p - 2;
155 buffer->next_line = p - 1;
156 break;
158 if (__builtin_expect (c == '\\', false))
159 pbackslash = s;
160 else if (__builtin_expect (c == '?', false)
161 && __builtin_expect (s[1] == '?', false)
162 && _cpp_trigraph_map[s[2]])
164 /* Have a trigraph. We may or may not have to convert
165 it. Add a line note regardless, for -Wtrigraphs. */
166 add_line_note (buffer, s, s[2]);
167 if (CPP_OPTION (pfile, trigraphs))
169 /* We do, and that means we have to switch to the
170 slow path. */
171 d = (uchar *) s;
172 *d = _cpp_trigraph_map[s[2]];
173 s += 2;
174 break;
180 for (;;)
182 c = *++s;
183 *++d = c;
185 if (c == '\n' || c == '\r')
187 /* Handle DOS line endings. */
188 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
189 s++;
190 if (s == buffer->rlimit)
191 break;
193 /* Escaped? */
194 p = d;
195 while (p != buffer->next_line && is_nvspace (p[-1]))
196 p--;
197 if (p == buffer->next_line || p[-1] != '\\')
198 break;
200 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
201 d = p - 2;
202 buffer->next_line = p - 1;
204 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
206 /* Add a note regardless, for the benefit of -Wtrigraphs. */
207 add_line_note (buffer, d, s[2]);
208 if (CPP_OPTION (pfile, trigraphs))
210 *d = _cpp_trigraph_map[s[2]];
211 s += 2;
216 else
219 s++;
220 while (*s != '\n' && *s != '\r');
221 d = (uchar *) s;
223 /* Handle DOS line endings. */
224 if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
225 s++;
228 done:
229 *d = '\n';
230 /* A sentinel note that should never be processed. */
231 add_line_note (buffer, d + 1, '\n');
232 buffer->next_line = s + 1;
235 /* Return true if the trigraph indicated by NOTE should be warned
236 about in a comment. */
237 static bool
238 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
240 const uchar *p;
242 /* Within comments we don't warn about trigraphs, unless the
243 trigraph forms an escaped newline, as that may change
244 behavior. */
245 if (note->type != '/')
246 return false;
248 /* If -trigraphs, then this was an escaped newline iff the next note
249 is coincident. */
250 if (CPP_OPTION (pfile, trigraphs))
251 return note[1].pos == note->pos;
253 /* Otherwise, see if this forms an escaped newline. */
254 p = note->pos + 3;
255 while (is_nvspace (*p))
256 p++;
258 /* There might have been escaped newlines between the trigraph and the
259 newline we found. Hence the position test. */
260 return (*p == '\n' && p < note[1].pos);
263 /* Process the notes created by add_line_note as far as the current
264 location. */
265 void
266 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
268 cpp_buffer *buffer = pfile->buffer;
270 for (;;)
272 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
273 unsigned int col;
275 if (note->pos > buffer->cur)
276 break;
278 buffer->cur_note++;
279 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
281 if (note->type == '\\' || note->type == ' ')
283 if (note->type == ' ' && !in_comment)
284 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
285 "backslash and newline separated by space");
287 if (buffer->next_line > buffer->rlimit)
289 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
290 "backslash-newline at end of file");
291 /* Prevent "no newline at end of file" warning. */
292 buffer->next_line = buffer->rlimit;
295 buffer->line_base = note->pos;
296 CPP_INCREMENT_LINE (pfile, 0);
298 else if (_cpp_trigraph_map[note->type])
300 if (CPP_OPTION (pfile, warn_trigraphs)
301 && (!in_comment || warn_in_comment (pfile, note)))
303 if (CPP_OPTION (pfile, trigraphs))
304 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
305 "trigraph ??%c converted to %c",
306 note->type,
307 (int) _cpp_trigraph_map[note->type]);
308 else
310 cpp_error_with_line
311 (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
312 "trigraph ??%c ignored, use -trigraphs to enable",
313 note->type);
317 else
318 abort ();
322 /* Skip a C-style block comment. We find the end of the comment by
323 seeing if an asterisk is before every '/' we encounter. Returns
324 nonzero if comment terminated by EOF, zero otherwise.
326 Buffer->cur points to the initial asterisk of the comment. */
327 bool
328 _cpp_skip_block_comment (cpp_reader *pfile)
330 cpp_buffer *buffer = pfile->buffer;
331 const uchar *cur = buffer->cur;
332 uchar c;
334 cur++;
335 if (*cur == '/')
336 cur++;
338 for (;;)
340 /* People like decorating comments with '*', so check for '/'
341 instead for efficiency. */
342 c = *cur++;
344 if (c == '/')
346 if (cur[-2] == '*')
347 break;
349 /* Warn about potential nested comments, but not if the '/'
350 comes immediately before the true comment delimiter.
351 Don't bother to get it right across escaped newlines. */
352 if (CPP_OPTION (pfile, warn_comments)
353 && cur[0] == '*' && cur[1] != '/')
355 buffer->cur = cur;
356 cpp_error_with_line (pfile, CPP_DL_WARNING,
357 pfile->line_table->highest_line, CPP_BUF_COL (buffer),
358 "\"/*\" within comment");
361 else if (c == '\n')
363 unsigned int cols;
364 buffer->cur = cur - 1;
365 _cpp_process_line_notes (pfile, true);
366 if (buffer->next_line >= buffer->rlimit)
367 return true;
368 _cpp_clean_line (pfile);
370 cols = buffer->next_line - buffer->line_base;
371 CPP_INCREMENT_LINE (pfile, cols);
373 cur = buffer->cur;
377 buffer->cur = cur;
378 _cpp_process_line_notes (pfile, true);
379 return false;
382 /* Skip a C++ line comment, leaving buffer->cur pointing to the
383 terminating newline. Handles escaped newlines. Returns nonzero
384 if a multiline comment. */
385 static int
386 skip_line_comment (cpp_reader *pfile)
388 cpp_buffer *buffer = pfile->buffer;
389 source_location orig_line = pfile->line_table->highest_line;
391 while (*buffer->cur != '\n')
392 buffer->cur++;
394 _cpp_process_line_notes (pfile, true);
395 return orig_line != pfile->line_table->highest_line;
398 /* Skips whitespace, saving the next non-whitespace character. */
399 static void
400 skip_whitespace (cpp_reader *pfile, cppchar_t c)
402 cpp_buffer *buffer = pfile->buffer;
403 bool saw_NUL = false;
407 /* Horizontal space always OK. */
408 if (c == ' ' || c == '\t')
410 /* Just \f \v or \0 left. */
411 else if (c == '\0')
412 saw_NUL = true;
413 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
414 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
415 CPP_BUF_COL (buffer),
416 "%s in preprocessing directive",
417 c == '\f' ? "form feed" : "vertical tab");
419 c = *buffer->cur++;
421 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
422 while (is_nvspace (c));
424 if (saw_NUL)
425 cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
427 buffer->cur--;
430 /* See if the characters of a number token are valid in a name (no
431 '.', '+' or '-'). */
432 static int
433 name_p (cpp_reader *pfile, const cpp_string *string)
435 unsigned int i;
437 for (i = 0; i < string->len; i++)
438 if (!is_idchar (string->text[i]))
439 return 0;
441 return 1;
444 /* After parsing an identifier or other sequence, produce a warning about
445 sequences not in NFC/NFKC. */
446 static void
447 warn_about_normalization (cpp_reader *pfile,
448 const cpp_token *token,
449 const struct normalize_state *s)
451 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
452 && !pfile->state.skipping)
454 /* Make sure that the token is printed using UCNs, even
455 if we'd otherwise happily print UTF-8. */
456 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
457 size_t sz;
459 sz = cpp_spell_token (pfile, token, buf, false) - buf;
460 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
461 cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
462 "`%.*s' is not in NFKC", (int) sz, buf);
463 else
464 cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
465 "`%.*s' is not in NFC", (int) sz, buf);
469 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
470 an identifier. FIRST is TRUE if this starts an identifier. */
471 static bool
472 forms_identifier_p (cpp_reader *pfile, int first,
473 struct normalize_state *state)
475 cpp_buffer *buffer = pfile->buffer;
477 if (*buffer->cur == '$')
479 if (!CPP_OPTION (pfile, dollars_in_ident))
480 return false;
482 buffer->cur++;
483 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
485 CPP_OPTION (pfile, warn_dollars) = 0;
486 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
489 return true;
492 /* Is this a syntactically valid UCN? */
493 if (CPP_OPTION (pfile, extended_identifiers)
494 && *buffer->cur == '\\'
495 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
497 buffer->cur += 2;
498 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
499 state))
500 return true;
501 buffer->cur -= 2;
504 return false;
507 /* Helper function to get the cpp_hashnode of the identifier BASE. */
508 static cpp_hashnode *
509 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
511 cpp_hashnode *result;
512 const uchar *cur;
513 unsigned int len;
514 unsigned int hash = HT_HASHSTEP (0, *base);
516 cur = base + 1;
517 while (ISIDNUM (*cur))
519 hash = HT_HASHSTEP (hash, *cur);
520 cur++;
522 len = cur - base;
523 hash = HT_HASHFINISH (hash, len);
524 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
525 base, len, hash, HT_ALLOC));
527 /* Rarely, identifiers require diagnostics when lexed. */
528 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
529 && !pfile->state.skipping, 0))
531 /* It is allowed to poison the same identifier twice. */
532 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
533 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
534 NODE_NAME (result));
536 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
537 replacement list of a variadic macro. */
538 if (result == pfile->spec_nodes.n__VA_ARGS__
539 && !pfile->state.va_args_ok)
540 cpp_error (pfile, CPP_DL_PEDWARN,
541 "__VA_ARGS__ can only appear in the expansion"
542 " of a C99 variadic macro");
544 /* For -Wc++-compat, warn about use of C++ named operators. */
545 if (result->flags & NODE_WARN_OPERATOR)
546 cpp_error (pfile, CPP_DL_WARNING,
547 "identifier \"%s\" is a special operator name in C++",
548 NODE_NAME (result));
551 return result;
554 /* Get the cpp_hashnode of an identifier specified by NAME in
555 the current cpp_reader object. If none is found, NULL is returned. */
556 cpp_hashnode *
557 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
559 cpp_hashnode *result;
560 result = lex_identifier_intern (pfile, (uchar *) name);
561 return result;
564 /* Lex an identifier starting at BUFFER->CUR - 1. */
565 static cpp_hashnode *
566 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
567 struct normalize_state *nst)
569 cpp_hashnode *result;
570 const uchar *cur;
571 unsigned int len;
572 unsigned int hash = HT_HASHSTEP (0, *base);
574 cur = pfile->buffer->cur;
575 if (! starts_ucn)
576 while (ISIDNUM (*cur))
578 hash = HT_HASHSTEP (hash, *cur);
579 cur++;
581 pfile->buffer->cur = cur;
582 if (starts_ucn || forms_identifier_p (pfile, false, nst))
584 /* Slower version for identifiers containing UCNs (or $). */
585 do {
586 while (ISIDNUM (*pfile->buffer->cur))
588 pfile->buffer->cur++;
589 NORMALIZE_STATE_UPDATE_IDNUM (nst);
591 } while (forms_identifier_p (pfile, false, nst));
592 result = _cpp_interpret_identifier (pfile, base,
593 pfile->buffer->cur - base);
595 else
597 len = cur - base;
598 hash = HT_HASHFINISH (hash, len);
600 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
601 base, len, hash, HT_ALLOC));
604 /* Rarely, identifiers require diagnostics when lexed. */
605 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
606 && !pfile->state.skipping, 0))
608 /* It is allowed to poison the same identifier twice. */
609 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
610 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
611 NODE_NAME (result));
613 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
614 replacement list of a variadic macro. */
615 if (result == pfile->spec_nodes.n__VA_ARGS__
616 && !pfile->state.va_args_ok)
617 cpp_error (pfile, CPP_DL_PEDWARN,
618 "__VA_ARGS__ can only appear in the expansion"
619 " of a C99 variadic macro");
621 /* For -Wc++-compat, warn about use of C++ named operators. */
622 if (result->flags & NODE_WARN_OPERATOR)
623 cpp_error (pfile, CPP_DL_WARNING,
624 "identifier \"%s\" is a special operator name in C++",
625 NODE_NAME (result));
628 return result;
631 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
632 static void
633 lex_number (cpp_reader *pfile, cpp_string *number,
634 struct normalize_state *nst)
636 const uchar *cur;
637 const uchar *base;
638 uchar *dest;
640 base = pfile->buffer->cur - 1;
643 cur = pfile->buffer->cur;
645 /* N.B. ISIDNUM does not include $. */
646 while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
648 cur++;
649 NORMALIZE_STATE_UPDATE_IDNUM (nst);
652 pfile->buffer->cur = cur;
654 while (forms_identifier_p (pfile, false, nst));
656 number->len = cur - base;
657 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
658 memcpy (dest, base, number->len);
659 dest[number->len] = '\0';
660 number->text = dest;
663 /* Create a token of type TYPE with a literal spelling. */
664 static void
665 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
666 unsigned int len, enum cpp_ttype type)
668 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
670 memcpy (dest, base, len);
671 dest[len] = '\0';
672 token->type = type;
673 token->val.str.len = len;
674 token->val.str.text = dest;
677 /* Lexes a raw string. The stored string contains the spelling, including
678 double quotes, delimiter string, '[' and ']', any leading
679 'L', 'u', 'U' or 'u8' and 'R' modifier. It returns the type of the
680 literal, or CPP_OTHER if it was not properly terminated.
682 The spelling is NUL-terminated, but it is not guaranteed that this
683 is the first NUL since embedded NULs are preserved. */
685 static void
686 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
687 const uchar *cur)
689 source_location saw_NUL = 0;
690 const uchar *raw_prefix;
691 unsigned int raw_prefix_len = 0;
692 enum cpp_ttype type;
693 size_t total_len = 0;
694 _cpp_buff *first_buff = NULL, *last_buff = NULL;
696 type = (*base == 'L' ? CPP_WSTRING :
697 *base == 'U' ? CPP_STRING32 :
698 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
699 : CPP_STRING);
701 raw_prefix = cur + 1;
702 while (raw_prefix_len < 16)
704 switch (raw_prefix[raw_prefix_len])
706 case ' ': case '[': case ']': case '\t':
707 case '\v': case '\f': case '\n': default:
708 break;
709 /* Basic source charset except the above chars. */
710 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
711 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
712 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
713 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
714 case 'y': case 'z':
715 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
716 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
717 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
718 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
719 case 'Y': case 'Z':
720 case '0': case '1': case '2': case '3': case '4': case '5':
721 case '6': case '7': case '8': case '9':
722 case '_': case '{': case '}': case '#': case '(': case ')':
723 case '<': case '>': case '%': case ':': case ';': case '.':
724 case '?': case '*': case '+': case '-': case '/': case '^':
725 case '&': case '|': case '~': case '!': case '=': case ',':
726 case '\\': case '"': case '\'':
727 raw_prefix_len++;
728 continue;
730 break;
733 if (raw_prefix[raw_prefix_len] != '[')
735 int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
736 + 1;
737 if (raw_prefix_len == 16)
738 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
739 "raw string delimiter longer than 16 characters");
740 else
741 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
742 "invalid character '%c' in raw string delimiter",
743 (int) raw_prefix[raw_prefix_len]);
744 pfile->buffer->cur = raw_prefix - 1;
745 create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
746 return;
749 cur = raw_prefix + raw_prefix_len + 1;
750 for (;;)
752 cppchar_t c = *cur++;
754 if (c == ']'
755 && strncmp ((const char *) cur, (const char *) raw_prefix,
756 raw_prefix_len) == 0
757 && cur[raw_prefix_len] == '"')
759 cur += raw_prefix_len + 1;
760 break;
762 else if (c == '\n')
764 if (pfile->state.in_directive
765 || pfile->state.parsing_args
766 || pfile->state.in_deferred_pragma)
768 cur--;
769 type = CPP_OTHER;
770 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
771 "unterminated raw string");
772 break;
775 /* raw strings allow embedded non-escaped newlines, which
776 complicates this routine a lot. */
777 if (first_buff == NULL)
779 total_len = cur - base;
780 first_buff = last_buff = _cpp_get_buff (pfile, total_len);
781 memcpy (BUFF_FRONT (last_buff), base, total_len);
782 raw_prefix = BUFF_FRONT (last_buff) + (raw_prefix - base);
783 BUFF_FRONT (last_buff) += total_len;
785 else
787 size_t len = cur - base;
788 size_t cur_len = len > BUFF_ROOM (last_buff)
789 ? BUFF_ROOM (last_buff) : len;
791 total_len += len;
792 memcpy (BUFF_FRONT (last_buff), base, cur_len);
793 BUFF_FRONT (last_buff) += cur_len;
794 if (len > cur_len)
796 last_buff = _cpp_append_extend_buff (pfile, last_buff,
797 len - cur_len);
798 memcpy (BUFF_FRONT (last_buff), base + cur_len,
799 len - cur_len);
800 BUFF_FRONT (last_buff) += len - cur_len;
804 if (pfile->buffer->cur < pfile->buffer->rlimit)
805 CPP_INCREMENT_LINE (pfile, 0);
806 pfile->buffer->need_line = true;
808 if (!_cpp_get_fresh_line (pfile))
810 source_location src_loc = token->src_loc;
811 token->type = CPP_EOF;
812 /* Tell the compiler the line number of the EOF token. */
813 token->src_loc = pfile->line_table->highest_line;
814 token->flags = BOL;
815 if (first_buff != NULL)
816 _cpp_release_buff (pfile, first_buff);
817 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
818 "unterminated raw string");
819 return;
822 cur = base = pfile->buffer->cur;
824 else if (c == '\0' && !saw_NUL)
825 LINEMAP_POSITION_FOR_COLUMN (saw_NUL, pfile->line_table,
826 CPP_BUF_COLUMN (pfile->buffer, cur));
829 if (saw_NUL && !pfile->state.skipping)
830 cpp_error_with_line (pfile, CPP_DL_WARNING, saw_NUL, 0,
831 "null character(s) preserved in literal");
833 pfile->buffer->cur = cur;
834 if (first_buff == NULL)
835 create_literal (pfile, token, base, cur - base, type);
836 else
838 uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
840 token->type = type;
841 token->val.str.len = total_len + (cur - base);
842 token->val.str.text = dest;
843 last_buff = first_buff;
844 while (last_buff != NULL)
846 memcpy (dest, last_buff->base,
847 BUFF_FRONT (last_buff) - last_buff->base);
848 dest += BUFF_FRONT (last_buff) - last_buff->base;
849 last_buff = last_buff->next;
851 _cpp_release_buff (pfile, first_buff);
852 memcpy (dest, base, cur - base);
853 dest[cur - base] = '\0';
857 /* Lexes a string, character constant, or angle-bracketed header file
858 name. The stored string contains the spelling, including opening
859 quote and any leading 'L', 'u', 'U' or 'u8' and optional
860 'R' modifier. It returns the type of the literal, or CPP_OTHER
861 if it was not properly terminated, or CPP_LESS for an unterminated
862 header name which must be relexed as normal tokens.
864 The spelling is NUL-terminated, but it is not guaranteed that this
865 is the first NUL since embedded NULs are preserved. */
866 static void
867 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
869 bool saw_NUL = false;
870 const uchar *cur;
871 cppchar_t terminator;
872 enum cpp_ttype type;
874 cur = base;
875 terminator = *cur++;
876 if (terminator == 'L' || terminator == 'U')
877 terminator = *cur++;
878 else if (terminator == 'u')
880 terminator = *cur++;
881 if (terminator == '8')
882 terminator = *cur++;
884 if (terminator == 'R')
886 lex_raw_string (pfile, token, base, cur);
887 return;
889 if (terminator == '"')
890 type = (*base == 'L' ? CPP_WSTRING :
891 *base == 'U' ? CPP_STRING32 :
892 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
893 : CPP_STRING);
894 else if (terminator == '\'')
895 type = (*base == 'L' ? CPP_WCHAR :
896 *base == 'U' ? CPP_CHAR32 :
897 *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
898 else
899 terminator = '>', type = CPP_HEADER_NAME;
901 for (;;)
903 cppchar_t c = *cur++;
905 /* In #include-style directives, terminators are not escapable. */
906 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
907 cur++;
908 else if (c == terminator)
909 break;
910 else if (c == '\n')
912 cur--;
913 /* Unmatched quotes always yield undefined behavior, but
914 greedy lexing means that what appears to be an unterminated
915 header name may actually be a legitimate sequence of tokens. */
916 if (terminator == '>')
918 token->type = CPP_LESS;
919 return;
921 type = CPP_OTHER;
922 break;
924 else if (c == '\0')
925 saw_NUL = true;
928 if (saw_NUL && !pfile->state.skipping)
929 cpp_error (pfile, CPP_DL_WARNING,
930 "null character(s) preserved in literal");
932 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
933 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
934 (int) terminator);
936 pfile->buffer->cur = cur;
937 create_literal (pfile, token, base, cur - base, type);
940 /* Return the comment table. The client may not make any assumption
941 about the ordering of the table. */
942 cpp_comment_table *
943 cpp_get_comments (cpp_reader *pfile)
945 return &pfile->comments;
948 /* Append a comment to the end of the comment table. */
949 static void
950 store_comment (cpp_reader *pfile, cpp_token *token)
952 int len;
954 if (pfile->comments.allocated == 0)
956 pfile->comments.allocated = 256;
957 pfile->comments.entries = (cpp_comment *) xmalloc
958 (pfile->comments.allocated * sizeof (cpp_comment));
961 if (pfile->comments.count == pfile->comments.allocated)
963 pfile->comments.allocated *= 2;
964 pfile->comments.entries = (cpp_comment *) xrealloc
965 (pfile->comments.entries,
966 pfile->comments.allocated * sizeof (cpp_comment));
969 len = token->val.str.len;
971 /* Copy comment. Note, token may not be NULL terminated. */
972 pfile->comments.entries[pfile->comments.count].comment =
973 (char *) xmalloc (sizeof (char) * (len + 1));
974 memcpy (pfile->comments.entries[pfile->comments.count].comment,
975 token->val.str.text, len);
976 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
978 /* Set source location. */
979 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
981 /* Increment the count of entries in the comment table. */
982 pfile->comments.count++;
985 /* The stored comment includes the comment start and any terminator. */
986 static void
987 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
988 cppchar_t type)
990 unsigned char *buffer;
991 unsigned int len, clen;
993 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
995 /* C++ comments probably (not definitely) have moved past a new
996 line, which we don't want to save in the comment. */
997 if (is_vspace (pfile->buffer->cur[-1]))
998 len--;
1000 /* If we are currently in a directive, then we need to store all
1001 C++ comments as C comments internally, and so we need to
1002 allocate a little extra space in that case.
1004 Note that the only time we encounter a directive here is
1005 when we are saving comments in a "#define". */
1006 clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
1008 buffer = _cpp_unaligned_alloc (pfile, clen);
1010 token->type = CPP_COMMENT;
1011 token->val.str.len = clen;
1012 token->val.str.text = buffer;
1014 buffer[0] = '/';
1015 memcpy (buffer + 1, from, len - 1);
1017 /* Finish conversion to a C comment, if necessary. */
1018 if (pfile->state.in_directive && type == '/')
1020 buffer[1] = '*';
1021 buffer[clen - 2] = '*';
1022 buffer[clen - 1] = '/';
1025 /* Finally store this comment for use by clients of libcpp. */
1026 store_comment (pfile, token);
1029 /* Allocate COUNT tokens for RUN. */
1030 void
1031 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1033 run->base = XNEWVEC (cpp_token, count);
1034 run->limit = run->base + count;
1035 run->next = NULL;
1038 /* Returns the next tokenrun, or creates one if there is none. */
1039 static tokenrun *
1040 next_tokenrun (tokenrun *run)
1042 if (run->next == NULL)
1044 run->next = XNEW (tokenrun);
1045 run->next->prev = run;
1046 _cpp_init_tokenrun (run->next, 250);
1049 return run->next;
1052 /* Look ahead in the input stream. */
1053 const cpp_token *
1054 cpp_peek_token (cpp_reader *pfile, int index)
1056 cpp_context *context = pfile->context;
1057 const cpp_token *peektok;
1058 int count;
1060 /* First, scan through any pending cpp_context objects. */
1061 while (context->prev)
1063 ptrdiff_t sz = (context->direct_p
1064 ? LAST (context).token - FIRST (context).token
1065 : LAST (context).ptoken - FIRST (context).ptoken);
1067 if (index < (int) sz)
1068 return (context->direct_p
1069 ? FIRST (context).token + index
1070 : *(FIRST (context).ptoken + index));
1072 index -= (int) sz;
1073 context = context->prev;
1076 /* We will have to read some new tokens after all (and do so
1077 without invalidating preceding tokens). */
1078 count = index;
1079 pfile->keep_tokens++;
1083 peektok = _cpp_lex_token (pfile);
1084 if (peektok->type == CPP_EOF)
1085 return peektok;
1087 while (index--);
1089 _cpp_backup_tokens_direct (pfile, count + 1);
1090 pfile->keep_tokens--;
1092 return peektok;
1095 /* Allocate a single token that is invalidated at the same time as the
1096 rest of the tokens on the line. Has its line and col set to the
1097 same as the last lexed token, so that diagnostics appear in the
1098 right place. */
1099 cpp_token *
1100 _cpp_temp_token (cpp_reader *pfile)
1102 cpp_token *old, *result;
1103 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1104 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1106 old = pfile->cur_token - 1;
1107 /* Any pre-existing lookaheads must not be clobbered. */
1108 if (la)
1110 if (sz <= la)
1112 tokenrun *next = next_tokenrun (pfile->cur_run);
1114 if (sz < la)
1115 memmove (next->base + 1, next->base,
1116 (la - sz) * sizeof (cpp_token));
1118 next->base[0] = pfile->cur_run->limit[-1];
1121 if (sz > 1)
1122 memmove (pfile->cur_token + 1, pfile->cur_token,
1123 MIN (la, sz - 1) * sizeof (cpp_token));
1126 if (!sz && pfile->cur_token == pfile->cur_run->limit)
1128 pfile->cur_run = next_tokenrun (pfile->cur_run);
1129 pfile->cur_token = pfile->cur_run->base;
1132 result = pfile->cur_token++;
1133 result->src_loc = old->src_loc;
1134 return result;
1137 /* Lex a token into RESULT (external interface). Takes care of issues
1138 like directive handling, token lookahead, multiple include
1139 optimization and skipping. */
1140 const cpp_token *
1141 _cpp_lex_token (cpp_reader *pfile)
1143 cpp_token *result;
1145 for (;;)
1147 if (pfile->cur_token == pfile->cur_run->limit)
1149 pfile->cur_run = next_tokenrun (pfile->cur_run);
1150 pfile->cur_token = pfile->cur_run->base;
1152 /* We assume that the current token is somewhere in the current
1153 run. */
1154 if (pfile->cur_token < pfile->cur_run->base
1155 || pfile->cur_token >= pfile->cur_run->limit)
1156 abort ();
1158 if (pfile->lookaheads)
1160 pfile->lookaheads--;
1161 result = pfile->cur_token++;
1163 else
1164 result = _cpp_lex_direct (pfile);
1166 if (result->flags & BOL)
1168 /* Is this a directive. If _cpp_handle_directive returns
1169 false, it is an assembler #. */
1170 if (result->type == CPP_HASH
1171 /* 6.10.3 p 11: Directives in a list of macro arguments
1172 gives undefined behavior. This implementation
1173 handles the directive as normal. */
1174 && pfile->state.parsing_args != 1)
1176 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1178 if (pfile->directive_result.type == CPP_PADDING)
1179 continue;
1180 result = &pfile->directive_result;
1183 else if (pfile->state.in_deferred_pragma)
1184 result = &pfile->directive_result;
1186 if (pfile->cb.line_change && !pfile->state.skipping)
1187 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1190 /* We don't skip tokens in directives. */
1191 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
1192 break;
1194 /* Outside a directive, invalidate controlling macros. At file
1195 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
1196 get here and MI optimization works. */
1197 pfile->mi_valid = false;
1199 if (!pfile->state.skipping || result->type == CPP_EOF)
1200 break;
1203 return result;
1206 /* Returns true if a fresh line has been loaded. */
1207 bool
1208 _cpp_get_fresh_line (cpp_reader *pfile)
1210 int return_at_eof;
1212 /* We can't get a new line until we leave the current directive. */
1213 if (pfile->state.in_directive)
1214 return false;
1216 for (;;)
1218 cpp_buffer *buffer = pfile->buffer;
1220 if (!buffer->need_line)
1221 return true;
1223 if (buffer->next_line < buffer->rlimit)
1225 _cpp_clean_line (pfile);
1226 return true;
1229 /* First, get out of parsing arguments state. */
1230 if (pfile->state.parsing_args)
1231 return false;
1233 /* End of buffer. Non-empty files should end in a newline. */
1234 if (buffer->buf != buffer->rlimit
1235 && buffer->next_line > buffer->rlimit
1236 && !buffer->from_stage3)
1238 /* Clip to buffer size. */
1239 buffer->next_line = buffer->rlimit;
1242 return_at_eof = buffer->return_at_eof;
1243 _cpp_pop_buffer (pfile);
1244 if (pfile->buffer == NULL || return_at_eof)
1245 return false;
1249 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
1250 do \
1252 result->type = ELSE_TYPE; \
1253 if (*buffer->cur == CHAR) \
1254 buffer->cur++, result->type = THEN_TYPE; \
1256 while (0)
1258 /* Lex a token into pfile->cur_token, which is also incremented, to
1259 get diagnostics pointing to the correct location.
1261 Does not handle issues such as token lookahead, multiple-include
1262 optimization, directives, skipping etc. This function is only
1263 suitable for use by _cpp_lex_token, and in special cases like
1264 lex_expansion_token which doesn't care for any of these issues.
1266 When meeting a newline, returns CPP_EOF if parsing a directive,
1267 otherwise returns to the start of the token buffer if permissible.
1268 Returns the location of the lexed token. */
1269 cpp_token *
1270 _cpp_lex_direct (cpp_reader *pfile)
1272 cppchar_t c;
1273 cpp_buffer *buffer;
1274 const unsigned char *comment_start;
1275 cpp_token *result = pfile->cur_token++;
1277 fresh_line:
1278 result->flags = 0;
1279 buffer = pfile->buffer;
1280 if (buffer->need_line)
1282 if (pfile->state.in_deferred_pragma)
1284 result->type = CPP_PRAGMA_EOL;
1285 pfile->state.in_deferred_pragma = false;
1286 if (!pfile->state.pragma_allow_expansion)
1287 pfile->state.prevent_expansion--;
1288 return result;
1290 if (!_cpp_get_fresh_line (pfile))
1292 result->type = CPP_EOF;
1293 if (!pfile->state.in_directive)
1295 /* Tell the compiler the line number of the EOF token. */
1296 result->src_loc = pfile->line_table->highest_line;
1297 result->flags = BOL;
1299 return result;
1301 if (!pfile->keep_tokens)
1303 pfile->cur_run = &pfile->base_run;
1304 result = pfile->base_run.base;
1305 pfile->cur_token = result + 1;
1307 result->flags = BOL;
1308 if (pfile->state.parsing_args == 2)
1309 result->flags |= PREV_WHITE;
1311 buffer = pfile->buffer;
1312 update_tokens_line:
1313 result->src_loc = pfile->line_table->highest_line;
1315 skipped_white:
1316 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
1317 && !pfile->overlaid_buffer)
1319 _cpp_process_line_notes (pfile, false);
1320 result->src_loc = pfile->line_table->highest_line;
1322 c = *buffer->cur++;
1324 LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
1325 CPP_BUF_COLUMN (buffer, buffer->cur));
1327 switch (c)
1329 case ' ': case '\t': case '\f': case '\v': case '\0':
1330 result->flags |= PREV_WHITE;
1331 skip_whitespace (pfile, c);
1332 goto skipped_white;
1334 case '\n':
1335 if (buffer->cur < buffer->rlimit)
1336 CPP_INCREMENT_LINE (pfile, 0);
1337 buffer->need_line = true;
1338 goto fresh_line;
1340 case '0': case '1': case '2': case '3': case '4':
1341 case '5': case '6': case '7': case '8': case '9':
1343 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1344 result->type = CPP_NUMBER;
1345 lex_number (pfile, &result->val.str, &nst);
1346 warn_about_normalization (pfile, result, &nst);
1347 break;
1350 case 'L':
1351 case 'u':
1352 case 'U':
1353 case 'R':
1354 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
1355 wide strings or raw strings. */
1356 if (c == 'L' || CPP_OPTION (pfile, uliterals))
1358 if ((*buffer->cur == '\'' && c != 'R')
1359 || *buffer->cur == '"'
1360 || (*buffer->cur == 'R'
1361 && c != 'R'
1362 && buffer->cur[1] == '"'
1363 && CPP_OPTION (pfile, uliterals))
1364 || (*buffer->cur == '8'
1365 && c == 'u'
1366 && (buffer->cur[1] == '"'
1367 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'))))
1369 lex_string (pfile, result, buffer->cur - 1);
1370 break;
1373 /* Fall through. */
1375 case '_':
1376 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1377 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1378 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1379 case 's': case 't': case 'v': case 'w': case 'x':
1380 case 'y': case 'z':
1381 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1382 case 'G': case 'H': case 'I': case 'J': case 'K':
1383 case 'M': case 'N': case 'O': case 'P': case 'Q':
1384 case 'S': case 'T': case 'V': case 'W': case 'X':
1385 case 'Y': case 'Z':
1386 result->type = CPP_NAME;
1388 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1389 result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
1390 &nst);
1391 warn_about_normalization (pfile, result, &nst);
1394 /* Convert named operators to their proper types. */
1395 if (result->val.node.node->flags & NODE_OPERATOR)
1397 result->flags |= NAMED_OP;
1398 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
1400 break;
1402 case '\'':
1403 case '"':
1404 lex_string (pfile, result, buffer->cur - 1);
1405 break;
1407 case '/':
1408 /* A potential block or line comment. */
1409 comment_start = buffer->cur;
1410 c = *buffer->cur;
1412 if (c == '*')
1414 if (_cpp_skip_block_comment (pfile))
1415 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
1417 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1418 || cpp_in_system_header (pfile)))
1420 /* Warn about comments only if pedantically GNUC89, and not
1421 in system headers. */
1422 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1423 && ! buffer->warned_cplusplus_comments)
1425 cpp_error (pfile, CPP_DL_PEDWARN,
1426 "C++ style comments are not allowed in ISO C90");
1427 cpp_error (pfile, CPP_DL_PEDWARN,
1428 "(this will be reported only once per input file)");
1429 buffer->warned_cplusplus_comments = 1;
1432 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1433 cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
1435 else if (c == '=')
1437 buffer->cur++;
1438 result->type = CPP_DIV_EQ;
1439 break;
1441 else
1443 result->type = CPP_DIV;
1444 break;
1447 if (!pfile->state.save_comments)
1449 result->flags |= PREV_WHITE;
1450 goto update_tokens_line;
1453 /* Save the comment as a token in its own right. */
1454 save_comment (pfile, result, comment_start, c);
1455 break;
1457 case '<':
1458 if (pfile->state.angled_headers)
1460 lex_string (pfile, result, buffer->cur - 1);
1461 if (result->type != CPP_LESS)
1462 break;
1465 result->type = CPP_LESS;
1466 if (*buffer->cur == '=')
1467 buffer->cur++, result->type = CPP_LESS_EQ;
1468 else if (*buffer->cur == '<')
1470 buffer->cur++;
1471 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1473 else if (CPP_OPTION (pfile, digraphs))
1475 if (*buffer->cur == ':')
1477 buffer->cur++;
1478 result->flags |= DIGRAPH;
1479 result->type = CPP_OPEN_SQUARE;
1481 else if (*buffer->cur == '%')
1483 buffer->cur++;
1484 result->flags |= DIGRAPH;
1485 result->type = CPP_OPEN_BRACE;
1488 break;
1490 case '>':
1491 result->type = CPP_GREATER;
1492 if (*buffer->cur == '=')
1493 buffer->cur++, result->type = CPP_GREATER_EQ;
1494 else if (*buffer->cur == '>')
1496 buffer->cur++;
1497 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1499 break;
1501 case '%':
1502 result->type = CPP_MOD;
1503 if (*buffer->cur == '=')
1504 buffer->cur++, result->type = CPP_MOD_EQ;
1505 else if (CPP_OPTION (pfile, digraphs))
1507 if (*buffer->cur == ':')
1509 buffer->cur++;
1510 result->flags |= DIGRAPH;
1511 result->type = CPP_HASH;
1512 if (*buffer->cur == '%' && buffer->cur[1] == ':')
1513 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
1515 else if (*buffer->cur == '>')
1517 buffer->cur++;
1518 result->flags |= DIGRAPH;
1519 result->type = CPP_CLOSE_BRACE;
1522 break;
1524 case '.':
1525 result->type = CPP_DOT;
1526 if (ISDIGIT (*buffer->cur))
1528 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1529 result->type = CPP_NUMBER;
1530 lex_number (pfile, &result->val.str, &nst);
1531 warn_about_normalization (pfile, result, &nst);
1533 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
1534 buffer->cur += 2, result->type = CPP_ELLIPSIS;
1535 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1536 buffer->cur++, result->type = CPP_DOT_STAR;
1537 break;
1539 case '+':
1540 result->type = CPP_PLUS;
1541 if (*buffer->cur == '+')
1542 buffer->cur++, result->type = CPP_PLUS_PLUS;
1543 else if (*buffer->cur == '=')
1544 buffer->cur++, result->type = CPP_PLUS_EQ;
1545 break;
1547 case '-':
1548 result->type = CPP_MINUS;
1549 if (*buffer->cur == '>')
1551 buffer->cur++;
1552 result->type = CPP_DEREF;
1553 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1554 buffer->cur++, result->type = CPP_DEREF_STAR;
1556 else if (*buffer->cur == '-')
1557 buffer->cur++, result->type = CPP_MINUS_MINUS;
1558 else if (*buffer->cur == '=')
1559 buffer->cur++, result->type = CPP_MINUS_EQ;
1560 break;
1562 case '&':
1563 result->type = CPP_AND;
1564 if (*buffer->cur == '&')
1565 buffer->cur++, result->type = CPP_AND_AND;
1566 else if (*buffer->cur == '=')
1567 buffer->cur++, result->type = CPP_AND_EQ;
1568 break;
1570 case '|':
1571 result->type = CPP_OR;
1572 if (*buffer->cur == '|')
1573 buffer->cur++, result->type = CPP_OR_OR;
1574 else if (*buffer->cur == '=')
1575 buffer->cur++, result->type = CPP_OR_EQ;
1576 break;
1578 case ':':
1579 result->type = CPP_COLON;
1580 if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1581 buffer->cur++, result->type = CPP_SCOPE;
1582 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1584 buffer->cur++;
1585 result->flags |= DIGRAPH;
1586 result->type = CPP_CLOSE_SQUARE;
1588 break;
1590 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1591 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1592 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1593 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1594 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
1596 case '?': result->type = CPP_QUERY; break;
1597 case '~': result->type = CPP_COMPL; break;
1598 case ',': result->type = CPP_COMMA; break;
1599 case '(': result->type = CPP_OPEN_PAREN; break;
1600 case ')': result->type = CPP_CLOSE_PAREN; break;
1601 case '[': result->type = CPP_OPEN_SQUARE; break;
1602 case ']': result->type = CPP_CLOSE_SQUARE; break;
1603 case '{': result->type = CPP_OPEN_BRACE; break;
1604 case '}': result->type = CPP_CLOSE_BRACE; break;
1605 case ';': result->type = CPP_SEMICOLON; break;
1607 /* @ is a punctuator in Objective-C. */
1608 case '@': result->type = CPP_ATSIGN; break;
1610 case '$':
1611 case '\\':
1613 const uchar *base = --buffer->cur;
1614 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1616 if (forms_identifier_p (pfile, true, &nst))
1618 result->type = CPP_NAME;
1619 result->val.node.node = lex_identifier (pfile, base, true, &nst);
1620 warn_about_normalization (pfile, result, &nst);
1621 break;
1623 buffer->cur++;
1626 default:
1627 create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1628 break;
1631 return result;
1634 /* An upper bound on the number of bytes needed to spell TOKEN.
1635 Does not include preceding whitespace. */
1636 unsigned int
1637 cpp_token_len (const cpp_token *token)
1639 unsigned int len;
1641 switch (TOKEN_SPELL (token))
1643 default: len = 6; break;
1644 case SPELL_LITERAL: len = token->val.str.len; break;
1645 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
1648 return len;
1651 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
1652 Return the number of bytes read out of NAME. (There are always
1653 10 bytes written to BUFFER.) */
1655 static size_t
1656 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
1658 int j;
1659 int ucn_len = 0;
1660 int ucn_len_c;
1661 unsigned t;
1662 unsigned long utf32;
1664 /* Compute the length of the UTF-8 sequence. */
1665 for (t = *name; t & 0x80; t <<= 1)
1666 ucn_len++;
1668 utf32 = *name & (0x7F >> ucn_len);
1669 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
1671 utf32 = (utf32 << 6) | (*++name & 0x3F);
1673 /* Ill-formed UTF-8. */
1674 if ((*name & ~0x3F) != 0x80)
1675 abort ();
1678 *buffer++ = '\\';
1679 *buffer++ = 'U';
1680 for (j = 7; j >= 0; j--)
1681 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
1682 return ucn_len;
1685 /* Given a token TYPE corresponding to a digraph, return a pointer to
1686 the spelling of the digraph. */
1687 static const unsigned char *
1688 cpp_digraph2name (enum cpp_ttype type)
1690 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
1693 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
1694 already contain the enough space to hold the token's spelling.
1695 Returns a pointer to the character after the last character written.
1696 FORSTRING is true if this is to be the spelling after translation
1697 phase 1 (this is different for UCNs).
1698 FIXME: Would be nice if we didn't need the PFILE argument. */
1699 unsigned char *
1700 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1701 unsigned char *buffer, bool forstring)
1703 switch (TOKEN_SPELL (token))
1705 case SPELL_OPERATOR:
1707 const unsigned char *spelling;
1708 unsigned char c;
1710 if (token->flags & DIGRAPH)
1711 spelling = cpp_digraph2name (token->type);
1712 else if (token->flags & NAMED_OP)
1713 goto spell_ident;
1714 else
1715 spelling = TOKEN_NAME (token);
1717 while ((c = *spelling++) != '\0')
1718 *buffer++ = c;
1720 break;
1722 spell_ident:
1723 case SPELL_IDENT:
1724 if (forstring)
1726 memcpy (buffer, NODE_NAME (token->val.node.node),
1727 NODE_LEN (token->val.node.node));
1728 buffer += NODE_LEN (token->val.node.node);
1730 else
1732 size_t i;
1733 const unsigned char * name = NODE_NAME (token->val.node.node);
1735 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
1736 if (name[i] & ~0x7F)
1738 i += utf8_to_ucn (buffer, name + i) - 1;
1739 buffer += 10;
1741 else
1742 *buffer++ = NODE_NAME (token->val.node.node)[i];
1744 break;
1746 case SPELL_LITERAL:
1747 memcpy (buffer, token->val.str.text, token->val.str.len);
1748 buffer += token->val.str.len;
1749 break;
1751 case SPELL_NONE:
1752 cpp_error (pfile, CPP_DL_ICE,
1753 "unspellable token %s", TOKEN_NAME (token));
1754 break;
1757 return buffer;
1760 /* Returns TOKEN spelt as a null-terminated string. The string is
1761 freed when the reader is destroyed. Useful for diagnostics. */
1762 unsigned char *
1763 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1765 unsigned int len = cpp_token_len (token) + 1;
1766 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1768 end = cpp_spell_token (pfile, token, start, false);
1769 end[0] = '\0';
1771 return start;
1774 /* Returns a pointer to a string which spells the token defined by
1775 TYPE and FLAGS. Used by C front ends, which really should move to
1776 using cpp_token_as_text. */
1777 const char *
1778 cpp_type2name (enum cpp_ttype type, unsigned char flags)
1780 if (flags & DIGRAPH)
1781 return (const char *) cpp_digraph2name (type);
1782 else if (flags & NAMED_OP)
1783 return cpp_named_operator2name (type);
1785 return (const char *) token_spellings[type].name;
1788 /* Writes the spelling of token to FP, without any preceding space.
1789 Separated from cpp_spell_token for efficiency - to avoid stdio
1790 double-buffering. */
1791 void
1792 cpp_output_token (const cpp_token *token, FILE *fp)
1794 switch (TOKEN_SPELL (token))
1796 case SPELL_OPERATOR:
1798 const unsigned char *spelling;
1799 int c;
1801 if (token->flags & DIGRAPH)
1802 spelling = cpp_digraph2name (token->type);
1803 else if (token->flags & NAMED_OP)
1804 goto spell_ident;
1805 else
1806 spelling = TOKEN_NAME (token);
1808 c = *spelling;
1810 putc (c, fp);
1811 while ((c = *++spelling) != '\0');
1813 break;
1815 spell_ident:
1816 case SPELL_IDENT:
1818 size_t i;
1819 const unsigned char * name = NODE_NAME (token->val.node.node);
1821 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
1822 if (name[i] & ~0x7F)
1824 unsigned char buffer[10];
1825 i += utf8_to_ucn (buffer, name + i) - 1;
1826 fwrite (buffer, 1, 10, fp);
1828 else
1829 fputc (NODE_NAME (token->val.node.node)[i], fp);
1831 break;
1833 case SPELL_LITERAL:
1834 fwrite (token->val.str.text, 1, token->val.str.len, fp);
1835 break;
1837 case SPELL_NONE:
1838 /* An error, most probably. */
1839 break;
1843 /* Compare two tokens. */
1845 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1847 if (a->type == b->type && a->flags == b->flags)
1848 switch (TOKEN_SPELL (a))
1850 default: /* Keep compiler happy. */
1851 case SPELL_OPERATOR:
1852 /* token_no is used to track where multiple consecutive ##
1853 tokens were originally located. */
1854 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
1855 case SPELL_NONE:
1856 return (a->type != CPP_MACRO_ARG
1857 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
1858 case SPELL_IDENT:
1859 return a->val.node.node == b->val.node.node;
1860 case SPELL_LITERAL:
1861 return (a->val.str.len == b->val.str.len
1862 && !memcmp (a->val.str.text, b->val.str.text,
1863 a->val.str.len));
1866 return 0;
1869 /* Returns nonzero if a space should be inserted to avoid an
1870 accidental token paste for output. For simplicity, it is
1871 conservative, and occasionally advises a space where one is not
1872 needed, e.g. "." and ".2". */
1874 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1875 const cpp_token *token2)
1877 enum cpp_ttype a = token1->type, b = token2->type;
1878 cppchar_t c;
1880 if (token1->flags & NAMED_OP)
1881 a = CPP_NAME;
1882 if (token2->flags & NAMED_OP)
1883 b = CPP_NAME;
1885 c = EOF;
1886 if (token2->flags & DIGRAPH)
1887 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1888 else if (token_spellings[b].category == SPELL_OPERATOR)
1889 c = token_spellings[b].name[0];
1891 /* Quickly get everything that can paste with an '='. */
1892 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1893 return 1;
1895 switch (a)
1897 case CPP_GREATER: return c == '>';
1898 case CPP_LESS: return c == '<' || c == '%' || c == ':';
1899 case CPP_PLUS: return c == '+';
1900 case CPP_MINUS: return c == '-' || c == '>';
1901 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
1902 case CPP_MOD: return c == ':' || c == '>';
1903 case CPP_AND: return c == '&';
1904 case CPP_OR: return c == '|';
1905 case CPP_COLON: return c == ':' || c == '>';
1906 case CPP_DEREF: return c == '*';
1907 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
1908 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
1909 case CPP_NAME: return ((b == CPP_NUMBER
1910 && name_p (pfile, &token2->val.str))
1911 || b == CPP_NAME
1912 || b == CPP_CHAR || b == CPP_STRING); /* L */
1913 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
1914 || c == '.' || c == '+' || c == '-');
1915 /* UCNs */
1916 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
1917 && b == CPP_NAME)
1918 || (CPP_OPTION (pfile, objc)
1919 && token1->val.str.text[0] == '@'
1920 && (b == CPP_NAME || b == CPP_STRING)));
1921 default: break;
1924 return 0;
1927 /* Output all the remaining tokens on the current line, and a newline
1928 character, to FP. Leading whitespace is removed. If there are
1929 macros, special token padding is not performed. */
1930 void
1931 cpp_output_line (cpp_reader *pfile, FILE *fp)
1933 const cpp_token *token;
1935 token = cpp_get_token (pfile);
1936 while (token->type != CPP_EOF)
1938 cpp_output_token (token, fp);
1939 token = cpp_get_token (pfile);
1940 if (token->flags & PREV_WHITE)
1941 putc (' ', fp);
1944 putc ('\n', fp);
1947 /* Return a string representation of all the remaining tokens on the
1948 current line. The result is allocated using xmalloc and must be
1949 freed by the caller. */
1950 unsigned char *
1951 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
1953 const cpp_token *token;
1954 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
1955 unsigned int alloced = 120 + out;
1956 unsigned char *result = (unsigned char *) xmalloc (alloced);
1958 /* If DIR_NAME is empty, there are no initial contents. */
1959 if (dir_name)
1961 sprintf ((char *) result, "#%s ", dir_name);
1962 out += 2;
1965 token = cpp_get_token (pfile);
1966 while (token->type != CPP_EOF)
1968 unsigned char *last;
1969 /* Include room for a possible space and the terminating nul. */
1970 unsigned int len = cpp_token_len (token) + 2;
1972 if (out + len > alloced)
1974 alloced *= 2;
1975 if (out + len > alloced)
1976 alloced = out + len;
1977 result = (unsigned char *) xrealloc (result, alloced);
1980 last = cpp_spell_token (pfile, token, &result[out], 0);
1981 out = last - result;
1983 token = cpp_get_token (pfile);
1984 if (token->flags & PREV_WHITE)
1985 result[out++] = ' ';
1988 result[out] = '\0';
1989 return result;
1992 /* Memory buffers. Changing these three constants can have a dramatic
1993 effect on performance. The values here are reasonable defaults,
1994 but might be tuned. If you adjust them, be sure to test across a
1995 range of uses of cpplib, including heavy nested function-like macro
1996 expansion. Also check the change in peak memory usage (NJAMD is a
1997 good tool for this). */
1998 #define MIN_BUFF_SIZE 8000
1999 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2000 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2001 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2003 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2004 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2005 #endif
2007 /* Create a new allocation buffer. Place the control block at the end
2008 of the buffer, so that buffer overflows will cause immediate chaos. */
2009 static _cpp_buff *
2010 new_buff (size_t len)
2012 _cpp_buff *result;
2013 unsigned char *base;
2015 if (len < MIN_BUFF_SIZE)
2016 len = MIN_BUFF_SIZE;
2017 len = CPP_ALIGN (len);
2019 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2020 result = (_cpp_buff *) (base + len);
2021 result->base = base;
2022 result->cur = base;
2023 result->limit = base + len;
2024 result->next = NULL;
2025 return result;
2028 /* Place a chain of unwanted allocation buffers on the free list. */
2029 void
2030 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2032 _cpp_buff *end = buff;
2034 while (end->next)
2035 end = end->next;
2036 end->next = pfile->free_buffs;
2037 pfile->free_buffs = buff;
2040 /* Return a free buffer of size at least MIN_SIZE. */
2041 _cpp_buff *
2042 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2044 _cpp_buff *result, **p;
2046 for (p = &pfile->free_buffs;; p = &(*p)->next)
2048 size_t size;
2050 if (*p == NULL)
2051 return new_buff (min_size);
2052 result = *p;
2053 size = result->limit - result->base;
2054 /* Return a buffer that's big enough, but don't waste one that's
2055 way too big. */
2056 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2057 break;
2060 *p = result->next;
2061 result->next = NULL;
2062 result->cur = result->base;
2063 return result;
2066 /* Creates a new buffer with enough space to hold the uncommitted
2067 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
2068 the excess bytes to the new buffer. Chains the new buffer after
2069 BUFF, and returns the new buffer. */
2070 _cpp_buff *
2071 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2073 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2074 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2076 buff->next = new_buff;
2077 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2078 return new_buff;
2081 /* Creates a new buffer with enough space to hold the uncommitted
2082 remaining bytes of the buffer pointed to by BUFF, and at least
2083 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
2084 Chains the new buffer before the buffer pointed to by BUFF, and
2085 updates the pointer to point to the new buffer. */
2086 void
2087 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
2089 _cpp_buff *new_buff, *old_buff = *pbuff;
2090 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2092 new_buff = _cpp_get_buff (pfile, size);
2093 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2094 new_buff->next = old_buff;
2095 *pbuff = new_buff;
2098 /* Free a chain of buffers starting at BUFF. */
2099 void
2100 _cpp_free_buff (_cpp_buff *buff)
2102 _cpp_buff *next;
2104 for (; buff; buff = next)
2106 next = buff->next;
2107 free (buff->base);
2111 /* Allocate permanent, unaligned storage of length LEN. */
2112 unsigned char *
2113 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2115 _cpp_buff *buff = pfile->u_buff;
2116 unsigned char *result = buff->cur;
2118 if (len > (size_t) (buff->limit - result))
2120 buff = _cpp_get_buff (pfile, len);
2121 buff->next = pfile->u_buff;
2122 pfile->u_buff = buff;
2123 result = buff->cur;
2126 buff->cur = result + len;
2127 return result;
2130 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2131 That buffer is used for growing allocations when saving macro
2132 replacement lists in a #define, and when parsing an answer to an
2133 assertion in #assert, #unassert or #if (and therefore possibly
2134 whilst expanding macros). It therefore must not be used by any
2135 code that they might call: specifically the lexer and the guts of
2136 the macro expander.
2138 All existing other uses clearly fit this restriction: storing
2139 registered pragmas during initialization. */
2140 unsigned char *
2141 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2143 _cpp_buff *buff = pfile->a_buff;
2144 unsigned char *result = buff->cur;
2146 if (len > (size_t) (buff->limit - result))
2148 buff = _cpp_get_buff (pfile, len);
2149 buff->next = pfile->a_buff;
2150 pfile->a_buff = buff;
2151 result = buff->cur;
2154 buff->cur = result + len;
2155 return result;
2158 /* Say which field of TOK is in use. */
2160 enum cpp_token_fld_kind
2161 cpp_token_val_index (cpp_token *tok)
2163 switch (TOKEN_SPELL (tok))
2165 case SPELL_IDENT:
2166 return CPP_TOKEN_FLD_NODE;
2167 case SPELL_LITERAL:
2168 return CPP_TOKEN_FLD_STR;
2169 case SPELL_OPERATOR:
2170 if (tok->type == CPP_PASTE)
2171 return CPP_TOKEN_FLD_TOKEN_NO;
2172 else
2173 return CPP_TOKEN_FLD_NONE;
2174 case SPELL_NONE:
2175 if (tok->type == CPP_MACRO_ARG)
2176 return CPP_TOKEN_FLD_ARG_NO;
2177 else if (tok->type == CPP_PADDING)
2178 return CPP_TOKEN_FLD_SOURCE;
2179 else if (tok->type == CPP_PRAGMA)
2180 return CPP_TOKEN_FLD_PRAGMA;
2181 /* else fall through */
2182 default:
2183 return CPP_TOKEN_FLD_NONE;