gccrs: Added newline to get more readable lexdump
[official-gcc.git] / gcc / rust / lex / rust-lex.cc
blobbf6bf4c84466470a5bf2ed34517fc48ff4b124b3
1 // Copyright (C) 2020-2024 Free Software Foundation, Inc.
3 // This file is part of GCC.
5 // GCC is free software; you can redistribute it and/or modify it under
6 // the terms of the GNU General Public License as published by the Free
7 // Software Foundation; either version 3, or (at your option) any later
8 // version.
10 // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 // for more details.
15 // You should have received a copy of the GNU General Public License
16 // along with GCC; see the file COPYING3. If not see
17 // <http://www.gnu.org/licenses/>.
19 #include "rust-codepoint.h"
20 #include "rust-system.h"
21 #include "rust-lex.h"
22 #include "rust-diagnostics.h"
23 #include "rust-linemap.h"
24 #include "rust-session-manager.h"
25 #include "safe-ctype.h"
26 #include "cpplib.h"
27 #include "rust-keyword-values.h"
29 namespace Rust {
30 // TODO: move to separate compilation unit?
31 // overload += for uint32_t to allow 32-bit encoded utf-8 to be added
32 std::string &
33 operator+= (std::string &str, Codepoint char32)
35 if (char32.value < 0x80)
37 str += static_cast<char> (char32.value);
39 else if (char32.value < (0x1F + 1) << (1 * 6))
41 str += static_cast<char> (0xC0 | ((char32.value >> 6) & 0x1F));
42 str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
44 else if (char32.value < (0x0F + 1) << (2 * 6))
46 str += static_cast<char> (0xE0 | ((char32.value >> 12) & 0x0F));
47 str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
48 str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
50 else if (char32.value < (0x07 + 1) << (3 * 6))
52 str += static_cast<char> (0xF0 | ((char32.value >> 18) & 0x07));
53 str += static_cast<char> (0x80 | ((char32.value >> 12) & 0x3F));
54 str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
55 str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
57 else
59 rust_debug ("Invalid unicode codepoint found: '%u' ", char32.value);
61 return str;
64 std::string
65 Codepoint::as_string ()
67 std::string str;
69 // str += Codepoint (value);
70 str += *this;
72 return str;
75 /* Includes all allowable float digits EXCEPT _ and . as that needs lookahead
76 * for handling. */
77 bool
78 is_float_digit (uint32_t number)
80 return ISDIGIT (number) || number == 'E' || number == 'e';
83 /* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or
84 * whatever is different */
85 bool
86 is_x_digit (uint32_t number)
88 return ISXDIGIT (number);
91 bool
92 is_octal_digit (uint32_t number)
94 return number >= '0' && number <= '7';
97 bool
98 is_bin_digit (uint32_t number)
100 return number == '0' || number == '1';
103 bool
104 check_valid_float_dot_end (uint32_t character)
106 return character != '.' && character != '_' && !ISALPHA (character);
109 bool
110 is_whitespace (uint32_t character)
112 // https://doc.rust-lang.org/reference/whitespace.html
113 return character == '\t' || character == '\n' || character == '\v'
114 || character == '\f' || character == '\r' || character == ' '
115 || character == 0x0085 // next line
116 || character == 0x200e // left-to-right mark
117 || character == 0x200f // right-to-left mark
118 || character == 0x2028 // line separator
119 || character == 0x2029; // pragraph separator
122 bool
123 is_non_decimal_int_literal_separator (uint32_t character)
125 return character == 'x' || character == 'o' || character == 'b';
128 bool
129 is_identifier_start (uint32_t codepoint)
131 return (cpp_check_xid_property (codepoint) & CPP_XID_START) || codepoint == '_';
134 bool
135 is_identifier_continue (uint32_t codepoint)
137 return cpp_check_xid_property (codepoint) & CPP_XID_CONTINUE;
140 Lexer::Lexer (const std::string &input, Linemap *linemap)
141 : input (RAIIFile::create_error ()), current_line (1), current_column (1),
142 line_map (linemap), dump_lex_out ({}),
143 raw_input_source (new BufferInputSource (input, 0)),
144 input_queue{*raw_input_source}, token_queue (TokenSource (this))
147 Lexer::Lexer (const char *filename, RAIIFile file_input, Linemap *linemap,
148 tl::optional<std::ofstream &> dump_lex_opt)
149 : input (std::move (file_input)), current_line (1), current_column (1),
150 line_map (linemap), dump_lex_out (dump_lex_opt),
151 raw_input_source (new FileInputSource (input.get_raw ())),
152 input_queue{*raw_input_source}, token_queue (TokenSource (this))
154 // inform line_table that file is being entered and is in line 1
155 if (linemap)
156 line_map->start_file (filename, current_line);
159 Lexer::~Lexer ()
161 /* ok apparently stop (which is equivalent of original code in destructor) is
162 * meant to be called after all files have finished parsing, for cleanup. On
163 * the other hand, actual code that it calls to leave a certain line map is
164 * mentioned in GCC docs as being useful for "just leaving an included header"
165 * and stuff like that, so this line mapping functionality may need fixing.
166 * FIXME: find out whether this occurs. */
168 // line_map->stop();
171 bool
172 Lexer::input_source_is_valid_utf8 ()
174 return raw_input_source->is_valid ();
177 location_t
178 Lexer::get_current_location ()
180 if (line_map)
181 return linemap_position_for_column (line_table, current_column);
182 else
183 // If we have no linemap, we're lexing something without proper locations
184 return UNDEF_LOCATION;
187 Codepoint
188 Lexer::peek_input (int n)
190 return input_queue.peek (n);
193 Codepoint
194 Lexer::peek_input ()
196 return peek_input (0);
199 void
200 Lexer::skip_input (int n)
202 input_queue.skip (n);
205 void
206 Lexer::skip_input ()
208 skip_input (0);
211 void
212 Lexer::skip_token (int n)
214 // dump tokens if dump-lex option is enabled
215 if (dump_lex_out.has_value ())
216 dump_and_skip (n);
217 else
218 token_queue.skip (n);
221 void
222 Lexer::dump_and_skip (int n)
224 std::ofstream &out = dump_lex_out.value ();
225 bool found_eof = false;
226 const_TokenPtr tok;
227 for (int i = 0; i < n + 1; i++)
229 if (!found_eof)
231 tok = peek_token ();
232 found_eof |= tok->get_id () == Rust::END_OF_FILE;
234 location_t loc = tok->get_locus ();
236 out << "<id=";
237 out << tok->token_id_to_str ();
238 out << (tok->has_str () ? (std::string (", text=") + tok->get_str ()
239 + std::string (", typehint=")
240 + std::string (tok->get_type_hint_str ()))
241 : "")
242 << " ";
243 out << Linemap::location_to_string (loc) << '\n';
246 token_queue.skip (0);
250 void
251 Lexer::replace_current_token (TokenPtr replacement)
253 token_queue.replace_current_value (replacement);
255 rust_debug ("called 'replace_current_token' - this is deprecated");
258 /* Determines whether the string passed in is a keyword or not. If it is, it
259 * returns the keyword name. */
260 TokenId
261 Lexer::classify_keyword (const std::string &str)
263 auto &keywords = Rust::Values::Keywords::keywords_tokens;
264 auto keyword = keywords.find (str);
266 if (keyword == keywords.end ())
267 return IDENTIFIER;
269 auto id = keyword->second;
271 // We now have the expected token ID of the reserved keyword. However, some
272 // keywords are reserved starting in certain editions. For example, `try` is
273 // only a reserved keyword in editions >=2018. The language might gain new
274 // reserved keywords in the future.
276 // https://doc.rust-lang.org/reference/keywords.html#reserved-keywords
278 // `try` is not a reserved keyword before 2018
279 if (Session::get_instance ().options.get_edition ()
280 == CompileOptions::Edition::E2015
281 && id == TRY)
282 return IDENTIFIER;
284 return id;
287 TokenPtr
288 Lexer::build_token ()
290 // loop to go through multiple characters to build a single token
291 while (true)
293 location_t loc = get_current_location ();
295 current_char = peek_input ();
296 skip_input ();
298 // detect shebang
299 // Must be the first thing on the first line, starting with #!
300 // But since an attribute can also start with an #! we don't count it as a
301 // shebang line when after any whitespace or comments there is a [. If it
302 // is a shebang line we simple drop the line. Otherwise we don't consume
303 // any characters and fall through to the real tokenizer.
304 if (current_line == 1 && current_column == 1 && current_char == '#'
305 && peek_input () == '!')
307 int n = 1;
308 while (true)
310 Codepoint next_char = peek_input (n);
311 if (is_whitespace (next_char.value))
312 n++;
313 else if ((next_char == '/' && peek_input (n + 1) == '/'
314 && peek_input (n + 2) != '!'
315 && peek_input (n + 2) != '/')
316 || (next_char == '/' && peek_input (n + 1) == '/'
317 && peek_input (n + 2) == '/'
318 && peek_input (n + 3) == '/'))
320 // two // or four ////
321 // A single line comment
322 // (but not an inner or outer doc comment)
323 n += 2;
324 next_char = peek_input (n);
325 while (next_char != '\n' && !next_char.is_eof ())
327 n++;
328 next_char = peek_input (n);
330 if (next_char == '\n')
331 n++;
333 else if (next_char == '/' && peek_input (n + 1) == '*'
334 && peek_input (n + 2) == '*'
335 && peek_input (n + 3) == '/')
337 /**/
338 n += 4;
340 else if (next_char == '/' && peek_input (n + 1) == '*'
341 && peek_input (n + 2) == '*' && peek_input (n + 3) == '*'
342 && peek_input (n + 4) == '/')
344 /***/
345 n += 5;
347 else if ((next_char == '/' && peek_input (n + 1) == '*'
348 && peek_input (n + 2) != '*'
349 && peek_input (n + 2) != '!')
350 || (next_char == '/' && peek_input (n + 1) == '*'
351 && peek_input (n + 2) == '*'
352 && peek_input (n + 3) == '*'))
354 // one /* or three /***
355 // Start of a block comment
356 // (but not an inner or outer doc comment)
357 n += 2;
358 int level = 1;
359 while (level > 0)
361 if (peek_input (n).is_eof ())
362 break;
363 else if (peek_input (n) == '/'
364 && peek_input (n + 1) == '*')
366 n += 2;
367 level += 1;
369 else if (peek_input (n) == '*'
370 && peek_input (n + 1) == '/')
372 n += 2;
373 level -= 1;
375 else
376 n++;
379 else if (next_char != '[')
381 // definitely shebang, ignore the first line
382 while (current_char != '\n' && !current_char.is_eof ())
384 current_char = peek_input ();
385 skip_input ();
388 // newline
389 current_line++;
390 current_column = 1;
391 // tell line_table that new line starts
392 start_line (current_line, max_column_hint);
393 break;
395 else
396 break; /* Definitely not a shebang line. */
400 // return end of file token if end of file
401 if (current_char.is_eof ())
402 return Token::make (END_OF_FILE, loc);
404 // if not end of file, start tokenising
405 switch (current_char.value)
407 /* ignore whitespace characters for tokens but continue updating
408 * location */
409 case '\n': // newline
410 case 0x0085: // next line
411 case 0x2028: // line separator
412 case 0x2029: // paragraph separator
413 current_line++;
414 current_column = 1;
415 // tell line_table that new line starts
416 start_line (current_line, max_column_hint);
417 continue;
418 case '\r': // cr
419 // Ignore, we expect a newline (lf) soon.
420 continue;
421 case ' ': // space
422 current_column++;
423 continue;
424 case '\t': // horizontal tab
425 // width of a tab is not well-defined, assume 8 spaces
426 current_column += 8;
427 continue;
428 case '\v': // vertical tab
429 case 0x000c: // form feed
430 case 0x200e: // left-to-right mark
431 case 0x200f: // right-to-left mark
432 // Ignored.
433 continue;
435 // punctuation - actual tokens
436 case '=':
437 if (peek_input () == '>')
439 // match arm arrow
440 skip_input ();
441 current_column += 2;
442 loc += 1;
444 return Token::make (MATCH_ARROW, loc);
446 else if (peek_input () == '=')
448 // equality operator
449 skip_input ();
450 current_column += 2;
451 loc += 1;
453 return Token::make (EQUAL_EQUAL, loc);
455 else
457 // assignment operator
458 current_column++;
459 return Token::make (EQUAL, loc);
461 case '(':
462 current_column++;
463 return Token::make (LEFT_PAREN, loc);
464 case '-':
465 if (peek_input () == '>')
467 // return type specifier
468 skip_input ();
469 current_column += 2;
470 loc += 1;
472 return Token::make (RETURN_TYPE, loc);
474 else if (peek_input () == '=')
476 // minus-assign
477 skip_input ();
478 current_column += 2;
479 loc += 1;
481 return Token::make (MINUS_EQ, loc);
483 else
485 // minus
486 current_column++;
487 return Token::make (MINUS, loc);
489 case '+':
490 if (peek_input () == '=')
492 // add-assign
493 skip_input ();
494 current_column += 2;
495 loc += 1;
497 return Token::make (PLUS_EQ, loc);
499 else
501 // add
502 current_column++;
503 return Token::make (PLUS, loc);
505 case ')':
506 current_column++;
507 return Token::make (RIGHT_PAREN, loc);
508 case ';':
509 current_column++;
510 return Token::make (SEMICOLON, loc);
511 case '*':
512 if (peek_input () == '=')
514 // multiplication-assign
515 skip_input ();
516 current_column += 2;
517 loc += 1;
519 return Token::make (ASTERISK_EQ, loc);
521 else
523 // multiplication
524 current_column++;
525 return Token::make (ASTERISK, loc);
527 case ',':
528 current_column++;
529 return Token::make (COMMA, loc);
530 case '/':
531 if (peek_input () == '=')
533 // division-assign
534 skip_input ();
535 current_column += 2;
536 loc += 1;
538 return Token::make (DIV_EQ, loc);
540 else if ((peek_input () == '/' && peek_input (1) != '!'
541 && peek_input (1) != '/')
542 || (peek_input () == '/' && peek_input (1) == '/'
543 && peek_input (2) == '/'))
545 // two // or four ////
546 // single line comment
547 // (but not an inner or outer doc comment)
548 skip_input ();
549 current_column += 2;
550 current_char = peek_input ();
552 // basically ignore until line finishes
553 while (current_char != '\n' && !current_char.is_eof ())
555 skip_input ();
556 current_column++; // not used
557 current_char = peek_input ();
559 continue;
561 else if (peek_input () == '/'
562 && (peek_input (1) == '!' || peek_input (1) == '/'))
564 /* single line doc comment, inner or outer. */
565 bool is_inner = peek_input (1) == '!';
566 skip_input (1);
567 current_column += 3;
569 std::string str;
570 str.reserve (32);
571 current_char = peek_input ();
572 while (current_char != '\n')
574 skip_input ();
575 if (current_char == '\r')
577 Codepoint next_char = peek_input ();
578 if (next_char == '\n')
580 current_char = '\n';
581 break;
583 rust_error_at (
584 loc, "Isolated CR %<\\r%> not allowed in doc comment");
585 current_char = next_char;
586 continue;
588 if (current_char.is_eof ())
590 rust_error_at (
591 loc, "unexpected EOF while looking for end of comment");
592 break;
594 str += current_char;
595 current_char = peek_input ();
597 skip_input ();
598 current_line++;
599 current_column = 1;
600 // tell line_table that new line starts
601 start_line (current_line, max_column_hint);
603 str.shrink_to_fit ();
605 loc += str.size () - 1;
606 if (is_inner)
607 return Token::make_inner_doc_comment (loc, std::move (str));
608 else
609 return Token::make_outer_doc_comment (loc, std::move (str));
611 else if (peek_input () == '*' && peek_input (1) == '*'
612 && peek_input (2) == '/')
614 /**/
615 skip_input (2);
616 current_column += 4;
617 continue;
619 else if (peek_input () == '*' && peek_input (1) == '*'
620 && peek_input (2) == '*' && peek_input (3) == '/')
622 /***/
623 skip_input (3);
624 current_column += 5;
625 continue;
627 else if ((peek_input () == '*' && peek_input (1) != '!'
628 && peek_input (1) != '*')
629 || (peek_input () == '*' && peek_input (1) == '*'
630 && peek_input (2) == '*'))
632 // one /* or three /***
633 // block comment
634 // (but not an inner or outer doc comment)
635 skip_input ();
636 current_column += 2;
638 int level = 1;
639 while (level > 0)
641 current_char = peek_input ();
643 if (current_char.is_eof ())
645 rust_error_at (
646 loc, "unexpected EOF while looking for end of comment");
647 break;
650 // if /* found
651 if (current_char == '/' && peek_input (1) == '*')
653 // skip /* characters
654 skip_input (1);
656 current_column += 2;
658 level += 1;
659 continue;
662 // ignore until */ is found
663 if (current_char == '*' && peek_input (1) == '/')
665 // skip */ characters
666 skip_input (1);
668 current_column += 2;
670 level -= 1;
671 continue;
674 if (current_char == '\n')
676 skip_input ();
677 current_line++;
678 current_column = 1;
679 // tell line_table that new line starts
680 start_line (current_line, max_column_hint);
681 continue;
684 skip_input ();
685 current_column++;
688 // refresh new token
689 continue;
691 else if (peek_input () == '*'
692 && (peek_input (1) == '!' || peek_input (1) == '*'))
694 // block doc comment, inner /*! or outer /**
695 bool is_inner = peek_input (1) == '!';
696 skip_input (1);
697 current_column += 3;
699 std::string str;
700 str.reserve (96);
702 int level = 1;
703 while (level > 0)
705 current_char = peek_input ();
707 if (current_char.is_eof ())
709 rust_error_at (
710 loc, "unexpected EOF while looking for end of comment");
711 break;
714 // if /* found
715 if (current_char == '/' && peek_input (1) == '*')
717 // skip /* characters
718 skip_input (1);
719 current_column += 2;
721 level += 1;
722 str += "/*";
723 continue;
726 // ignore until */ is found
727 if (current_char == '*' && peek_input (1) == '/')
729 // skip */ characters
730 skip_input (1);
731 current_column += 2;
733 level -= 1;
734 if (level > 0)
735 str += "*/";
736 continue;
739 if (current_char == '\r' && peek_input (1) != '\n')
740 rust_error_at (
741 loc, "Isolated CR %<\\r%> not allowed in doc comment");
743 if (current_char == '\n')
745 skip_input ();
746 current_line++;
747 current_column = 1;
748 // tell line_table that new line starts
749 start_line (current_line, max_column_hint);
750 str += '\n';
751 continue;
754 str += current_char;
755 skip_input ();
756 current_column++;
759 str.shrink_to_fit ();
761 loc += str.size () - 1;
762 if (is_inner)
763 return Token::make_inner_doc_comment (loc, std::move (str));
764 else
765 return Token::make_outer_doc_comment (loc, std::move (str));
767 else
769 // division
770 current_column++;
771 return Token::make (DIV, loc);
773 case '%':
774 if (peek_input () == '=')
776 // modulo-assign
777 skip_input ();
778 current_column += 2;
779 loc += 1;
781 return Token::make (PERCENT_EQ, loc);
783 else
785 // modulo
786 current_column++;
787 return Token::make (PERCENT, loc);
789 case '^':
790 if (peek_input () == '=')
792 // xor-assign?
793 skip_input ();
794 current_column += 2;
795 loc += 1;
797 return Token::make (CARET_EQ, loc);
799 else
801 // xor?
802 current_column++;
803 return Token::make (CARET, loc);
805 case '<':
806 if (peek_input () == '<')
808 if (peek_input (1) == '=')
810 // left-shift assign
811 skip_input (1);
812 current_column += 3;
813 loc += 2;
815 return Token::make (LEFT_SHIFT_EQ, loc);
817 else
819 // left-shift
820 skip_input ();
821 current_column += 2;
822 loc += 1;
824 return Token::make (LEFT_SHIFT, loc);
827 else if (peek_input () == '=')
829 // smaller than or equal to
830 skip_input ();
831 current_column += 2;
832 loc += 1;
834 return Token::make (LESS_OR_EQUAL, loc);
836 else
838 // smaller than
839 current_column++;
840 return Token::make (LEFT_ANGLE, loc);
842 break;
843 case '>':
844 if (peek_input () == '>')
846 if (peek_input (1) == '=')
848 // right-shift-assign
849 skip_input (1);
850 current_column += 3;
851 loc += 2;
853 return Token::make (RIGHT_SHIFT_EQ, loc);
855 else
857 // right-shift
858 skip_input ();
859 current_column += 2;
860 loc += 1;
862 return Token::make (RIGHT_SHIFT, loc);
865 else if (peek_input () == '=')
867 // larger than or equal to
868 skip_input ();
869 current_column += 2;
870 loc += 1;
872 return Token::make (GREATER_OR_EQUAL, loc);
874 else
876 // larger than
877 current_column++;
878 return Token::make (RIGHT_ANGLE, loc);
880 case ':':
881 if (peek_input () == ':')
883 // scope resolution ::
884 skip_input ();
885 current_column += 2;
886 loc += 1;
888 return Token::make (SCOPE_RESOLUTION, loc);
890 else
892 // single colon :
893 current_column++;
894 return Token::make (COLON, loc);
896 case '!':
897 // no special handling for macros in lexer?
898 if (peek_input () == '=')
900 // not equal boolean operator
901 skip_input ();
902 current_column += 2;
903 loc += 1;
905 return Token::make (NOT_EQUAL, loc);
907 else
909 // not equal unary operator
910 current_column++;
912 return Token::make (EXCLAM, loc);
914 case '?':
915 current_column++;
916 return Token::make (QUESTION_MARK, loc);
917 case '#':
918 current_column++;
919 return Token::make (HASH, loc);
920 case '[':
921 current_column++;
922 return Token::make (LEFT_SQUARE, loc);
923 case ']':
924 current_column++;
925 return Token::make (RIGHT_SQUARE, loc);
926 case '{':
927 current_column++;
928 return Token::make (LEFT_CURLY, loc);
929 case '}':
930 current_column++;
931 return Token::make (RIGHT_CURLY, loc);
932 case '@':
933 current_column++;
934 return Token::make (PATTERN_BIND, loc);
935 case '$':
936 current_column++;
937 return Token::make (DOLLAR_SIGN, loc);
938 case '~':
939 current_column++;
940 return Token::make (TILDE, loc);
941 case '\\':
942 current_column++;
943 return Token::make (BACKSLASH, loc);
944 case '`':
945 current_column++;
946 return Token::make (BACKTICK, loc);
947 case '|':
948 if (peek_input () == '=')
950 // bitwise or-assign?
951 skip_input ();
952 current_column += 2;
953 loc += 1;
955 return Token::make (PIPE_EQ, loc);
957 else if (peek_input () == '|')
959 // logical or
960 skip_input ();
961 current_column += 2;
962 loc += 1;
964 return Token::make (OR, loc);
966 else
968 // bitwise or
969 current_column++;
971 return Token::make (PIPE, loc);
973 case '&':
974 if (peek_input () == '=')
976 // bitwise and-assign?
977 skip_input ();
978 current_column += 2;
979 loc += 1;
981 return Token::make (AMP_EQ, loc);
983 else if (peek_input () == '&')
985 // logical and
986 skip_input ();
987 current_column += 2;
988 loc += 1;
990 return Token::make (LOGICAL_AND, loc);
992 else
994 // bitwise and/reference
995 current_column++;
997 return Token::make (AMP, loc);
999 case '.':
1000 if (peek_input () == '.')
1002 if (peek_input (1) == '.')
1004 // ellipsis
1005 skip_input (1);
1006 current_column += 3;
1007 loc += 2;
1009 return Token::make (ELLIPSIS, loc);
1011 else if (peek_input (1) == '=')
1013 // ..=
1014 skip_input (1);
1015 current_column += 3;
1016 loc += 2;
1018 return Token::make (DOT_DOT_EQ, loc);
1020 else
1022 // ..
1023 skip_input ();
1024 current_column += 2;
1025 loc += 1;
1027 return Token::make (DOT_DOT, loc);
1030 else /*if (!ISDIGIT (peek_input ()))*/
1032 // single dot .
1033 // Only if followed by a non-number - otherwise is float
1034 // nope, float cannot start with '.'.
1035 current_column++;
1036 return Token::make (DOT, loc);
1039 // TODO: special handling of _ in the lexer? instead of being identifier
1041 // byte character, byte string and raw byte string literals
1042 if (current_char == 'b')
1044 if (peek_input () == '\'')
1045 return parse_byte_char (loc);
1046 else if (peek_input () == '"')
1047 return parse_byte_string (loc);
1048 else if (peek_input () == 'r'
1049 && (peek_input (1) == '#' || peek_input (1) == '"'))
1050 return parse_raw_byte_string (loc);
1053 // raw identifiers and raw strings
1054 if (current_char == 'r')
1056 Codepoint peek = peek_input ();
1057 Codepoint peek1 = peek_input (1);
1059 // TODO (tamaron) parse Unicode ident
1060 if (peek == '#' && is_identifier_start (peek1.value))
1062 TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
1063 if (raw_ident_ptr != nullptr)
1064 return raw_ident_ptr;
1065 else
1066 continue; /* input got parsed, it just wasn't valid. An error
1067 was produced. */
1069 else
1071 TokenPtr maybe_raw_string_ptr = maybe_parse_raw_string (loc);
1072 if (maybe_raw_string_ptr != nullptr)
1073 return maybe_raw_string_ptr;
1077 // find identifiers and keywords.
1078 if (is_identifier_start (current_char.value))
1079 return parse_identifier_or_keyword (loc);
1081 // int and float literals
1082 if (ISDIGIT (current_char.value))
1083 { // _ not allowed as first char
1084 if (current_char == '0'
1085 && is_non_decimal_int_literal_separator (peek_input ().value))
1087 // handle binary, octal, hex literals
1088 TokenPtr non_dec_int_lit_ptr
1089 = parse_non_decimal_int_literals (loc);
1090 if (non_dec_int_lit_ptr != nullptr)
1091 return non_dec_int_lit_ptr;
1093 else
1095 // handle decimals (integer or float)
1096 TokenPtr decimal_or_float_ptr = parse_decimal_int_or_float (loc);
1097 if (decimal_or_float_ptr != nullptr)
1098 return decimal_or_float_ptr;
1102 // string literals
1103 if (current_char == '"')
1104 return parse_string (loc);
1106 // char literals and lifetime names
1107 if (current_char == '\'')
1109 TokenPtr char_or_lifetime_ptr = parse_char_or_lifetime (loc);
1110 if (char_or_lifetime_ptr != nullptr)
1111 return char_or_lifetime_ptr;
1114 // DEBUG: check for specific character problems:
1115 if (current_char == '0')
1116 rust_debug ("'0' uncaught before unexpected character");
1117 else if (current_char == ']')
1118 rust_debug ("']' uncaught before unexpected character");
1119 else if (current_char == 0x5d)
1120 rust_debug ("whatever 0x5d is (not '0' or ']') uncaught before "
1121 "unexpected character");
1123 // didn't match anything so error
1124 rust_error_at (loc, "unexpected character %<%x%>", current_char.value);
1125 current_column++;
1129 // Parses in a type suffix.
1130 std::pair<PrimitiveCoreType, int>
1131 Lexer::parse_in_type_suffix ()
1133 std::string suffix;
1134 suffix.reserve (5);
1136 int additional_length_offset = 0;
1138 // get suffix
1139 while (ISALPHA (current_char.value) || ISDIGIT (current_char.value)
1140 || current_char == '_')
1142 if (current_char == '_')
1144 // don't add _ to suffix
1145 skip_input ();
1146 current_char = peek_input ();
1148 additional_length_offset++;
1150 continue;
1153 additional_length_offset++;
1155 suffix += current_char;
1156 skip_input ();
1157 current_char = peek_input ();
1160 if (suffix.empty ())
1162 // no type suffix: do nothing but also no error
1163 return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
1165 else if (suffix == "f32")
1167 return std::make_pair (CORETYPE_F32, additional_length_offset);
1169 else if (suffix == "f64")
1171 return std::make_pair (CORETYPE_F64, additional_length_offset);
1173 else if (suffix == "i8")
1175 return std::make_pair (CORETYPE_I8, additional_length_offset);
1177 else if (suffix == "i16")
1179 return std::make_pair (CORETYPE_I16, additional_length_offset);
1181 else if (suffix == "i32")
1183 return std::make_pair (CORETYPE_I32, additional_length_offset);
1185 else if (suffix == "i64")
1187 return std::make_pair (CORETYPE_I64, additional_length_offset);
1189 else if (suffix == "i128")
1191 return std::make_pair (CORETYPE_I128, additional_length_offset);
1193 else if (suffix == "isize")
1195 return std::make_pair (CORETYPE_ISIZE, additional_length_offset);
1197 else if (suffix == "u8")
1199 return std::make_pair (CORETYPE_U8, additional_length_offset);
1201 else if (suffix == "u16")
1203 return std::make_pair (CORETYPE_U16, additional_length_offset);
1205 else if (suffix == "u32")
1207 return std::make_pair (CORETYPE_U32, additional_length_offset);
1209 else if (suffix == "u64")
1211 return std::make_pair (CORETYPE_U64, additional_length_offset);
1213 else if (suffix == "u128")
1215 return std::make_pair (CORETYPE_U128, additional_length_offset);
1217 else if (suffix == "usize")
1219 return std::make_pair (CORETYPE_USIZE, additional_length_offset);
1221 else
1223 rust_error_at (get_current_location (), "unknown number suffix %qs",
1224 suffix.c_str ());
1226 return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
1230 // Parses in the exponent part (if any) of a float literal.
1231 std::pair<std::string, int>
1232 Lexer::parse_in_exponent_part ()
1234 int additional_length_offset = 0;
1235 std::string str;
1236 if (current_char == 'E' || current_char == 'e')
1238 // add exponent to string as strtod works with it
1239 str += current_char;
1240 skip_input ();
1241 current_char = peek_input ();
1243 additional_length_offset++;
1245 // special - and + handling
1246 if (current_char == '-')
1248 str += '-';
1250 skip_input ();
1251 current_char = peek_input ();
1253 additional_length_offset++;
1255 else if (current_char == '+')
1257 // don't add + but still skip input
1258 skip_input ();
1259 current_char = peek_input ();
1261 additional_length_offset++;
1264 // parse another decimal number for exponent
1265 auto str_length = parse_in_decimal ();
1266 str += std::get<0> (str_length);
1267 additional_length_offset += std::get<1> (str_length);
1269 return std::make_pair (str, additional_length_offset);
1272 // Parses a decimal integer.
1273 std::tuple<std::string, int, bool>
1274 Lexer::parse_in_decimal ()
1276 /* A pure decimal contains only digits. */
1277 bool pure_decimal = true;
1278 int additional_length_offset = 0;
1279 std::string str;
1280 while (ISDIGIT (current_char.value) || current_char.value == '_')
1282 if (current_char == '_')
1284 pure_decimal = false;
1285 // don't add _ to number
1286 skip_input ();
1287 current_char = peek_input ();
1289 additional_length_offset++;
1291 continue;
1294 additional_length_offset++;
1296 str += current_char;
1297 skip_input ();
1298 current_char = peek_input ();
1300 return std::make_tuple (str, additional_length_offset, pure_decimal);
1303 /* Parses escapes (and string continues) in "byte" strings and characters. Does
1304 * not support unicode. */
1305 std::tuple<char, int, bool>
1306 Lexer::parse_escape (char opening_char)
1308 int additional_length_offset = 0;
1309 char output_char = 0;
1311 // skip to actual letter
1312 skip_input ();
1313 current_char = peek_input ();
1314 additional_length_offset++;
1316 switch (current_char.value)
1318 case 'x': {
1319 auto hex_escape_pair = parse_partial_hex_escape ();
1320 long hexLong = hex_escape_pair.first;
1321 additional_length_offset += hex_escape_pair.second;
1323 if (hexLong > 255 || hexLong < 0)
1324 rust_error_at (
1325 get_current_location (),
1326 "byte \\x escape %<\\x%x%> out of range - allows up to %<\\xFF%>",
1327 static_cast<unsigned int> (hexLong));
1328 /* TODO: restore capital for escape output - gcc pretty-printer doesn't
1329 * support %X directly */
1330 char hexChar = static_cast<char> (hexLong);
1332 output_char = hexChar;
1334 break;
1335 case 'n':
1336 output_char = '\n';
1337 break;
1338 case 'r':
1339 output_char = '\r';
1340 break;
1341 case 't':
1342 output_char = '\t';
1343 break;
1344 case '\\':
1345 output_char = '\\';
1346 break;
1347 case '0':
1348 output_char = '\0';
1349 break;
1350 case '\'':
1351 output_char = '\'';
1352 break;
1353 case '"':
1354 output_char = '"';
1355 break;
1356 case 'u':
1357 rust_error_at (get_current_location (),
1358 "cannot have a unicode escape \\u in a byte %s",
1359 opening_char == '\'' ? "character" : "string");
1360 // Try to parse it anyway, just to skip it
1361 parse_partial_unicode_escape ();
1362 return std::make_tuple (output_char, additional_length_offset, false);
1363 case '\r':
1364 case '\n':
1365 // string continue
1366 return std::make_tuple (0, parse_partial_string_continue (), true);
1367 default:
1368 rust_error_at (get_current_location (),
1369 "unknown escape sequence %<\\%s%>",
1370 current_char.as_string ().c_str ());
1371 // returns false if no parsing could be done
1372 // return false;
1373 return std::make_tuple (output_char, additional_length_offset, false);
1374 break;
1376 // all non-special cases (string continue) should skip their used char
1377 skip_input ();
1378 current_char = peek_input ();
1379 additional_length_offset++;
1381 // returns true if parsing was successful
1382 // return true;
1383 return std::make_tuple (output_char, additional_length_offset, false);
1386 /* Parses an escape (or string continue) in a string or character. Supports
1387 * unicode escapes. */
1388 std::tuple<Codepoint, int, bool>
1389 Lexer::parse_utf8_escape ()
1391 Codepoint output_char;
1392 int additional_length_offset = 0;
1394 // skip to actual letter
1395 skip_input ();
1396 current_char = peek_input ();
1397 additional_length_offset++;
1399 switch (current_char.value)
1401 case 'x': {
1402 auto hex_escape_pair = parse_partial_hex_escape ();
1403 long hexLong = hex_escape_pair.first;
1404 additional_length_offset += hex_escape_pair.second;
1406 if (hexLong > 127 || hexLong < 0)
1407 rust_error_at (
1408 get_current_location (),
1409 "ascii \\x escape %<\\x%x%> out of range - allows up to %<\\x7F%>",
1410 static_cast<unsigned int> (hexLong));
1411 /* TODO: restore capital for escape output - gcc pretty-printer doesn't
1412 * support %X directly */
1413 char hexChar = static_cast<char> (hexLong);
1415 output_char = hexChar;
1417 break;
1418 case 'n':
1419 output_char = '\n';
1420 break;
1421 case 'r':
1422 output_char = '\r';
1423 break;
1424 case 't':
1425 output_char = '\t';
1426 break;
1427 case '\\':
1428 output_char = '\\';
1429 break;
1430 case '0':
1431 output_char = '\0';
1432 break;
1433 case '\'':
1434 output_char = '\'';
1435 break;
1436 case '"':
1437 output_char = '"';
1438 break;
1439 case 'u': {
1440 auto unicode_escape_pair = parse_partial_unicode_escape ();
1441 output_char = unicode_escape_pair.first;
1442 additional_length_offset += unicode_escape_pair.second;
1444 return std::make_tuple (output_char, additional_length_offset, false);
1446 break;
1447 case '\r':
1448 case '\n':
1449 // string continue
1450 return std::make_tuple (0, parse_partial_string_continue (), true);
1451 default:
1452 rust_error_at (get_current_location (),
1453 "unknown escape sequence %<\\%s%>",
1454 current_char.as_string ().c_str ());
1455 // returns false if no parsing could be done
1456 // return false;
1457 return std::make_tuple (output_char, additional_length_offset, false);
1458 break;
1460 /* all non-special cases (unicode, string continue) should skip their used
1461 * char */
1462 skip_input ();
1463 current_char = peek_input ();
1464 additional_length_offset++;
1466 // returns true if parsing was successful
1467 // return true;
1468 return std::make_tuple (output_char, additional_length_offset, false);
1471 // Parses the body of a string continue that has been found in an escape.
1473 Lexer::parse_partial_string_continue ()
1475 int additional_length_offset = 1;
1477 // string continue
1478 // TODO use utf-8 codepoint to skip whitespaces
1479 while (is_whitespace (current_char.value))
1481 if (current_char == '\n')
1483 current_line++;
1484 current_column = 1;
1485 // tell line_table that new line starts
1486 start_line (current_line, max_column_hint);
1488 // reset "length"
1489 additional_length_offset = 1;
1491 // get next char
1492 skip_input ();
1493 current_char = peek_input ();
1495 continue;
1498 skip_input ();
1499 current_char = peek_input ();
1500 additional_length_offset++;
1503 return additional_length_offset;
1506 /* Parses the body of a '\x' escape. Note that it does not check that the number
1507 * is valid and smaller than 255. */
1508 std::pair<long, int>
1509 Lexer::parse_partial_hex_escape ()
1511 // hex char string (null-terminated)
1512 char hexNum[3] = {0, 0, 0};
1514 // first hex char
1515 current_char = peek_input (1);
1516 int additional_length_offset = 1;
1518 if (!is_x_digit (current_char.value))
1520 rust_error_at (get_current_location (),
1521 "invalid character %<\\x%s%> in \\x sequence",
1522 current_char.as_string ().c_str ());
1523 return std::make_pair (0, 0);
1525 hexNum[0] = current_char.value;
1527 // second hex char
1528 skip_input ();
1529 current_char = peek_input (1);
1530 additional_length_offset++;
1532 if (!is_x_digit (current_char.value))
1534 rust_error_at (get_current_location (),
1535 "invalid character %<\\x%c%s%> in \\x sequence", hexNum[0],
1536 current_char.as_string ().c_str ());
1537 return std::make_pair (0, 1);
1539 skip_input ();
1540 hexNum[1] = current_char.value;
1542 long hexLong = std::strtol (hexNum, nullptr, 16);
1544 return std::make_pair (hexLong, additional_length_offset);
1547 // Parses the body of a unicode escape.
1548 std::pair<Codepoint, int>
1549 Lexer::parse_partial_unicode_escape ()
1551 skip_input ();
1552 current_char = peek_input ();
1553 int additional_length_offset = 0;
1555 if (current_char != '{')
1557 rust_error_at (get_current_location (),
1558 "unicode escape should start with %<{%>");
1559 /* Skip what should probaby have been between brackets. */
1560 while (is_x_digit (current_char.value) || current_char == '_')
1562 skip_input ();
1563 current_char = peek_input ();
1564 additional_length_offset++;
1566 return std::make_pair (Codepoint (0), additional_length_offset);
1569 skip_input ();
1570 current_char = peek_input ();
1571 additional_length_offset++;
1573 if (current_char == '_')
1575 rust_error_at (get_current_location (),
1576 "unicode escape cannot start with %<_%>");
1577 skip_input ();
1578 current_char = peek_input ();
1579 additional_length_offset++;
1580 // fallthrough and try to parse the rest anyway
1583 // parse unicode escape - 1-6 hex digits
1584 std::string num_str;
1585 num_str.reserve (6);
1587 // loop through to add entire hex number to string
1588 while (is_x_digit (current_char.value) || current_char.value == '_')
1590 if (current_char == '_')
1592 // don't add _ to number
1593 skip_input ();
1594 current_char = peek_input ();
1596 additional_length_offset++;
1598 continue;
1601 additional_length_offset++;
1603 // add raw hex numbers
1604 num_str += current_char;
1606 skip_input ();
1607 current_char = peek_input ();
1610 if (current_char == '}')
1612 skip_input ();
1613 current_char = peek_input ();
1614 additional_length_offset++;
1616 else
1618 // actually an error, but allow propagation anyway Assume that
1619 // wrong bracketm whitespace or single/double quotes are wrong
1620 // termination, otherwise it is a wrong character, then skip to the actual
1621 // terminator.
1622 // TODO use utf-8 codepoint to skip whitespaces
1623 if (current_char == '{' || is_whitespace (current_char.value)
1624 || current_char == '\'' || current_char == '"')
1626 rust_error_at (get_current_location (),
1627 "expected terminating %<}%> in unicode escape");
1628 return std::make_pair (Codepoint (0), additional_length_offset);
1630 else
1632 rust_error_at (get_current_location (),
1633 "invalid character %<%s%> in unicode escape",
1634 current_char.as_string ().c_str ());
1635 // TODO use utf-8 codepoint to skip whitespaces
1636 while (current_char != '}' && current_char != '{'
1637 && !is_whitespace (current_char.value) && current_char != '\''
1638 && current_char != '"')
1640 skip_input ();
1641 current_char = peek_input ();
1642 additional_length_offset++;
1644 // Consume the actual closing bracket if found
1645 if (current_char == '}')
1647 skip_input ();
1648 current_char = peek_input ();
1649 additional_length_offset++;
1651 return std::make_pair (Codepoint (0), additional_length_offset);
1655 // ensure 1-6 hex characters
1656 if (num_str.length () > 6 || num_str.length () < 1)
1658 rust_error_at (get_current_location (),
1659 "unicode escape should be between 1 and 6 hex "
1660 "characters; it is %lu",
1661 (unsigned long) num_str.length ());
1662 // return false;
1663 return std::make_pair (Codepoint (0), additional_length_offset);
1666 unsigned long hex_num = std::strtoul (num_str.c_str (), nullptr, 16);
1668 if (hex_num > 0xd7ff && hex_num < 0xe000)
1670 rust_error_at (
1671 get_current_location (),
1672 "unicode escape cannot be a surrogate value (D800 to DFFF)");
1673 return std::make_pair (Codepoint (0), additional_length_offset);
1676 if (hex_num > 0x10ffff)
1678 rust_error_at (get_current_location (),
1679 "unicode escape cannot be larger than 10FFFF");
1680 return std::make_pair (Codepoint (0), additional_length_offset);
1683 // return true;
1684 return std::make_pair (Codepoint (static_cast<uint32_t> (hex_num)),
1685 additional_length_offset);
1688 // Parses a byte character.
1689 TokenPtr
1690 Lexer::parse_byte_char (location_t loc)
1692 skip_input ();
1693 current_column++;
1694 // make current char the next character
1695 current_char = peek_input ();
1697 int length = 1;
1699 // char to save
1700 Codepoint byte_char = 0;
1702 // detect escapes
1703 if (current_char == '\\')
1705 auto escape_length_pair = parse_escape ('\'');
1706 byte_char = std::get<0> (escape_length_pair);
1707 length += std::get<1> (escape_length_pair);
1709 current_char = peek_input ();
1711 if (current_char != '\'')
1713 rust_error_at (get_current_location (), "unclosed %<byte char%>");
1716 skip_input ();
1717 current_char = peek_input ();
1718 length++; // go to next char
1720 else if (current_char != '\'')
1722 // otherwise, get character from direct input character
1723 byte_char = current_char;
1725 if (!byte_char.is_ascii ())
1727 rust_error_at (get_current_location (),
1728 "non-ASCII character in %<byte char%>");
1731 skip_input ();
1732 current_char = peek_input ();
1733 length++;
1735 if (current_char != '\'')
1737 rust_error_at (get_current_location (), "unclosed %<byte char%>");
1740 skip_input ();
1741 current_char = peek_input ();
1742 length++; // go to next char
1744 else
1746 rust_error_at (get_current_location (),
1747 "no character inside %<%> for %<byte char%>");
1750 current_column += length;
1752 loc += length - 1;
1753 return Token::make_byte_char (loc, byte_char.value);
1756 // Parses a byte string.
1757 TokenPtr
1758 Lexer::parse_byte_string (location_t loc)
1760 // byte string
1762 // skip quote character
1763 skip_input ();
1764 current_column++;
1766 std::string str;
1767 str.reserve (16); // some sensible default
1769 current_char = peek_input ();
1771 const location_t string_begin_locus = get_current_location ();
1773 while (current_char != '"' && !current_char.is_eof ())
1775 if (current_char == '\\')
1777 int length = 1;
1778 auto escape_length_pair = parse_escape ('"');
1779 char output_char = std::get<0> (escape_length_pair);
1781 if (output_char == 0 && std::get<2> (escape_length_pair))
1782 length = std::get<1> (escape_length_pair) - 1;
1783 else
1784 length += std::get<1> (escape_length_pair);
1786 if (output_char != 0 || !std::get<2> (escape_length_pair))
1787 str += output_char;
1789 current_column += length;
1791 continue;
1794 current_column++;
1795 if (current_char.value == '\n')
1797 current_line++;
1798 current_column = 1;
1799 // tell line_table that new line starts
1800 start_line (current_line, max_column_hint);
1803 str += current_char;
1804 skip_input ();
1805 current_char = peek_input ();
1808 if (current_char == '"')
1810 current_column++;
1812 skip_input ();
1813 current_char = peek_input ();
1815 else if (current_char.is_eof ())
1817 rust_error_at (string_begin_locus, "unended byte string literal");
1818 return Token::make (END_OF_FILE, get_current_location ());
1820 else
1822 rust_unreachable ();
1825 str.shrink_to_fit ();
1826 loc += str.size () - 1;
1828 return Token::make_byte_string (loc, std::move (str));
1831 // Parses a raw byte string.
1832 TokenPtr
1833 Lexer::parse_raw_byte_string (location_t loc)
1835 // raw byte string literals
1836 std::string str;
1837 str.reserve (16); // some sensible default
1839 int length = 1;
1840 int hash_count = 0;
1842 // get hash count at beginnning
1843 skip_input ();
1844 current_char = peek_input ();
1845 length++;
1846 while (current_char == '#')
1848 hash_count++;
1849 length++;
1851 skip_input ();
1852 current_char = peek_input ();
1855 if (current_char != '"')
1857 rust_error_at (get_current_location (),
1858 "raw byte string has no opening %<\"%>");
1861 skip_input ();
1862 current_char = peek_input ();
1863 length++;
1865 while (true)
1867 if (current_char == '"')
1869 bool enough_hashes = true;
1871 for (int i = 0; i < hash_count; i++)
1873 if (peek_input (i + 1) != '#')
1875 enough_hashes = false;
1876 break;
1880 if (enough_hashes)
1882 // skip enough input and peek enough input
1883 skip_input (hash_count);
1884 current_char = peek_input ();
1885 length += hash_count + 1;
1886 break;
1890 if (current_char.value > 127)
1892 rust_error_at (get_current_location (),
1893 "character %<%s%> in raw byte string out of range",
1894 current_char.as_string ().c_str ());
1895 current_char = 0;
1898 length++;
1900 str += current_char;
1901 skip_input ();
1902 current_char = peek_input ();
1905 current_column += length;
1907 loc += length - 1;
1909 str.shrink_to_fit ();
1911 return Token::make_byte_string (loc, std::move (str));
1914 // Parses a raw identifier.
1915 TokenPtr
1916 Lexer::parse_raw_identifier (location_t loc)
1918 // raw identifier
1919 std::string str;
1920 str.reserve (16); // default
1922 skip_input ();
1923 current_char = peek_input ();
1925 current_column += 2;
1927 bool first_is_underscore = current_char == '_';
1929 int length = 0;
1930 current_char = peek_input ();
1931 // loop through entire name
1932 while (is_identifier_continue (current_char.value))
1934 length++;
1936 str += current_char;
1937 skip_input ();
1938 current_char = peek_input ();
1941 current_column += length;
1943 rust_debug ("raw ident: %s", str.c_str ());
1945 // if just a single underscore, not an identifier
1946 if (first_is_underscore && length == 1)
1947 rust_error_at (get_current_location (),
1948 "%<_%> is not a valid raw identifier");
1950 using namespace Rust::Values;
1951 std::set<std::string> invalid{
1952 Keywords::CRATE, Keywords::EXTERN_KW, Keywords::SELF,
1953 Keywords::SUPER, Keywords::SELF_ALIAS,
1956 if (invalid.find (str) != invalid.end ())
1958 rust_error_at (get_current_location (),
1959 "%qs is a forbidden raw identifier", str.c_str ());
1961 return nullptr;
1963 else
1965 str.shrink_to_fit ();
1966 loc += length - 1;
1968 return Token::make_identifier (loc, std::move (str));
1972 // skip broken string input (unterminated strings)
1973 void
1974 Lexer::skip_broken_string_input (Codepoint current_char)
1976 while (current_char != '"' && !current_char.is_eof ())
1978 if (current_char == '\n')
1980 current_line++;
1981 current_column = 1;
1983 else
1985 current_column++;
1987 skip_input ();
1988 current_char = peek_input ();
1990 if (current_char == '"')
1992 current_column++;
1994 skip_input ();
1995 current_char = peek_input ();
1997 rust_debug ("skipped to %d:%d due to bad quotes", current_line,
1998 current_column);
2001 // Parses a string.
2002 TokenPtr
2003 Lexer::parse_string (location_t loc)
2005 std::string str;
2006 str.reserve (16); // some sensible default
2008 current_char = peek_input ();
2010 const location_t string_begin_locus = get_current_location ();
2012 // FIXME: This fails if the input ends. How do we check for EOF?
2013 while (current_char.value != '"' && !current_char.is_eof ())
2015 if (current_char.value == '\\')
2017 int length = 1;
2019 // parse escape
2020 auto utf8_escape_pair = parse_utf8_escape ();
2021 current_char = std::get<0> (utf8_escape_pair);
2023 if (current_char == Codepoint (0) && std::get<2> (utf8_escape_pair))
2024 length = std::get<1> (utf8_escape_pair) - 1;
2025 else
2026 length += std::get<1> (utf8_escape_pair);
2028 if (current_char != Codepoint (0) || !std::get<2> (utf8_escape_pair))
2029 str += current_char.as_string ();
2031 current_column += length;
2033 // FIXME: should remove this but can't.
2034 // `parse_utf8_escape` does not update `current_char` correctly.
2035 current_char = peek_input ();
2036 continue;
2039 current_column++;
2040 if (current_char.value == '\n')
2042 current_line++;
2043 current_column = 1;
2044 // tell line_table that new line starts
2045 start_line (current_line, max_column_hint);
2048 str += current_char;
2049 skip_input ();
2050 current_char = peek_input ();
2053 if (current_char.value == '"')
2055 current_column++;
2057 skip_input ();
2058 current_char = peek_input ();
2060 else if (current_char.is_eof ())
2062 rust_error_at (string_begin_locus, "unended string literal");
2063 return Token::make (END_OF_FILE, get_current_location ());
2065 else
2067 rust_unreachable ();
2070 str.shrink_to_fit ();
2072 return Token::make_string (loc, std::move (str));
2075 // Parses an identifier or keyword.
2076 TokenPtr
2077 Lexer::parse_identifier_or_keyword (location_t loc)
2079 std::string str;
2080 str.reserve (16); // default
2081 str += current_char.as_string ();
2083 bool first_is_underscore = current_char == '_';
2085 int length = 1;
2086 current_char = peek_input ();
2088 // loop through entire name
2089 while (is_identifier_continue (current_char.value))
2091 auto s = current_char.as_string ();
2092 length++;
2094 str += current_char.as_string ();
2095 skip_input ();
2096 current_char = peek_input ();
2099 current_column += length;
2101 // if just a single underscore, not an identifier
2102 if (first_is_underscore && length == 1)
2103 return Token::make (UNDERSCORE, loc);
2105 str.shrink_to_fit ();
2107 loc += length - 1;
2109 TokenId keyword = classify_keyword (str);
2110 if (keyword == IDENTIFIER)
2111 return Token::make_identifier (loc, std::move (str));
2112 else
2113 return Token::make (keyword, loc);
2116 // Possibly returns a raw string token if it exists - otherwise returns null.
2117 TokenPtr
2118 Lexer::maybe_parse_raw_string (location_t loc)
2120 int peek_index = 0;
2121 while (peek_input (peek_index) == '#')
2122 peek_index++;
2124 if (peek_input (peek_index) == '"')
2125 return parse_raw_string (loc, peek_index);
2126 else
2127 return nullptr;
2130 // Returns a raw string token.
2131 TokenPtr
2132 Lexer::parse_raw_string (location_t loc, int initial_hash_count)
2134 // raw string literals
2135 std::string str;
2136 str.reserve (16); // some sensible default
2138 int length = 1 + initial_hash_count;
2140 if (initial_hash_count > 0)
2141 skip_input (initial_hash_count - 1);
2143 current_char = peek_input ();
2145 if (current_char != '"')
2146 rust_error_at (get_current_location (), "raw string has no opening %<\"%>");
2148 length++;
2149 skip_input ();
2150 current_char = peek_input ();
2152 while (!current_char.is_eof ())
2154 if (current_char.value == '"')
2156 bool enough_hashes = true;
2158 for (int i = 0; i < initial_hash_count; i++)
2160 if (peek_input (i + 1) != '#')
2162 enough_hashes = false;
2163 break;
2167 if (enough_hashes)
2169 // skip enough input and peek enough input
2170 skip_input (initial_hash_count);
2171 current_char = peek_input ();
2172 length += initial_hash_count + 1;
2173 break;
2177 length++;
2179 str += current_char.as_string ();
2180 skip_input ();
2181 current_char = peek_input ();
2184 current_column += length;
2186 loc += length - 1;
2188 str.shrink_to_fit ();
2190 return Token::make_string (loc, std::move (str));
2193 template <typename IsDigitFunc>
2194 TokenPtr
2195 Lexer::parse_non_decimal_int_literal (location_t loc, IsDigitFunc is_digit_func,
2196 std::string existent_str, int base)
2198 int length = 1;
2200 skip_input ();
2201 current_char = peek_input ();
2203 length++;
2205 // loop through to add entire number to string
2206 while (is_digit_func (current_char.value) || current_char == '_')
2208 if (current_char == '_')
2210 // don't add _ to number
2211 skip_input ();
2212 current_char = peek_input ();
2214 length++;
2216 continue;
2219 length++;
2221 // add raw numbers
2222 existent_str += current_char;
2223 skip_input ();
2224 current_char = peek_input ();
2227 // convert value to decimal representation
2228 long dec_num = std::strtol (existent_str.c_str (), nullptr, base);
2230 existent_str = std::to_string (dec_num);
2232 // parse in type suffix if it exists
2233 auto type_suffix_pair = parse_in_type_suffix ();
2234 PrimitiveCoreType type_hint = type_suffix_pair.first;
2235 length += type_suffix_pair.second;
2237 current_column += length;
2239 if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64)
2241 rust_error_at (get_current_location (),
2242 "invalid type suffix %qs for integer (%s) literal",
2243 get_type_hint_string (type_hint),
2244 base == 16
2245 ? "hex"
2246 : (base == 8 ? "octal"
2247 : (base == 2 ? "binary"
2248 : "<insert unknown base>")));
2249 return nullptr;
2252 loc += length - 1;
2254 return Token::make_int (loc, std::move (existent_str), type_hint);
2257 // Parses a hex, binary or octal int literal.
2258 TokenPtr
2259 Lexer::parse_non_decimal_int_literals (location_t loc)
2261 std::string str;
2262 str.reserve (16); // some sensible default
2263 str += current_char;
2265 current_char = peek_input ();
2267 if (current_char == 'x')
2269 // hex (integer only)
2270 return parse_non_decimal_int_literal (loc, is_x_digit, str + "x", 16);
2272 else if (current_char == 'o')
2274 // octal (integer only)
2275 return parse_non_decimal_int_literal (loc, is_octal_digit,
2276 std::move (str), 8);
2278 else if (current_char == 'b')
2280 // binary (integer only)
2281 return parse_non_decimal_int_literal (loc, is_bin_digit, std::move (str),
2284 else
2286 return nullptr;
2290 // Parses a decimal-based int literal or float literal.
2291 TokenPtr
2292 Lexer::parse_decimal_int_or_float (location_t loc)
2294 std::string str;
2295 str.reserve (16); // some sensible default
2296 str += current_char;
2298 int length = 1;
2299 bool first_zero = current_char == '0';
2301 current_char = peek_input ();
2303 // parse initial decimal integer (or first integer part of float) literal
2304 auto initial_decimal = parse_in_decimal ();
2305 str += std::get<0> (initial_decimal);
2306 length += std::get<1> (initial_decimal);
2308 // detect float literal
2310 // Note:
2312 // We should not use is_float_digit () for this verification but instead
2313 // directly ISDIGIT because rust does not support non digit values right after
2314 // a dot.
2315 // The following value is not legal in rust:
2316 // let a = 3.e1;
2317 // A `0` should be put between the dot and the exponent to be valid
2318 // (eg. 3.0e1).
2319 if (current_char == '.' && ISDIGIT (peek_input (1).value))
2321 // float with a '.', parse another decimal into it
2323 // add . to str
2324 str += current_char;
2325 skip_input ();
2326 current_char = peek_input ();
2327 length++;
2329 // parse another decimal number for float
2330 auto second_decimal = parse_in_decimal ();
2331 str += std::get<0> (second_decimal);
2332 length += std::get<1> (second_decimal);
2334 // parse in exponent part if it exists
2335 auto exponent_pair = parse_in_exponent_part ();
2336 str += exponent_pair.first;
2337 length += exponent_pair.second;
2339 // parse in type suffix if it exists
2340 auto type_suffix_pair = parse_in_type_suffix ();
2341 PrimitiveCoreType type_hint = type_suffix_pair.first;
2342 length += type_suffix_pair.second;
2344 if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
2345 && type_hint != CORETYPE_UNKNOWN)
2347 rust_error_at (get_current_location (),
2348 "invalid type suffix %qs for floating-point literal",
2349 get_type_hint_string (type_hint));
2350 // ignore invalid type suffix as everything else seems fine
2351 type_hint = CORETYPE_UNKNOWN;
2354 current_column += length;
2356 loc += length - 1;
2358 str.shrink_to_fit ();
2359 return Token::make_float (loc, std::move (str), type_hint);
2361 else if (current_char == '.'
2362 && check_valid_float_dot_end (peek_input (1).value))
2364 // float that is just an integer with a terminating '.' character
2366 // add . to str
2367 str += current_char;
2368 skip_input ();
2369 current_char = peek_input ();
2370 length++;
2372 // type hint not allowed
2374 current_column += length;
2376 loc += length - 1;
2378 str.shrink_to_fit ();
2379 return Token::make_float (loc, std::move (str), CORETYPE_UNKNOWN);
2381 else if (current_char == 'E' || current_char == 'e')
2383 // exponent float with no '.' character
2385 // parse exponent part
2386 auto exponent_pair = parse_in_exponent_part ();
2387 str += exponent_pair.first;
2388 length += exponent_pair.second;
2390 // parse in type suffix if it exists
2391 auto type_suffix_pair = parse_in_type_suffix ();
2392 PrimitiveCoreType type_hint = type_suffix_pair.first;
2393 length += type_suffix_pair.second;
2395 if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
2396 && type_hint != CORETYPE_UNKNOWN)
2398 rust_error_at (get_current_location (),
2399 "invalid type suffix %qs for floating-point literal",
2400 get_type_hint_string (type_hint));
2401 // ignore invalid type suffix as everything else seems fine
2402 type_hint = CORETYPE_UNKNOWN;
2405 current_column += length;
2407 loc += length - 1;
2409 str.shrink_to_fit ();
2410 return Token::make_float (loc, std::move (str), type_hint);
2412 else
2414 // is an integer
2416 // parse in type suffix if it exists
2417 auto type_suffix_pair = parse_in_type_suffix ();
2418 PrimitiveCoreType type_hint = type_suffix_pair.first;
2419 /* A "real" pure decimal doesn't have a suffix and no zero prefix. */
2420 if (type_hint == CORETYPE_UNKNOWN)
2422 bool pure_decimal = std::get<2> (initial_decimal);
2423 if (pure_decimal && (!first_zero || str.size () == 1))
2424 type_hint = CORETYPE_PURE_DECIMAL;
2426 length += type_suffix_pair.second;
2428 current_column += length;
2430 loc += length - 1;
2432 str.shrink_to_fit ();
2433 return Token::make_int (loc, std::move (str), type_hint);
2437 TokenPtr
2438 Lexer::parse_char_or_lifetime (location_t loc)
2440 int length = 1;
2442 current_char = peek_input ();
2443 if (current_char.is_eof ())
2444 return nullptr;
2446 // parse escaped char literal
2447 if (current_char.value == '\\')
2449 // parse escape
2450 auto utf8_escape_pair = parse_utf8_escape ();
2451 Codepoint escaped_char = std::get<0> (utf8_escape_pair);
2452 length += std::get<1> (utf8_escape_pair);
2454 if (peek_input ().value != '\'')
2456 rust_error_at (get_current_location (), "unended character literal");
2458 else
2460 skip_input ();
2461 current_char = peek_input ();
2462 length++;
2465 current_column += length;
2467 loc += length - 1;
2469 return Token::make_char (loc, escaped_char);
2471 else
2473 skip_input ();
2475 if (peek_input ().value == '\'')
2477 // parse non-escaped char literal
2478 Codepoint non_escaped_char = current_char;
2480 // skip the ' character
2481 skip_input ();
2482 current_char = peek_input ();
2484 // TODO fix due to different widths of utf-8 chars?
2485 current_column += 3;
2487 loc += 2;
2489 return Token::make_char (loc, non_escaped_char);
2491 else if (is_identifier_start (current_char.value))
2493 // parse lifetime name
2494 std::string str;
2495 str += current_char.as_string ();
2496 length++;
2498 current_char = peek_input ();
2499 while (is_identifier_continue (current_char.value))
2501 str += current_char.as_string ();
2502 skip_input ();
2503 current_char = peek_input ();
2504 length++;
2507 current_column += length;
2509 loc += length - 1;
2511 // TODO some keywords cannot be used for a lifetime label #2306
2512 // https://doc.rust-lang.org/reference/tokens.html
2514 str.shrink_to_fit ();
2515 return Token::make_lifetime (loc, std::move (str));
2517 else
2519 rust_error_at (
2520 get_current_location (),
2521 "expected %' after character constant in character literal");
2522 return nullptr;
2527 void
2528 Lexer::split_current_token (TokenId new_left, TokenId new_right)
2530 /* TODO: assert that this TokenId is a "simple token" like punctuation and not
2531 * like "IDENTIFIER"? */
2532 location_t current_loc = peek_token ()->get_locus ();
2533 TokenPtr new_left_tok = Token::make (new_left, current_loc);
2534 TokenPtr new_right_tok = Token::make (new_right, current_loc + 1);
2536 token_queue.replace_current_value (std::move (new_left_tok));
2537 token_queue.insert (1, std::move (new_right_tok));
2540 void
2541 Lexer::split_current_token (std::vector<TokenPtr> new_tokens)
2543 rust_assert (new_tokens.size () > 0);
2544 token_queue.replace_current_value (new_tokens[0]);
2546 for (size_t i = 1; i < new_tokens.size (); i++)
2548 token_queue.insert (i, new_tokens[i]);
2552 void
2553 Lexer::start_line (int current_line, int current_column)
2555 if (line_map)
2556 linemap_line_start (line_table, current_line, current_column);
2559 } // namespace Rust
2561 #if CHECKING_P
2563 namespace selftest {
2565 // Checks if `src` has the same contents as the given characters
2566 static void
2567 assert_source_content (Rust::InputSource &src,
2568 const std::vector<uint32_t> &expected)
2570 Rust::Codepoint src_char = src.next ();
2571 for (auto expected_char : expected)
2573 // Make sure that `src` is not shorter than `expected`
2574 ASSERT_FALSE (src_char.is_eof ());
2575 // Checks skipped character is expeceted one.
2576 ASSERT_EQ (src_char.value, expected_char);
2577 src_char = src.next ();
2579 // Checks if `src` and `chars` has the same length.
2580 ASSERT_TRUE (src_char.is_eof ());
2583 static void
2584 test_buffer_input_source (std::string str,
2585 const std::vector<uint32_t> &expected)
2587 Rust::BufferInputSource source (str, 0);
2588 assert_source_content (source, expected);
2591 static void
2592 test_file_input_source (std::string str, const std::vector<uint32_t> &expected)
2594 FILE *tmpf = tmpfile ();
2595 // Moves to the first character
2596 fputs (str.c_str (), tmpf);
2597 std::rewind (tmpf);
2598 Rust::FileInputSource source (tmpf);
2599 assert_source_content (source, expected);
2602 void
2603 rust_input_source_test ()
2605 // ASCII
2606 std::string src = u8"_abcde\tXYZ\v\f";
2607 std::vector<uint32_t> expected
2608 = {'_', 'a', 'b', 'c', 'd', 'e', '\t', 'X', 'Y', 'Z', '\v', '\f'};
2609 test_buffer_input_source (src, expected);
2611 // BOM
2612 src = u8"\xef\xbb\xbfOK";
2613 expected = {'O', 'K'};
2614 test_buffer_input_source (src, expected);
2616 // Russian
2617 src = u8"приве́т";
2618 expected = {L'п',
2619 L'р',
2620 L'и',
2621 L'в',
2622 0x0435 /* CYRILLIC SMALL LETTER IE е */,
2623 0x301 /* COMBINING ACUTE ACCENT ́ */,
2624 L'т'};
2625 test_buffer_input_source (src, expected);
2627 src = u8"❤️🦀";
2628 expected = {0x2764 /* HEAVY BLACK HEART */,
2629 0xfe0f /* VARIATION SELECTOR-16 */, L'🦀'};
2630 test_buffer_input_source (src, expected);
2632 src = u8"こんにちは";
2633 expected = {L'こ', L'ん', L'に', L'ち', L'は'};
2634 test_file_input_source (src, expected);
2636 src = u8"👮‍♂👩‍⚕";
2637 expected
2638 = {0x1f46e /* POLICE OFFICER */, 0x200d /* ZERO WIDTH JOINER */,
2639 0x2642 /* MALE SIGN */, 0x1f469 /* WOMAN */,
2640 0x200d /* ZERO WIDTH JOINER */, 0x2695 /* STAFF OF AESCULAPIUS */};
2641 test_file_input_source (src, expected);
2644 } // namespace selftest
2646 #endif // CHECKING_P