1 // Copyright (C) 2020-2024 Free Software Foundation, Inc.
3 // This file is part of GCC.
5 // GCC is free software; you can redistribute it and/or modify it under
6 // the terms of the GNU General Public License as published by the Free
7 // Software Foundation; either version 3, or (at your option) any later
10 // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 // You should have received a copy of the GNU General Public License
16 // along with GCC; see the file COPYING3. If not see
17 // <http://www.gnu.org/licenses/>.
19 #include "rust-codepoint.h"
20 #include "rust-system.h"
22 #include "rust-diagnostics.h"
23 #include "rust-linemap.h"
24 #include "rust-session-manager.h"
25 #include "safe-ctype.h"
27 #include "rust-keyword-values.h"
30 // TODO: move to separate compilation unit?
31 // overload += for uint32_t to allow 32-bit encoded utf-8 to be added
33 operator+= (std::string
&str
, Codepoint char32
)
35 if (char32
.value
< 0x80)
37 str
+= static_cast<char> (char32
.value
);
39 else if (char32
.value
< (0x1F + 1) << (1 * 6))
41 str
+= static_cast<char> (0xC0 | ((char32
.value
>> 6) & 0x1F));
42 str
+= static_cast<char> (0x80 | ((char32
.value
>> 0) & 0x3F));
44 else if (char32
.value
< (0x0F + 1) << (2 * 6))
46 str
+= static_cast<char> (0xE0 | ((char32
.value
>> 12) & 0x0F));
47 str
+= static_cast<char> (0x80 | ((char32
.value
>> 6) & 0x3F));
48 str
+= static_cast<char> (0x80 | ((char32
.value
>> 0) & 0x3F));
50 else if (char32
.value
< (0x07 + 1) << (3 * 6))
52 str
+= static_cast<char> (0xF0 | ((char32
.value
>> 18) & 0x07));
53 str
+= static_cast<char> (0x80 | ((char32
.value
>> 12) & 0x3F));
54 str
+= static_cast<char> (0x80 | ((char32
.value
>> 6) & 0x3F));
55 str
+= static_cast<char> (0x80 | ((char32
.value
>> 0) & 0x3F));
59 rust_debug ("Invalid unicode codepoint found: '%u' ", char32
.value
);
65 Codepoint::as_string ()
69 // str += Codepoint (value);
75 /* Includes all allowable float digits EXCEPT _ and . as that needs lookahead
78 is_float_digit (uint32_t number
)
80 return ISDIGIT (number
) || number
== 'E' || number
== 'e';
83 /* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or
84 * whatever is different */
86 is_x_digit (uint32_t number
)
88 return ISXDIGIT (number
);
92 is_octal_digit (uint32_t number
)
94 return number
>= '0' && number
<= '7';
98 is_bin_digit (uint32_t number
)
100 return number
== '0' || number
== '1';
104 check_valid_float_dot_end (uint32_t character
)
106 return character
!= '.' && character
!= '_' && !ISALPHA (character
);
110 is_whitespace (uint32_t character
)
112 // https://doc.rust-lang.org/reference/whitespace.html
113 return character
== '\t' || character
== '\n' || character
== '\v'
114 || character
== '\f' || character
== '\r' || character
== ' '
115 || character
== 0x0085 // next line
116 || character
== 0x200e // left-to-right mark
117 || character
== 0x200f // right-to-left mark
118 || character
== 0x2028 // line separator
119 || character
== 0x2029; // pragraph separator
123 is_non_decimal_int_literal_separator (uint32_t character
)
125 return character
== 'x' || character
== 'o' || character
== 'b';
129 is_identifier_start (uint32_t codepoint
)
131 return (cpp_check_xid_property (codepoint
) & CPP_XID_START
) || codepoint
== '_';
135 is_identifier_continue (uint32_t codepoint
)
137 return cpp_check_xid_property (codepoint
) & CPP_XID_CONTINUE
;
140 Lexer::Lexer (const std::string
&input
, Linemap
*linemap
)
141 : input (RAIIFile::create_error ()), current_line (1), current_column (1),
142 line_map (linemap
), dump_lex_out ({}),
143 raw_input_source (new BufferInputSource (input
, 0)),
144 input_queue
{*raw_input_source
}, token_queue (TokenSource (this))
147 Lexer::Lexer (const char *filename
, RAIIFile file_input
, Linemap
*linemap
,
148 tl::optional
<std::ofstream
&> dump_lex_opt
)
149 : input (std::move (file_input
)), current_line (1), current_column (1),
150 line_map (linemap
), dump_lex_out (dump_lex_opt
),
151 raw_input_source (new FileInputSource (input
.get_raw ())),
152 input_queue
{*raw_input_source
}, token_queue (TokenSource (this))
154 // inform line_table that file is being entered and is in line 1
156 line_map
->start_file (filename
, current_line
);
161 /* ok apparently stop (which is equivalent of original code in destructor) is
162 * meant to be called after all files have finished parsing, for cleanup. On
163 * the other hand, actual code that it calls to leave a certain line map is
164 * mentioned in GCC docs as being useful for "just leaving an included header"
165 * and stuff like that, so this line mapping functionality may need fixing.
166 * FIXME: find out whether this occurs. */
172 Lexer::input_source_is_valid_utf8 ()
174 return raw_input_source
->is_valid ();
178 Lexer::get_current_location ()
181 return linemap_position_for_column (line_table
, current_column
);
183 // If we have no linemap, we're lexing something without proper locations
184 return UNDEF_LOCATION
;
188 Lexer::peek_input (int n
)
190 return input_queue
.peek (n
);
196 return peek_input (0);
200 Lexer::skip_input (int n
)
202 input_queue
.skip (n
);
212 Lexer::skip_token (int n
)
214 // dump tokens if dump-lex option is enabled
215 if (dump_lex_out
.has_value ())
218 token_queue
.skip (n
);
222 Lexer::dump_and_skip (int n
)
224 std::ofstream
&out
= dump_lex_out
.value ();
225 bool found_eof
= false;
227 for (int i
= 0; i
< n
+ 1; i
++)
232 found_eof
|= tok
->get_id () == Rust::END_OF_FILE
;
234 location_t loc
= tok
->get_locus ();
237 out
<< tok
->token_id_to_str ();
238 out
<< (tok
->has_str () ? (std::string (", text=") + tok
->get_str ()
239 + std::string (", typehint=")
240 + std::string (tok
->get_type_hint_str ()))
243 out
<< Linemap::location_to_string (loc
) << '\n';
246 token_queue
.skip (0);
251 Lexer::replace_current_token (TokenPtr replacement
)
253 token_queue
.replace_current_value (replacement
);
255 rust_debug ("called 'replace_current_token' - this is deprecated");
258 /* Determines whether the string passed in is a keyword or not. If it is, it
259 * returns the keyword name. */
261 Lexer::classify_keyword (const std::string
&str
)
263 auto &keywords
= Rust::Values::Keywords::keywords_tokens
;
264 auto keyword
= keywords
.find (str
);
266 if (keyword
== keywords
.end ())
269 auto id
= keyword
->second
;
271 // We now have the expected token ID of the reserved keyword. However, some
272 // keywords are reserved starting in certain editions. For example, `try` is
273 // only a reserved keyword in editions >=2018. The language might gain new
274 // reserved keywords in the future.
276 // https://doc.rust-lang.org/reference/keywords.html#reserved-keywords
278 // `try` is not a reserved keyword before 2018
279 if (Session::get_instance ().options
.get_edition ()
280 == CompileOptions::Edition::E2015
288 Lexer::build_token ()
290 // loop to go through multiple characters to build a single token
293 location_t loc
= get_current_location ();
295 current_char
= peek_input ();
299 // Must be the first thing on the first line, starting with #!
300 // But since an attribute can also start with an #! we don't count it as a
301 // shebang line when after any whitespace or comments there is a [. If it
302 // is a shebang line we simple drop the line. Otherwise we don't consume
303 // any characters and fall through to the real tokenizer.
304 if (current_line
== 1 && current_column
== 1 && current_char
== '#'
305 && peek_input () == '!')
310 Codepoint next_char
= peek_input (n
);
311 if (is_whitespace (next_char
.value
))
313 else if ((next_char
== '/' && peek_input (n
+ 1) == '/'
314 && peek_input (n
+ 2) != '!'
315 && peek_input (n
+ 2) != '/')
316 || (next_char
== '/' && peek_input (n
+ 1) == '/'
317 && peek_input (n
+ 2) == '/'
318 && peek_input (n
+ 3) == '/'))
320 // two // or four ////
321 // A single line comment
322 // (but not an inner or outer doc comment)
324 next_char
= peek_input (n
);
325 while (next_char
!= '\n' && !next_char
.is_eof ())
328 next_char
= peek_input (n
);
330 if (next_char
== '\n')
333 else if (next_char
== '/' && peek_input (n
+ 1) == '*'
334 && peek_input (n
+ 2) == '*'
335 && peek_input (n
+ 3) == '/')
340 else if (next_char
== '/' && peek_input (n
+ 1) == '*'
341 && peek_input (n
+ 2) == '*' && peek_input (n
+ 3) == '*'
342 && peek_input (n
+ 4) == '/')
347 else if ((next_char
== '/' && peek_input (n
+ 1) == '*'
348 && peek_input (n
+ 2) != '*'
349 && peek_input (n
+ 2) != '!')
350 || (next_char
== '/' && peek_input (n
+ 1) == '*'
351 && peek_input (n
+ 2) == '*'
352 && peek_input (n
+ 3) == '*'))
354 // one /* or three /***
355 // Start of a block comment
356 // (but not an inner or outer doc comment)
361 if (peek_input (n
).is_eof ())
363 else if (peek_input (n
) == '/'
364 && peek_input (n
+ 1) == '*')
369 else if (peek_input (n
) == '*'
370 && peek_input (n
+ 1) == '/')
379 else if (next_char
!= '[')
381 // definitely shebang, ignore the first line
382 while (current_char
!= '\n' && !current_char
.is_eof ())
384 current_char
= peek_input ();
391 // tell line_table that new line starts
392 start_line (current_line
, max_column_hint
);
396 break; /* Definitely not a shebang line. */
400 // return end of file token if end of file
401 if (current_char
.is_eof ())
402 return Token::make (END_OF_FILE
, loc
);
404 // if not end of file, start tokenising
405 switch (current_char
.value
)
407 /* ignore whitespace characters for tokens but continue updating
409 case '\n': // newline
410 case 0x0085: // next line
411 case 0x2028: // line separator
412 case 0x2029: // paragraph separator
415 // tell line_table that new line starts
416 start_line (current_line
, max_column_hint
);
419 // Ignore, we expect a newline (lf) soon.
424 case '\t': // horizontal tab
425 // width of a tab is not well-defined, assume 8 spaces
428 case '\v': // vertical tab
429 case 0x000c: // form feed
430 case 0x200e: // left-to-right mark
431 case 0x200f: // right-to-left mark
435 // punctuation - actual tokens
437 if (peek_input () == '>')
444 return Token::make (MATCH_ARROW
, loc
);
446 else if (peek_input () == '=')
453 return Token::make (EQUAL_EQUAL
, loc
);
457 // assignment operator
459 return Token::make (EQUAL
, loc
);
463 return Token::make (LEFT_PAREN
, loc
);
465 if (peek_input () == '>')
467 // return type specifier
472 return Token::make (RETURN_TYPE
, loc
);
474 else if (peek_input () == '=')
481 return Token::make (MINUS_EQ
, loc
);
487 return Token::make (MINUS
, loc
);
490 if (peek_input () == '=')
497 return Token::make (PLUS_EQ
, loc
);
503 return Token::make (PLUS
, loc
);
507 return Token::make (RIGHT_PAREN
, loc
);
510 return Token::make (SEMICOLON
, loc
);
512 if (peek_input () == '=')
514 // multiplication-assign
519 return Token::make (ASTERISK_EQ
, loc
);
525 return Token::make (ASTERISK
, loc
);
529 return Token::make (COMMA
, loc
);
531 if (peek_input () == '=')
538 return Token::make (DIV_EQ
, loc
);
540 else if ((peek_input () == '/' && peek_input (1) != '!'
541 && peek_input (1) != '/')
542 || (peek_input () == '/' && peek_input (1) == '/'
543 && peek_input (2) == '/'))
545 // two // or four ////
546 // single line comment
547 // (but not an inner or outer doc comment)
550 current_char
= peek_input ();
552 // basically ignore until line finishes
553 while (current_char
!= '\n' && !current_char
.is_eof ())
556 current_column
++; // not used
557 current_char
= peek_input ();
561 else if (peek_input () == '/'
562 && (peek_input (1) == '!' || peek_input (1) == '/'))
564 /* single line doc comment, inner or outer. */
565 bool is_inner
= peek_input (1) == '!';
571 current_char
= peek_input ();
572 while (current_char
!= '\n')
575 if (current_char
== '\r')
577 Codepoint next_char
= peek_input ();
578 if (next_char
== '\n')
584 loc
, "Isolated CR %<\\r%> not allowed in doc comment");
585 current_char
= next_char
;
588 if (current_char
.is_eof ())
591 loc
, "unexpected EOF while looking for end of comment");
595 current_char
= peek_input ();
600 // tell line_table that new line starts
601 start_line (current_line
, max_column_hint
);
603 str
.shrink_to_fit ();
605 loc
+= str
.size () - 1;
607 return Token::make_inner_doc_comment (loc
, std::move (str
));
609 return Token::make_outer_doc_comment (loc
, std::move (str
));
611 else if (peek_input () == '*' && peek_input (1) == '*'
612 && peek_input (2) == '/')
619 else if (peek_input () == '*' && peek_input (1) == '*'
620 && peek_input (2) == '*' && peek_input (3) == '/')
627 else if ((peek_input () == '*' && peek_input (1) != '!'
628 && peek_input (1) != '*')
629 || (peek_input () == '*' && peek_input (1) == '*'
630 && peek_input (2) == '*'))
632 // one /* or three /***
634 // (but not an inner or outer doc comment)
641 current_char
= peek_input ();
643 if (current_char
.is_eof ())
646 loc
, "unexpected EOF while looking for end of comment");
651 if (current_char
== '/' && peek_input (1) == '*')
653 // skip /* characters
662 // ignore until */ is found
663 if (current_char
== '*' && peek_input (1) == '/')
665 // skip */ characters
674 if (current_char
== '\n')
679 // tell line_table that new line starts
680 start_line (current_line
, max_column_hint
);
691 else if (peek_input () == '*'
692 && (peek_input (1) == '!' || peek_input (1) == '*'))
694 // block doc comment, inner /*! or outer /**
695 bool is_inner
= peek_input (1) == '!';
705 current_char
= peek_input ();
707 if (current_char
.is_eof ())
710 loc
, "unexpected EOF while looking for end of comment");
715 if (current_char
== '/' && peek_input (1) == '*')
717 // skip /* characters
726 // ignore until */ is found
727 if (current_char
== '*' && peek_input (1) == '/')
729 // skip */ characters
739 if (current_char
== '\r' && peek_input (1) != '\n')
741 loc
, "Isolated CR %<\\r%> not allowed in doc comment");
743 if (current_char
== '\n')
748 // tell line_table that new line starts
749 start_line (current_line
, max_column_hint
);
759 str
.shrink_to_fit ();
761 loc
+= str
.size () - 1;
763 return Token::make_inner_doc_comment (loc
, std::move (str
));
765 return Token::make_outer_doc_comment (loc
, std::move (str
));
771 return Token::make (DIV
, loc
);
774 if (peek_input () == '=')
781 return Token::make (PERCENT_EQ
, loc
);
787 return Token::make (PERCENT
, loc
);
790 if (peek_input () == '=')
797 return Token::make (CARET_EQ
, loc
);
803 return Token::make (CARET
, loc
);
806 if (peek_input () == '<')
808 if (peek_input (1) == '=')
815 return Token::make (LEFT_SHIFT_EQ
, loc
);
824 return Token::make (LEFT_SHIFT
, loc
);
827 else if (peek_input () == '=')
829 // smaller than or equal to
834 return Token::make (LESS_OR_EQUAL
, loc
);
840 return Token::make (LEFT_ANGLE
, loc
);
844 if (peek_input () == '>')
846 if (peek_input (1) == '=')
848 // right-shift-assign
853 return Token::make (RIGHT_SHIFT_EQ
, loc
);
862 return Token::make (RIGHT_SHIFT
, loc
);
865 else if (peek_input () == '=')
867 // larger than or equal to
872 return Token::make (GREATER_OR_EQUAL
, loc
);
878 return Token::make (RIGHT_ANGLE
, loc
);
881 if (peek_input () == ':')
883 // scope resolution ::
888 return Token::make (SCOPE_RESOLUTION
, loc
);
894 return Token::make (COLON
, loc
);
897 // no special handling for macros in lexer?
898 if (peek_input () == '=')
900 // not equal boolean operator
905 return Token::make (NOT_EQUAL
, loc
);
909 // not equal unary operator
912 return Token::make (EXCLAM
, loc
);
916 return Token::make (QUESTION_MARK
, loc
);
919 return Token::make (HASH
, loc
);
922 return Token::make (LEFT_SQUARE
, loc
);
925 return Token::make (RIGHT_SQUARE
, loc
);
928 return Token::make (LEFT_CURLY
, loc
);
931 return Token::make (RIGHT_CURLY
, loc
);
934 return Token::make (PATTERN_BIND
, loc
);
937 return Token::make (DOLLAR_SIGN
, loc
);
940 return Token::make (TILDE
, loc
);
943 return Token::make (BACKSLASH
, loc
);
946 return Token::make (BACKTICK
, loc
);
948 if (peek_input () == '=')
950 // bitwise or-assign?
955 return Token::make (PIPE_EQ
, loc
);
957 else if (peek_input () == '|')
964 return Token::make (OR
, loc
);
971 return Token::make (PIPE
, loc
);
974 if (peek_input () == '=')
976 // bitwise and-assign?
981 return Token::make (AMP_EQ
, loc
);
983 else if (peek_input () == '&')
990 return Token::make (LOGICAL_AND
, loc
);
994 // bitwise and/reference
997 return Token::make (AMP
, loc
);
1000 if (peek_input () == '.')
1002 if (peek_input (1) == '.')
1006 current_column
+= 3;
1009 return Token::make (ELLIPSIS
, loc
);
1011 else if (peek_input (1) == '=')
1015 current_column
+= 3;
1018 return Token::make (DOT_DOT_EQ
, loc
);
1024 current_column
+= 2;
1027 return Token::make (DOT_DOT
, loc
);
1030 else /*if (!ISDIGIT (peek_input ()))*/
1033 // Only if followed by a non-number - otherwise is float
1034 // nope, float cannot start with '.'.
1036 return Token::make (DOT
, loc
);
1039 // TODO: special handling of _ in the lexer? instead of being identifier
1041 // byte character, byte string and raw byte string literals
1042 if (current_char
== 'b')
1044 if (peek_input () == '\'')
1045 return parse_byte_char (loc
);
1046 else if (peek_input () == '"')
1047 return parse_byte_string (loc
);
1048 else if (peek_input () == 'r'
1049 && (peek_input (1) == '#' || peek_input (1) == '"'))
1050 return parse_raw_byte_string (loc
);
1053 // raw identifiers and raw strings
1054 if (current_char
== 'r')
1056 Codepoint peek
= peek_input ();
1057 Codepoint peek1
= peek_input (1);
1059 // TODO (tamaron) parse Unicode ident
1060 if (peek
== '#' && is_identifier_start (peek1
.value
))
1062 TokenPtr raw_ident_ptr
= parse_raw_identifier (loc
);
1063 if (raw_ident_ptr
!= nullptr)
1064 return raw_ident_ptr
;
1066 continue; /* input got parsed, it just wasn't valid. An error
1071 TokenPtr maybe_raw_string_ptr
= maybe_parse_raw_string (loc
);
1072 if (maybe_raw_string_ptr
!= nullptr)
1073 return maybe_raw_string_ptr
;
1077 // find identifiers and keywords.
1078 if (is_identifier_start (current_char
.value
))
1079 return parse_identifier_or_keyword (loc
);
1081 // int and float literals
1082 if (ISDIGIT (current_char
.value
))
1083 { // _ not allowed as first char
1084 if (current_char
== '0'
1085 && is_non_decimal_int_literal_separator (peek_input ().value
))
1087 // handle binary, octal, hex literals
1088 TokenPtr non_dec_int_lit_ptr
1089 = parse_non_decimal_int_literals (loc
);
1090 if (non_dec_int_lit_ptr
!= nullptr)
1091 return non_dec_int_lit_ptr
;
1095 // handle decimals (integer or float)
1096 TokenPtr decimal_or_float_ptr
= parse_decimal_int_or_float (loc
);
1097 if (decimal_or_float_ptr
!= nullptr)
1098 return decimal_or_float_ptr
;
1103 if (current_char
== '"')
1104 return parse_string (loc
);
1106 // char literals and lifetime names
1107 if (current_char
== '\'')
1109 TokenPtr char_or_lifetime_ptr
= parse_char_or_lifetime (loc
);
1110 if (char_or_lifetime_ptr
!= nullptr)
1111 return char_or_lifetime_ptr
;
1114 // DEBUG: check for specific character problems:
1115 if (current_char
== '0')
1116 rust_debug ("'0' uncaught before unexpected character");
1117 else if (current_char
== ']')
1118 rust_debug ("']' uncaught before unexpected character");
1119 else if (current_char
== 0x5d)
1120 rust_debug ("whatever 0x5d is (not '0' or ']') uncaught before "
1121 "unexpected character");
1123 // didn't match anything so error
1124 rust_error_at (loc
, "unexpected character %<%x%>", current_char
.value
);
1129 // Parses in a type suffix.
1130 std::pair
<PrimitiveCoreType
, int>
1131 Lexer::parse_in_type_suffix ()
1136 int additional_length_offset
= 0;
1139 while (ISALPHA (current_char
.value
) || ISDIGIT (current_char
.value
)
1140 || current_char
== '_')
1142 if (current_char
== '_')
1144 // don't add _ to suffix
1146 current_char
= peek_input ();
1148 additional_length_offset
++;
1153 additional_length_offset
++;
1155 suffix
+= current_char
;
1157 current_char
= peek_input ();
1160 if (suffix
.empty ())
1162 // no type suffix: do nothing but also no error
1163 return std::make_pair (CORETYPE_UNKNOWN
, additional_length_offset
);
1165 else if (suffix
== "f32")
1167 return std::make_pair (CORETYPE_F32
, additional_length_offset
);
1169 else if (suffix
== "f64")
1171 return std::make_pair (CORETYPE_F64
, additional_length_offset
);
1173 else if (suffix
== "i8")
1175 return std::make_pair (CORETYPE_I8
, additional_length_offset
);
1177 else if (suffix
== "i16")
1179 return std::make_pair (CORETYPE_I16
, additional_length_offset
);
1181 else if (suffix
== "i32")
1183 return std::make_pair (CORETYPE_I32
, additional_length_offset
);
1185 else if (suffix
== "i64")
1187 return std::make_pair (CORETYPE_I64
, additional_length_offset
);
1189 else if (suffix
== "i128")
1191 return std::make_pair (CORETYPE_I128
, additional_length_offset
);
1193 else if (suffix
== "isize")
1195 return std::make_pair (CORETYPE_ISIZE
, additional_length_offset
);
1197 else if (suffix
== "u8")
1199 return std::make_pair (CORETYPE_U8
, additional_length_offset
);
1201 else if (suffix
== "u16")
1203 return std::make_pair (CORETYPE_U16
, additional_length_offset
);
1205 else if (suffix
== "u32")
1207 return std::make_pair (CORETYPE_U32
, additional_length_offset
);
1209 else if (suffix
== "u64")
1211 return std::make_pair (CORETYPE_U64
, additional_length_offset
);
1213 else if (suffix
== "u128")
1215 return std::make_pair (CORETYPE_U128
, additional_length_offset
);
1217 else if (suffix
== "usize")
1219 return std::make_pair (CORETYPE_USIZE
, additional_length_offset
);
1223 rust_error_at (get_current_location (), "unknown number suffix %qs",
1226 return std::make_pair (CORETYPE_UNKNOWN
, additional_length_offset
);
1230 // Parses in the exponent part (if any) of a float literal.
1231 std::pair
<std::string
, int>
1232 Lexer::parse_in_exponent_part ()
1234 int additional_length_offset
= 0;
1236 if (current_char
== 'E' || current_char
== 'e')
1238 // add exponent to string as strtod works with it
1239 str
+= current_char
;
1241 current_char
= peek_input ();
1243 additional_length_offset
++;
1245 // special - and + handling
1246 if (current_char
== '-')
1251 current_char
= peek_input ();
1253 additional_length_offset
++;
1255 else if (current_char
== '+')
1257 // don't add + but still skip input
1259 current_char
= peek_input ();
1261 additional_length_offset
++;
1264 // parse another decimal number for exponent
1265 auto str_length
= parse_in_decimal ();
1266 str
+= std::get
<0> (str_length
);
1267 additional_length_offset
+= std::get
<1> (str_length
);
1269 return std::make_pair (str
, additional_length_offset
);
1272 // Parses a decimal integer.
1273 std::tuple
<std::string
, int, bool>
1274 Lexer::parse_in_decimal ()
1276 /* A pure decimal contains only digits. */
1277 bool pure_decimal
= true;
1278 int additional_length_offset
= 0;
1280 while (ISDIGIT (current_char
.value
) || current_char
.value
== '_')
1282 if (current_char
== '_')
1284 pure_decimal
= false;
1285 // don't add _ to number
1287 current_char
= peek_input ();
1289 additional_length_offset
++;
1294 additional_length_offset
++;
1296 str
+= current_char
;
1298 current_char
= peek_input ();
1300 return std::make_tuple (str
, additional_length_offset
, pure_decimal
);
1303 /* Parses escapes (and string continues) in "byte" strings and characters. Does
1304 * not support unicode. */
1305 std::tuple
<char, int, bool>
1306 Lexer::parse_escape (char opening_char
)
1308 int additional_length_offset
= 0;
1309 char output_char
= 0;
1311 // skip to actual letter
1313 current_char
= peek_input ();
1314 additional_length_offset
++;
1316 switch (current_char
.value
)
1319 auto hex_escape_pair
= parse_partial_hex_escape ();
1320 long hexLong
= hex_escape_pair
.first
;
1321 additional_length_offset
+= hex_escape_pair
.second
;
1323 if (hexLong
> 255 || hexLong
< 0)
1325 get_current_location (),
1326 "byte \\x escape %<\\x%x%> out of range - allows up to %<\\xFF%>",
1327 static_cast<unsigned int> (hexLong
));
1328 /* TODO: restore capital for escape output - gcc pretty-printer doesn't
1329 * support %X directly */
1330 char hexChar
= static_cast<char> (hexLong
);
1332 output_char
= hexChar
;
1357 rust_error_at (get_current_location (),
1358 "cannot have a unicode escape \\u in a byte %s",
1359 opening_char
== '\'' ? "character" : "string");
1360 // Try to parse it anyway, just to skip it
1361 parse_partial_unicode_escape ();
1362 return std::make_tuple (output_char
, additional_length_offset
, false);
1366 return std::make_tuple (0, parse_partial_string_continue (), true);
1368 rust_error_at (get_current_location (),
1369 "unknown escape sequence %<\\%s%>",
1370 current_char
.as_string ().c_str ());
1371 // returns false if no parsing could be done
1373 return std::make_tuple (output_char
, additional_length_offset
, false);
1376 // all non-special cases (string continue) should skip their used char
1378 current_char
= peek_input ();
1379 additional_length_offset
++;
1381 // returns true if parsing was successful
1383 return std::make_tuple (output_char
, additional_length_offset
, false);
1386 /* Parses an escape (or string continue) in a string or character. Supports
1387 * unicode escapes. */
1388 std::tuple
<Codepoint
, int, bool>
1389 Lexer::parse_utf8_escape ()
1391 Codepoint output_char
;
1392 int additional_length_offset
= 0;
1394 // skip to actual letter
1396 current_char
= peek_input ();
1397 additional_length_offset
++;
1399 switch (current_char
.value
)
1402 auto hex_escape_pair
= parse_partial_hex_escape ();
1403 long hexLong
= hex_escape_pair
.first
;
1404 additional_length_offset
+= hex_escape_pair
.second
;
1406 if (hexLong
> 127 || hexLong
< 0)
1408 get_current_location (),
1409 "ascii \\x escape %<\\x%x%> out of range - allows up to %<\\x7F%>",
1410 static_cast<unsigned int> (hexLong
));
1411 /* TODO: restore capital for escape output - gcc pretty-printer doesn't
1412 * support %X directly */
1413 char hexChar
= static_cast<char> (hexLong
);
1415 output_char
= hexChar
;
1440 auto unicode_escape_pair
= parse_partial_unicode_escape ();
1441 output_char
= unicode_escape_pair
.first
;
1442 additional_length_offset
+= unicode_escape_pair
.second
;
1444 return std::make_tuple (output_char
, additional_length_offset
, false);
1450 return std::make_tuple (0, parse_partial_string_continue (), true);
1452 rust_error_at (get_current_location (),
1453 "unknown escape sequence %<\\%s%>",
1454 current_char
.as_string ().c_str ());
1455 // returns false if no parsing could be done
1457 return std::make_tuple (output_char
, additional_length_offset
, false);
1460 /* all non-special cases (unicode, string continue) should skip their used
1463 current_char
= peek_input ();
1464 additional_length_offset
++;
1466 // returns true if parsing was successful
1468 return std::make_tuple (output_char
, additional_length_offset
, false);
1471 // Parses the body of a string continue that has been found in an escape.
1473 Lexer::parse_partial_string_continue ()
1475 int additional_length_offset
= 1;
1478 // TODO use utf-8 codepoint to skip whitespaces
1479 while (is_whitespace (current_char
.value
))
1481 if (current_char
== '\n')
1485 // tell line_table that new line starts
1486 start_line (current_line
, max_column_hint
);
1489 additional_length_offset
= 1;
1493 current_char
= peek_input ();
1499 current_char
= peek_input ();
1500 additional_length_offset
++;
1503 return additional_length_offset
;
1506 /* Parses the body of a '\x' escape. Note that it does not check that the number
1507 * is valid and smaller than 255. */
1508 std::pair
<long, int>
1509 Lexer::parse_partial_hex_escape ()
1511 // hex char string (null-terminated)
1512 char hexNum
[3] = {0, 0, 0};
1515 current_char
= peek_input (1);
1516 int additional_length_offset
= 1;
1518 if (!is_x_digit (current_char
.value
))
1520 rust_error_at (get_current_location (),
1521 "invalid character %<\\x%s%> in \\x sequence",
1522 current_char
.as_string ().c_str ());
1523 return std::make_pair (0, 0);
1525 hexNum
[0] = current_char
.value
;
1529 current_char
= peek_input (1);
1530 additional_length_offset
++;
1532 if (!is_x_digit (current_char
.value
))
1534 rust_error_at (get_current_location (),
1535 "invalid character %<\\x%c%s%> in \\x sequence", hexNum
[0],
1536 current_char
.as_string ().c_str ());
1537 return std::make_pair (0, 1);
1540 hexNum
[1] = current_char
.value
;
1542 long hexLong
= std::strtol (hexNum
, nullptr, 16);
1544 return std::make_pair (hexLong
, additional_length_offset
);
1547 // Parses the body of a unicode escape.
1548 std::pair
<Codepoint
, int>
1549 Lexer::parse_partial_unicode_escape ()
1552 current_char
= peek_input ();
1553 int additional_length_offset
= 0;
1555 if (current_char
!= '{')
1557 rust_error_at (get_current_location (),
1558 "unicode escape should start with %<{%>");
1559 /* Skip what should probaby have been between brackets. */
1560 while (is_x_digit (current_char
.value
) || current_char
== '_')
1563 current_char
= peek_input ();
1564 additional_length_offset
++;
1566 return std::make_pair (Codepoint (0), additional_length_offset
);
1570 current_char
= peek_input ();
1571 additional_length_offset
++;
1573 if (current_char
== '_')
1575 rust_error_at (get_current_location (),
1576 "unicode escape cannot start with %<_%>");
1578 current_char
= peek_input ();
1579 additional_length_offset
++;
1580 // fallthrough and try to parse the rest anyway
1583 // parse unicode escape - 1-6 hex digits
1584 std::string num_str
;
1585 num_str
.reserve (6);
1587 // loop through to add entire hex number to string
1588 while (is_x_digit (current_char
.value
) || current_char
.value
== '_')
1590 if (current_char
== '_')
1592 // don't add _ to number
1594 current_char
= peek_input ();
1596 additional_length_offset
++;
1601 additional_length_offset
++;
1603 // add raw hex numbers
1604 num_str
+= current_char
;
1607 current_char
= peek_input ();
1610 if (current_char
== '}')
1613 current_char
= peek_input ();
1614 additional_length_offset
++;
1618 // actually an error, but allow propagation anyway Assume that
1619 // wrong bracketm whitespace or single/double quotes are wrong
1620 // termination, otherwise it is a wrong character, then skip to the actual
1622 // TODO use utf-8 codepoint to skip whitespaces
1623 if (current_char
== '{' || is_whitespace (current_char
.value
)
1624 || current_char
== '\'' || current_char
== '"')
1626 rust_error_at (get_current_location (),
1627 "expected terminating %<}%> in unicode escape");
1628 return std::make_pair (Codepoint (0), additional_length_offset
);
1632 rust_error_at (get_current_location (),
1633 "invalid character %<%s%> in unicode escape",
1634 current_char
.as_string ().c_str ());
1635 // TODO use utf-8 codepoint to skip whitespaces
1636 while (current_char
!= '}' && current_char
!= '{'
1637 && !is_whitespace (current_char
.value
) && current_char
!= '\''
1638 && current_char
!= '"')
1641 current_char
= peek_input ();
1642 additional_length_offset
++;
1644 // Consume the actual closing bracket if found
1645 if (current_char
== '}')
1648 current_char
= peek_input ();
1649 additional_length_offset
++;
1651 return std::make_pair (Codepoint (0), additional_length_offset
);
1655 // ensure 1-6 hex characters
1656 if (num_str
.length () > 6 || num_str
.length () < 1)
1658 rust_error_at (get_current_location (),
1659 "unicode escape should be between 1 and 6 hex "
1660 "characters; it is %lu",
1661 (unsigned long) num_str
.length ());
1663 return std::make_pair (Codepoint (0), additional_length_offset
);
1666 unsigned long hex_num
= std::strtoul (num_str
.c_str (), nullptr, 16);
1668 if (hex_num
> 0xd7ff && hex_num
< 0xe000)
1671 get_current_location (),
1672 "unicode escape cannot be a surrogate value (D800 to DFFF)");
1673 return std::make_pair (Codepoint (0), additional_length_offset
);
1676 if (hex_num
> 0x10ffff)
1678 rust_error_at (get_current_location (),
1679 "unicode escape cannot be larger than 10FFFF");
1680 return std::make_pair (Codepoint (0), additional_length_offset
);
1684 return std::make_pair (Codepoint (static_cast<uint32_t> (hex_num
)),
1685 additional_length_offset
);
1688 // Parses a byte character.
1690 Lexer::parse_byte_char (location_t loc
)
1694 // make current char the next character
1695 current_char
= peek_input ();
1700 Codepoint byte_char
= 0;
1703 if (current_char
== '\\')
1705 auto escape_length_pair
= parse_escape ('\'');
1706 byte_char
= std::get
<0> (escape_length_pair
);
1707 length
+= std::get
<1> (escape_length_pair
);
1709 current_char
= peek_input ();
1711 if (current_char
!= '\'')
1713 rust_error_at (get_current_location (), "unclosed %<byte char%>");
1717 current_char
= peek_input ();
1718 length
++; // go to next char
1720 else if (current_char
!= '\'')
1722 // otherwise, get character from direct input character
1723 byte_char
= current_char
;
1725 if (!byte_char
.is_ascii ())
1727 rust_error_at (get_current_location (),
1728 "non-ASCII character in %<byte char%>");
1732 current_char
= peek_input ();
1735 if (current_char
!= '\'')
1737 rust_error_at (get_current_location (), "unclosed %<byte char%>");
1741 current_char
= peek_input ();
1742 length
++; // go to next char
1746 rust_error_at (get_current_location (),
1747 "no character inside %<%> for %<byte char%>");
1750 current_column
+= length
;
1753 return Token::make_byte_char (loc
, byte_char
.value
);
1756 // Parses a byte string.
1758 Lexer::parse_byte_string (location_t loc
)
1762 // skip quote character
1767 str
.reserve (16); // some sensible default
1769 current_char
= peek_input ();
1771 const location_t string_begin_locus
= get_current_location ();
1773 while (current_char
!= '"' && !current_char
.is_eof ())
1775 if (current_char
== '\\')
1778 auto escape_length_pair
= parse_escape ('"');
1779 char output_char
= std::get
<0> (escape_length_pair
);
1781 if (output_char
== 0 && std::get
<2> (escape_length_pair
))
1782 length
= std::get
<1> (escape_length_pair
) - 1;
1784 length
+= std::get
<1> (escape_length_pair
);
1786 if (output_char
!= 0 || !std::get
<2> (escape_length_pair
))
1789 current_column
+= length
;
1795 if (current_char
.value
== '\n')
1799 // tell line_table that new line starts
1800 start_line (current_line
, max_column_hint
);
1803 str
+= current_char
;
1805 current_char
= peek_input ();
1808 if (current_char
== '"')
1813 current_char
= peek_input ();
1815 else if (current_char
.is_eof ())
1817 rust_error_at (string_begin_locus
, "unended byte string literal");
1818 return Token::make (END_OF_FILE
, get_current_location ());
1822 rust_unreachable ();
1825 str
.shrink_to_fit ();
1826 loc
+= str
.size () - 1;
1828 return Token::make_byte_string (loc
, std::move (str
));
1831 // Parses a raw byte string.
1833 Lexer::parse_raw_byte_string (location_t loc
)
1835 // raw byte string literals
1837 str
.reserve (16); // some sensible default
1842 // get hash count at beginnning
1844 current_char
= peek_input ();
1846 while (current_char
== '#')
1852 current_char
= peek_input ();
1855 if (current_char
!= '"')
1857 rust_error_at (get_current_location (),
1858 "raw byte string has no opening %<\"%>");
1862 current_char
= peek_input ();
1867 if (current_char
== '"')
1869 bool enough_hashes
= true;
1871 for (int i
= 0; i
< hash_count
; i
++)
1873 if (peek_input (i
+ 1) != '#')
1875 enough_hashes
= false;
1882 // skip enough input and peek enough input
1883 skip_input (hash_count
);
1884 current_char
= peek_input ();
1885 length
+= hash_count
+ 1;
1890 if (current_char
.value
> 127)
1892 rust_error_at (get_current_location (),
1893 "character %<%s%> in raw byte string out of range",
1894 current_char
.as_string ().c_str ());
1900 str
+= current_char
;
1902 current_char
= peek_input ();
1905 current_column
+= length
;
1909 str
.shrink_to_fit ();
1911 return Token::make_byte_string (loc
, std::move (str
));
1914 // Parses a raw identifier.
1916 Lexer::parse_raw_identifier (location_t loc
)
1920 str
.reserve (16); // default
1923 current_char
= peek_input ();
1925 current_column
+= 2;
1927 bool first_is_underscore
= current_char
== '_';
1930 current_char
= peek_input ();
1931 // loop through entire name
1932 while (is_identifier_continue (current_char
.value
))
1936 str
+= current_char
;
1938 current_char
= peek_input ();
1941 current_column
+= length
;
1943 rust_debug ("raw ident: %s", str
.c_str ());
1945 // if just a single underscore, not an identifier
1946 if (first_is_underscore
&& length
== 1)
1947 rust_error_at (get_current_location (),
1948 "%<_%> is not a valid raw identifier");
1950 using namespace Rust::Values
;
1951 std::set
<std::string
> invalid
{
1952 Keywords::CRATE
, Keywords::EXTERN_KW
, Keywords::SELF
,
1953 Keywords::SUPER
, Keywords::SELF_ALIAS
,
1956 if (invalid
.find (str
) != invalid
.end ())
1958 rust_error_at (get_current_location (),
1959 "%qs is a forbidden raw identifier", str
.c_str ());
1965 str
.shrink_to_fit ();
1968 return Token::make_identifier (loc
, std::move (str
));
1972 // skip broken string input (unterminated strings)
1974 Lexer::skip_broken_string_input (Codepoint current_char
)
1976 while (current_char
!= '"' && !current_char
.is_eof ())
1978 if (current_char
== '\n')
1988 current_char
= peek_input ();
1990 if (current_char
== '"')
1995 current_char
= peek_input ();
1997 rust_debug ("skipped to %d:%d due to bad quotes", current_line
,
2003 Lexer::parse_string (location_t loc
)
2006 str
.reserve (16); // some sensible default
2008 current_char
= peek_input ();
2010 const location_t string_begin_locus
= get_current_location ();
2012 // FIXME: This fails if the input ends. How do we check for EOF?
2013 while (current_char
.value
!= '"' && !current_char
.is_eof ())
2015 if (current_char
.value
== '\\')
2020 auto utf8_escape_pair
= parse_utf8_escape ();
2021 current_char
= std::get
<0> (utf8_escape_pair
);
2023 if (current_char
== Codepoint (0) && std::get
<2> (utf8_escape_pair
))
2024 length
= std::get
<1> (utf8_escape_pair
) - 1;
2026 length
+= std::get
<1> (utf8_escape_pair
);
2028 if (current_char
!= Codepoint (0) || !std::get
<2> (utf8_escape_pair
))
2029 str
+= current_char
.as_string ();
2031 current_column
+= length
;
2033 // FIXME: should remove this but can't.
2034 // `parse_utf8_escape` does not update `current_char` correctly.
2035 current_char
= peek_input ();
2040 if (current_char
.value
== '\n')
2044 // tell line_table that new line starts
2045 start_line (current_line
, max_column_hint
);
2048 str
+= current_char
;
2050 current_char
= peek_input ();
2053 if (current_char
.value
== '"')
2058 current_char
= peek_input ();
2060 else if (current_char
.is_eof ())
2062 rust_error_at (string_begin_locus
, "unended string literal");
2063 return Token::make (END_OF_FILE
, get_current_location ());
2067 rust_unreachable ();
2070 str
.shrink_to_fit ();
2072 return Token::make_string (loc
, std::move (str
));
2075 // Parses an identifier or keyword.
2077 Lexer::parse_identifier_or_keyword (location_t loc
)
2080 str
.reserve (16); // default
2081 str
+= current_char
.as_string ();
2083 bool first_is_underscore
= current_char
== '_';
2086 current_char
= peek_input ();
2088 // loop through entire name
2089 while (is_identifier_continue (current_char
.value
))
2091 auto s
= current_char
.as_string ();
2094 str
+= current_char
.as_string ();
2096 current_char
= peek_input ();
2099 current_column
+= length
;
2101 // if just a single underscore, not an identifier
2102 if (first_is_underscore
&& length
== 1)
2103 return Token::make (UNDERSCORE
, loc
);
2105 str
.shrink_to_fit ();
2109 TokenId keyword
= classify_keyword (str
);
2110 if (keyword
== IDENTIFIER
)
2111 return Token::make_identifier (loc
, std::move (str
));
2113 return Token::make (keyword
, loc
);
2116 // Possibly returns a raw string token if it exists - otherwise returns null.
2118 Lexer::maybe_parse_raw_string (location_t loc
)
2121 while (peek_input (peek_index
) == '#')
2124 if (peek_input (peek_index
) == '"')
2125 return parse_raw_string (loc
, peek_index
);
2130 // Returns a raw string token.
2132 Lexer::parse_raw_string (location_t loc
, int initial_hash_count
)
2134 // raw string literals
2136 str
.reserve (16); // some sensible default
2138 int length
= 1 + initial_hash_count
;
2140 if (initial_hash_count
> 0)
2141 skip_input (initial_hash_count
- 1);
2143 current_char
= peek_input ();
2145 if (current_char
!= '"')
2146 rust_error_at (get_current_location (), "raw string has no opening %<\"%>");
2150 current_char
= peek_input ();
2152 while (!current_char
.is_eof ())
2154 if (current_char
.value
== '"')
2156 bool enough_hashes
= true;
2158 for (int i
= 0; i
< initial_hash_count
; i
++)
2160 if (peek_input (i
+ 1) != '#')
2162 enough_hashes
= false;
2169 // skip enough input and peek enough input
2170 skip_input (initial_hash_count
);
2171 current_char
= peek_input ();
2172 length
+= initial_hash_count
+ 1;
2179 str
+= current_char
.as_string ();
2181 current_char
= peek_input ();
2184 current_column
+= length
;
2188 str
.shrink_to_fit ();
2190 return Token::make_string (loc
, std::move (str
));
2193 template <typename IsDigitFunc
>
2195 Lexer::parse_non_decimal_int_literal (location_t loc
, IsDigitFunc is_digit_func
,
2196 std::string existent_str
, int base
)
2201 current_char
= peek_input ();
2205 // loop through to add entire number to string
2206 while (is_digit_func (current_char
.value
) || current_char
== '_')
2208 if (current_char
== '_')
2210 // don't add _ to number
2212 current_char
= peek_input ();
2222 existent_str
+= current_char
;
2224 current_char
= peek_input ();
2227 // convert value to decimal representation
2228 long dec_num
= std::strtol (existent_str
.c_str (), nullptr, base
);
2230 existent_str
= std::to_string (dec_num
);
2232 // parse in type suffix if it exists
2233 auto type_suffix_pair
= parse_in_type_suffix ();
2234 PrimitiveCoreType type_hint
= type_suffix_pair
.first
;
2235 length
+= type_suffix_pair
.second
;
2237 current_column
+= length
;
2239 if (type_hint
== CORETYPE_F32
|| type_hint
== CORETYPE_F64
)
2241 rust_error_at (get_current_location (),
2242 "invalid type suffix %qs for integer (%s) literal",
2243 get_type_hint_string (type_hint
),
2246 : (base
== 8 ? "octal"
2247 : (base
== 2 ? "binary"
2248 : "<insert unknown base>")));
2254 return Token::make_int (loc
, std::move (existent_str
), type_hint
);
2257 // Parses a hex, binary or octal int literal.
2259 Lexer::parse_non_decimal_int_literals (location_t loc
)
2262 str
.reserve (16); // some sensible default
2263 str
+= current_char
;
2265 current_char
= peek_input ();
2267 if (current_char
== 'x')
2269 // hex (integer only)
2270 return parse_non_decimal_int_literal (loc
, is_x_digit
, str
+ "x", 16);
2272 else if (current_char
== 'o')
2274 // octal (integer only)
2275 return parse_non_decimal_int_literal (loc
, is_octal_digit
,
2276 std::move (str
), 8);
2278 else if (current_char
== 'b')
2280 // binary (integer only)
2281 return parse_non_decimal_int_literal (loc
, is_bin_digit
, std::move (str
),
2290 // Parses a decimal-based int literal or float literal.
2292 Lexer::parse_decimal_int_or_float (location_t loc
)
2295 str
.reserve (16); // some sensible default
2296 str
+= current_char
;
2299 bool first_zero
= current_char
== '0';
2301 current_char
= peek_input ();
2303 // parse initial decimal integer (or first integer part of float) literal
2304 auto initial_decimal
= parse_in_decimal ();
2305 str
+= std::get
<0> (initial_decimal
);
2306 length
+= std::get
<1> (initial_decimal
);
2308 // detect float literal
2312 // We should not use is_float_digit () for this verification but instead
2313 // directly ISDIGIT because rust does not support non digit values right after
2315 // The following value is not legal in rust:
2317 // A `0` should be put between the dot and the exponent to be valid
2319 if (current_char
== '.' && ISDIGIT (peek_input (1).value
))
2321 // float with a '.', parse another decimal into it
2324 str
+= current_char
;
2326 current_char
= peek_input ();
2329 // parse another decimal number for float
2330 auto second_decimal
= parse_in_decimal ();
2331 str
+= std::get
<0> (second_decimal
);
2332 length
+= std::get
<1> (second_decimal
);
2334 // parse in exponent part if it exists
2335 auto exponent_pair
= parse_in_exponent_part ();
2336 str
+= exponent_pair
.first
;
2337 length
+= exponent_pair
.second
;
2339 // parse in type suffix if it exists
2340 auto type_suffix_pair
= parse_in_type_suffix ();
2341 PrimitiveCoreType type_hint
= type_suffix_pair
.first
;
2342 length
+= type_suffix_pair
.second
;
2344 if (type_hint
!= CORETYPE_F32
&& type_hint
!= CORETYPE_F64
2345 && type_hint
!= CORETYPE_UNKNOWN
)
2347 rust_error_at (get_current_location (),
2348 "invalid type suffix %qs for floating-point literal",
2349 get_type_hint_string (type_hint
));
2350 // ignore invalid type suffix as everything else seems fine
2351 type_hint
= CORETYPE_UNKNOWN
;
2354 current_column
+= length
;
2358 str
.shrink_to_fit ();
2359 return Token::make_float (loc
, std::move (str
), type_hint
);
2361 else if (current_char
== '.'
2362 && check_valid_float_dot_end (peek_input (1).value
))
2364 // float that is just an integer with a terminating '.' character
2367 str
+= current_char
;
2369 current_char
= peek_input ();
2372 // type hint not allowed
2374 current_column
+= length
;
2378 str
.shrink_to_fit ();
2379 return Token::make_float (loc
, std::move (str
), CORETYPE_UNKNOWN
);
2381 else if (current_char
== 'E' || current_char
== 'e')
2383 // exponent float with no '.' character
2385 // parse exponent part
2386 auto exponent_pair
= parse_in_exponent_part ();
2387 str
+= exponent_pair
.first
;
2388 length
+= exponent_pair
.second
;
2390 // parse in type suffix if it exists
2391 auto type_suffix_pair
= parse_in_type_suffix ();
2392 PrimitiveCoreType type_hint
= type_suffix_pair
.first
;
2393 length
+= type_suffix_pair
.second
;
2395 if (type_hint
!= CORETYPE_F32
&& type_hint
!= CORETYPE_F64
2396 && type_hint
!= CORETYPE_UNKNOWN
)
2398 rust_error_at (get_current_location (),
2399 "invalid type suffix %qs for floating-point literal",
2400 get_type_hint_string (type_hint
));
2401 // ignore invalid type suffix as everything else seems fine
2402 type_hint
= CORETYPE_UNKNOWN
;
2405 current_column
+= length
;
2409 str
.shrink_to_fit ();
2410 return Token::make_float (loc
, std::move (str
), type_hint
);
2416 // parse in type suffix if it exists
2417 auto type_suffix_pair
= parse_in_type_suffix ();
2418 PrimitiveCoreType type_hint
= type_suffix_pair
.first
;
2419 /* A "real" pure decimal doesn't have a suffix and no zero prefix. */
2420 if (type_hint
== CORETYPE_UNKNOWN
)
2422 bool pure_decimal
= std::get
<2> (initial_decimal
);
2423 if (pure_decimal
&& (!first_zero
|| str
.size () == 1))
2424 type_hint
= CORETYPE_PURE_DECIMAL
;
2426 length
+= type_suffix_pair
.second
;
2428 current_column
+= length
;
2432 str
.shrink_to_fit ();
2433 return Token::make_int (loc
, std::move (str
), type_hint
);
2438 Lexer::parse_char_or_lifetime (location_t loc
)
2442 current_char
= peek_input ();
2443 if (current_char
.is_eof ())
2446 // parse escaped char literal
2447 if (current_char
.value
== '\\')
2450 auto utf8_escape_pair
= parse_utf8_escape ();
2451 Codepoint escaped_char
= std::get
<0> (utf8_escape_pair
);
2452 length
+= std::get
<1> (utf8_escape_pair
);
2454 if (peek_input ().value
!= '\'')
2456 rust_error_at (get_current_location (), "unended character literal");
2461 current_char
= peek_input ();
2465 current_column
+= length
;
2469 return Token::make_char (loc
, escaped_char
);
2475 if (peek_input ().value
== '\'')
2477 // parse non-escaped char literal
2478 Codepoint non_escaped_char
= current_char
;
2480 // skip the ' character
2482 current_char
= peek_input ();
2484 // TODO fix due to different widths of utf-8 chars?
2485 current_column
+= 3;
2489 return Token::make_char (loc
, non_escaped_char
);
2491 else if (is_identifier_start (current_char
.value
))
2493 // parse lifetime name
2495 str
+= current_char
.as_string ();
2498 current_char
= peek_input ();
2499 while (is_identifier_continue (current_char
.value
))
2501 str
+= current_char
.as_string ();
2503 current_char
= peek_input ();
2507 current_column
+= length
;
2511 // TODO some keywords cannot be used for a lifetime label #2306
2512 // https://doc.rust-lang.org/reference/tokens.html
2514 str
.shrink_to_fit ();
2515 return Token::make_lifetime (loc
, std::move (str
));
2520 get_current_location (),
2521 "expected %' after character constant in character literal");
2528 Lexer::split_current_token (TokenId new_left
, TokenId new_right
)
2530 /* TODO: assert that this TokenId is a "simple token" like punctuation and not
2531 * like "IDENTIFIER"? */
2532 location_t current_loc
= peek_token ()->get_locus ();
2533 TokenPtr new_left_tok
= Token::make (new_left
, current_loc
);
2534 TokenPtr new_right_tok
= Token::make (new_right
, current_loc
+ 1);
2536 token_queue
.replace_current_value (std::move (new_left_tok
));
2537 token_queue
.insert (1, std::move (new_right_tok
));
2541 Lexer::split_current_token (std::vector
<TokenPtr
> new_tokens
)
2543 rust_assert (new_tokens
.size () > 0);
2544 token_queue
.replace_current_value (new_tokens
[0]);
2546 for (size_t i
= 1; i
< new_tokens
.size (); i
++)
2548 token_queue
.insert (i
, new_tokens
[i
]);
2553 Lexer::start_line (int current_line
, int current_column
)
2556 linemap_line_start (line_table
, current_line
, current_column
);
2563 namespace selftest
{
2565 // Checks if `src` has the same contents as the given characters
2567 assert_source_content (Rust::InputSource
&src
,
2568 const std::vector
<uint32_t> &expected
)
2570 Rust::Codepoint src_char
= src
.next ();
2571 for (auto expected_char
: expected
)
2573 // Make sure that `src` is not shorter than `expected`
2574 ASSERT_FALSE (src_char
.is_eof ());
2575 // Checks skipped character is expeceted one.
2576 ASSERT_EQ (src_char
.value
, expected_char
);
2577 src_char
= src
.next ();
2579 // Checks if `src` and `chars` has the same length.
2580 ASSERT_TRUE (src_char
.is_eof ());
2584 test_buffer_input_source (std::string str
,
2585 const std::vector
<uint32_t> &expected
)
2587 Rust::BufferInputSource
source (str
, 0);
2588 assert_source_content (source
, expected
);
2592 test_file_input_source (std::string str
, const std::vector
<uint32_t> &expected
)
2594 FILE *tmpf
= tmpfile ();
2595 // Moves to the first character
2596 fputs (str
.c_str (), tmpf
);
2598 Rust::FileInputSource
source (tmpf
);
2599 assert_source_content (source
, expected
);
2603 rust_input_source_test ()
2606 std::string src
= u8
"_abcde\tXYZ\v\f";
2607 std::vector
<uint32_t> expected
2608 = {'_', 'a', 'b', 'c', 'd', 'e', '\t', 'X', 'Y', 'Z', '\v', '\f'};
2609 test_buffer_input_source (src
, expected
);
2612 src
= u8
"\xef\xbb\xbfOK";
2613 expected
= {'O', 'K'};
2614 test_buffer_input_source (src
, expected
);
2622 0x0435 /* CYRILLIC SMALL LETTER IE е */,
2623 0x301 /* COMBINING ACUTE ACCENT ́ */,
2625 test_buffer_input_source (src
, expected
);
2628 expected
= {0x2764 /* HEAVY BLACK HEART */,
2629 0xfe0f /* VARIATION SELECTOR-16 */, L
'🦀'};
2630 test_buffer_input_source (src
, expected
);
2633 expected
= {L
'こ', L
'ん', L
'に', L
'ち', L
'は'};
2634 test_file_input_source (src
, expected
);
2638 = {0x1f46e /* POLICE OFFICER */, 0x200d /* ZERO WIDTH JOINER */,
2639 0x2642 /* MALE SIGN */, 0x1f469 /* WOMAN */,
2640 0x200d /* ZERO WIDTH JOINER */, 0x2695 /* STAFF OF AESCULAPIUS */};
2641 test_file_input_source (src
, expected
);
2644 } // namespace selftest
2646 #endif // CHECKING_P