gcc/rust/lex/rust-lex.cc

   1 // Copyright (C) 2020-2024 Free Software Foundation, Inc.
   2
   3 // This file is part of GCC.
   4
   5 // GCC is free software; you can redistribute it and/or modify it under
   6 // the terms of the GNU General Public License as published by the Free
   7 // Software Foundation; either version 3, or (at your option) any later
   8 // version.
   9
  10 // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13 // for more details.
  14
  15 // You should have received a copy of the GNU General Public License
  16 // along with GCC; see the file COPYING3.  If not see
  17 // <http://www.gnu.org/licenses/>.
  18
  19 #include "rust-codepoint.h"
  20 #include "rust-system.h"
  21 #include "rust-lex.h"
  22 #include "rust-diagnostics.h"
  23 #include "rust-linemap.h"
  24 #include "rust-session-manager.h"
  25 #include "safe-ctype.h"
  26 #include "cpplib.h"
  27 #include "rust-keyword-values.h"
  28
  29 namespace Rust {
  30 // TODO: move to separate compilation unit?
  31 // overload += for uint32_t to allow 32-bit encoded utf-8 to be added
  32 std::string &
  33 operator+= (std::string &str, Codepoint char32)
  34 {
  35   if (char32.value < 0x80)
  36     {
  37       str += static_cast<char> (char32.value);
  38     }
  39   else if (char32.value < (0x1F + 1) << (1 * 6))
  40     {
  41       str += static_cast<char> (0xC0 | ((char32.value >> 6) & 0x1F));
  42       str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
  43     }
  44   else if (char32.value < (0x0F + 1) << (2 * 6))
  45     {
  46       str += static_cast<char> (0xE0 | ((char32.value >> 12) & 0x0F));
  47       str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
  48       str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
  49     }
  50   else if (char32.value < (0x07 + 1) << (3 * 6))
  51     {
  52       str += static_cast<char> (0xF0 | ((char32.value >> 18) & 0x07));
  53       str += static_cast<char> (0x80 | ((char32.value >> 12) & 0x3F));
  54       str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
  55       str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
  56     }
  57   else
  58     {
  59       rust_debug ("Invalid unicode codepoint found: '%u' ", char32.value);
  60     }
  61   return str;
  62 }
  63
  64 std::string
  65 Codepoint::as_string ()
  66 {
  67   std::string str;
  68
  69   // str += Codepoint (value);
  70   str += *this;
  71
  72   return str;
  73 }
  74
  75 /* Includes all allowable float digits EXCEPT _ and . as that needs lookahead
  76  * for handling. */
  77 bool
  78 is_float_digit (uint32_t number)
  79 {
  80   return ISDIGIT (number) || number == 'E' || number == 'e';
  81 }
  82
  83 /* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or
  84  * whatever is different */
  85 bool
  86 is_x_digit (uint32_t number)
  87 {
  88   return ISXDIGIT (number);
  89 }
  90
  91 bool
  92 is_octal_digit (uint32_t number)
  93 {
  94   return number >= '0' && number <= '7';
  95 }
  96
  97 bool
  98 is_bin_digit (uint32_t number)
  99 {
 100   return number == '0' || number == '1';
 101 }
 102
 103 bool
 104 check_valid_float_dot_end (uint32_t character)
 105 {
 106   return character != '.' && character != '_' && !ISALPHA (character);
 107 }
 108
 109 bool
 110 is_whitespace (uint32_t character)
 111 {
 112   // https://doc.rust-lang.org/reference/whitespace.html
 113   return character == '\t' || character == '\n' || character == '\v'
 114          || character == '\f' || character == '\r' || character == ' '
 115          || character == 0x0085  // next line
 116          || character == 0x200e  // left-to-right mark
 117          || character == 0x200f  // right-to-left mark
 118          || character == 0x2028  // line separator
 119          || character == 0x2029; // pragraph separator
 120 }
 121
 122 bool
 123 is_non_decimal_int_literal_separator (uint32_t character)
 124 {
 125   return character == 'x' || character == 'o' || character == 'b';
 126 }
 127
 128 bool
 129 is_identifier_start (uint32_t codepoint)
 130 {
 131   return (cpp_check_xid_property (codepoint) & CPP_XID_START) || codepoint == '_';
 132 }
 133
 134 bool
 135 is_identifier_continue (uint32_t codepoint)
 136 {
 137   return cpp_check_xid_property (codepoint) & CPP_XID_CONTINUE;
 138 }
 139
 140 Lexer::Lexer (const std::string &input, Linemap *linemap)
 141   : input (RAIIFile::create_error ()), current_line (1), current_column (1),
 142     line_map (linemap), dump_lex_out ({}),
 143     raw_input_source (new BufferInputSource (input, 0)),
 144     input_queue{*raw_input_source}, token_queue (TokenSource (this))
 145 {}
 146
 147 Lexer::Lexer (const char *filename, RAIIFile file_input, Linemap *linemap,
 148               tl::optional<std::ofstream &> dump_lex_opt)
 149   : input (std::move (file_input)), current_line (1), current_column (1),
 150     line_map (linemap), dump_lex_out (dump_lex_opt),
 151     raw_input_source (new FileInputSource (input.get_raw ())),
 152     input_queue{*raw_input_source}, token_queue (TokenSource (this))
 153 {
 154   // inform line_table that file is being entered and is in line 1
 155   if (linemap)
 156     line_map->start_file (filename, current_line);
 157 }
 158
 159 Lexer::~Lexer ()
 160 {
 161   /* ok apparently stop (which is equivalent of original code in destructor) is
 162    * meant to be called after all files have finished parsing, for cleanup. On
 163    * the other hand, actual code that it calls to leave a certain line map is
 164    * mentioned in GCC docs as being useful for "just leaving an included header"
 165    * and stuff like that, so this line mapping functionality may need fixing.
 166    * FIXME: find out whether this occurs. */
 167
 168   // line_map->stop();
 169 }
 170
 171 bool
 172 Lexer::input_source_is_valid_utf8 ()
 173 {
 174   return raw_input_source->is_valid ();
 175 }
 176
 177 location_t
 178 Lexer::get_current_location ()
 179 {
 180   if (line_map)
 181     return linemap_position_for_column (line_table, current_column);
 182   else
 183     // If we have no linemap, we're lexing something without proper locations
 184     return UNDEF_LOCATION;
 185 }
 186
 187 Codepoint
 188 Lexer::peek_input (int n)
 189 {
 190   return input_queue.peek (n);
 191 }
 192
 193 Codepoint
 194 Lexer::peek_input ()
 195 {
 196   return peek_input (0);
 197 }
 198
 199 void
 200 Lexer::skip_input (int n)
 201 {
 202   input_queue.skip (n);
 203 }
 204
 205 void
 206 Lexer::skip_input ()
 207 {
 208   skip_input (0);
 209 }
 210
 211 void
 212 Lexer::skip_token (int n)
 213 {
 214   // dump tokens if dump-lex option is enabled
 215   if (dump_lex_out.has_value ())
 216     dump_and_skip (n);
 217   else
 218     token_queue.skip (n);
 219 }
 220
 221 void
 222 Lexer::dump_and_skip (int n)
 223 {
 224   std::ofstream &out = dump_lex_out.value ();
 225   bool found_eof = false;
 226   const_TokenPtr tok;
 227   for (int i = 0; i < n + 1; i++)
 228     {
 229       if (!found_eof)
 230         {
 231           tok = peek_token ();
 232           found_eof |= tok->get_id () == Rust::END_OF_FILE;
 233
 234           location_t loc = tok->get_locus ();
 235
 236           out << "<id=";
 237           out << tok->token_id_to_str ();
 238           out << (tok->has_str () ? (std::string (", text=") + tok->get_str ()
 239                                      + std::string (", typehint=")
 240                                      + std::string (tok->get_type_hint_str ()))
 241                                   : "")
 242               << " ";
 243           out << Linemap::location_to_string (loc) << '\n';
 244         }
 245
 246       token_queue.skip (0);
 247     }
 248 }
 249
 250 void
 251 Lexer::replace_current_token (TokenPtr replacement)
 252 {
 253   token_queue.replace_current_value (replacement);
 254
 255   rust_debug ("called 'replace_current_token' - this is deprecated");
 256 }
 257
 258 /* Determines whether the string passed in is a keyword or not. If it is, it
 259  * returns the keyword name.  */
 260 TokenId
 261 Lexer::classify_keyword (const std::string &str)
 262 {
 263   auto &keywords = Rust::Values::Keywords::keywords_tokens;
 264   auto keyword = keywords.find (str);
 265
 266   if (keyword == keywords.end ())
 267     return IDENTIFIER;
 268
 269   auto id = keyword->second;
 270
 271   // We now have the expected token ID of the reserved keyword. However, some
 272   // keywords are reserved starting in certain editions. For example, `try` is
 273   // only a reserved keyword in editions >=2018. The language might gain new
 274   // reserved keywords in the future.
 275   //
 276   // https://doc.rust-lang.org/reference/keywords.html#reserved-keywords
 277
 278   // `try` is not a reserved keyword before 2018
 279   if (Session::get_instance ().options.get_edition ()
 280         == CompileOptions::Edition::E2015
 281       && id == TRY)
 282     return IDENTIFIER;
 283
 284   return id;
 285 }
 286
 287 TokenPtr
 288 Lexer::build_token ()
 289 {
 290   // loop to go through multiple characters to build a single token
 291   while (true)
 292     {
 293       location_t loc = get_current_location ();
 294
 295       current_char = peek_input ();
 296       skip_input ();
 297
 298       // detect shebang
 299       // Must be the first thing on the first line, starting with #!
 300       // But since an attribute can also start with an #! we don't count it as a
 301       // shebang line when after any whitespace or comments there is a [. If it
 302       // is a shebang line we simple drop the line. Otherwise we don't consume
 303       // any characters and fall through to the real tokenizer.
 304       if (current_line == 1 && current_column == 1 && current_char == '#'
 305           && peek_input () == '!')
 306         {
 307           int n = 1;
 308           while (true)
 309             {
 310               Codepoint next_char = peek_input (n);
 311               if (is_whitespace (next_char.value))
 312                 n++;
 313               else if ((next_char == '/' && peek_input (n + 1) == '/'
 314                         && peek_input (n + 2) != '!'
 315                         && peek_input (n + 2) != '/')
 316                        || (next_char == '/' && peek_input (n + 1) == '/'
 317                            && peek_input (n + 2) == '/'
 318                            && peek_input (n + 3) == '/'))
 319                 {
 320                   // two // or four ////
 321                   // A single line comment
 322                   // (but not an inner or outer doc comment)
 323                   n += 2;
 324                   next_char = peek_input (n);
 325                   while (next_char != '\n' && !next_char.is_eof ())
 326                     {
 327                       n++;
 328                       next_char = peek_input (n);
 329                     }
 330                   if (next_char == '\n')
 331                     n++;
 332                 }
 333               else if (next_char == '/' && peek_input (n + 1) == '*'
 334                        && peek_input (n + 2) == '*'
 335                        && peek_input (n + 3) == '/')
 336                 {
 337                   /**/
 338                   n += 4;
 339                 }
 340               else if (next_char == '/' && peek_input (n + 1) == '*'
 341                        && peek_input (n + 2) == '*' && peek_input (n + 3) == '*'
 342                        && peek_input (n + 4) == '/')
 343                 {
 344                   /***/
 345                   n += 5;
 346                 }
 347               else if ((next_char == '/' && peek_input (n + 1) == '*'
 348                         && peek_input (n + 2) != '*'
 349                         && peek_input (n + 2) != '!')
 350                        || (next_char == '/' && peek_input (n + 1) == '*'
 351                            && peek_input (n + 2) == '*'
 352                            && peek_input (n + 3) == '*'))
 353                 {
 354                   // one /* or three /***
 355                   // Start of a block comment
 356                   // (but not an inner or outer doc comment)
 357                   n += 2;
 358                   int level = 1;
 359                   while (level > 0)
 360                     {
 361                       if (peek_input (n).is_eof ())
 362                         break;
 363                       else if (peek_input (n) == '/'
 364                                && peek_input (n + 1) == '*')
 365                         {
 366                           n += 2;
 367                           level += 1;
 368                         }
 369                       else if (peek_input (n) == '*'
 370                                && peek_input (n + 1) == '/')
 371                         {
 372                           n += 2;
 373                           level -= 1;
 374                         }
 375                       else
 376                         n++;
 377                     }
 378                 }
 379               else if (next_char != '[')
 380                 {
 381                   // definitely shebang, ignore the first line
 382                   while (current_char != '\n' && !current_char.is_eof ())
 383                     {
 384                       current_char = peek_input ();
 385                       skip_input ();
 386                     }
 387
 388                   // newline
 389                   current_line++;
 390                   current_column = 1;
 391                   // tell line_table that new line starts
 392                   start_line (current_line, max_column_hint);
 393                   break;
 394                 }
 395               else
 396                 break; /* Definitely not a shebang line. */
 397             }
 398         }
 399
 400       // return end of file token if end of file
 401       if (current_char.is_eof ())
 402         return Token::make (END_OF_FILE, loc);
 403
 404       // if not end of file, start tokenising
 405       switch (current_char.value)
 406         {
 407         /* ignore whitespace characters for tokens but continue updating
 408          * location */
 409         case '\n':   // newline
 410         case 0x0085: // next line
 411         case 0x2028: // line separator
 412         case 0x2029: // paragraph separator
 413           current_line++;
 414           current_column = 1;
 415           // tell line_table that new line starts
 416           start_line (current_line, max_column_hint);
 417           continue;
 418         case '\r': // cr
 419           // Ignore, we expect a newline (lf) soon.
 420           continue;
 421         case ' ': // space
 422           current_column++;
 423           continue;
 424         case '\t': // horizontal tab
 425           // width of a tab is not well-defined, assume 8 spaces
 426           current_column += 8;
 427           continue;
 428         case '\v':   // vertical tab
 429         case 0x000c: // form feed
 430         case 0x200e: // left-to-right mark
 431         case 0x200f: // right-to-left mark
 432           // Ignored.
 433           continue;
 434
 435         // punctuation - actual tokens
 436         case '=':
 437           if (peek_input () == '>')
 438             {
 439               // match arm arrow
 440               skip_input ();
 441               current_column += 2;
 442               loc += 1;
 443
 444               return Token::make (MATCH_ARROW, loc);
 445             }
 446           else if (peek_input () == '=')
 447             {
 448               // equality operator
 449               skip_input ();
 450               current_column += 2;
 451               loc += 1;
 452
 453               return Token::make (EQUAL_EQUAL, loc);
 454             }
 455           else
 456             {
 457               // assignment operator
 458               current_column++;
 459               return Token::make (EQUAL, loc);
 460             }
 461         case '(':
 462           current_column++;
 463           return Token::make (LEFT_PAREN, loc);
 464         case '-':
 465           if (peek_input () == '>')
 466             {
 467               // return type specifier
 468               skip_input ();
 469               current_column += 2;
 470               loc += 1;
 471
 472               return Token::make (RETURN_TYPE, loc);
 473             }
 474           else if (peek_input () == '=')
 475             {
 476               // minus-assign
 477               skip_input ();
 478               current_column += 2;
 479               loc += 1;
 480
 481               return Token::make (MINUS_EQ, loc);
 482             }
 483           else
 484             {
 485               // minus
 486               current_column++;
 487               return Token::make (MINUS, loc);
 488             }
 489         case '+':
 490           if (peek_input () == '=')
 491             {
 492               // add-assign
 493               skip_input ();
 494               current_column += 2;
 495               loc += 1;
 496
 497               return Token::make (PLUS_EQ, loc);
 498             }
 499           else
 500             {
 501               // add
 502               current_column++;
 503               return Token::make (PLUS, loc);
 504             }
 505         case ')':
 506           current_column++;
 507           return Token::make (RIGHT_PAREN, loc);
 508         case ';':
 509           current_column++;
 510           return Token::make (SEMICOLON, loc);
 511         case '*':
 512           if (peek_input () == '=')
 513             {
 514               // multiplication-assign
 515               skip_input ();
 516               current_column += 2;
 517               loc += 1;
 518
 519               return Token::make (ASTERISK_EQ, loc);
 520             }
 521           else
 522             {
 523               // multiplication
 524               current_column++;
 525               return Token::make (ASTERISK, loc);
 526             }
 527         case ',':
 528           current_column++;
 529           return Token::make (COMMA, loc);
 530         case '/':
 531           if (peek_input () == '=')
 532             {
 533               // division-assign
 534               skip_input ();
 535               current_column += 2;
 536               loc += 1;
 537
 538               return Token::make (DIV_EQ, loc);
 539             }
 540           else if ((peek_input () == '/' && peek_input (1) != '!'
 541                     && peek_input (1) != '/')
 542                    || (peek_input () == '/' && peek_input (1) == '/'
 543                        && peek_input (2) == '/'))
 544             {
 545               // two // or four ////
 546               // single line comment
 547               // (but not an inner or outer doc comment)
 548               skip_input ();
 549               current_column += 2;
 550               current_char = peek_input ();
 551
 552               // basically ignore until line finishes
 553               while (current_char != '\n' && !current_char.is_eof ())
 554                 {
 555                   skip_input ();
 556                   current_column++; // not used
 557                   current_char = peek_input ();
 558                 }
 559               continue;
 560             }
 561           else if (peek_input () == '/'
 562                    && (peek_input (1) == '!' || peek_input (1) == '/'))
 563             {
 564               /* single line doc comment, inner or outer.  */
 565               bool is_inner = peek_input (1) == '!';
 566               skip_input (1);
 567               current_column += 3;
 568
 569               std::string str;
 570               str.reserve (32);
 571               current_char = peek_input ();
 572               while (current_char != '\n')
 573                 {
 574                   skip_input ();
 575                   if (current_char == '\r')
 576                     {
 577                       Codepoint next_char = peek_input ();
 578                       if (next_char == '\n')
 579                         {
 580                           current_char = '\n';
 581                           break;
 582                         }
 583                       rust_error_at (
 584                         loc, "Isolated CR %<\\r%> not allowed in doc comment");
 585                       current_char = next_char;
 586                       continue;
 587                     }
 588                   if (current_char.is_eof ())
 589                     {
 590                       rust_error_at (
 591                         loc, "unexpected EOF while looking for end of comment");
 592                       break;
 593                     }
 594                   str += current_char;
 595                   current_char = peek_input ();
 596                 }
 597               skip_input ();
 598               current_line++;
 599               current_column = 1;
 600               // tell line_table that new line starts
 601               start_line (current_line, max_column_hint);
 602
 603               str.shrink_to_fit ();
 604
 605               loc += str.size () - 1;
 606               if (is_inner)
 607                 return Token::make_inner_doc_comment (loc, std::move (str));
 608               else
 609                 return Token::make_outer_doc_comment (loc, std::move (str));
 610             }
 611           else if (peek_input () == '*' && peek_input (1) == '*'
 612                    && peek_input (2) == '/')
 613             {
 614               /**/
 615               skip_input (2);
 616               current_column += 4;
 617               continue;
 618             }
 619           else if (peek_input () == '*' && peek_input (1) == '*'
 620                    && peek_input (2) == '*' && peek_input (3) == '/')
 621             {
 622               /***/
 623               skip_input (3);
 624               current_column += 5;
 625               continue;
 626             }
 627           else if ((peek_input () == '*' && peek_input (1) != '!'
 628                     && peek_input (1) != '*')
 629                    || (peek_input () == '*' && peek_input (1) == '*'
 630                        && peek_input (2) == '*'))
 631             {
 632               // one /* or three /***
 633               // block comment
 634               // (but not an inner or outer doc comment)
 635               skip_input ();
 636               current_column += 2;
 637
 638               int level = 1;
 639               while (level > 0)
 640                 {
 641                   current_char = peek_input ();
 642
 643                   if (current_char.is_eof ())
 644                     {
 645                       rust_error_at (
 646                         loc, "unexpected EOF while looking for end of comment");
 647                       break;
 648                     }
 649
 650                   // if /* found
 651                   if (current_char == '/' && peek_input (1) == '*')
 652                     {
 653                       // skip /* characters
 654                       skip_input (1);
 655
 656                       current_column += 2;
 657
 658                       level += 1;
 659                       continue;
 660                     }
 661
 662                   // ignore until */ is found
 663                   if (current_char == '*' && peek_input (1) == '/')
 664                     {
 665                       // skip */ characters
 666                       skip_input (1);
 667
 668                       current_column += 2;
 669
 670                       level -= 1;
 671                       continue;
 672                     }
 673
 674                   if (current_char == '\n')
 675                     {
 676                       skip_input ();
 677                       current_line++;
 678                       current_column = 1;
 679                       // tell line_table that new line starts
 680                       start_line (current_line, max_column_hint);
 681                       continue;
 682                     }
 683
 684                   skip_input ();
 685                   current_column++;
 686                 }
 687
 688               // refresh new token
 689               continue;
 690             }
 691           else if (peek_input () == '*'
 692                    && (peek_input (1) == '!' || peek_input (1) == '*'))
 693             {
 694               // block doc comment, inner /*! or outer /**
 695               bool is_inner = peek_input (1) == '!';
 696               skip_input (1);
 697               current_column += 3;
 698
 699               std::string str;
 700               str.reserve (96);
 701
 702               int level = 1;
 703               while (level > 0)
 704                 {
 705                   current_char = peek_input ();
 706
 707                   if (current_char.is_eof ())
 708                     {
 709                       rust_error_at (
 710                         loc, "unexpected EOF while looking for end of comment");
 711                       break;
 712                     }
 713
 714                   // if /* found
 715                   if (current_char == '/' && peek_input (1) == '*')
 716                     {
 717                       // skip /* characters
 718                       skip_input (1);
 719                       current_column += 2;
 720
 721                       level += 1;
 722                       str += "/*";
 723                       continue;
 724                     }
 725
 726                   // ignore until */ is found
 727                   if (current_char == '*' && peek_input (1) == '/')
 728                     {
 729                       // skip */ characters
 730                       skip_input (1);
 731                       current_column += 2;
 732
 733                       level -= 1;
 734                       if (level > 0)
 735                         str += "*/";
 736                       continue;
 737                     }
 738
 739                   if (current_char == '\r' && peek_input (1) != '\n')
 740                     rust_error_at (
 741                       loc, "Isolated CR %<\\r%> not allowed in doc comment");
 742
 743                   if (current_char == '\n')
 744                     {
 745                       skip_input ();
 746                       current_line++;
 747                       current_column = 1;
 748                       // tell line_table that new line starts
 749                       start_line (current_line, max_column_hint);
 750                       str += '\n';
 751                       continue;
 752                     }
 753
 754                   str += current_char;
 755                   skip_input ();
 756                   current_column++;
 757                 }
 758
 759               str.shrink_to_fit ();
 760
 761               loc += str.size () - 1;
 762               if (is_inner)
 763                 return Token::make_inner_doc_comment (loc, std::move (str));
 764               else
 765                 return Token::make_outer_doc_comment (loc, std::move (str));
 766             }
 767           else
 768             {
 769               // division
 770               current_column++;
 771               return Token::make (DIV, loc);
 772             }
 773         case '%':
 774           if (peek_input () == '=')
 775             {
 776               // modulo-assign
 777               skip_input ();
 778               current_column += 2;
 779               loc += 1;
 780
 781               return Token::make (PERCENT_EQ, loc);
 782             }
 783           else
 784             {
 785               // modulo
 786               current_column++;
 787               return Token::make (PERCENT, loc);
 788             }
 789         case '^':
 790           if (peek_input () == '=')
 791             {
 792               // xor-assign?
 793               skip_input ();
 794               current_column += 2;
 795               loc += 1;
 796
 797               return Token::make (CARET_EQ, loc);
 798             }
 799           else
 800             {
 801               // xor?
 802               current_column++;
 803               return Token::make (CARET, loc);
 804             }
 805         case '<':
 806           if (peek_input () == '<')
 807             {
 808               if (peek_input (1) == '=')
 809                 {
 810                   // left-shift assign
 811                   skip_input (1);
 812                   current_column += 3;
 813                   loc += 2;
 814
 815                   return Token::make (LEFT_SHIFT_EQ, loc);
 816                 }
 817               else
 818                 {
 819                   // left-shift
 820                   skip_input ();
 821                   current_column += 2;
 822                   loc += 1;
 823
 824                   return Token::make (LEFT_SHIFT, loc);
 825                 }
 826             }
 827           else if (peek_input () == '=')
 828             {
 829               // smaller than or equal to
 830               skip_input ();
 831               current_column += 2;
 832               loc += 1;
 833
 834               return Token::make (LESS_OR_EQUAL, loc);
 835             }
 836           else
 837             {
 838               // smaller than
 839               current_column++;
 840               return Token::make (LEFT_ANGLE, loc);
 841             }
 842           break;
 843         case '>':
 844           if (peek_input () == '>')
 845             {
 846               if (peek_input (1) == '=')
 847                 {
 848                   // right-shift-assign
 849                   skip_input (1);
 850                   current_column += 3;
 851                   loc += 2;
 852
 853                   return Token::make (RIGHT_SHIFT_EQ, loc);
 854                 }
 855               else
 856                 {
 857                   // right-shift
 858                   skip_input ();
 859                   current_column += 2;
 860                   loc += 1;
 861
 862                   return Token::make (RIGHT_SHIFT, loc);
 863                 }
 864             }
 865           else if (peek_input () == '=')
 866             {
 867               // larger than or equal to
 868               skip_input ();
 869               current_column += 2;
 870               loc += 1;
 871
 872               return Token::make (GREATER_OR_EQUAL, loc);
 873             }
 874           else
 875             {
 876               // larger than
 877               current_column++;
 878               return Token::make (RIGHT_ANGLE, loc);
 879             }
 880         case ':':
 881           if (peek_input () == ':')
 882             {
 883               // scope resolution ::
 884               skip_input ();
 885               current_column += 2;
 886               loc += 1;
 887
 888               return Token::make (SCOPE_RESOLUTION, loc);
 889             }
 890           else
 891             {
 892               // single colon :
 893               current_column++;
 894               return Token::make (COLON, loc);
 895             }
 896         case '!':
 897           // no special handling for macros in lexer?
 898           if (peek_input () == '=')
 899             {
 900               // not equal boolean operator
 901               skip_input ();
 902               current_column += 2;
 903               loc += 1;
 904
 905               return Token::make (NOT_EQUAL, loc);
 906             }
 907           else
 908             {
 909               // not equal unary operator
 910               current_column++;
 911
 912               return Token::make (EXCLAM, loc);
 913             }
 914         case '?':
 915           current_column++;
 916           return Token::make (QUESTION_MARK, loc);
 917         case '#':
 918           current_column++;
 919           return Token::make (HASH, loc);
 920         case '[':
 921           current_column++;
 922           return Token::make (LEFT_SQUARE, loc);
 923         case ']':
 924           current_column++;
 925           return Token::make (RIGHT_SQUARE, loc);
 926         case '{':
 927           current_column++;
 928           return Token::make (LEFT_CURLY, loc);
 929         case '}':
 930           current_column++;
 931           return Token::make (RIGHT_CURLY, loc);
 932         case '@':
 933           current_column++;
 934           return Token::make (PATTERN_BIND, loc);
 935         case '$':
 936           current_column++;
 937           return Token::make (DOLLAR_SIGN, loc);
 938         case '~':
 939           current_column++;
 940           return Token::make (TILDE, loc);
 941         case '\\':
 942           current_column++;
 943           return Token::make (BACKSLASH, loc);
 944         case '`':
 945           current_column++;
 946           return Token::make (BACKTICK, loc);
 947         case '|':
 948           if (peek_input () == '=')
 949             {
 950               // bitwise or-assign?
 951               skip_input ();
 952               current_column += 2;
 953               loc += 1;
 954
 955               return Token::make (PIPE_EQ, loc);
 956             }
 957           else if (peek_input () == '|')
 958             {
 959               // logical or
 960               skip_input ();
 961               current_column += 2;
 962               loc += 1;
 963
 964               return Token::make (OR, loc);
 965             }
 966           else
 967             {
 968               // bitwise or
 969               current_column++;
 970
 971               return Token::make (PIPE, loc);
 972             }
 973         case '&':
 974           if (peek_input () == '=')
 975             {
 976               // bitwise and-assign?
 977               skip_input ();
 978               current_column += 2;
 979               loc += 1;
 980
 981               return Token::make (AMP_EQ, loc);
 982             }
 983           else if (peek_input () == '&')
 984             {
 985               // logical and
 986               skip_input ();
 987               current_column += 2;
 988               loc += 1;
 989
 990               return Token::make (LOGICAL_AND, loc);
 991             }
 992           else
 993             {
 994               // bitwise and/reference
 995               current_column++;
 996
 997               return Token::make (AMP, loc);
 998             }
 999         case '.':
1000           if (peek_input () == '.')
1001             {
1002               if (peek_input (1) == '.')
1003                 {
1004                   // ellipsis
1005                   skip_input (1);
1006                   current_column += 3;
1007                   loc += 2;
1008
1009                   return Token::make (ELLIPSIS, loc);
1010                 }
1011               else if (peek_input (1) == '=')
1012                 {
1013                   // ..=
1014                   skip_input (1);
1015                   current_column += 3;
1016                   loc += 2;
1017
1018                   return Token::make (DOT_DOT_EQ, loc);
1019                 }
1020               else
1021                 {
1022                   // ..
1023                   skip_input ();
1024                   current_column += 2;
1025                   loc += 1;
1026
1027                   return Token::make (DOT_DOT, loc);
1028                 }
1029             }
1030           else /*if (!ISDIGIT (peek_input ()))*/
1031             {
1032               // single dot .
1033               // Only if followed by a non-number - otherwise is float
1034               // nope, float cannot start with '.'.
1035               current_column++;
1036               return Token::make (DOT, loc);
1037             }
1038         }
1039       // TODO: special handling of _ in the lexer? instead of being identifier
1040
1041       // byte character, byte string and raw byte string literals
1042       if (current_char == 'b')
1043         {
1044           if (peek_input () == '\'')
1045             return parse_byte_char (loc);
1046           else if (peek_input () == '"')
1047             return parse_byte_string (loc);
1048           else if (peek_input () == 'r'
1049                    && (peek_input (1) == '#' || peek_input (1) == '"'))
1050             return parse_raw_byte_string (loc);
1051         }
1052
1053       // raw identifiers and raw strings
1054       if (current_char == 'r')
1055         {
1056           Codepoint peek = peek_input ();
1057           Codepoint peek1 = peek_input (1);
1058
1059           // TODO (tamaron) parse Unicode ident
1060           if (peek == '#' && is_identifier_start (peek1.value))
1061             {
1062               TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
1063               if (raw_ident_ptr != nullptr)
1064                 return raw_ident_ptr;
1065               else
1066                 continue; /* input got parsed, it just wasn't valid. An error
1067                              was produced. */
1068             }
1069           else
1070             {
1071               TokenPtr maybe_raw_string_ptr = maybe_parse_raw_string (loc);
1072               if (maybe_raw_string_ptr != nullptr)
1073                 return maybe_raw_string_ptr;
1074             }
1075         }
1076
1077       // find identifiers and keywords.
1078       if (is_identifier_start (current_char.value))
1079         return parse_identifier_or_keyword (loc);
1080
1081       // int and float literals
1082       if (ISDIGIT (current_char.value))
1083         { //  _ not allowed as first char
1084           if (current_char == '0'
1085               && is_non_decimal_int_literal_separator (peek_input ().value))
1086             {
1087               // handle binary, octal, hex literals
1088               TokenPtr non_dec_int_lit_ptr
1089                 = parse_non_decimal_int_literals (loc);
1090               if (non_dec_int_lit_ptr != nullptr)
1091                 return non_dec_int_lit_ptr;
1092             }
1093           else
1094             {
1095               // handle decimals (integer or float)
1096               TokenPtr decimal_or_float_ptr = parse_decimal_int_or_float (loc);
1097               if (decimal_or_float_ptr != nullptr)
1098                 return decimal_or_float_ptr;
1099             }
1100         }
1101
1102       // string literals
1103       if (current_char == '"')
1104         return parse_string (loc);
1105
1106       // char literals and lifetime names
1107       if (current_char == '\'')
1108         {
1109           TokenPtr char_or_lifetime_ptr = parse_char_or_lifetime (loc);
1110           if (char_or_lifetime_ptr != nullptr)
1111             return char_or_lifetime_ptr;
1112         }
1113
1114       // DEBUG: check for specific character problems:
1115       if (current_char == '0')
1116         rust_debug ("'0' uncaught before unexpected character");
1117       else if (current_char == ']')
1118         rust_debug ("']' uncaught before unexpected character");
1119       else if (current_char == 0x5d)
1120         rust_debug ("whatever 0x5d is (not '0' or ']') uncaught before "
1121                     "unexpected character");
1122
1123       // didn't match anything so error
1124       rust_error_at (loc, "unexpected character %<%x%>", current_char.value);
1125       current_column++;
1126     }
1127 }
1128
1129 // Parses in a type suffix.
1130 std::pair<PrimitiveCoreType, int>
1131 Lexer::parse_in_type_suffix ()
1132 {
1133   std::string suffix;
1134   suffix.reserve (5);
1135
1136   int additional_length_offset = 0;
1137
1138   // get suffix
1139   while (ISALPHA (current_char.value) || ISDIGIT (current_char.value)
1140          || current_char == '_')
1141     {
1142       if (current_char == '_')
1143         {
1144           // don't add _ to suffix
1145           skip_input ();
1146           current_char = peek_input ();
1147
1148           additional_length_offset++;
1149
1150           continue;
1151         }
1152
1153       additional_length_offset++;
1154
1155       suffix += current_char;
1156       skip_input ();
1157       current_char = peek_input ();
1158     }
1159
1160   if (suffix.empty ())
1161     {
1162       // no type suffix: do nothing but also no error
1163       return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
1164     }
1165   else if (suffix == "f32")
1166     {
1167       return std::make_pair (CORETYPE_F32, additional_length_offset);
1168     }
1169   else if (suffix == "f64")
1170     {
1171       return std::make_pair (CORETYPE_F64, additional_length_offset);
1172     }
1173   else if (suffix == "i8")
1174     {
1175       return std::make_pair (CORETYPE_I8, additional_length_offset);
1176     }
1177   else if (suffix == "i16")
1178     {
1179       return std::make_pair (CORETYPE_I16, additional_length_offset);
1180     }
1181   else if (suffix == "i32")
1182     {
1183       return std::make_pair (CORETYPE_I32, additional_length_offset);
1184     }
1185   else if (suffix == "i64")
1186     {
1187       return std::make_pair (CORETYPE_I64, additional_length_offset);
1188     }
1189   else if (suffix == "i128")
1190     {
1191       return std::make_pair (CORETYPE_I128, additional_length_offset);
1192     }
1193   else if (suffix == "isize")
1194     {
1195       return std::make_pair (CORETYPE_ISIZE, additional_length_offset);
1196     }
1197   else if (suffix == "u8")
1198     {
1199       return std::make_pair (CORETYPE_U8, additional_length_offset);
1200     }
1201   else if (suffix == "u16")
1202     {
1203       return std::make_pair (CORETYPE_U16, additional_length_offset);
1204     }
1205   else if (suffix == "u32")
1206     {
1207       return std::make_pair (CORETYPE_U32, additional_length_offset);
1208     }
1209   else if (suffix == "u64")
1210     {
1211       return std::make_pair (CORETYPE_U64, additional_length_offset);
1212     }
1213   else if (suffix == "u128")
1214     {
1215       return std::make_pair (CORETYPE_U128, additional_length_offset);
1216     }
1217   else if (suffix == "usize")
1218     {
1219       return std::make_pair (CORETYPE_USIZE, additional_length_offset);
1220     }
1221   else
1222     {
1223       rust_error_at (get_current_location (), "unknown number suffix %qs",
1224                      suffix.c_str ());
1225
1226       return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
1227     }
1228 }
1229
1230 // Parses in the exponent part (if any) of a float literal.
1231 std::pair<std::string, int>
1232 Lexer::parse_in_exponent_part ()
1233 {
1234   int additional_length_offset = 0;
1235   std::string str;
1236   if (current_char == 'E' || current_char == 'e')
1237     {
1238       // add exponent to string as strtod works with it
1239       str += current_char;
1240       skip_input ();
1241       current_char = peek_input ();
1242
1243       additional_length_offset++;
1244
1245       // special - and + handling
1246       if (current_char == '-')
1247         {
1248           str += '-';
1249
1250           skip_input ();
1251           current_char = peek_input ();
1252
1253           additional_length_offset++;
1254         }
1255       else if (current_char == '+')
1256         {
1257           // don't add + but still skip input
1258           skip_input ();
1259           current_char = peek_input ();
1260
1261           additional_length_offset++;
1262         }
1263
1264       // parse another decimal number for exponent
1265       auto str_length = parse_in_decimal ();
1266       str += std::get<0> (str_length);
1267       additional_length_offset += std::get<1> (str_length);
1268     }
1269   return std::make_pair (str, additional_length_offset);
1270 }
1271
1272 // Parses a decimal integer.
1273 std::tuple<std::string, int, bool>
1274 Lexer::parse_in_decimal ()
1275 {
1276   /* A pure decimal contains only digits.  */
1277   bool pure_decimal = true;
1278   int additional_length_offset = 0;
1279   std::string str;
1280   while (ISDIGIT (current_char.value) || current_char.value == '_')
1281     {
1282       if (current_char == '_')
1283         {
1284           pure_decimal = false;
1285           // don't add _ to number
1286           skip_input ();
1287           current_char = peek_input ();
1288
1289           additional_length_offset++;
1290
1291           continue;
1292         }
1293
1294       additional_length_offset++;
1295
1296       str += current_char;
1297       skip_input ();
1298       current_char = peek_input ();
1299     }
1300   return std::make_tuple (str, additional_length_offset, pure_decimal);
1301 }
1302
1303 /* Parses escapes (and string continues) in "byte" strings and characters. Does
1304  * not support unicode. */
1305 std::tuple<char, int, bool>
1306 Lexer::parse_escape (char opening_char)
1307 {
1308   int additional_length_offset = 0;
1309   char output_char = 0;
1310
1311   // skip to actual letter
1312   skip_input ();
1313   current_char = peek_input ();
1314   additional_length_offset++;
1315
1316   switch (current_char.value)
1317     {
1318       case 'x': {
1319         auto hex_escape_pair = parse_partial_hex_escape ();
1320         long hexLong = hex_escape_pair.first;
1321         additional_length_offset += hex_escape_pair.second;
1322
1323         if (hexLong > 255 || hexLong < 0)
1324           rust_error_at (
1325             get_current_location (),
1326             "byte \\x escape %<\\x%x%> out of range - allows up to %<\\xFF%>",
1327             static_cast<unsigned int> (hexLong));
1328         /* TODO: restore capital for escape output - gcc pretty-printer doesn't
1329          * support %X directly */
1330         char hexChar = static_cast<char> (hexLong);
1331
1332         output_char = hexChar;
1333       }
1334       break;
1335     case 'n':
1336       output_char = '\n';
1337       break;
1338     case 'r':
1339       output_char = '\r';
1340       break;
1341     case 't':
1342       output_char = '\t';
1343       break;
1344     case '\\':
1345       output_char = '\\';
1346       break;
1347     case '0':
1348       output_char = '\0';
1349       break;
1350     case '\'':
1351       output_char = '\'';
1352       break;
1353     case '"':
1354       output_char = '"';
1355       break;
1356     case 'u':
1357       rust_error_at (get_current_location (),
1358                      "cannot have a unicode escape \\u in a byte %s",
1359                      opening_char == '\'' ? "character" : "string");
1360       // Try to parse it anyway, just to skip it
1361       parse_partial_unicode_escape ();
1362       return std::make_tuple (output_char, additional_length_offset, false);
1363     case '\r':
1364     case '\n':
1365       // string continue
1366       return std::make_tuple (0, parse_partial_string_continue (), true);
1367     default:
1368       rust_error_at (get_current_location (),
1369                      "unknown escape sequence %<\\%s%>",
1370                      current_char.as_string ().c_str ());
1371       // returns false if no parsing could be done
1372       // return false;
1373       return std::make_tuple (output_char, additional_length_offset, false);
1374       break;
1375     }
1376   // all non-special cases (string continue) should skip their used char
1377   skip_input ();
1378   current_char = peek_input ();
1379   additional_length_offset++;
1380
1381   // returns true if parsing was successful
1382   // return true;
1383   return std::make_tuple (output_char, additional_length_offset, false);
1384 }
1385
1386 /* Parses an escape (or string continue) in a string or character. Supports
1387  * unicode escapes. */
1388 std::tuple<Codepoint, int, bool>
1389 Lexer::parse_utf8_escape ()
1390 {
1391   Codepoint output_char;
1392   int additional_length_offset = 0;
1393
1394   // skip to actual letter
1395   skip_input ();
1396   current_char = peek_input ();
1397   additional_length_offset++;
1398
1399   switch (current_char.value)
1400     {
1401       case 'x': {
1402         auto hex_escape_pair = parse_partial_hex_escape ();
1403         long hexLong = hex_escape_pair.first;
1404         additional_length_offset += hex_escape_pair.second;
1405
1406         if (hexLong > 127 || hexLong < 0)
1407           rust_error_at (
1408             get_current_location (),
1409             "ascii \\x escape %<\\x%x%> out of range - allows up to %<\\x7F%>",
1410             static_cast<unsigned int> (hexLong));
1411         /* TODO: restore capital for escape output - gcc pretty-printer doesn't
1412          * support %X directly */
1413         char hexChar = static_cast<char> (hexLong);
1414
1415         output_char = hexChar;
1416       }
1417       break;
1418     case 'n':
1419       output_char = '\n';
1420       break;
1421     case 'r':
1422       output_char = '\r';
1423       break;
1424     case 't':
1425       output_char = '\t';
1426       break;
1427     case '\\':
1428       output_char = '\\';
1429       break;
1430     case '0':
1431       output_char = '\0';
1432       break;
1433     case '\'':
1434       output_char = '\'';
1435       break;
1436     case '"':
1437       output_char = '"';
1438       break;
1439       case 'u': {
1440         auto unicode_escape_pair = parse_partial_unicode_escape ();
1441         output_char = unicode_escape_pair.first;
1442         additional_length_offset += unicode_escape_pair.second;
1443
1444         return std::make_tuple (output_char, additional_length_offset, false);
1445       }
1446       break;
1447     case '\r':
1448     case '\n':
1449       // string continue
1450       return std::make_tuple (0, parse_partial_string_continue (), true);
1451     default:
1452       rust_error_at (get_current_location (),
1453                      "unknown escape sequence %<\\%s%>",
1454                      current_char.as_string ().c_str ());
1455       // returns false if no parsing could be done
1456       // return false;
1457       return std::make_tuple (output_char, additional_length_offset, false);
1458       break;
1459     }
1460   /* all non-special cases (unicode, string continue) should skip their used
1461    * char */
1462   skip_input ();
1463   current_char = peek_input ();
1464   additional_length_offset++;
1465
1466   // returns true if parsing was successful
1467   // return true;
1468   return std::make_tuple (output_char, additional_length_offset, false);
1469 }
1470
1471 // Parses the body of a string continue that has been found in an escape.
1472 int
1473 Lexer::parse_partial_string_continue ()
1474 {
1475   int additional_length_offset = 1;
1476
1477   // string continue
1478   // TODO use utf-8 codepoint to skip whitespaces
1479   while (is_whitespace (current_char.value))
1480     {
1481       if (current_char == '\n')
1482         {
1483           current_line++;
1484           current_column = 1;
1485           // tell line_table that new line starts
1486           start_line (current_line, max_column_hint);
1487
1488           // reset "length"
1489           additional_length_offset = 1;
1490
1491           // get next char
1492           skip_input ();
1493           current_char = peek_input ();
1494
1495           continue;
1496         }
1497
1498       skip_input ();
1499       current_char = peek_input ();
1500       additional_length_offset++;
1501     }
1502
1503   return additional_length_offset;
1504 }
1505
1506 /* Parses the body of a '\x' escape. Note that it does not check that the number
1507  * is valid and smaller than 255. */
1508 std::pair<long, int>
1509 Lexer::parse_partial_hex_escape ()
1510 {
1511   // hex char string (null-terminated)
1512   char hexNum[3] = {0, 0, 0};
1513
1514   // first hex char
1515   current_char = peek_input (1);
1516   int additional_length_offset = 1;
1517
1518   if (!is_x_digit (current_char.value))
1519     {
1520       rust_error_at (get_current_location (),
1521                      "invalid character %<\\x%s%> in \\x sequence",
1522                      current_char.as_string ().c_str ());
1523       return std::make_pair (0, 0);
1524     }
1525   hexNum[0] = current_char.value;
1526
1527   // second hex char
1528   skip_input ();
1529   current_char = peek_input (1);
1530   additional_length_offset++;
1531
1532   if (!is_x_digit (current_char.value))
1533     {
1534       rust_error_at (get_current_location (),
1535                      "invalid character %<\\x%c%s%> in \\x sequence", hexNum[0],
1536                      current_char.as_string ().c_str ());
1537       return std::make_pair (0, 1);
1538     }
1539   skip_input ();
1540   hexNum[1] = current_char.value;
1541
1542   long hexLong = std::strtol (hexNum, nullptr, 16);
1543
1544   return std::make_pair (hexLong, additional_length_offset);
1545 }
1546
1547 // Parses the body of a unicode escape.
1548 std::pair<Codepoint, int>
1549 Lexer::parse_partial_unicode_escape ()
1550 {
1551   skip_input ();
1552   current_char = peek_input ();
1553   int additional_length_offset = 0;
1554
1555   if (current_char != '{')
1556     {
1557       rust_error_at (get_current_location (),
1558                      "unicode escape should start with %<{%>");
1559       /* Skip what should probaby have been between brackets.  */
1560       while (is_x_digit (current_char.value) || current_char == '_')
1561         {
1562           skip_input ();
1563           current_char = peek_input ();
1564           additional_length_offset++;
1565         }
1566       return std::make_pair (Codepoint (0), additional_length_offset);
1567     }
1568
1569   skip_input ();
1570   current_char = peek_input ();
1571   additional_length_offset++;
1572
1573   if (current_char == '_')
1574     {
1575       rust_error_at (get_current_location (),
1576                      "unicode escape cannot start with %<_%>");
1577       skip_input ();
1578       current_char = peek_input ();
1579       additional_length_offset++;
1580       // fallthrough and try to parse the rest anyway
1581     }
1582
1583   // parse unicode escape - 1-6 hex digits
1584   std::string num_str;
1585   num_str.reserve (6);
1586
1587   // loop through to add entire hex number to string
1588   while (is_x_digit (current_char.value) || current_char.value == '_')
1589     {
1590       if (current_char == '_')
1591         {
1592           // don't add _ to number
1593           skip_input ();
1594           current_char = peek_input ();
1595
1596           additional_length_offset++;
1597
1598           continue;
1599         }
1600
1601       additional_length_offset++;
1602
1603       // add raw hex numbers
1604       num_str += current_char;
1605
1606       skip_input ();
1607       current_char = peek_input ();
1608     }
1609
1610   if (current_char == '}')
1611     {
1612       skip_input ();
1613       current_char = peek_input ();
1614       additional_length_offset++;
1615     }
1616   else
1617     {
1618       // actually an error, but allow propagation anyway Assume that
1619       // wrong bracketm whitespace or single/double quotes are wrong
1620       // termination, otherwise it is a wrong character, then skip to the actual
1621       // terminator.
1622       // TODO use utf-8 codepoint to skip whitespaces
1623       if (current_char == '{' || is_whitespace (current_char.value)
1624           || current_char == '\'' || current_char == '"')
1625         {
1626           rust_error_at (get_current_location (),
1627                          "expected terminating %<}%> in unicode escape");
1628           return std::make_pair (Codepoint (0), additional_length_offset);
1629         }
1630       else
1631         {
1632           rust_error_at (get_current_location (),
1633                          "invalid character %<%s%> in unicode escape",
1634                          current_char.as_string ().c_str ());
1635           // TODO use utf-8 codepoint to skip whitespaces
1636           while (current_char != '}' && current_char != '{'
1637                  && !is_whitespace (current_char.value) && current_char != '\''
1638                  && current_char != '"')
1639             {
1640               skip_input ();
1641               current_char = peek_input ();
1642               additional_length_offset++;
1643             }
1644           // Consume the actual closing bracket if found
1645           if (current_char == '}')
1646             {
1647               skip_input ();
1648               current_char = peek_input ();
1649               additional_length_offset++;
1650             }
1651           return std::make_pair (Codepoint (0), additional_length_offset);
1652         }
1653     }
1654
1655   // ensure 1-6 hex characters
1656   if (num_str.length () > 6 || num_str.length () < 1)
1657     {
1658       rust_error_at (get_current_location (),
1659                      "unicode escape should be between 1 and 6 hex "
1660                      "characters; it is %lu",
1661                      (unsigned long) num_str.length ());
1662       // return false;
1663       return std::make_pair (Codepoint (0), additional_length_offset);
1664     }
1665
1666   unsigned long hex_num = std::strtoul (num_str.c_str (), nullptr, 16);
1667
1668   if (hex_num > 0xd7ff && hex_num < 0xe000)
1669     {
1670       rust_error_at (
1671         get_current_location (),
1672         "unicode escape cannot be a surrogate value (D800 to DFFF)");
1673       return std::make_pair (Codepoint (0), additional_length_offset);
1674     }
1675
1676   if (hex_num > 0x10ffff)
1677     {
1678       rust_error_at (get_current_location (),
1679                      "unicode escape cannot be larger than 10FFFF");
1680       return std::make_pair (Codepoint (0), additional_length_offset);
1681     }
1682
1683   // return true;
1684   return std::make_pair (Codepoint (static_cast<uint32_t> (hex_num)),
1685                          additional_length_offset);
1686 }
1687
1688 // Parses a byte character.
1689 TokenPtr
1690 Lexer::parse_byte_char (location_t loc)
1691 {
1692   skip_input ();
1693   current_column++;
1694   // make current char the next character
1695   current_char = peek_input ();
1696
1697   int length = 1;
1698
1699   // char to save
1700   Codepoint byte_char = 0;
1701
1702   // detect escapes
1703   if (current_char == '\\')
1704     {
1705       auto escape_length_pair = parse_escape ('\'');
1706       byte_char = std::get<0> (escape_length_pair);
1707       length += std::get<1> (escape_length_pair);
1708
1709       current_char = peek_input ();
1710
1711       if (current_char != '\'')
1712         {
1713           rust_error_at (get_current_location (), "unclosed %<byte char%>");
1714         }
1715
1716       skip_input ();
1717       current_char = peek_input ();
1718       length++; // go to next char
1719     }
1720   else if (current_char != '\'')
1721     {
1722       // otherwise, get character from direct input character
1723       byte_char = current_char;
1724
1725       if (!byte_char.is_ascii ())
1726         {
1727           rust_error_at (get_current_location (),
1728                          "non-ASCII character in %<byte char%>");
1729         }
1730
1731       skip_input ();
1732       current_char = peek_input ();
1733       length++;
1734
1735       if (current_char != '\'')
1736         {
1737           rust_error_at (get_current_location (), "unclosed %<byte char%>");
1738         }
1739
1740       skip_input ();
1741       current_char = peek_input ();
1742       length++; // go to next char
1743     }
1744   else
1745     {
1746       rust_error_at (get_current_location (),
1747                      "no character inside %<%> for %<byte char%>");
1748     }
1749
1750   current_column += length;
1751
1752   loc += length - 1;
1753   return Token::make_byte_char (loc, byte_char.value);
1754 }
1755
1756 // Parses a byte string.
1757 TokenPtr
1758 Lexer::parse_byte_string (location_t loc)
1759 {
1760   // byte string
1761
1762   // skip quote character
1763   skip_input ();
1764   current_column++;
1765
1766   std::string str;
1767   str.reserve (16); // some sensible default
1768
1769   current_char = peek_input ();
1770
1771   const location_t string_begin_locus = get_current_location ();
1772
1773   while (current_char != '"' && !current_char.is_eof ())
1774     {
1775       if (current_char == '\\')
1776         {
1777           int length = 1;
1778           auto escape_length_pair = parse_escape ('"');
1779           char output_char = std::get<0> (escape_length_pair);
1780
1781           if (output_char == 0 && std::get<2> (escape_length_pair))
1782             length = std::get<1> (escape_length_pair) - 1;
1783           else
1784             length += std::get<1> (escape_length_pair);
1785
1786           if (output_char != 0 || !std::get<2> (escape_length_pair))
1787             str += output_char;
1788
1789           current_column += length;
1790
1791           continue;
1792         }
1793
1794       current_column++;
1795       if (current_char.value == '\n')
1796         {
1797           current_line++;
1798           current_column = 1;
1799           // tell line_table that new line starts
1800           start_line (current_line, max_column_hint);
1801         }
1802
1803       str += current_char;
1804       skip_input ();
1805       current_char = peek_input ();
1806     }
1807
1808   if (current_char == '"')
1809     {
1810       current_column++;
1811
1812       skip_input ();
1813       current_char = peek_input ();
1814     }
1815   else if (current_char.is_eof ())
1816     {
1817       rust_error_at (string_begin_locus, "unended byte string literal");
1818       return Token::make (END_OF_FILE, get_current_location ());
1819     }
1820   else
1821     {
1822       rust_unreachable ();
1823     }
1824
1825   str.shrink_to_fit ();
1826   loc += str.size () - 1;
1827
1828   return Token::make_byte_string (loc, std::move (str));
1829 }
1830
1831 // Parses a raw byte string.
1832 TokenPtr
1833 Lexer::parse_raw_byte_string (location_t loc)
1834 {
1835   // raw byte string literals
1836   std::string str;
1837   str.reserve (16); // some sensible default
1838
1839   int length = 1;
1840   int hash_count = 0;
1841
1842   // get hash count at beginnning
1843   skip_input ();
1844   current_char = peek_input ();
1845   length++;
1846   while (current_char == '#')
1847     {
1848       hash_count++;
1849       length++;
1850
1851       skip_input ();
1852       current_char = peek_input ();
1853     }
1854
1855   if (current_char != '"')
1856     {
1857       rust_error_at (get_current_location (),
1858                      "raw byte string has no opening %<\"%>");
1859     }
1860
1861   skip_input ();
1862   current_char = peek_input ();
1863   length++;
1864
1865   while (true)
1866     {
1867       if (current_char == '"')
1868         {
1869           bool enough_hashes = true;
1870
1871           for (int i = 0; i < hash_count; i++)
1872             {
1873               if (peek_input (i + 1) != '#')
1874                 {
1875                   enough_hashes = false;
1876                   break;
1877                 }
1878             }
1879
1880           if (enough_hashes)
1881             {
1882               // skip enough input and peek enough input
1883               skip_input (hash_count);
1884               current_char = peek_input ();
1885               length += hash_count + 1;
1886               break;
1887             }
1888         }
1889
1890       if (current_char.value > 127)
1891         {
1892           rust_error_at (get_current_location (),
1893                          "character %<%s%> in raw byte string out of range",
1894                          current_char.as_string ().c_str ());
1895           current_char = 0;
1896         }
1897
1898       length++;
1899
1900       str += current_char;
1901       skip_input ();
1902       current_char = peek_input ();
1903     }
1904
1905   current_column += length;
1906
1907   loc += length - 1;
1908
1909   str.shrink_to_fit ();
1910
1911   return Token::make_byte_string (loc, std::move (str));
1912 }
1913
1914 // Parses a raw identifier.
1915 TokenPtr
1916 Lexer::parse_raw_identifier (location_t loc)
1917 {
1918   // raw identifier
1919   std::string str;
1920   str.reserve (16); // default
1921
1922   skip_input ();
1923   current_char = peek_input ();
1924
1925   current_column += 2;
1926
1927   bool first_is_underscore = current_char == '_';
1928
1929   int length = 0;
1930   current_char = peek_input ();
1931   // loop through entire name
1932   while (is_identifier_continue (current_char.value))
1933     {
1934       length++;
1935
1936       str += current_char;
1937       skip_input ();
1938       current_char = peek_input ();
1939     }
1940
1941   current_column += length;
1942
1943   rust_debug ("raw ident: %s", str.c_str ());
1944
1945   // if just a single underscore, not an identifier
1946   if (first_is_underscore && length == 1)
1947     rust_error_at (get_current_location (),
1948                    "%<_%> is not a valid raw identifier");
1949
1950   using namespace Rust::Values;
1951   std::set<std::string> invalid{
1952     Keywords::CRATE, Keywords::EXTERN_KW,  Keywords::SELF,
1953     Keywords::SUPER, Keywords::SELF_ALIAS,
1954   };
1955
1956   if (invalid.find (str) != invalid.end ())
1957     {
1958       rust_error_at (get_current_location (),
1959                      "%qs is a forbidden raw identifier", str.c_str ());
1960
1961       return nullptr;
1962     }
1963   else
1964     {
1965       str.shrink_to_fit ();
1966       loc += length - 1;
1967
1968       return Token::make_identifier (loc, std::move (str));
1969     }
1970 }
1971
1972 // skip broken string input (unterminated strings)
1973 void
1974 Lexer::skip_broken_string_input (Codepoint current_char)
1975 {
1976   while (current_char != '"' && !current_char.is_eof ())
1977     {
1978       if (current_char == '\n')
1979         {
1980           current_line++;
1981           current_column = 1;
1982         }
1983       else
1984         {
1985           current_column++;
1986         }
1987       skip_input ();
1988       current_char = peek_input ();
1989     }
1990   if (current_char == '"')
1991     {
1992       current_column++;
1993
1994       skip_input ();
1995       current_char = peek_input ();
1996     }
1997   rust_debug ("skipped to %d:%d due to bad quotes", current_line,
1998               current_column);
1999 }
2000
2001 // Parses a string.
2002 TokenPtr
2003 Lexer::parse_string (location_t loc)
2004 {
2005   std::string str;
2006   str.reserve (16); // some sensible default
2007
2008   current_char = peek_input ();
2009
2010   const location_t string_begin_locus = get_current_location ();
2011
2012   // FIXME: This fails if the input ends. How do we check for EOF?
2013   while (current_char.value != '"' && !current_char.is_eof ())
2014     {
2015       if (current_char.value == '\\')
2016         {
2017           int length = 1;
2018
2019           // parse escape
2020           auto utf8_escape_pair = parse_utf8_escape ();
2021           current_char = std::get<0> (utf8_escape_pair);
2022
2023           if (current_char == Codepoint (0) && std::get<2> (utf8_escape_pair))
2024             length = std::get<1> (utf8_escape_pair) - 1;
2025           else
2026             length += std::get<1> (utf8_escape_pair);
2027
2028           if (current_char != Codepoint (0) || !std::get<2> (utf8_escape_pair))
2029             str += current_char.as_string ();
2030
2031           current_column += length;
2032
2033           // FIXME: should remove this but can't.
2034           // `parse_utf8_escape` does not update `current_char` correctly.
2035           current_char = peek_input ();
2036           continue;
2037         }
2038
2039       current_column++;
2040       if (current_char.value == '\n')
2041         {
2042           current_line++;
2043           current_column = 1;
2044           // tell line_table that new line starts
2045           start_line (current_line, max_column_hint);
2046         }
2047
2048       str += current_char;
2049       skip_input ();
2050       current_char = peek_input ();
2051     }
2052
2053   if (current_char.value == '"')
2054     {
2055       current_column++;
2056
2057       skip_input ();
2058       current_char = peek_input ();
2059     }
2060   else if (current_char.is_eof ())
2061     {
2062       rust_error_at (string_begin_locus, "unended string literal");
2063       return Token::make (END_OF_FILE, get_current_location ());
2064     }
2065   else
2066     {
2067       rust_unreachable ();
2068     }
2069
2070   str.shrink_to_fit ();
2071
2072   return Token::make_string (loc, std::move (str));
2073 }
2074
2075 // Parses an identifier or keyword.
2076 TokenPtr
2077 Lexer::parse_identifier_or_keyword (location_t loc)
2078 {
2079   std::string str;
2080   str.reserve (16); // default
2081   str += current_char.as_string ();
2082
2083   bool first_is_underscore = current_char == '_';
2084
2085   int length = 1;
2086   current_char = peek_input ();
2087
2088   // loop through entire name
2089   while (is_identifier_continue (current_char.value))
2090     {
2091       auto s = current_char.as_string ();
2092       length++;
2093
2094       str += current_char.as_string ();
2095       skip_input ();
2096       current_char = peek_input ();
2097     }
2098
2099   current_column += length;
2100
2101   // if just a single underscore, not an identifier
2102   if (first_is_underscore && length == 1)
2103     return Token::make (UNDERSCORE, loc);
2104
2105   str.shrink_to_fit ();
2106
2107   loc += length - 1;
2108
2109   TokenId keyword = classify_keyword (str);
2110   if (keyword == IDENTIFIER)
2111     return Token::make_identifier (loc, std::move (str));
2112   else
2113     return Token::make (keyword, loc);
2114 }
2115
2116 // Possibly returns a raw string token if it exists - otherwise returns null.
2117 TokenPtr
2118 Lexer::maybe_parse_raw_string (location_t loc)
2119 {
2120   int peek_index = 0;
2121   while (peek_input (peek_index) == '#')
2122     peek_index++;
2123
2124   if (peek_input (peek_index) == '"')
2125     return parse_raw_string (loc, peek_index);
2126   else
2127     return nullptr;
2128 }
2129
2130 // Returns a raw string token.
2131 TokenPtr
2132 Lexer::parse_raw_string (location_t loc, int initial_hash_count)
2133 {
2134   // raw string literals
2135   std::string str;
2136   str.reserve (16); // some sensible default
2137
2138   int length = 1 + initial_hash_count;
2139
2140   if (initial_hash_count > 0)
2141     skip_input (initial_hash_count - 1);
2142
2143   current_char = peek_input ();
2144
2145   if (current_char != '"')
2146     rust_error_at (get_current_location (), "raw string has no opening %<\"%>");
2147
2148   length++;
2149   skip_input ();
2150   current_char = peek_input ();
2151
2152   while (!current_char.is_eof ())
2153     {
2154       if (current_char.value == '"')
2155         {
2156           bool enough_hashes = true;
2157
2158           for (int i = 0; i < initial_hash_count; i++)
2159             {
2160               if (peek_input (i + 1) != '#')
2161                 {
2162                   enough_hashes = false;
2163                   break;
2164                 }
2165             }
2166
2167           if (enough_hashes)
2168             {
2169               // skip enough input and peek enough input
2170               skip_input (initial_hash_count);
2171               current_char = peek_input ();
2172               length += initial_hash_count + 1;
2173               break;
2174             }
2175         }
2176
2177       length++;
2178
2179       str += current_char.as_string ();
2180       skip_input ();
2181       current_char = peek_input ();
2182     }
2183
2184   current_column += length;
2185
2186   loc += length - 1;
2187
2188   str.shrink_to_fit ();
2189
2190   return Token::make_string (loc, std::move (str));
2191 }
2192
2193 template <typename IsDigitFunc>
2194 TokenPtr
2195 Lexer::parse_non_decimal_int_literal (location_t loc, IsDigitFunc is_digit_func,
2196                                       std::string existent_str, int base)
2197 {
2198   int length = 1;
2199
2200   skip_input ();
2201   current_char = peek_input ();
2202
2203   length++;
2204
2205   // loop through to add entire number to string
2206   while (is_digit_func (current_char.value) || current_char == '_')
2207     {
2208       if (current_char == '_')
2209         {
2210           // don't add _ to number
2211           skip_input ();
2212           current_char = peek_input ();
2213
2214           length++;
2215
2216           continue;
2217         }
2218
2219       length++;
2220
2221       // add raw numbers
2222       existent_str += current_char;
2223       skip_input ();
2224       current_char = peek_input ();
2225     }
2226
2227   // convert value to decimal representation
2228   long dec_num = std::strtol (existent_str.c_str (), nullptr, base);
2229
2230   existent_str = std::to_string (dec_num);
2231
2232   // parse in type suffix if it exists
2233   auto type_suffix_pair = parse_in_type_suffix ();
2234   PrimitiveCoreType type_hint = type_suffix_pair.first;
2235   length += type_suffix_pair.second;
2236
2237   current_column += length;
2238
2239   if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64)
2240     {
2241       rust_error_at (get_current_location (),
2242                      "invalid type suffix %qs for integer (%s) literal",
2243                      get_type_hint_string (type_hint),
2244                      base == 16
2245                        ? "hex"
2246                        : (base == 8 ? "octal"
2247                                     : (base == 2 ? "binary"
2248                                                  : "<insert unknown base>")));
2249       return nullptr;
2250     }
2251
2252   loc += length - 1;
2253
2254   return Token::make_int (loc, std::move (existent_str), type_hint);
2255 }
2256
2257 // Parses a hex, binary or octal int literal.
2258 TokenPtr
2259 Lexer::parse_non_decimal_int_literals (location_t loc)
2260 {
2261   std::string str;
2262   str.reserve (16); // some sensible default
2263   str += current_char;
2264
2265   current_char = peek_input ();
2266
2267   if (current_char == 'x')
2268     {
2269       // hex (integer only)
2270       return parse_non_decimal_int_literal (loc, is_x_digit, str + "x", 16);
2271     }
2272   else if (current_char == 'o')
2273     {
2274       // octal (integer only)
2275       return parse_non_decimal_int_literal (loc, is_octal_digit,
2276                                             std::move (str), 8);
2277     }
2278   else if (current_char == 'b')
2279     {
2280       // binary (integer only)
2281       return parse_non_decimal_int_literal (loc, is_bin_digit, std::move (str),
2282                                             2);
2283     }
2284   else
2285     {
2286       return nullptr;
2287     }
2288 }
2289
2290 // Parses a decimal-based int literal or float literal.
2291 TokenPtr
2292 Lexer::parse_decimal_int_or_float (location_t loc)
2293 {
2294   std::string str;
2295   str.reserve (16); // some sensible default
2296   str += current_char;
2297
2298   int length = 1;
2299   bool first_zero = current_char == '0';
2300
2301   current_char = peek_input ();
2302
2303   // parse initial decimal integer (or first integer part of float) literal
2304   auto initial_decimal = parse_in_decimal ();
2305   str += std::get<0> (initial_decimal);
2306   length += std::get<1> (initial_decimal);
2307
2308   // detect float literal
2309   //
2310   // Note:
2311   //
2312   // We should not use is_float_digit () for this verification but instead
2313   // directly ISDIGIT because rust does not support non digit values right after
2314   // a dot.
2315   // The following value is not legal in rust:
2316   // let a = 3.e1;
2317   // A `0` should be put between the dot and the exponent to be valid
2318   // (eg. 3.0e1).
2319   if (current_char == '.' && ISDIGIT (peek_input (1).value))
2320     {
2321       // float with a '.', parse another decimal into it
2322
2323       // add . to str
2324       str += current_char;
2325       skip_input ();
2326       current_char = peek_input ();
2327       length++;
2328
2329       // parse another decimal number for float
2330       auto second_decimal = parse_in_decimal ();
2331       str += std::get<0> (second_decimal);
2332       length += std::get<1> (second_decimal);
2333
2334       // parse in exponent part if it exists
2335       auto exponent_pair = parse_in_exponent_part ();
2336       str += exponent_pair.first;
2337       length += exponent_pair.second;
2338
2339       // parse in type suffix if it exists
2340       auto type_suffix_pair = parse_in_type_suffix ();
2341       PrimitiveCoreType type_hint = type_suffix_pair.first;
2342       length += type_suffix_pair.second;
2343
2344       if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
2345           && type_hint != CORETYPE_UNKNOWN)
2346         {
2347           rust_error_at (get_current_location (),
2348                          "invalid type suffix %qs for floating-point literal",
2349                          get_type_hint_string (type_hint));
2350           // ignore invalid type suffix as everything else seems fine
2351           type_hint = CORETYPE_UNKNOWN;
2352         }
2353
2354       current_column += length;
2355
2356       loc += length - 1;
2357
2358       str.shrink_to_fit ();
2359       return Token::make_float (loc, std::move (str), type_hint);
2360     }
2361   else if (current_char == '.'
2362            && check_valid_float_dot_end (peek_input (1).value))
2363     {
2364       // float that is just an integer with a terminating '.' character
2365
2366       // add . to str
2367       str += current_char;
2368       skip_input ();
2369       current_char = peek_input ();
2370       length++;
2371
2372       // type hint not allowed
2373
2374       current_column += length;
2375
2376       loc += length - 1;
2377
2378       str.shrink_to_fit ();
2379       return Token::make_float (loc, std::move (str), CORETYPE_UNKNOWN);
2380     }
2381   else if (current_char == 'E' || current_char == 'e')
2382     {
2383       // exponent float with no '.' character
2384
2385       // parse exponent part
2386       auto exponent_pair = parse_in_exponent_part ();
2387       str += exponent_pair.first;
2388       length += exponent_pair.second;
2389
2390       // parse in type suffix if it exists
2391       auto type_suffix_pair = parse_in_type_suffix ();
2392       PrimitiveCoreType type_hint = type_suffix_pair.first;
2393       length += type_suffix_pair.second;
2394
2395       if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
2396           && type_hint != CORETYPE_UNKNOWN)
2397         {
2398           rust_error_at (get_current_location (),
2399                          "invalid type suffix %qs for floating-point literal",
2400                          get_type_hint_string (type_hint));
2401           // ignore invalid type suffix as everything else seems fine
2402           type_hint = CORETYPE_UNKNOWN;
2403         }
2404
2405       current_column += length;
2406
2407       loc += length - 1;
2408
2409       str.shrink_to_fit ();
2410       return Token::make_float (loc, std::move (str), type_hint);
2411     }
2412   else
2413     {
2414       // is an integer
2415
2416       // parse in type suffix if it exists
2417       auto type_suffix_pair = parse_in_type_suffix ();
2418       PrimitiveCoreType type_hint = type_suffix_pair.first;
2419       /* A "real" pure decimal doesn't have a suffix and no zero prefix.  */
2420       if (type_hint == CORETYPE_UNKNOWN)
2421         {
2422           bool pure_decimal = std::get<2> (initial_decimal);
2423           if (pure_decimal && (!first_zero || str.size () == 1))
2424             type_hint = CORETYPE_PURE_DECIMAL;
2425         }
2426       length += type_suffix_pair.second;
2427
2428       current_column += length;
2429
2430       loc += length - 1;
2431
2432       str.shrink_to_fit ();
2433       return Token::make_int (loc, std::move (str), type_hint);
2434     }
2435 }
2436
2437 TokenPtr
2438 Lexer::parse_char_or_lifetime (location_t loc)
2439 {
2440   int length = 1;
2441
2442   current_char = peek_input ();
2443   if (current_char.is_eof ())
2444     return nullptr;
2445
2446   // parse escaped char literal
2447   if (current_char.value == '\\')
2448     {
2449       // parse escape
2450       auto utf8_escape_pair = parse_utf8_escape ();
2451       Codepoint escaped_char = std::get<0> (utf8_escape_pair);
2452       length += std::get<1> (utf8_escape_pair);
2453
2454       if (peek_input ().value != '\'')
2455         {
2456           rust_error_at (get_current_location (), "unended character literal");
2457         }
2458       else
2459         {
2460           skip_input ();
2461           current_char = peek_input ();
2462           length++;
2463         }
2464
2465       current_column += length;
2466
2467       loc += length - 1;
2468
2469       return Token::make_char (loc, escaped_char);
2470     }
2471   else
2472     {
2473       skip_input ();
2474
2475       if (peek_input ().value == '\'')
2476         {
2477           // parse non-escaped char literal
2478           Codepoint non_escaped_char = current_char;
2479
2480           // skip the ' character
2481           skip_input ();
2482           current_char = peek_input ();
2483
2484           // TODO fix due to different widths of utf-8 chars?
2485           current_column += 3;
2486
2487           loc += 2;
2488
2489           return Token::make_char (loc, non_escaped_char);
2490         }
2491       else if (is_identifier_start (current_char.value))
2492         {
2493           // parse lifetime name
2494           std::string str;
2495           str += current_char.as_string ();
2496           length++;
2497
2498           current_char = peek_input ();
2499           while (is_identifier_continue (current_char.value))
2500             {
2501               str += current_char.as_string ();
2502               skip_input ();
2503               current_char = peek_input ();
2504               length++;
2505             }
2506
2507           current_column += length;
2508
2509           loc += length - 1;
2510
2511           // TODO some keywords cannot be used for a lifetime label #2306
2512           // https://doc.rust-lang.org/reference/tokens.html
2513
2514           str.shrink_to_fit ();
2515           return Token::make_lifetime (loc, std::move (str));
2516         }
2517       else
2518         {
2519           rust_error_at (
2520             get_current_location (),
2521             "expected %' after character constant in character literal");
2522           return nullptr;
2523         }
2524     }
2525 }
2526
2527 void
2528 Lexer::split_current_token (TokenId new_left, TokenId new_right)
2529 {
2530   /* TODO: assert that this TokenId is a "simple token" like punctuation and not
2531    * like "IDENTIFIER"? */
2532   location_t current_loc = peek_token ()->get_locus ();
2533   TokenPtr new_left_tok = Token::make (new_left, current_loc);
2534   TokenPtr new_right_tok = Token::make (new_right, current_loc + 1);
2535
2536   token_queue.replace_current_value (std::move (new_left_tok));
2537   token_queue.insert (1, std::move (new_right_tok));
2538 }
2539
2540 void
2541 Lexer::split_current_token (std::vector<TokenPtr> new_tokens)
2542 {
2543   rust_assert (new_tokens.size () > 0);
2544   token_queue.replace_current_value (new_tokens[0]);
2545
2546   for (size_t i = 1; i < new_tokens.size (); i++)
2547     {
2548       token_queue.insert (i, new_tokens[i]);
2549     }
2550 }
2551
2552 void
2553 Lexer::start_line (int current_line, int current_column)
2554 {
2555   if (line_map)
2556     linemap_line_start (line_table, current_line, current_column);
2557 }
2558
2559 } // namespace Rust
2560
2561 #if CHECKING_P
2562
2563 namespace selftest {
2564
2565 // Checks if `src` has the same contents as the given characters
2566 static void
2567 assert_source_content (Rust::InputSource &src,
2568                        const std::vector<uint32_t> &expected)
2569 {
2570   Rust::Codepoint src_char = src.next ();
2571   for (auto expected_char : expected)
2572     {
2573       // Make sure that `src` is not shorter than `expected`
2574       ASSERT_FALSE (src_char.is_eof ());
2575       // Checks skipped character is expeceted one.
2576       ASSERT_EQ (src_char.value, expected_char);
2577       src_char = src.next ();
2578     }
2579   // Checks if `src` and `chars` has the same length.
2580   ASSERT_TRUE (src_char.is_eof ());
2581 }
2582
2583 static void
2584 test_buffer_input_source (std::string str,
2585                           const std::vector<uint32_t> &expected)
2586 {
2587   Rust::BufferInputSource source (str, 0);
2588   assert_source_content (source, expected);
2589 }
2590
2591 static void
2592 test_file_input_source (std::string str, const std::vector<uint32_t> &expected)
2593 {
2594   FILE *tmpf = tmpfile ();
2595   // Moves to the first character
2596   fputs (str.c_str (), tmpf);
2597   std::rewind (tmpf);
2598   Rust::FileInputSource source (tmpf);
2599   assert_source_content (source, expected);
2600 }
2601
2602 void
2603 rust_input_source_test ()
2604 {
2605   // ASCII
2606   std::string src = u8"_abcde\tXYZ\v\f";
2607   std::vector<uint32_t> expected
2608     = {'_', 'a', 'b', 'c', 'd', 'e', '\t', 'X', 'Y', 'Z', '\v', '\f'};
2609   test_buffer_input_source (src, expected);
2610
2611   // BOM
2612   src = u8"\xef\xbb\xbfOK";
2613   expected = {'O', 'K'};
2614   test_buffer_input_source (src, expected);
2615
2616   // Russian
2617   src = u8"приве́т";
2618   expected = {L'п',
2619               L'р',
2620               L'и',
2621               L'в',
2622               0x0435 /* CYRILLIC SMALL LETTER IE е */,
2623               0x301 /* COMBINING ACUTE ACCENT ́ */,
2624               L'т'};
2625   test_buffer_input_source (src, expected);
2626
2627   src = u8"❤️🦀";
2628   expected = {0x2764 /* HEAVY BLACK HEART */,
2629               0xfe0f /* VARIATION SELECTOR-16 */, L'🦀'};
2630   test_buffer_input_source (src, expected);
2631
2632   src = u8"こんにちは";
2633   expected = {L'こ', L'ん', L'に', L'ち', L'は'};
2634   test_file_input_source (src, expected);
2635
2636   src = u8"👮‍♂👩‍⚕";
2637   expected
2638     = {0x1f46e /* POLICE OFFICER */,   0x200d /* ZERO WIDTH JOINER */,
2639        0x2642 /* MALE SIGN */,         0x1f469 /* WOMAN */,
2640        0x200d /* ZERO WIDTH JOINER */, 0x2695 /* STAFF OF AESCULAPIUS */};
2641   test_file_input_source (src, expected);
2642 }
2643
2644 } // namespace selftest
2645
2646 #endif // CHECKING_P