hphp/hack/src/parser/lexer.rs

   1 // Copyright (c) 2019, Facebook, Inc.
   2 // All rights reserved.
   3 //
   4 // This source code is licensed under the MIT license found in the
   5 // LICENSE file in the "hack" directory of this source tree.
   6
   7 use parser_core_types::lexable_token::LexableToken;
   8 use parser_core_types::lexable_trivia::LexableTrivia;
   9 use parser_core_types::source_text::{SourceText, INVALID};
  10 use parser_core_types::syntax_error::{self as Errors, Error, SyntaxError};
  11 use parser_core_types::token_kind::TokenKind;
  12 use parser_core_types::trivia_kind::TriviaKind;
  13
  14 use std::cell::RefCell;
  15 use std::ops::DerefMut;
  16 use std::rc::Rc;
  17
  18 #[derive(Debug)]
  19 struct LexerPreSnapshot {
  20     start: usize,
  21     offset: usize,
  22     in_type: bool,
  23 }
  24
  25 #[derive(Debug)]
  26 struct LexerPostSnapshot {
  27     start: usize,
  28     offset: usize,
  29     in_type: bool,
  30     errors: Vec<SyntaxError>,
  31 }
  32
  33 impl<'a, Token: LexableToken<'a>> PartialEq<Lexer<'a, Token>> for LexerPreSnapshot {
  34     fn eq(&self, other: &Lexer<'a, Token>) -> bool {
  35         self.start == other.start && self.offset == other.offset && self.in_type == other.in_type
  36     }
  37 }
  38
  39 /*
  40 Lexer Caching
  41
  42 One token look ahead in parser is implemented by `parser.peek_token()` ... `parser.next_token()`.
  43 Re-scanning in next_token can be avoided by caching the result of `peek_token`, consecutive
  44 `peek_token`s can also get improved.
  45
  46 `Lexer.peek_next_token()` checks cache first if cache misses it will clone of the current lexer and
  47 call next_token on cloned lexer. To cache the result, it takes a snapshot of lexer state before and
  48 after calling next_token, and store them in current lexer.
  49
  50 Clone trait of Lexer is derived automatically, therefore `cache: Rc<...>` is also cloned. `Rc` ensures
  51 cloned lexer and original lexer share the same cache, this is intended! Other than one token look
  52 ahead still clones parser, therefore lexer get cloned, sharing cache allows cloned lexer uses
  53 cache from original lexer and vise versa. It is measured that 2% faster than not sharing cache.
  54
  55 NOTE: There is an invariant assumed by this caching mechanism. `errors` in `Lexer` can only add new errors
  56 and must not remove any error when scanning forward! `Lexer.peek_next_token()` clones a new `Lexer` and
  57 reset `errors` to empty, look ahead may accumulate new errors and these errors will be appended to the original
  58 `Lexer`. The reason we need this invariant is that between `peek_next_token` and `next_token` we can not
  59 prove no new error added. Actually it is observed that new errors are added between these two calls.
  60 */
  61 #[derive(Debug)]
  62 struct LexerCache<Token>(LexerPreSnapshot, Token, LexerPostSnapshot);
  63
  64 #[derive(Debug, Clone)]
  65 pub struct Lexer<'a, Token: LexableToken<'a>> {
  66     source: SourceText<'a>,
  67     start: usize,
  68     offset: usize,
  69     errors: Vec<SyntaxError>,
  70     is_experimental_mode: bool,
  71     in_type: bool,
  72
  73     cache: Rc<RefCell<Option<LexerCache<Token>>>>,
  74 }
  75
  76 #[derive(Debug, PartialEq)]
  77 pub enum StringLiteralKind {
  78     LiteralDoubleQuoted,
  79     LiteralHeredoc { heredoc: Vec<u8> },
  80 }
  81
  82 #[derive(Debug, Copy, Clone)]
  83 pub enum KwSet {
  84     AllKeywords,
  85     NonReservedKeywords,
  86     NoKeywords,
  87 }
  88
  89 impl<'a, Token: LexableToken<'a>> Lexer<'a, Token> {
  90     fn to_lexer_pre_snapshot(&self) -> LexerPreSnapshot {
  91         LexerPreSnapshot {
  92             start: self.start,
  93             offset: self.offset,
  94             in_type: self.in_type,
  95         }
  96     }
  97
  98     fn into_lexer_post_snapshot(self) -> LexerPostSnapshot {
  99         LexerPostSnapshot {
 100             start: self.start,
 101             offset: self.offset,
 102             in_type: self.in_type,
 103             errors: self.errors,
 104         }
 105     }
 106
 107     pub fn make_at(source: &SourceText<'a>, is_experimental_mode: bool, offset: usize) -> Self {
 108         Self {
 109             source: source.clone(),
 110             start: offset,
 111             offset,
 112             errors: vec![],
 113             is_experimental_mode,
 114             in_type: false,
 115             cache: Rc::new(RefCell::new(None)),
 116         }
 117     }
 118
 119     pub fn make(source: &SourceText<'a>, is_experimental_mode: bool) -> Self {
 120         Self::make_at(source, is_experimental_mode, 0)
 121     }
 122
 123     fn continue_from(&mut self, l: Lexer<'a, Token>) {
 124         self.start = l.start;
 125         self.offset = l.offset;
 126         self.errors = l.errors
 127     }
 128
 129     pub fn start(&self) -> usize {
 130         self.start
 131     }
 132
 133     pub fn offset(&self) -> usize {
 134         self.offset
 135     }
 136
 137     pub fn errors(&self) -> &[SyntaxError] {
 138         &self.errors
 139     }
 140
 141     fn with_error(&mut self, error: Error) {
 142         let error = SyntaxError::make(self.start(), self.offset(), error);
 143         self.errors.push(error)
 144     }
 145
 146     fn with_offset(&mut self, offset: usize) {
 147         self.offset = offset
 148     }
 149
 150     fn with_start_offset(&mut self, start: usize, offset: usize) {
 151         self.start = start;
 152         self.offset = offset;
 153     }
 154
 155     fn start_new_lexeme(&mut self) {
 156         self.start = self.offset
 157     }
 158
 159     pub fn advance(&mut self, i: usize) {
 160         self.offset += i
 161     }
 162
 163     fn is_experimental_mode(&self) -> bool {
 164         self.is_experimental_mode
 165     }
 166
 167     pub fn set_in_type(&mut self, in_type: bool) {
 168         self.in_type = in_type
 169     }
 170
 171     pub fn source(&self) -> &SourceText<'a> {
 172         &self.source
 173     }
 174
 175     fn source_text_string(&self) -> &[u8] {
 176         self.source.text()
 177     }
 178
 179     // Housekeeping
 180
 181     pub fn peek_char(&self, index: usize) -> char {
 182         self.source.get(self.offset() + index)
 183     }
 184
 185     fn peek_string(&self, size: usize) -> &[u8] {
 186         &self.source.sub(self.offset, size)
 187     }
 188
 189     fn match_string(&self, s: &[u8]) -> bool {
 190         s == self.peek_string(s.len())
 191     }
 192
 193     fn width(&self) -> usize {
 194         self.offset - self.start
 195     }
 196
 197     fn current_text(&self) -> &[u8] {
 198         self.source.sub(self.start, self.width())
 199     }
 200
 201     fn current_text_as_str(&self) -> &str {
 202         unsafe { std::str::from_utf8_unchecked(self.current_text()) }
 203     }
 204
 205     fn at_end(&self) -> bool {
 206         self.offset() >= self.source.length()
 207     }
 208
 209     fn remaining(&self) -> usize {
 210         let r = (self.source.length() as isize) - (self.offset as isize);
 211         if r < 0 {
 212             0
 213         } else {
 214             r as usize
 215         }
 216     }
 217
 218     fn peek(&self, i: usize) -> char {
 219         self.source.get(i)
 220     }
 221
 222     fn peek_back(&self, index: usize) -> char {
 223         self.source.get(self.offset() - index)
 224     }
 225
 226     fn peek_def(&self, index: usize, default: char) -> char {
 227         if index >= self.source.length() {
 228             default
 229         } else {
 230             self.source.get(index)
 231         }
 232     }
 233
 234     // Character classification
 235
 236     fn is_whitespace_no_newline(c: char) -> bool {
 237         match c {
 238             ' ' | '\t' => true,
 239             _ => false,
 240         }
 241     }
 242
 243     fn is_newline(ch: char) -> bool {
 244         match ch {
 245             '\r' | '\n' => true,
 246             _ => false,
 247         }
 248     }
 249
 250     fn is_binary_digit(ch: char) -> bool {
 251         match ch {
 252             '0' | '1' => true,
 253             _ => false,
 254         }
 255     }
 256
 257     fn is_octal_digit(c: char) -> bool {
 258         ('0' <= c && c <= '7')
 259     }
 260
 261     fn is_decimal_digit(ch: char) -> bool {
 262         '0' <= ch && ch <= '9'
 263     }
 264
 265     fn is_hexadecimal_digit(c: char) -> bool {
 266         ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
 267     }
 268
 269     fn is_name_nondigit(c: char) -> bool {
 270         (c == '_') || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('\x7f' <= c)
 271     }
 272
 273     fn is_name_letter(c: char) -> bool {
 274         (c == '_')
 275             || ('0' <= c && c <= '9')
 276             || ('a' <= c && c <= 'z')
 277             || ('A' <= c && c <= 'Z')
 278             || ('\x7f' <= c)
 279     }
 280
 281     // Lexing
 282
 283     fn skip_while_to_offset(&self, p: impl Fn(char) -> bool) -> usize {
 284         let n = self.source.length();
 285         let mut i = self.offset();
 286         while i < n && p(self.peek(i)) {
 287             i += 1;
 288         }
 289         i
 290     }
 291
 292     // advance offset as long as the predicate is true
 293     fn skip_while(&mut self, p: impl Fn(char) -> bool) {
 294         self.with_offset(self.skip_while_to_offset(p))
 295     }
 296
 297     fn str_skip_while(s: &[u8], mut i: usize, p: impl Fn(char) -> bool) -> usize {
 298         let n = s.len();
 299         loop {
 300             if i < n && p(s[i] as char) {
 301                 i += 1
 302             } else {
 303                 return i;
 304             }
 305         }
 306     }
 307
 308     fn skip_whitespace(&mut self) {
 309         self.skip_while(&Self::is_whitespace_no_newline);
 310     }
 311
 312     fn str_skip_whitespace(s: &[u8], i: usize) -> usize {
 313         Self::str_skip_while(s, i, &Self::is_whitespace_no_newline)
 314     }
 315
 316     fn not_newline(ch: char) -> bool {
 317         !(Self::is_newline(ch))
 318     }
 319
 320     fn skip_to_end_of_line(&mut self) {
 321         self.skip_while(&Self::not_newline)
 322     }
 323
 324     fn skip_to_end_of_line_or_end_tag(&mut self) {
 325         let n = self.source.length();
 326         let peek_def = |i| if i < n { self.peek(i) } else { INVALID };
 327
 328         let should_stop = |i| {
 329             i >= n || {
 330                 let ch = self.peek(i);
 331                 Self::is_newline(ch) || (ch == '?' && peek_def(i + 1) == '>')
 332             }
 333         };
 334         let mut i = self.offset();
 335         while !(should_stop(i)) {
 336             i += 1
 337         }
 338         self.with_offset(i)
 339     }
 340
 341     fn skip_name_end(&mut self) {
 342         self.skip_while(&Self::is_name_letter)
 343     }
 344
 345     fn skip_end_of_line(&mut self) {
 346         match self.peek_char(0) {
 347             '\n' => self.advance(1),
 348             '\r' => {
 349                 if self.peek_char(1) == '\n' {
 350                     self.advance(2)
 351                 } else {
 352                     self.advance(1)
 353                 }
 354             }
 355             _ => {}
 356         }
 357     }
 358
 359     fn scan_name_impl(&mut self) {
 360         assert!(Self::is_name_nondigit(self.peek_char(0)));
 361         self.advance(1);
 362         self.skip_name_end();
 363     }
 364
 365     fn scan_name(&mut self) -> TokenKind {
 366         self.scan_name_impl();
 367         TokenKind::Name
 368     }
 369
 370     fn scan_variable(&mut self) -> TokenKind {
 371         assert_eq!('$', self.peek_char(0));
 372         self.advance(1);
 373         self.scan_name_impl();
 374         TokenKind::Variable
 375     }
 376
 377     fn scan_with_underscores(&mut self, accepted_char: impl Fn(char) -> bool) {
 378         let n = self.source.length();
 379         let peek_def = |i| if i < n { self.peek(i) } else { INVALID };
 380         let mut i = self.offset();
 381         while i < n {
 382             let ch = self.peek(i);
 383             if accepted_char(ch) {
 384                 i += 1
 385             } else if ch == ' ' && accepted_char(peek_def(i + 1)) {
 386                 i += 2;
 387             } else {
 388                 break;
 389             }
 390         }
 391         self.with_offset(i);
 392     }
 393
 394     fn scan_decimal_digits(&mut self) {
 395         self.skip_while(&Self::is_decimal_digit)
 396     }
 397
 398     fn scan_decimal_digits_with_underscores(&mut self) {
 399         self.scan_with_underscores(&Self::is_decimal_digit);
 400     }
 401
 402     fn scan_octal_digits(&mut self) {
 403         self.skip_while(&Self::is_octal_digit)
 404     }
 405
 406     fn scan_octal_digits_with_underscores(&mut self) {
 407         self.scan_with_underscores(&Self::is_octal_digit)
 408     }
 409
 410     fn scan_binary_digits_with_underscores(&mut self) {
 411         self.scan_with_underscores(&Self::is_binary_digit)
 412     }
 413
 414     fn scan_hexadecimal_digits(&mut self) {
 415         self.skip_while(&Self::is_hexadecimal_digit)
 416     }
 417
 418     fn scan_hexadecimal_digits_with_underscores(&mut self) {
 419         self.scan_with_underscores(&Self::is_hexadecimal_digit)
 420     }
 421
 422     fn scan_hex_literal(&mut self) -> TokenKind {
 423         let ch = self.peek_char(0);
 424         if !Self::is_hexadecimal_digit(ch) {
 425             self.with_error(Errors::error0001);
 426             TokenKind::HexadecimalLiteral
 427         } else {
 428             self.scan_hexadecimal_digits_with_underscores();
 429             TokenKind::HexadecimalLiteral
 430         }
 431     }
 432
 433     fn scan_binary_literal(&mut self) -> TokenKind {
 434         let ch = self.peek_char(0);
 435         if !Self::is_binary_digit(ch) {
 436             self.with_error(Errors::error0002);
 437             TokenKind::BinaryLiteral
 438         } else {
 439             self.scan_binary_digits_with_underscores();
 440             TokenKind::BinaryLiteral
 441         }
 442     }
 443
 444     fn scan_exponent(&mut self) -> TokenKind {
 445         let ch = self.peek_char(1);
 446         if ch == '+' || ch == '-' {
 447             self.advance(2)
 448         } else {
 449             self.advance(1)
 450         }
 451         let ch = self.peek_char(0);
 452         if !Self::is_decimal_digit(ch) {
 453             self.with_error(Errors::error0003);
 454             TokenKind::FloatingLiteral
 455         } else {
 456             self.scan_decimal_digits();
 457             TokenKind::FloatingLiteral
 458         }
 459     }
 460
 461     fn scan_after_decimal_point(&mut self) -> TokenKind {
 462         self.advance(1);
 463         self.scan_decimal_digits();
 464         let ch = self.peek_char(0);
 465         if ch == 'e' || ch == 'E' {
 466             self.scan_exponent()
 467         } else {
 468             TokenKind::FloatingLiteral
 469         }
 470     }
 471
 472     fn scan_octal_or_float(&mut self) -> TokenKind {
 473         // We've scanned a leading zero.
 474         // We have an irritating ambiguity here.  09 is not a legal octal or
 475         // floating literal, but 09e1 and 09.1 are.
 476         self.advance(1);
 477         let ch = self.peek_char(0);
 478         match ch {
 479             '.' =>
 480             // 0.
 481             {
 482                 self.scan_after_decimal_point()
 483             }
 484             'e' | 'E' =>
 485             // 0e
 486             {
 487                 self.scan_exponent()
 488             }
 489             _ if '0' <= ch && ch <= '9' => {
 490                 // 05
 491                 let mut lexer_oct = self.clone();
 492                 lexer_oct.scan_octal_digits();
 493
 494                 let mut lexer_dec = self.clone();
 495                 lexer_dec.scan_decimal_digits();
 496                 if (lexer_oct.width()) == (lexer_dec.width()) {
 497                     // Only octal digits. Could be an octal literal, or could
 498                     // be a float.
 499                     let ch = lexer_oct.peek_char(0);
 500                     if ch == 'e' || ch == 'E' {
 501                         self.continue_from(lexer_oct);
 502                         self.scan_exponent()
 503                     } else if ch == '.' {
 504                         self.continue_from(lexer_oct);
 505                         self.scan_after_decimal_point()
 506                     } else {
 507                         // This is irritating - we only want to allow underscores for integer
 508                         // literals. Deferring the lexing with underscores here allows us to
 509                         // make sure we're not dealing with floats.
 510                         self.continue_from(lexer_oct);
 511                         self.scan_octal_digits_with_underscores();
 512                         TokenKind::OctalLiteral
 513                     }
 514                 } else {
 515                     // We had decimal digits following a leading zero; this is either a
 516                     // float literal or an octal to be truncated at the first non-octal
 517                     // digit.
 518                     let ch = lexer_dec.peek_char(0);
 519                     if ch == 'e' || ch == 'E' {
 520                         self.continue_from(lexer_dec);
 521                         self.scan_exponent()
 522                     } else if ch == '.' {
 523                         self.continue_from(lexer_dec);
 524                         self.scan_after_decimal_point()
 525                     } else {
 526                         // an octal to be truncated at the first non-octal digit
 527                         // Again we differ the lexing with underscores here
 528                         self.scan_decimal_digits_with_underscores();
 529                         TokenKind::OctalLiteral
 530                     }
 531                 }
 532             }
 533             _ =>
 534             // 0 is a decimal literal
 535             {
 536                 TokenKind::DecimalLiteral
 537             }
 538         }
 539     }
 540
 541     fn scan_decimal_or_float(&mut self) -> TokenKind {
 542         // We've scanned a leading non-zero digit.
 543         let mut lexer_no_underscores = self.clone();
 544         lexer_no_underscores.scan_decimal_digits();
 545         let mut lexer_with_underscores = self.clone();
 546         lexer_with_underscores.scan_decimal_digits_with_underscores();
 547         let ch = lexer_no_underscores.peek_char(0);
 548         match ch {
 549             '.' =>
 550             // 123.
 551             {
 552                 self.continue_from(lexer_no_underscores);
 553                 self.scan_after_decimal_point()
 554             }
 555             'e' | 'E' =>
 556             // 123e
 557             {
 558                 self.continue_from(lexer_no_underscores);
 559                 self.scan_exponent()
 560             }
 561             _ =>
 562             // 123
 563             {
 564                 self.continue_from(lexer_with_underscores);
 565                 TokenKind::DecimalLiteral
 566             }
 567         }
 568     }
 569
 570     fn scan_single_quote_string_literal(&mut self) -> TokenKind {
 571         // TODO: What about newlines embedded?
 572         // SPEC:
 573         // single-quoted-string-literal::
 574         //   b-opt  ' sq-char-sequence-opt  '
 575         //
 576         // TODO: What is this b-opt?  We don't lex an optional 'b' before a literal.
 577         //
 578         // sq-char-sequence::
 579         //   sq-char
 580         //   sq-char-sequence   sq-char
 581         //
 582         // sq-char::
 583         //   sq-escape-sequence
 584         //   \opt   any character except single-quote (') or backslash (\)
 585         //
 586         // sq-escape-sequence:: one of
 587         //   \'  \\
 588         let n = self.source.length();
 589         let peek = |x| self.source.get(x);
 590
 591         let mut has_error0012 = false;
 592         let mut has_error0006 = false;
 593
 594         let mut i = 1 + self.offset();
 595         let new_offset = loop {
 596             if i >= n {
 597                 has_error0012 = true;
 598                 break n - 1;
 599             } else {
 600                 let ch = peek(i);
 601                 match ch {
 602                     INVALID => {
 603                         has_error0006 = true;
 604                         i += 1
 605                     }
 606                     '\\' => i += 2,
 607                     '\'' => break (1 + i),
 608                     _ => i += 1,
 609                 }
 610             }
 611         };
 612
 613         if has_error0006 {
 614             self.with_error(Errors::error0006)
 615         }
 616         if has_error0012 {
 617             self.with_error(Errors::error0012)
 618         }
 619
 620         self.with_offset(new_offset);
 621         TokenKind::SingleQuotedStringLiteral
 622     }
 623
 624     fn scan_hexadecimal_escape(&mut self) {
 625         let ch2 = self.peek_char(2);
 626         let ch3 = self.peek_char(3);
 627         if !(Self::is_hexadecimal_digit(ch2)) {
 628             // TODO: Consider producing an error for a malformed hex escape
 629             // let lexer = with_error lexer SyntaxError.error0005 in
 630             self.advance(2);
 631         } else if !(Self::is_hexadecimal_digit(ch3)) {
 632             // let lexer = with_error lexer SyntaxError.error0005 in
 633             self.advance(3)
 634         } else {
 635             self.advance(4)
 636         }
 637     }
 638
 639     fn scan_unicode_escape(&mut self) {
 640         // At present the lexer is pointing at \u
 641         if self.peek_char(2) == '{' {
 642             if self.peek_char(3) == '$' {
 643                 // We have a malformed unicode escape that contains a possible embedded
 644                 // expression. Eat the \u and keep on processing the embedded expression.
 645                 // TODO: Consider producing a warning for a malformed unicode escape.
 646                 self.advance(2)
 647             } else {
 648                 // We have a possibly well-formed escape sequence, and at least we know
 649                 // that it is not an embedded expression.
 650                 // TODO: Consider producing an error if the digits are out of range
 651                 // of legal Unicode characters.
 652                 // TODO: Consider producing an error if there are no digits.
 653                 // Skip over the slash, u and brace, and start lexing the number.
 654                 self.advance(3);
 655                 self.scan_hexadecimal_digits();
 656                 let ch = self.peek_char(0);
 657                 if ch != '}' {
 658                     // TODO: Consider producing a warning for a malformed unicode escape.
 659                     {}
 660                 } else {
 661                     self.advance(1)
 662                 }
 663             }
 664         } else {
 665             // We have a malformed unicode escape sequence. Bail out.
 666             // TODO: Consider producing a warning for a malformed unicode escape.
 667             self.advance(2)
 668         }
 669     }
 670
 671     fn skip_uninteresting_double_quote_like_string_characters(&mut self, start_char: char) {
 672         let is_uninteresting = |ch| match ch {
 673             INVALID | '\\' | '$' | '{' | '[' | ']' | '-' => false,
 674             ch if '0' <= ch && ch <= '9' => false,
 675             ch => ch != start_char && !Self::is_name_nondigit(ch),
 676         };
 677         self.skip_while(&is_uninteresting);
 678     }
 679
 680     fn scan_integer_literal_in_string(&mut self) -> TokenKind {
 681         if self.peek_char(0) == '0' {
 682             match self.peek_char(1) {
 683                 'x' | 'X' => {
 684                     self.advance(2);
 685                     self.scan_hex_literal()
 686                 }
 687                 'b' | 'B' => {
 688                     self.advance(2);
 689                     self.scan_binary_literal()
 690                 }
 691                 _ => {
 692                     // An integer literal starting with 0 in a string will actually
 693                     // always be treated as a string index in HHVM, and not as an octal.
 694                     // In such a case, HHVM actually scans all decimal digits to create the
 695                     // token. TODO: (kasper) T40381519 we may want to change this behavior to something more
 696                     // sensible
 697                     self.scan_decimal_digits_with_underscores();
 698                     TokenKind::DecimalLiteral
 699                 }
 700             }
 701         } else {
 702             self.scan_decimal_digits_with_underscores();
 703             TokenKind::DecimalLiteral
 704         }
 705     }
 706
 707     fn scan_double_quote_like_string_literal_from_start(&mut self, start_char: char) -> TokenKind {
 708         let literal_token_kind = TokenKind::DoubleQuotedStringLiteral;
 709         let head_token_kind = TokenKind::DoubleQuotedStringLiteralHead;
 710         self.advance(1);
 711         loop {
 712             // If there's nothing interesting in this double-quoted string then
 713             // we can just hand it back as-is.
 714             self.skip_uninteresting_double_quote_like_string_characters(start_char);
 715             match self.peek_char(0) {
 716                 INVALID => {
 717                     // If the string is unterminated then give an error; if this is an
 718                     // embedded zero character then give an error and recurse; we might
 719                     // be able to make more progress.
 720                     if self.at_end() {
 721                         self.with_error(Errors::error0012);
 722                         break literal_token_kind;
 723                     } else {
 724                         self.with_error(Errors::error0006);
 725                         self.advance(1)
 726                     }
 727                 }
 728                 '`' | '"' => {
 729                     // We made it to the end without finding a special character.
 730                     self.advance(1);
 731                     break literal_token_kind;
 732                 }
 733                 _ =>
 734                 // We've found a backslash, dollar or brace.
 735                 {
 736                     break head_token_kind;
 737                 }
 738             }
 739         }
 740     }
 741
 742     fn is_heredoc_tail(&self, name: &[u8]) -> bool {
 743         // A heredoc tail is the identifier immediately preceded by a newline
 744         // and immediately followed by an optional semi and then a newline.
 745         //
 746         // Note that the newline and optional semi are not part of the literal;
 747         // the literal's lexeme ends at the end of the name. Either there is
 748         // no trivia and the next token is a semi-with-trailing-newline, or
 749         // the trailing trivia is a newline.
 750         //
 751         // This odd rule is to ensure that both
 752         // $x = <<<HERE
 753         // something
 754         // HERE;
 755         //
 756         // and
 757         //
 758         // $x = <<<HERE
 759         // something
 760         // HERE
 761         // . "something else";
 762         //
 763         // are legal.
 764         if !(Self::is_newline(self.peek_back(1))) {
 765             false
 766         } else {
 767             let len = name.len();
 768             let ch0 = self.peek_char(len);
 769             let ch1 = self.peek_char(len + 1);
 770             ((Self::is_newline(ch0)) || ch0 == ';' && (Self::is_newline(ch1)))
 771                 && self.peek_string(len) == name
 772         }
 773     }
 774
 775     fn get_tail_token_kind(&self, literal_kind: &StringLiteralKind) -> TokenKind {
 776         match literal_kind {
 777             StringLiteralKind::LiteralHeredoc { .. } => TokenKind::HeredocStringLiteralTail,
 778             StringLiteralKind::LiteralDoubleQuoted => TokenKind::DoubleQuotedStringLiteralTail,
 779         }
 780     }
 781
 782     fn get_string_literal_body_or_double_quoted_tail(
 783         &self,
 784         literal_kind: &StringLiteralKind,
 785     ) -> TokenKind {
 786         if literal_kind == &StringLiteralKind::LiteralDoubleQuoted {
 787             TokenKind::DoubleQuotedStringLiteralTail
 788         } else {
 789             TokenKind::StringLiteralBody
 790         }
 791     }
 792
 793     fn scan_string_literal_in_progress(&mut self, literal_kind: &StringLiteralKind) -> TokenKind {
 794         let (is_heredoc, name): (bool, &[u8]) = match literal_kind {
 795             StringLiteralKind::LiteralHeredoc { heredoc } => (true, &heredoc),
 796             _ => (false, b""),
 797         };
 798         let start_char = '"';
 799         let ch0 = self.peek_char(0);
 800         if Self::is_name_nondigit(ch0) {
 801             if is_heredoc && (self.is_heredoc_tail(name)) {
 802                 self.scan_name_impl();
 803                 TokenKind::HeredocStringLiteralTail
 804             } else {
 805                 self.scan_name_impl();
 806                 TokenKind::Name
 807             }
 808         } else {
 809             match ch0 {
 810                 INVALID => {
 811                     if self.at_end() {
 812                         self.with_error(Errors::error0012);
 813                         self.get_tail_token_kind(literal_kind)
 814                     } else {
 815                         self.with_error(Errors::error0006);
 816                         self.advance(1);
 817                         self.skip_uninteresting_double_quote_like_string_characters(start_char);
 818                         TokenKind::StringLiteralBody
 819                     }
 820                 }
 821                 '"' => {
 822                     let kind = self.get_string_literal_body_or_double_quoted_tail(literal_kind);
 823                     self.advance(1);
 824                     kind
 825                 }
 826                 '$' => {
 827                     if Self::is_name_nondigit(self.peek_char(1)) {
 828                         self.scan_variable()
 829                     } else {
 830                         self.advance(1);
 831                         TokenKind::Dollar
 832                     }
 833                 }
 834                 '{' => {
 835                     self.advance(1);
 836                     TokenKind::LeftBrace
 837                 }
 838                 '\\' => {
 839                     match self.peek_char(1) {
 840                         // In these cases we just skip the escape sequence and
 841                         // keep on scanning for special characters.
 842                         | '\\' | '"' | '$' | 'e' | 'f' | 'n' | 'r' | 't' | 'v' | '`'
 843                         // Same in these cases; there might be more octal characters following but
 844                         // if there are, we'll just eat them as normal characters.
 845                         | '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' => {
 846                             self.advance(2);
 847                             self.skip_uninteresting_double_quote_like_string_characters(start_char);
 848                             TokenKind::StringLiteralBody}
 849                         | 'x' => {
 850                             self.scan_hexadecimal_escape();
 851                             self.skip_uninteresting_double_quote_like_string_characters(start_char);
 852                             TokenKind::StringLiteralBody }
 853                         | 'u' => {
 854                             self.scan_unicode_escape();
 855                             self.skip_uninteresting_double_quote_like_string_characters(start_char);
 856                             TokenKind::StringLiteralBody }
 857                         | '{' => {
 858                             // The rules for escaping open braces in Hack are bizarre. Suppose we
 859                             // have
 860                             // $x = 123;
 861                             // $y = 456;
 862                             // $z = "\{$x,$y\}";
 863                             // What is the value of $z?  Naively you would think that the backslash
 864                             // escapes the braces, and the variables are embedded, so {123,456}. But
 865                             // that's not what happens. Yes, the backslash makes the brace no longer
 866                             // the opening brace of an expression. But the backslash is still part
 867                             // of the string!  This is the string \{123,456\}.
 868                             // TODO: We might want to fix this because this is very strange.
 869                             // Eat the backslash and the brace.
 870                             self.advance(2);
 871                             TokenKind::StringLiteralBody
 872                         }
 873                     | _ => {
 874                        // TODO: A backslash followed by something other than an escape sequence
 875                        // is legal in hack, and treated as though it was just the backslash
 876                        // and the character. However we might consider making this a warning.
 877                        // It is particularly egregious when we have something like:
 878                        // $x = "abcdef \
 879                        //       ghi";
 880                        // The author of the code likely means the backslash to mean line
 881                        // continuation but in fact it just means to put a backslash and newline
 882                        // in the string.
 883                           self.advance(1);
 884                           self.skip_uninteresting_double_quote_like_string_characters(start_char);
 885                           TokenKind::StringLiteralBody
 886                       }
 887                    }
 888                 }
 889                 '[' => {
 890                     self.advance(1);
 891                     TokenKind::LeftBracket
 892                 }
 893                 ']' => {
 894                     self.advance(1);;
 895                     TokenKind::RightBracket
 896                 }
 897                 '-' => {
 898                     if (self.peek_char(1)) == '>' {
 899                         self.advance(2);
 900                         TokenKind::MinusGreaterThan
 901                     } else {
 902                         // Nothing interesting here. Skip it and find the next
 903                         // interesting character.
 904                         self.advance(1);
 905                         self.skip_uninteresting_double_quote_like_string_characters(start_char);
 906                         TokenKind::StringLiteralBody
 907                     }
 908                 }
 909                 ch if '0' <= ch && ch <= '9' => {
 910                     let mut lexer1 = self.clone();
 911                     let literal = lexer1.scan_integer_literal_in_string();
 912
 913                     if self.errors.len() == lexer1.errors.len() {
 914                         self.continue_from(lexer1);
 915                         literal
 916                     } else {
 917                         // If we failed to scan a literal, do not interpret the literal
 918                         self.with_offset(lexer1.offset());
 919                         TokenKind::StringLiteralBody
 920                     }
 921                 }
 922                 _ => {
 923                     // Nothing interesting here. Skip it and find the next
 924                     // interesting character.
 925                     self.advance(1);
 926                     self.skip_uninteresting_double_quote_like_string_characters(start_char);
 927                     TokenKind::StringLiteralBody
 928                 }
 929             }
 930         }
 931     }
 932     // A heredoc string literal has the form
 933     //
 934     // header
 935     // optional body
 936     // trailer
 937     //
 938     // The header is:
 939     //
 940     // <<< (optional whitespace) name (no whitespace) (newline)
 941     //
 942     // The optional body is:
 943     //
 944     // any characters whatsoever including newlines (newline)
 945     //
 946     // The trailer is:
 947     //
 948     // (no whitespace) name (no whitespace) (optional semi) (no whitespace) (newline)
 949     //
 950     // The names must be identical.  The trailing semi and newline must be present.
 951     //
 952     // The body is any and all characters, up to the first line that exactly matches
 953     // the trailer.
 954     //
 955     // The body may contain embedded expressions.
 956     //
 957     // A nowdoc string literal has the same form except that the first name is
 958     // enclosed in single quotes, and it may not contain embedded expressions.
 959     fn scan_docstring_name_actual(&mut self) -> &'a [u8] {
 960         let ch = self.peek_char(0);
 961         if Self::is_name_nondigit(ch) {
 962             let start_offset = self.offset();
 963             self.advance(1);
 964             self.skip_name_end();
 965             self.source.sub(start_offset, self.offset() - start_offset)
 966         } else {
 967             self.with_error(Errors::error0008);
 968             b""
 969         }
 970     }
 971
 972     fn scan_docstring_name(&mut self) -> (&'a [u8], TokenKind) {
 973         self.skip_whitespace();
 974         let ch = self.peek_char(0);
 975         let kind = if ch == '\'' {
 976             TokenKind::NowdocStringLiteral
 977         } else {
 978             TokenKind::HeredocStringLiteral
 979         };
 980
 981         let name = if ch == '\'' {
 982             self.advance(1);
 983             let name = self.scan_docstring_name_actual();
 984             if (self.peek_char(0)) == '\'' {
 985                 self.advance(1);
 986                 name
 987             } else {
 988                 self.with_error(Errors::error0010);
 989                 name
 990             }
 991         } else {
 992             // Starting with PHP 5.3.0, the opening Heredoc identifier
 993             // may optionally be enclosed in double quotes:
 994             if ch == '"' {
 995                 self.advance(1)
 996             };
 997             let name = self.scan_docstring_name_actual();
 998             if ch == '"' {
 999                 // same logic as above, just for double quote
1000                 if self.peek_char(0) == '\"' {
1001                     self.advance(1);
1002                 } else {
1003                     self.with_error(Errors::missing_double_quote)
1004                 }
1005             }
1006             name
1007         };
1008         (name, kind)
1009     }
1010
1011     fn scan_docstring_header(&mut self) -> (&'a [u8], TokenKind) {
1012         let ch = self.peek_char(0);
1013         // Skip 3 for <<< or 4 for b<<<
1014         let skip_count = if ch == 'b' { 4 } else { 3 };
1015         self.advance(skip_count);
1016         let (name, kind) = self.scan_docstring_name();
1017         let ch = self.peek_char(0);
1018         if !Self::is_newline(ch) {
1019             self.with_error(Errors::error0011)
1020         }
1021         self.skip_to_end_of_line();
1022         self.skip_end_of_line();
1023         (name, kind)
1024     }
1025
1026     fn scan_docstring_remainder(&mut self, name: &[u8]) {
1027         let len = name.len();
1028         loop {
1029             let ch0 = self.peek_char(len);
1030             let ch1 = self.peek_char(len + 1);
1031             if ((Self::is_newline(ch0)) || ch0 == ';' && (Self::is_newline(ch1)))
1032                 && self.peek_string(len as usize) == name
1033             {
1034                 self.advance(len as usize);
1035                 break;
1036             } else {
1037                 self.skip_to_end_of_line();
1038                 let ch = self.peek_char(0);
1039                 if Self::is_newline(ch) {
1040                     self.skip_end_of_line()
1041                 } else {
1042                     // If we got here then we ran off the end of the file without
1043                     // finding a newline. Just bail.
1044                     self.with_error(Errors::error0011);
1045                     break;
1046                 }
1047             }
1048         }
1049     }
1050
1051     fn scan_docstring_literal(&mut self) -> TokenKind {
1052         let (name, kind) = self.scan_docstring_header();
1053         self.scan_docstring_remainder(name);
1054         kind
1055     }
1056
1057     fn scan_xhp_label(&mut self) {
1058         // An XHP label has the same grammar as a Hack name.
1059         let _: TokenKind = self.scan_name();
1060     }
1061
1062     fn scan_xhp_element_name(&mut self, attribute: bool) -> TokenKind {
1063         // An XHP element name is a sequence of one or more XHP labels each separated
1064         // by a single : or -.  Note that it is possible for an XHP element name to be
1065         // followed immediately by a : or - that is the next token, so if we find
1066         // a : or - not followed by a label, we need to terminate the token.
1067         self.scan_xhp_label();
1068         let ch0 = self.peek_char(0);
1069         let ch1 = self.peek_char(1);
1070         if (!attribute && ch0 == ':' || ch0 == '-') && Self::is_name_nondigit(ch1) {
1071             self.advance(1);
1072             self.scan_xhp_element_name(false)
1073         } else {
1074             TokenKind::XHPElementName
1075         }
1076     }
1077
1078     // Is the next token we're going to lex a possible xhp class name?
1079     fn is_xhp_class_name(&self) -> bool {
1080         (self.peek_char(0) == ':') && (Self::is_name_nondigit(self.peek_char(1)))
1081     }
1082
1083     fn scan_xhp_class_name(&mut self) -> TokenKind {
1084         // An XHP class name is a colon followed by an xhp name.
1085         if self.is_xhp_class_name() {
1086             self.advance(1);
1087             self.scan_xhp_element_name(false);
1088             TokenKind::XHPClassName
1089         } else {
1090             self.with_error(Errors::error0008);
1091             self.advance(1);
1092             TokenKind::ErrorToken
1093         }
1094     }
1095
1096     fn scan_xhp_string_literal(&mut self) -> TokenKind {
1097         // XHP string literals are just straight up "find the closing quote"
1098         // strings.  Embedded newlines are legal.
1099         let mut offset: usize = 1;
1100         loop {
1101             match self.peek_char(offset) {
1102                 INVALID => {
1103                     self.advance(offset);
1104                     if self.at_end() {
1105                         self.with_error(Errors::error0012);
1106                         return TokenKind::XHPStringLiteral;
1107                     } else {
1108                         self.with_error(Errors::error0006);
1109                         offset = 1
1110                     }
1111                 }
1112                 '"' => {
1113                     self.advance(offset + 1);
1114                     return TokenKind::XHPStringLiteral;
1115                 }
1116                 _ => offset += 1,
1117             }
1118         }
1119     }
1120
1121     // Note that this does not scan an XHP body
1122     fn scan_xhp_token(&mut self) -> TokenKind {
1123         // TODO: HHVM requires that there be no trivia between < and name in an
1124         // opening tag, but does allow trivia between </ and name in a closing tag.
1125         // Consider allowing trivia in an opening tag.
1126         let ch0 = self.peek_char(0);
1127         if ch0 == INVALID && self.at_end() {
1128             TokenKind::EndOfFile
1129         } else if Self::is_name_nondigit(ch0) {
1130             self.scan_xhp_element_name(false)
1131         } else {
1132             match ch0 {
1133                 '{' => {
1134                     self.advance(1);
1135                     TokenKind::LeftBrace
1136                 }
1137                 '}' => {
1138                     self.advance(1);
1139                     TokenKind::RightBrace
1140                 }
1141                 '=' => {
1142                     self.advance(1);
1143                     TokenKind::Equal
1144                 }
1145                 '<' => {
1146                     if (self.peek_char(1)) == '/' {
1147                         self.advance(2);
1148                         TokenKind::LessThanSlash
1149                     } else {
1150                         self.advance(1);
1151                         TokenKind::LessThan
1152                     }
1153                 }
1154                 '"' => self.scan_xhp_string_literal(),
1155                 '/' => {
1156                     if (self.peek_char(1)) == '>' {
1157                         self.advance(2);
1158                         TokenKind::SlashGreaterThan
1159                     } else {
1160                         self.with_error(Errors::error0006);
1161                         self.advance(1);
1162                         TokenKind::ErrorToken
1163                     }
1164                 }
1165                 '>' => {
1166                     self.advance(1);
1167                     TokenKind::GreaterThan
1168                 }
1169                 _ => {
1170                     self.with_error(Errors::error0006);
1171                     self.advance(1);
1172                     TokenKind::ErrorToken
1173                 }
1174             }
1175         }
1176     }
1177
1178     fn scan_xhp_comment(&mut self) {
1179         let mut offset = 4;
1180         loop {
1181             let ch0 = self.peek_char(offset);
1182             let ch1 = self.peek_char(offset + 1);
1183             let ch2 = self.peek_char(offset + 2);
1184             match (ch0, ch1, ch2) {
1185                 (INVALID, _, _) => {
1186                     self.advance(offset as usize);
1187                     return self.with_error(Errors::error0014);
1188                 }
1189                 ('-', '-', '>') => return self.advance((offset + 3) as usize),
1190                 _ => offset += 1,
1191             }
1192         }
1193     }
1194     fn scan_xhp_body(&mut self) -> TokenKind {
1195         // Naively you might think that an XHP body is just a bunch of characters,
1196         // terminated by an embedded { } expression or a tag.  However, whitespace
1197         // and newlines are relevant in XHP bodies because they are "soft".
1198         // That is, any section of contiguous trivia has the same semantics as a
1199         // single space or newline -- just as in HTML.
1200         //
1201         // Obviously this is of relevance to code formatters.
1202         //
1203         // Therefore we detect whitespace and newlines within XHP bodies and treat
1204         // it as trivia surrounding the tokens within the body.
1205         //
1206         // TODO: Is this also true of whitespace within XHP comments? If so then
1207         // we need to make XHP comments a sequence of tokens, rather than a
1208         // single token as they are now.
1209         let ch0 = self.peek_char(0);
1210
1211         match ch0 {
1212             INVALID if self.at_end() => TokenKind::EndOfFile,
1213             '{' => {
1214                 self.advance(1);
1215                 TokenKind::LeftBrace
1216             }
1217             '}' => {
1218                 self.advance(1);
1219                 TokenKind::RightBrace
1220             }
1221             '<' => {
1222                 let ch1 = self.peek_char(1);
1223                 let ch2 = self.peek_char(2);
1224                 let ch3 = self.peek_char(3);
1225                 match (ch1, ch2, ch3) {
1226                     ('!', '-', '-') => {
1227                         self.scan_xhp_comment();
1228                         TokenKind::XHPComment
1229                     }
1230                     ('/', _, _) => {
1231                         self.advance(2);
1232                         TokenKind::LessThanSlash
1233                     }
1234                     _ => {
1235                         self.advance(1);
1236                         TokenKind::LessThan
1237                     }
1238                 }
1239             }
1240             _ => {
1241                 let mut offset = 0;
1242                 loop {
1243                     let ch = self.peek_char(offset);
1244                     match ch {
1245                         INVALID => {
1246                             self.advance(offset);
1247                             if self.at_end() {
1248                                 self.with_error(Errors::error0013);
1249                                 break;
1250                             } else {
1251                                 self.with_error(Errors::error0006);
1252                                 offset = 1
1253                             }
1254                         }
1255                         '\t' | ' ' | '\r' | '\n' | '{' | '}' | '<' => {
1256                             self.advance(offset);
1257                             break;
1258                         }
1259                         _ => offset += 1,
1260                     }
1261                 }
1262                 TokenKind::XHPBody
1263             }
1264         }
1265     }
1266
1267     fn scan_dollar_token(&mut self) -> TokenKind {
1268         // We have a problem here.  We wish to be able to lexically analyze both
1269         // PHP and Hack, but the introduction of $$ to Hack makes them incompatible.
1270         // "$$x" and "$$ $x" are legal in PHP, but illegal in Hack.
1271         // The rule in PHP seems to be that $ is a prefix operator, it is a token,
1272         // it can be followed by trivia, but the next token has to be another $
1273         // operator, a variable $x, or a {.
1274         //
1275         // Here's a reasonable compromise.  (TODO: Review this decision.)
1276         //
1277         // $$x lexes as $ $x
1278         // $$$x lexes as $ $ $x
1279         // and so on.
1280         //
1281         // $$ followed by anything other than a name or a $ lexes as $$.
1282         //
1283         // This means that lexing a PHP program which contains "$$ $x" is different
1284         // will fail at parse time, but I'm willing to live with that.
1285         //
1286         // This means that lexing a Hack program which contains
1287         // "$x |> $$instanceof Foo" produces an error as well.
1288         //
1289         // If these decisions are unacceptable then we will need to make the lexer
1290         // be aware of whether it is lexing PHP or Hack; thus far we have not had
1291         // to make this distinction.
1292
1293         // We are already at $.
1294         let ch1 = self.peek_char(1);
1295         match ch1 {
1296             '$' => {
1297                 let ch2 = self.peek_char(2);
1298                 if ch2 == '$' || ch2 == '{' || Self::is_name_nondigit(ch2) {
1299                     self.advance(1);
1300                     TokenKind::Dollar // $$x or $$$
1301                 } else {
1302                     self.advance(2);
1303                     TokenKind::DollarDollar // $$
1304                 }
1305             }
1306             _ => {
1307                 if Self::is_name_nondigit(ch1) {
1308                     self.scan_variable() // $x
1309                 } else {
1310                     self.advance(1);
1311                     TokenKind::Dollar // $
1312                 }
1313             }
1314         }
1315     }
1316
1317     fn scan_token(&mut self, in_type: bool) -> TokenKind {
1318         let ch0 = self.peek_char(0);
1319         match ch0 {
1320             '[' => {
1321                 self.advance(1);
1322                 TokenKind::LeftBracket
1323             }
1324             ']' => {
1325                 self.advance(1);
1326                 TokenKind::RightBracket
1327             }
1328             '(' => {
1329                 self.advance(1);
1330                 TokenKind::LeftParen
1331             }
1332             ')' => {
1333                 self.advance(1);
1334                 TokenKind::RightParen
1335             }
1336             '{' => {
1337                 self.advance(1);
1338                 TokenKind::LeftBrace
1339             }
1340             '}' => {
1341                 self.advance(1);
1342                 TokenKind::RightBrace
1343             }
1344             '.' => match self.peek_char(1) {
1345                 '=' => {
1346                     self.advance(2);
1347                     TokenKind::DotEqual
1348                 }
1349                 ch if '0' <= ch && ch <= '9' => self.scan_after_decimal_point(),
1350                 '.' => {
1351                     if (self.peek_char(2)) == '.' {
1352                         self.advance(3);
1353                         TokenKind::DotDotDot
1354                     } else {
1355                         self.advance(1);
1356                         TokenKind::Dot
1357                     }
1358                 }
1359                 _ => {
1360                     self.advance(1);
1361                     TokenKind::Dot
1362                 }
1363             },
1364             '-' => match self.peek_char(1) {
1365                 '=' => {
1366                     self.advance(2);
1367                     TokenKind::MinusEqual
1368                 }
1369                 '-' => {
1370                     self.advance(2);
1371                     TokenKind::MinusMinus
1372                 }
1373                 '>' => {
1374                     self.advance(2);
1375                     TokenKind::MinusGreaterThan
1376                 }
1377                 _ => {
1378                     self.advance(1);
1379                     TokenKind::Minus
1380                 }
1381             },
1382             '+' => match self.peek_char(1) {
1383                 '=' => {
1384                     self.advance(2);
1385                     TokenKind::PlusEqual
1386                 }
1387                 '+' => {
1388                     self.advance(2);
1389                     TokenKind::PlusPlus
1390                 }
1391                 _ => {
1392                     self.advance(1);
1393                     TokenKind::Plus
1394                 }
1395             },
1396             '*' => match (self.peek_char(1), self.peek_char(2)) {
1397                 ('=', _) => {
1398                     self.advance(2);
1399                     TokenKind::StarEqual
1400                 }
1401                 ('*', '=') => {
1402                     self.advance(3);
1403                     TokenKind::StarStarEqual
1404                 }
1405                 ('*', _) => {
1406                     self.advance(2);
1407                     TokenKind::StarStar
1408                 }
1409                 _ => {
1410                     self.advance(1);
1411                     TokenKind::Star
1412                 }
1413             },
1414             '~' => {
1415                 self.advance(1);
1416                 TokenKind::Tilde
1417             }
1418             '!' => match (self.peek_char(1), self.peek_char(2)) {
1419                 ('=', '=') => {
1420                     self.advance(3);
1421                     TokenKind::ExclamationEqualEqual
1422                 }
1423                 ('=', _) => {
1424                     self.advance(2);
1425                     TokenKind::ExclamationEqual
1426                 }
1427                 _ => {
1428                     self.advance(1);
1429                     TokenKind::Exclamation
1430                 }
1431             },
1432             '$' => self.scan_dollar_token(),
1433             '/' => {
1434                 if (self.peek_char(1)) == '=' {
1435                     self.advance(2);
1436                     TokenKind::SlashEqual
1437                 } else {
1438                     self.advance(1);
1439                     TokenKind::Slash
1440                 }
1441             }
1442             '%' => {
1443                 if (self.peek_char(1)) == '=' {
1444                     self.advance(2);
1445                     TokenKind::PercentEqual
1446                 } else {
1447                     self.advance(1);
1448                     TokenKind::Percent
1449                 }
1450             }
1451             '<' => {
1452                 match (self.peek_char(1), self.peek_char(2)) {
1453                     ('<', '<') => self.scan_docstring_literal(),
1454                     ('<', '=') => {
1455                         self.advance(3);
1456                         TokenKind::LessThanLessThanEqual
1457                     }
1458                     // TODO: We lex and parse the spaceship operator.
1459                     // TODO: This is not in the spec at present.  We should either make it an
1460                     // TODO: error, or add it to the specification.
1461                     ('=', '>') => {
1462                         self.advance(3);
1463                         TokenKind::LessThanEqualGreaterThan
1464                     }
1465                     ('=', _) => {
1466                         self.advance(2);
1467                         TokenKind::LessThanEqual
1468                     }
1469                     ('<', _) => {
1470                         self.advance(2);
1471                         TokenKind::LessThanLessThan
1472                     }
1473                     _ => {
1474                         self.advance(1);
1475                         TokenKind::LessThan
1476                     }
1477                 }
1478             }
1479             '>' => {
1480                 match (self.peek_char(1), self.peek_char(2)) {
1481                     // If we are parsing a generic type argument list then we might be at the >>
1482                     // in `List<List<int>>``, or at the >= of `let x:vec<int>=...`. In that case
1483                     // we want to lex two >'s instead of >> / one > and one = instead of >=.
1484                     (ch, _) if (ch == '>' || ch == '=') && in_type => {
1485                         self.advance(1);
1486                         TokenKind::GreaterThan
1487                     }
1488                     ('>', '=') => {
1489                         self.advance(3);
1490                         TokenKind::GreaterThanGreaterThanEqual
1491                     }
1492                     ('>', _) => {
1493                         self.advance(2);
1494                         TokenKind::GreaterThanGreaterThan
1495                     }
1496                     ('=', _) => {
1497                         self.advance(2);
1498                         TokenKind::GreaterThanEqual
1499                     }
1500                     _ => {
1501                         self.advance(1);
1502                         TokenKind::GreaterThan
1503                     }
1504                 }
1505             }
1506             '=' => match (self.peek_char(1), self.peek_char(2)) {
1507                 ('=', '=') => {
1508                     self.advance(3);
1509                     TokenKind::EqualEqualEqual
1510                 }
1511                 ('=', '>') => {
1512                     self.advance(3);
1513                     TokenKind::EqualEqualGreaterThan
1514                 }
1515                 ('=', _) => {
1516                     self.advance(2);
1517                     TokenKind::EqualEqual
1518                 }
1519                 ('>', _) => {
1520                     self.advance(2);
1521                     TokenKind::EqualGreaterThan
1522                 }
1523                 _ => {
1524                     self.advance(1);
1525                     TokenKind::Equal
1526                 }
1527             },
1528             '^' => {
1529                 if (self.peek_char(1)) == '=' {
1530                     self.advance(2);
1531                     TokenKind::CaratEqual
1532                 } else {
1533                     self.advance(1);
1534                     TokenKind::Carat
1535                 }
1536             }
1537             '|' => match self.peek_char(1) {
1538                 '=' => {
1539                     self.advance(2);
1540                     TokenKind::BarEqual
1541                 }
1542                 '>' => {
1543                     self.advance(2);
1544                     TokenKind::BarGreaterThan
1545                 }
1546                 '|' => {
1547                     self.advance(2);
1548                     TokenKind::BarBar
1549                 }
1550                 _ => {
1551                     self.advance(1);
1552                     TokenKind::Bar
1553                 }
1554             },
1555             '&' => match self.peek_char(1) {
1556                 '=' => {
1557                     self.advance(2);
1558                     TokenKind::AmpersandEqual
1559                 }
1560                 '&' => {
1561                     self.advance(2);
1562                     TokenKind::AmpersandAmpersand
1563                 }
1564                 _ => {
1565                     self.advance(1);
1566                     TokenKind::Ampersand
1567                 }
1568             },
1569             '?' => match (self.peek_char(1), self.peek_char(2)) {
1570                 (':', _) if !in_type => {
1571                     self.advance(2);
1572                     TokenKind::QuestionColon
1573                 }
1574                 ('-', '>') => {
1575                     self.advance(3);
1576                     TokenKind::QuestionMinusGreaterThan
1577                 }
1578                 ('?', '=') => {
1579                     self.advance(3);
1580                     TokenKind::QuestionQuestionEqual
1581                 }
1582                 ('?', _) => {
1583                     self.advance(2);
1584                     TokenKind::QuestionQuestion
1585                 }
1586                 ('>', _) => {
1587                     self.advance(2);
1588                     TokenKind::QuestionGreaterThan
1589                 }
1590                 ('a', 's') if !Self::is_name_nondigit(self.peek_char(3)) => {
1591                     self.advance(3);
1592                     TokenKind::QuestionAs
1593                 }
1594                 _ => {
1595                     self.advance(1);
1596                     TokenKind::Question
1597                 }
1598             },
1599             ':' => {
1600                 let ch1 = self.peek_char(1);
1601
1602                 if ch1 == ':' {
1603                     self.advance(2);
1604                     TokenKind::ColonColon
1605                 } else if ch1 == '@' {
1606                     self.advance(2);
1607                     TokenKind::ColonAt
1608                 } else {
1609                     self.advance(1);
1610                     TokenKind::Colon
1611                 }
1612             }
1613             ';' => {
1614                 self.advance(1);
1615                 TokenKind::Semicolon
1616             }
1617             ',' => {
1618                 self.advance(1);
1619                 TokenKind::Comma
1620             }
1621             '@' => {
1622                 self.advance(1);
1623                 TokenKind::At
1624             }
1625             '0' => match self.peek_char(1) {
1626                 'x' | 'X' => {
1627                     self.advance(2);
1628                     self.scan_hex_literal()
1629                 }
1630                 'b' | 'B' => {
1631                     self.advance(2);
1632                     self.scan_binary_literal()
1633                 }
1634                 _ => self.scan_octal_or_float(),
1635             },
1636             ch if '1' <= ch && ch <= '9' => self.scan_decimal_or_float(),
1637             '\'' => self.scan_single_quote_string_literal(),
1638             '`' => self.scan_double_quote_like_string_literal_from_start('`'),
1639             '"' => self.scan_double_quote_like_string_literal_from_start('"'),
1640             '\\' => {
1641                 self.advance(1);
1642                 TokenKind::Backslash
1643             }
1644             'b' if {
1645                 let c1 = self.peek_char(1);
1646                 let c2 = self.peek_char(2);
1647                 let c3 = self.peek_char(3);
1648                 c1 == '"' || c1 == '\'' || (c1 == '<' && c2 == '<' && c3 == '<')
1649             } =>
1650             {
1651                 self.advance(1);
1652                 self.scan_token(in_type)
1653             }
1654             // Names
1655             _ => {
1656                 if ch0 == INVALID && self.at_end() {
1657                     TokenKind::EndOfFile
1658                 } else if Self::is_name_nondigit(ch0) {
1659                     self.scan_name()
1660                 } else {
1661                     self.with_error(Errors::error0006);
1662                     self.advance(1);
1663                     TokenKind::ErrorToken
1664                 }
1665             }
1666         }
1667     }
1668
1669     fn scan_token_outside_type(&mut self) -> TokenKind {
1670         self.scan_token(false)
1671     }
1672
1673     fn scan_token_inside_type(&mut self) -> TokenKind {
1674         self.scan_token(true)
1675     }
1676
1677     // Lexing trivia
1678
1679     // SPEC:
1680     //
1681     // white-space-character::
1682     //   new-line
1683     //   Space character (U+0020)
1684     //   Horizontal-tab character (U+0009)
1685     //
1686     // single-line-comment::
1687     //   //   input-characters-opt
1688     //   #    input-characters-opt
1689     //
1690     // new-line::
1691     //   Carriage-return character (U+000D)
1692     //   Line-feed character (U+000A)
1693     //   Carriage-return character followed by line-feed character
1694
1695     fn str_scan_end_of_line(s: &[u8], i: usize) -> usize {
1696         match s.get(i).map(|x| *x as char) {
1697             None => i + 1,
1698             Some('\r') => match s.get(i + 1).map(|x| *x as char) {
1699                 Some('\n') => 2 + i,
1700                 _ => i + 1,
1701             },
1702             Some('\n') => i + 1,
1703             _ => panic!("str_scan_end_of_line called while not on end of line!"),
1704         }
1705     }
1706
1707     fn scan_end_of_line(&mut self) -> Token::Trivia {
1708         match self.peek_char(0) {
1709             '\r' => {
1710                 let w = if self.peek_char(1) == '\n' { 2 } else { 1 };
1711                 self.advance(w);
1712                 Token::Trivia::make_eol(self.source(), self.start, w)
1713             }
1714             '\n' => {
1715                 self.advance(1);
1716                 Token::Trivia::make_eol(self.source(), self.start, 1)
1717             }
1718             _ => panic!("scan_end_of_line called while not on end of line!"),
1719         }
1720     }
1721
1722     fn scan_hash_comment(&mut self) -> Token::Trivia {
1723         self.skip_to_end_of_line();
1724         Token::Trivia::make_single_line_comment(self.source(), self.start, self.width())
1725     }
1726
1727     fn scan_single_line_comment(&mut self) -> Token::Trivia {
1728         // A fallthrough comment is two slashes, any amount of whitespace,
1729         // FALLTHROUGH, and any characters may follow.
1730         // TODO: Consider allowing lowercase fallthrough.
1731
1732         self.advance(2);
1733         self.skip_whitespace();
1734         let lexer_ws = self.clone();
1735         self.skip_to_end_of_line_or_end_tag();
1736         let w = self.width();
1737         let remainder = self.offset - lexer_ws.offset;
1738         if remainder >= 11 && lexer_ws.peek_string(11) == b"FALLTHROUGH" {
1739             Token::Trivia::make_fallthrough(self.source(), self.start, w)
1740         } else {
1741             Token::Trivia::make_single_line_comment(self.source(), self.start, w)
1742         }
1743     }
1744
1745     fn skip_to_end_of_delimited_comment(&mut self) {
1746         let mut offset = 0;
1747         loop {
1748             let ch0 = self.peek_char(offset);
1749             if ch0 == INVALID {
1750                 self.advance(offset);
1751                 if self.at_end() {
1752                     return self.with_error(Errors::error0007);
1753                 } else {
1754                     // TODO: Do we want to give a warning for an embedded zero char
1755                     // inside a comment?
1756                     offset = 1;
1757                 }
1758             } else if ch0 == '*' && (self.peek_char(offset + 1)) == '/' {
1759                 return self.advance(offset + 2);
1760             } else {
1761                 offset += 1
1762             }
1763         }
1764     }
1765
1766     fn scan_delimited_comment(&mut self) -> Token::Trivia {
1767         // The original lexer lexes a fixme / ignore error as:
1768         //
1769         // slash star [whitespace]* HH_FIXME [whitespace or newline]* leftbracket
1770         // [whitespace or newline]* integer [any text]* star slash
1771         //
1772         // Notice that the original lexer oddly enough does not verify that there
1773         // is a right bracket.
1774         //
1775         // For our purposes we will just check for HH_FIXME / HH_IGNORE_ERROR;
1776         // a later pass can try to parse out the integer if there is one,
1777         // give a warning if there is not, and so on.
1778
1779         self.advance(2);
1780         self.skip_whitespace();
1781
1782         let lexer_ws = self.clone();
1783         self.skip_to_end_of_delimited_comment();
1784         let w = self.width();
1785         if lexer_ws.match_string(b"HH_FIXME") {
1786             Token::Trivia::make_fix_me(self.source(), self.start, w)
1787         } else if lexer_ws.match_string(b"HH_IGNORE_ERROR") {
1788             Token::Trivia::make_ignore_error(self.source(), self.start, w)
1789         } else {
1790             Token::Trivia::make_delimited_comment(self.source(), self.start, w)
1791         }
1792     }
1793
1794     fn scan_php_trivia(&mut self) -> Option<Token::Trivia> {
1795         // Hack does not support PHP style embedded markup:
1796         // <?php
1797         // if (x) {
1798         // ?>
1799         // <foo>bar</foo>
1800         // <?php
1801         // } else { ... }
1802         //
1803         // However, ?> is never legal in Hack, so we can treat ?> ... any text ... <?php
1804         // as a comment, and then give an error saying that this feature is not supported
1805         // in Hack.
1806         //
1807         // TODO: Give an error if this appears in a Hack program.
1808         match self.peek_char(0) {
1809             '#' => {
1810                 self.start_new_lexeme();
1811                 Some(self.scan_hash_comment())
1812             }
1813             '/' => {
1814                 self.start_new_lexeme();
1815                 match self.peek_char(1) {
1816                     '/' => Some(self.scan_single_line_comment()),
1817                     '*' => Some(self.scan_delimited_comment()),
1818                     _ => None,
1819                 }
1820             }
1821             ' ' | '\t' => {
1822                 let new_end = Self::str_skip_whitespace(self.source_text_string(), self.offset);
1823                 let new_start = self.offset;
1824                 let new_trivia =
1825                     Token::Trivia::make_whitespace(self.source(), new_start, new_end - new_start);
1826                 self.with_start_offset(new_start, new_end);
1827                 Some(new_trivia)
1828             }
1829             '\r' | '\n' => {
1830                 self.start_new_lexeme();
1831                 Some(self.scan_end_of_line())
1832             }
1833             _ => {
1834                 self.start_new_lexeme();
1835                 // Not trivia
1836                 None
1837             }
1838         }
1839     }
1840
1841     fn scan_xhp_trivia(&mut self) -> Option<Token::Trivia> {
1842         // TODO: Should XHP comments <!-- --> be their own thing, or a kind of
1843         // trivia associated with a token? Right now they are the former.
1844         let i = self.offset;
1845         let ch = self.peek_char(0);
1846         match ch {
1847             ' ' | '\t' => {
1848                 let j = Self::str_skip_whitespace(self.source_text_string(), i);
1849                 self.with_start_offset(i, j);
1850                 Some(Token::Trivia::make_whitespace(self.source(), i, j - i))
1851             }
1852             '\r' | '\n' => {
1853                 let j = Self::str_scan_end_of_line(self.source_text_string(), i);
1854                 self.with_start_offset(i, j);
1855                 Some(Token::Trivia::make_eol(self.source(), i, j - i))
1856             }
1857             _ =>
1858             // Not trivia
1859             {
1860                 self.start_new_lexeme();
1861                 None
1862             }
1863         }
1864     }
1865
1866     // We divide trivia into "leading" and "trailing" trivia of an associated
1867     // token. This means that we must find a dividing line between the trailing trivia
1868     // following one token and the leading trivia of the following token. Plainly
1869     // we need only find this line while scanning trailing trivia. The heuristics
1870     // we use are:
1871     // * The first newline trivia encountered is the last trailing trivia.
1872     // * The newline which follows a // or # comment is not part of the comment
1873     //   but does terminate the trailing trivia.
1874     // * A pragma to turn checks off (HH_FIXME and HH_IGNORE_ERROR) is
1875     //   always a leading trivia.
1876     fn scan_leading_trivia(
1877         &mut self,
1878         scanner: impl Fn(&mut Self) -> Option<Token::Trivia>,
1879     ) -> Vec<Token::Trivia> {
1880         let mut acc = vec![];
1881         while let Some(t) = scanner(self) {
1882             acc.push(t)
1883         }
1884         acc
1885     }
1886
1887     pub fn scan_leading_php_trivia(&mut self) -> Vec<Token::Trivia> {
1888         self.scan_leading_trivia(&Self::scan_php_trivia)
1889     }
1890
1891     pub fn scan_leading_xhp_trivia(&mut self) -> Vec<Token::Trivia> {
1892         self.scan_leading_trivia(&Self::scan_xhp_trivia)
1893     }
1894
1895     fn scan_trailing_trivia(
1896         &mut self,
1897         scanner: impl Fn(&mut Self) -> Option<Token::Trivia>,
1898     ) -> Vec<Token::Trivia> {
1899         let mut acc = vec![];
1900         loop {
1901             let mut lexer1 = self.clone();
1902             match scanner(&mut lexer1) {
1903                 None => {
1904                     self.continue_from(lexer1);
1905                     return acc;
1906                 }
1907                 Some(t) => match t.kind() {
1908                     TriviaKind::EndOfLine => {
1909                         self.continue_from(lexer1);
1910                         acc.push(t);
1911                         return acc;
1912                     }
1913                     TriviaKind::FixMe | TriviaKind::IgnoreError => {
1914                         return acc;
1915                     }
1916                     _ => {
1917                         self.continue_from(lexer1);
1918                         acc.push(t)
1919                     }
1920                 },
1921             }
1922         }
1923     }
1924
1925     pub fn scan_trailing_php_trivia(&mut self) -> Vec<Token::Trivia> {
1926         self.scan_trailing_trivia(&Self::scan_php_trivia)
1927     }
1928
1929     pub fn scan_trailing_xhp_trivia(&mut self) -> Vec<Token::Trivia> {
1930         self.scan_trailing_trivia(&Self::scan_xhp_trivia)
1931     }
1932
1933     pub fn is_next_name(&self) -> bool {
1934         let mut lexer = self.clone();
1935         lexer.scan_leading_php_trivia();
1936         Self::is_name_nondigit(lexer.peek_char(0))
1937     }
1938
1939     pub fn is_next_xhp_class_name(&self) -> bool {
1940         let mut lexer = self.clone();
1941         lexer.scan_leading_php_trivia();
1942         lexer.is_xhp_class_name()
1943     }
1944
1945     fn as_case_insensitive_keyword(&self, text: &str) -> Option<String> {
1946         let lower = text.to_ascii_lowercase();
1947         let res = match lower.as_ref() {
1948             "__halt_compiler" | "abstract" | "and" | "array" | "as" | "bool" | "boolean"
1949             | "break" | "callable" | "case" | "catch" | "class" | "clone" | "const"
1950             | "continue" | "default" | "die" | "do" | "echo" | "else" | "elseif" | "empty"
1951             | "endfor" | "endforeach" | "endif" | "endswitch" | "endwhile" | "eval" | "exit"
1952             | "extends" | "false" | "final" | "finally" | "for" | "foreach" | "function"
1953             | "global" | "goto" | "if" | "implements" | "include" | "include_once" | "inout"
1954             | "instanceof" | "insteadof" | "int" | "integer" | "interface" | "isset" | "list"
1955             | "namespace" | "new" | "null" | "or" | "parent" | "print" | "private"
1956             | "protected" | "public" | "require" | "require_once" | "return" | "self"
1957             | "static" | "string" | "switch" | "throw" | "trait" | "try" | "true" | "unset"
1958             | "use" | "using" | "var" | "void" | "while" | "xor" | "yield" => Some(lower),
1959             _ => None,
1960         };
1961         res.map(|x| x.to_owned())
1962     }
1963
1964     fn lowercase_error(&self, original_text: &str, lowered_text: &str) -> bool {
1965         match lowered_text {
1966             "true" | "false" | "null" => false,
1967             _ => original_text != lowered_text,
1968         }
1969     }
1970
1971     fn as_keyword(&mut self, only_reserved: bool, kind: TokenKind) -> TokenKind {
1972         if kind == TokenKind::Name {
1973             let original_text = self.current_text_as_str();
1974             let text_as_lowercase_keyword = self.as_case_insensitive_keyword(original_text);
1975             let text = match text_as_lowercase_keyword.as_ref() {
1976                 Some(x) => x,
1977                 None => original_text,
1978             };
1979             match TokenKind::from_string(&text.as_bytes(), only_reserved) {
1980                 Some(TokenKind::Let) if (!(self.is_experimental_mode())) => TokenKind::Name,
1981                 Some(keyword) => {
1982                     if self.lowercase_error(original_text, &text) {
1983                         let err = Errors::uppercase_kw(original_text);
1984                         self.with_error(err);
1985                     }
1986                     keyword
1987                 }
1988                 _ => TokenKind::Name,
1989             }
1990         } else {
1991             kind
1992         }
1993     }
1994
1995     fn scan_token_and_leading_trivia(
1996         &mut self,
1997         scanner: impl Fn(&mut Self) -> TokenKind,
1998         as_name: KwSet,
1999     ) -> (TokenKind, usize, Vec<Token::Trivia>) {
2000         // Get past the leading trivia
2001         let leading = self.scan_leading_php_trivia();
2002         // Remember where we were when we started this token
2003         self.start_new_lexeme();
2004         let kind = scanner(self);
2005         let kind = match as_name {
2006             KwSet::AllKeywords => kind,
2007             KwSet::NonReservedKeywords => self.as_keyword(true, kind),
2008             KwSet::NoKeywords => self.as_keyword(false, kind),
2009         };
2010         let w = self.width();
2011         (kind, w, leading)
2012     }
2013
2014     fn scan_token_and_trivia(
2015         &mut self,
2016         scanner: &impl Fn(&mut Self) -> TokenKind,
2017         as_name: KwSet,
2018     ) -> Token {
2019         let token_start = self.offset;
2020
2021         let (kind, w, leading) = self.scan_token_and_leading_trivia(scanner, as_name);
2022         let trailing = match kind {
2023             TokenKind::DoubleQuotedStringLiteralHead => vec![],
2024             TokenKind::QuestionGreaterThan => {
2025                 if Self::is_newline(self.peek_char(0)) {
2026                     // consume only trailing EOL token after ?> as trailing trivia
2027                     vec![self.scan_end_of_line()]
2028                 } else {
2029                     vec![]
2030                 }
2031             }
2032             _ => self.scan_trailing_php_trivia(),
2033         };
2034         Token::make(kind, self.source(), token_start, w, leading, trailing)
2035     }
2036
2037     fn scan_assert_progress(&mut self, tokenizer: impl Fn(&mut Self) -> Token) -> Token {
2038         let original_remaining = self.remaining();
2039         let token = tokenizer(self);
2040         let new_remaining = self.remaining();
2041         if new_remaining < original_remaining
2042             || original_remaining == 0
2043                 && new_remaining == 0
2044                 && (token.kind()) == TokenKind::EndOfFile
2045         {
2046             token
2047         } else {
2048             panic!("failed to make progress at {}\n", self.offset)
2049         }
2050     }
2051
2052     fn scan_next_token(
2053         &mut self,
2054         scanner: impl Fn(&mut Self) -> TokenKind,
2055         as_name: KwSet,
2056     ) -> Token {
2057         let tokenizer = |x: &mut Self| x.scan_token_and_trivia(&scanner, as_name);
2058         self.scan_assert_progress(&tokenizer)
2059     }
2060
2061     fn scan_next_token_as_name(&mut self, scanner: impl Fn(&mut Self) -> TokenKind) -> Token {
2062         self.scan_next_token(scanner, KwSet::AllKeywords)
2063     }
2064
2065     fn scan_next_token_as_keyword(&mut self, scanner: impl Fn(&mut Self) -> TokenKind) -> Token {
2066         self.scan_next_token(scanner, KwSet::NoKeywords)
2067     }
2068
2069     fn scan_next_token_nonreserved_as_name(
2070         &mut self,
2071         scanner: impl Fn(&mut Self) -> TokenKind,
2072     ) -> Token {
2073         self.scan_next_token(scanner, KwSet::NonReservedKeywords)
2074     }
2075
2076     fn next_token_impl(&mut self) -> Token {
2077         if self.in_type {
2078             self.scan_next_token_as_keyword(&Self::scan_token_inside_type)
2079         } else {
2080             self.scan_next_token_as_keyword(&Self::scan_token_outside_type)
2081         }
2082     }
2083
2084     // Entrypoints
2085     pub fn peek_next_token(&self) -> Token {
2086         {
2087             let cache = self.cache.borrow();
2088             if let Some(cache) = cache.as_ref() {
2089                 if cache.0 == *self {
2090                     return cache.1.clone();
2091                 }
2092             }
2093         }
2094
2095         let mut lexer = self.clone();
2096         lexer.errors = vec![];
2097         let before = lexer.to_lexer_pre_snapshot();
2098         let token = lexer.next_token_impl();
2099         let after = lexer.into_lexer_post_snapshot();
2100         self.cache
2101             .replace(Some(LexerCache(before, token.clone(), after)));
2102         token
2103     }
2104
2105     pub fn next_token(&mut self) -> Token {
2106         {
2107             let mut cache = self.cache.borrow_mut();
2108             if let Some(ref mut cache) = cache.deref_mut() {
2109                 if cache.0 == *self {
2110                     self.start = (cache.2).start;
2111                     self.offset = (cache.2).offset;
2112                     self.in_type = (cache.2).in_type;
2113                     if !(cache.2).errors.is_empty() {
2114                         self.errors.append(&mut (cache.2).errors.clone());
2115                     }
2116                     return cache.1.clone();
2117                 }
2118             }
2119         }
2120         self.next_token_impl()
2121     }
2122
2123     pub fn next_token_no_trailing(&mut self) -> Token {
2124         let tokenizer = |x: &mut Self| {
2125             let token_start = x.offset;
2126             let (kind, w, leading) =
2127                 x.scan_token_and_leading_trivia(&Self::scan_token_outside_type, KwSet::NoKeywords);
2128             Token::make(kind, x.source(), token_start, w, leading, vec![])
2129         };
2130         self.scan_assert_progress(&tokenizer)
2131     }
2132
2133     pub fn next_token_in_string(&mut self, literal_kind: &StringLiteralKind) -> Token {
2134         let token_start = self.offset;
2135         self.start_new_lexeme();
2136         // We're inside a string. Do not scan leading trivia.
2137         let kind = self.scan_string_literal_in_progress(literal_kind);
2138         let w = self.width();
2139         // Only scan trailing trivia if we've finished the string.
2140         let trailing = match kind {
2141             TokenKind::DoubleQuotedStringLiteralTail | TokenKind::HeredocStringLiteralTail => {
2142                 self.scan_trailing_php_trivia()
2143             }
2144             _ => vec![],
2145         };
2146         Token::make(kind, self.source(), token_start, w, vec![], trailing)
2147     }
2148
2149     pub fn next_docstring_header(&mut self) -> (Token, &'a [u8]) {
2150         // We're at the beginning of a heredoc string literal. Scan leading
2151         // trivia but not trailing trivia.
2152         let token_start = self.offset;
2153         let leading = self.scan_leading_php_trivia();
2154         self.start_new_lexeme();
2155         let (name, _) = self.scan_docstring_header();
2156         let w = self.width();
2157         let token = Token::make(
2158             TokenKind::HeredocStringLiteralHead,
2159             self.source(),
2160             token_start,
2161             w,
2162             leading,
2163             vec![],
2164         );
2165         (token, name)
2166     }
2167
2168     pub fn next_token_as_name(&mut self) -> Token {
2169         self.scan_next_token_as_name(&Self::scan_token_outside_type)
2170     }
2171
2172     pub fn next_token_non_reserved_as_name(&mut self) -> Token {
2173         self.scan_next_token_nonreserved_as_name(&Self::scan_token_outside_type)
2174     }
2175
2176     pub fn next_xhp_element_token(&mut self, no_trailing: bool) -> (Token, &[u8]) {
2177         // XHP elements have whitespace, newlines and Hack comments.
2178         let tokenizer = |lexer: &mut Self| {
2179             let token_start = lexer.offset;
2180             let (kind, w, leading) =
2181                 lexer.scan_token_and_leading_trivia(&Self::scan_xhp_token, KwSet::AllKeywords);
2182             // We do not scan trivia after an XHPOpen's >. If that is the beginning of
2183             // an XHP body then we want any whitespace or newlines to be leading trivia
2184             // of the body token.
2185             match kind {
2186                 TokenKind::GreaterThan | TokenKind::SlashGreaterThan if no_trailing => {
2187                     Token::make(kind, lexer.source(), token_start, w, leading, vec![])
2188                 }
2189                 _ => {
2190                     let trailing = lexer.scan_trailing_php_trivia();
2191                     Token::make(kind, lexer.source(), token_start, w, leading, trailing)
2192                 }
2193             }
2194         };
2195         let token = self.scan_assert_progress(&tokenizer);
2196         let token_width = token.width();
2197         let trailing_width = token.trailing_width();
2198         let token_start_offset = (self.offset) - trailing_width - token_width;
2199         let token_text = self.source.sub(token_start_offset, token_width);
2200         (token, token_text)
2201     }
2202
2203     pub fn next_xhp_body_token(&mut self) -> Token {
2204         let scanner = |lexer: &mut Self| {
2205             let token_start = lexer.offset;
2206             let leading = lexer.scan_leading_xhp_trivia();
2207             lexer.start_new_lexeme();
2208             let kind = lexer.scan_xhp_body();
2209             let w = lexer.width();
2210             let trailing =
2211              // Trivia (leading and trailing) is semantically
2212              // significant for XHPBody tokens. When we find elements or
2213              // braced expressions inside the body, the trivia should be
2214              // seen as leading the next token, but we should certainly
2215              // keep it trailing if this is an XHPBody token.
2216              if kind == TokenKind::XHPBody
2217              { lexer.scan_trailing_xhp_trivia() }
2218              else  { vec!() };
2219             Token::make(kind, lexer.source(), token_start, w, leading, trailing)
2220         };
2221         self.scan_assert_progress(&scanner)
2222     }
2223
2224     pub fn next_xhp_class_name(&mut self) -> Token {
2225         self.scan_token_and_trivia(&Self::scan_xhp_class_name, KwSet::NoKeywords)
2226     }
2227
2228     pub fn next_xhp_name(&mut self) -> Token {
2229         let scanner = |x: &mut Self| x.scan_xhp_element_name(false);
2230         self.scan_token_and_trivia(&scanner, KwSet::NoKeywords)
2231     }
2232
2233     fn make_markup_token(&self) -> Token {
2234         Token::make(
2235             TokenKind::Markup,
2236             self.source(),
2237             self.start,
2238             self.width(),
2239             vec![],
2240             vec![],
2241         )
2242     }
2243
2244     fn make_long_tag(
2245         &mut self,
2246         name_token_offset: usize,
2247         size: usize,
2248         markup_text: Token,
2249         less_than_question_token: Token,
2250     ) -> (Token, Option<(Token, Option<Token>)>) {
2251         // skip name
2252         self.advance(size);
2253         // single line comments that follow the language in leading markup_text
2254         // determine the file check mode, read the trailing trivia and attach it
2255         // to the language token
2256         let trailing = self.scan_trailing_php_trivia();
2257         let name = Token::make(
2258             TokenKind::Name,
2259             self.source(),
2260             name_token_offset,
2261             size,
2262             vec![],
2263             trailing,
2264         );
2265         (markup_text, Some((less_than_question_token, Some(name))))
2266     }
2267
2268     fn make_markup_and_suffix(&mut self) -> (Token, Option<(Token, Option<Token>)>) {
2269         let markup_text = self.make_markup_token();
2270         let less_than_question_token = Token::make(
2271             TokenKind::LessThanQuestion,
2272             self.source(),
2273             self.offset,
2274             2,
2275             vec![],
2276             vec![],
2277         );
2278         // skip <?
2279         self.advance(2);
2280         let name_token_offset = self.offset;
2281         let ch0 = self.peek_char(0).to_ascii_lowercase();
2282         let ch1 = self.peek_char(1).to_ascii_lowercase();
2283         let ch2 = self.peek_char(2).to_ascii_lowercase();
2284         match (ch0, ch1, ch2) {
2285             ('h', 'h', _) => {
2286                 self.make_long_tag(name_token_offset, 2, markup_text, less_than_question_token)
2287             }
2288             ('p', 'h', 'p') => {
2289                 self.make_long_tag(name_token_offset, 3, markup_text, less_than_question_token)
2290             }
2291             ('=', _, _) => {
2292                 // skip =
2293                 self.advance(1);
2294                 let equal = Token::make(
2295                     TokenKind::Equal,
2296                     self.source(),
2297                     name_token_offset,
2298                     1,
2299                     vec![],
2300                     vec![],
2301                 );
2302
2303                 (markup_text, Some((less_than_question_token, Some(equal))))
2304             }
2305             _ => (markup_text, Some((less_than_question_token, (None)))),
2306         }
2307     }
2308
2309     fn skip_to_end_of_markup(&mut self) -> (Token, Option<(Token, Option<Token>)>) {
2310         let start_offset = {
2311             // if leading section starts with #! - it should span the entire line
2312             let index = self.offset;
2313             if index != 0 {
2314                 panic!("Should only try to lex header at start of document")
2315             };
2316             if self.peek_def(index, INVALID) == '#' && self.peek_def(index + 1, INVALID) == '!' {
2317                 self.skip_while_to_offset(&Self::not_newline) + 1
2318             } else {
2319                 // this should really just be `index` - but, skip whitespace as the FFP
2320                 // tests use magic comments in leading markup to set flags, but blank
2321                 // them out before parsing; the newlines are kept to provide correct line
2322                 // numbers in errors
2323                 self.skip_while_to_offset(&|x| {
2324                     Self::is_newline(x) || Self::is_whitespace_no_newline(x)
2325                 })
2326             }
2327         };
2328         if self.peek(start_offset) == '<' && self.peek_def(start_offset + 1, INVALID) == '?' {
2329             self.with_offset(start_offset);
2330             self.make_markup_and_suffix()
2331         } else {
2332             (self.make_markup_token(), None)
2333         }
2334     }
2335
2336     pub fn scan_header(&mut self) -> (Token, Option<(Token, Option<Token>)>) {
2337         self.start_new_lexeme();
2338         self.skip_to_end_of_markup()
2339     }
2340
2341     pub fn is_next_xhp_category_name(&self) -> bool {
2342         let mut lexer = self.clone();
2343         let _ = lexer.scan_leading_php_trivia();
2344         // An XHP category is an xhp element name preceded by a %.
2345         let ch0 = lexer.peek_char(0);
2346         let ch1 = lexer.peek_char(1);
2347         ch0 == '%' && Self::is_name_nondigit(ch1)
2348     }
2349
2350     fn scan_xhp_category_name(&mut self) -> TokenKind {
2351         if self.is_next_xhp_category_name() {
2352             self.advance(1);
2353             let _ = self.scan_xhp_element_name(false);
2354             TokenKind::XHPCategoryName
2355         } else {
2356             self.scan_token(false)
2357         }
2358     }
2359
2360     pub fn next_xhp_category_name(&mut self) -> Token {
2361         self.scan_token_and_trivia(&Self::scan_xhp_category_name, KwSet::NoKeywords)
2362     }
2363
2364     pub fn rescan_halt_compiler(&mut self, last_token: Token) -> Token {
2365         // __halt_compiler stops parsing of the file.
2366         // In order to preserve fill fidelity aspect of the parser
2367         // we pack everything that follows __halt_compiler as
2368         // separate opaque kind of trivia - it will be attached as a trailing trivia
2369         // to the last_token and existing trailing trivia will be merged in.
2370
2371         // This is incorrect for minimal token
2372         let leading_start_offset = last_token.leading_start_offset().unwrap_or(0);
2373         let start_offset = leading_start_offset + last_token.leading_width() + last_token.width();
2374
2375         let length = self.source.length();
2376         let trailing = Token::Trivia::make_after_halt_compiler(
2377             self.source(),
2378             start_offset,
2379             length - start_offset,
2380         );
2381         self.with_offset(length);
2382         last_token.with_trailing(vec![trailing])
2383     }
2384 }