hphp/hack/src/parser/core/lexer.rs

   1 // Copyright (c) 2019, Facebook, Inc.
   2 // All rights reserved.
   3 //
   4 // This source code is licensed under the MIT license found in the
   5 // LICENSE file in the "hack" directory of this source tree.
   6
   7 use parser_core_types::{
   8     lexable_token::LexableToken,
   9     lexable_trivia::{LexableTrivia, LexableTrivium},
  10     source_text::{SourceText, INVALID},
  11     syntax_error::{self as Errors, Error, SyntaxError},
  12     token_factory::{TokenFactory, Trivia, Trivium},
  13     token_kind::TokenKind,
  14     trivia_factory::TriviaFactory,
  15     trivia_kind::TriviaKind,
  16 };
  17 use static_assertions::*;
  18
  19 use std::cell::RefCell;
  20 use std::ops::DerefMut;
  21 use std::rc::Rc;
  22
  23 #[derive(Debug)]
  24 struct LexerPreSnapshot {
  25     start: usize,
  26     offset: usize,
  27     in_type: bool,
  28 }
  29
  30 #[derive(Debug)]
  31 struct LexerPostSnapshot {
  32     start: usize,
  33     offset: usize,
  34     in_type: bool,
  35     errors: Vec<SyntaxError>,
  36 }
  37
  38 impl<'a, TF> PartialEq<Lexer<'a, TF>> for LexerPreSnapshot
  39 where
  40     TF: TokenFactory,
  41 {
  42     fn eq(&self, other: &Lexer<'a, TF>) -> bool {
  43         self.start == other.start && self.offset == other.offset && self.in_type == other.in_type
  44     }
  45 }
  46
  47 /*
  48 Lexer Caching
  49
  50 One token look ahead in parser is implemented by `parser.peek_token()` ... `parser.next_token()`.
  51 Re-scanning in next_token can be avoided by caching the result of `peek_token`, consecutive
  52 `peek_token`s can also get improved.
  53
  54 `Lexer.peek_next_token()` checks cache first if cache misses it will clone of the current lexer and
  55 call next_token on cloned lexer. To cache the result, it takes a snapshot of lexer state before and
  56 after calling next_token, and store them in current lexer.
  57
  58 Clone trait of Lexer is derived automatically, therefore `cache: Rc<...>` is also cloned. `Rc` ensures
  59 cloned lexer and original lexer share the same cache, this is intended! Other than one token look
  60 ahead still clones parser, therefore lexer get cloned, sharing cache allows cloned lexer uses
  61 cache from original lexer and vise versa. It is measured that 2% faster than not sharing cache.
  62
  63 NOTE: There is an invariant assumed by this caching mechanism. `errors` in `Lexer` can only add new errors
  64 and must not remove any error when scanning forward! `Lexer.peek_next_token()` clones a new `Lexer` and
  65 reset `errors` to empty, look ahead may accumulate new errors and these errors will be appended to the original
  66 `Lexer`. The reason we need this invariant is that between `peek_next_token` and `next_token` we can not
  67 prove no new error added. Actually it is observed that new errors are added between these two calls.
  68 */
  69 #[derive(Debug)]
  70 struct LexerCache<Token>(LexerPreSnapshot, Token, LexerPostSnapshot);
  71
  72 #[derive(Debug, Clone)]
  73 pub struct Lexer<'a, TF>
  74 where
  75     TF: TokenFactory,
  76 {
  77     source: SourceText<'a>,
  78     start: usize,
  79     offset: usize,
  80     errors: Vec<SyntaxError>,
  81     in_type: bool,
  82     token_factory: TF,
  83     cache: Rc<RefCell<Option<LexerCache<TF::Token>>>>,
  84 }
  85
  86 #[derive(Debug, PartialEq)]
  87 pub enum StringLiteralKind {
  88     LiteralDoubleQuoted,
  89     LiteralHeredoc { heredoc: Vec<u8> },
  90 }
  91
  92 #[derive(Debug, Copy, Clone)]
  93 pub enum KwSet {
  94     AllKeywords,
  95     NonReservedKeywords,
  96     NoKeywords,
  97 }
  98
  99 macro_rules! as_case_insensitive_keyword {
 100     ($size:tt, $size_type:ty $(, $keyword:tt)+) => {
 101         fn as_case_insensitive_keyword(&self, text: &str) -> Option<(&'static str, bool)> {
 102             use heapless::consts::*;
 103
 104             // - The $size should be greater than or equal to the each length of keyword
 105             // - The $size should be equal to at least one of the length of a keyword
 106             // Therefore, $size is equal to the length of the longest keyword.
 107             $(
 108                 const_assert!($size >= $keyword.len());
 109             )*
 110             const_assert!(
 111                 $(
 112                     $size == $keyword.len() ||
 113                 )*
 114                 false
 115             );
 116
 117             if text.len() > $size {
 118                 None
 119             } else {
 120                 let mut t: heapless::String<$size_type> = text.into();
 121                 let t: &mut str = t.as_mut_str();
 122                 t.make_ascii_lowercase();
 123                 let has_upper = t != text;
 124                 let t: &str = t as &str;
 125                 match t {
 126                     $(
 127                         $keyword => Some(($keyword, has_upper)),
 128                     )*
 129                     _ => None,
 130                 }
 131             }
 132         }
 133     }
 134 }
 135
 136 impl<'a, TF> Lexer<'a, TF>
 137 where
 138     TF: TokenFactory,
 139 {
 140     fn to_lexer_pre_snapshot(&self) -> LexerPreSnapshot {
 141         LexerPreSnapshot {
 142             start: self.start,
 143             offset: self.offset,
 144             in_type: self.in_type,
 145         }
 146     }
 147
 148     fn into_lexer_post_snapshot(self) -> LexerPostSnapshot {
 149         LexerPostSnapshot {
 150             start: self.start,
 151             offset: self.offset,
 152             in_type: self.in_type,
 153             errors: self.errors,
 154         }
 155     }
 156
 157     pub fn make_at(source: &SourceText<'a>, offset: usize, token_factory: TF) -> Self {
 158         Self {
 159             source: source.clone(),
 160             start: offset,
 161             offset,
 162             errors: vec![],
 163             in_type: false,
 164             cache: Rc::new(RefCell::new(None)),
 165             token_factory,
 166         }
 167     }
 168
 169     pub fn make(source: &SourceText<'a>, token_factory: TF) -> Self {
 170         Self::make_at(source, 0, token_factory)
 171     }
 172
 173     fn continue_from(&mut self, l: Lexer<'a, TF>) {
 174         self.start = l.start;
 175         self.offset = l.offset;
 176         self.errors = l.errors
 177     }
 178
 179     pub fn start(&self) -> usize {
 180         self.start
 181     }
 182
 183     pub fn offset(&self) -> usize {
 184         self.offset
 185     }
 186
 187     pub fn errors(&self) -> &[SyntaxError] {
 188         &self.errors
 189     }
 190
 191     fn with_error(&mut self, error: Error) {
 192         let error = SyntaxError::make(self.start(), self.offset(), error);
 193         self.errors.push(error)
 194     }
 195
 196     fn with_offset(&mut self, offset: usize) {
 197         self.offset = offset
 198     }
 199
 200     fn with_start_offset(&mut self, start: usize, offset: usize) {
 201         self.start = start;
 202         self.offset = offset;
 203     }
 204
 205     fn start_new_lexeme(&mut self) {
 206         self.start = self.offset
 207     }
 208
 209     pub fn advance(&mut self, i: usize) {
 210         self.offset += i
 211     }
 212
 213     pub fn set_in_type(&mut self, in_type: bool) {
 214         self.in_type = in_type
 215     }
 216
 217     pub fn source(&self) -> &SourceText<'a> {
 218         &self.source
 219     }
 220
 221     fn source_text_string(&self) -> &[u8] {
 222         self.source.text()
 223     }
 224
 225     // Housekeeping
 226
 227     pub fn peek_char(&self, index: usize) -> char {
 228         self.source.get(self.offset() + index)
 229     }
 230
 231     fn peek_string(&self, size: usize) -> &[u8] {
 232         self.source.sub(self.offset, size)
 233     }
 234
 235     fn match_string(&self, s: &[u8]) -> bool {
 236         s == self.peek_string(s.len())
 237     }
 238
 239     fn width(&self) -> usize {
 240         self.offset - self.start
 241     }
 242
 243     fn current_text(&self) -> &[u8] {
 244         self.source.sub(self.start, self.width())
 245     }
 246
 247     fn current_text_as_str(&self) -> &str {
 248         unsafe { std::str::from_utf8_unchecked(self.current_text()) }
 249     }
 250
 251     fn at_end(&self) -> bool {
 252         self.offset() >= self.source.length()
 253     }
 254
 255     fn remaining(&self) -> usize {
 256         let r = (self.source.length() as isize) - (self.offset as isize);
 257         if r < 0 { 0 } else { r as usize }
 258     }
 259
 260     fn peek(&self, i: usize) -> char {
 261         self.source.get(i)
 262     }
 263
 264     fn peek_back(&self, index: usize) -> char {
 265         self.source.get(self.offset() - index)
 266     }
 267
 268     fn peek_def(&self, index: usize, default: char) -> char {
 269         if index >= self.source.length() {
 270             default
 271         } else {
 272             self.source.get(index)
 273         }
 274     }
 275
 276     // Character classification
 277
 278     fn is_whitespace_no_newline(c: char) -> bool {
 279         match c {
 280             ' ' | '\t' => true,
 281             _ => false,
 282         }
 283     }
 284
 285     fn is_newline(ch: char) -> bool {
 286         match ch {
 287             '\r' | '\n' => true,
 288             _ => false,
 289         }
 290     }
 291
 292     fn is_binary_digit(ch: char) -> bool {
 293         match ch {
 294             '0' | '1' => true,
 295             _ => false,
 296         }
 297     }
 298
 299     fn is_octal_digit(c: char) -> bool {
 300         ('0'..='7').contains(&c)
 301     }
 302
 303     fn is_decimal_digit(ch: char) -> bool {
 304         ('0'..='9').contains(&ch)
 305     }
 306
 307     fn is_hexadecimal_digit(c: char) -> bool {
 308         ('0'..='9').contains(&c) || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
 309     }
 310
 311     fn is_name_nondigit(c: char) -> bool {
 312         (c == '_') || ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) || ('\x7f' <= c)
 313     }
 314
 315     fn is_name_letter(c: char) -> bool {
 316         (c == '_')
 317             || ('0'..='9').contains(&c)
 318             || ('a'..='z').contains(&c)
 319             || ('A'..='Z').contains(&c)
 320             || ('\x7f' <= c)
 321     }
 322
 323     // Lexing
 324
 325     fn skip_while_to_offset(&self, p: impl Fn(char) -> bool) -> usize {
 326         let n = self.source.length();
 327         let mut i = self.offset();
 328         while i < n && p(self.peek(i)) {
 329             i += 1;
 330         }
 331         i
 332     }
 333
 334     // advance offset as long as the predicate is true
 335     fn skip_while(&mut self, p: impl Fn(char) -> bool) {
 336         self.with_offset(self.skip_while_to_offset(p))
 337     }
 338
 339     fn str_skip_while(s: &[u8], mut i: usize, p: impl Fn(char) -> bool) -> usize {
 340         let n = s.len();
 341         loop {
 342             if i < n && p(s[i] as char) {
 343                 i += 1
 344             } else {
 345                 return i;
 346             }
 347         }
 348     }
 349
 350     fn skip_whitespace(&mut self) {
 351         self.skip_while(&Self::is_whitespace_no_newline);
 352     }
 353
 354     fn str_skip_whitespace(s: &[u8], i: usize) -> usize {
 355         Self::str_skip_while(s, i, &Self::is_whitespace_no_newline)
 356     }
 357
 358     fn not_newline(ch: char) -> bool {
 359         !(Self::is_newline(ch))
 360     }
 361
 362     fn skip_to_end_of_line(&mut self) {
 363         self.skip_while(&Self::not_newline)
 364     }
 365
 366     fn skip_name_end(&mut self) {
 367         self.skip_while(&Self::is_name_letter)
 368     }
 369
 370     fn skip_end_of_line(&mut self) {
 371         match self.peek_char(0) {
 372             '\n' => self.advance(1),
 373             '\r' => {
 374                 if self.peek_char(1) == '\n' {
 375                     self.advance(2)
 376                 } else {
 377                     self.advance(1)
 378                 }
 379             }
 380             _ => {}
 381         }
 382     }
 383
 384     fn scan_name_impl(&mut self) {
 385         assert!(Self::is_name_nondigit(self.peek_char(0)));
 386         self.advance(1);
 387         self.skip_name_end();
 388     }
 389
 390     fn scan_name(&mut self) -> TokenKind {
 391         self.scan_name_impl();
 392         TokenKind::Name
 393     }
 394
 395     fn scan_variable(&mut self) -> TokenKind {
 396         assert_eq!('$', self.peek_char(0));
 397         self.advance(1);
 398         self.scan_name_impl();
 399         TokenKind::Variable
 400     }
 401
 402     fn scan_with_underscores(&mut self, accepted_char: impl Fn(char) -> bool) {
 403         let n = self.source.length();
 404         let peek_def = |i| if i < n { self.peek(i) } else { INVALID };
 405         let mut i = self.offset();
 406         while i < n {
 407             let ch = self.peek(i);
 408             if accepted_char(ch) {
 409                 i += 1
 410             } else if ch == ' ' && accepted_char(peek_def(i + 1)) {
 411                 i += 2;
 412             } else {
 413                 break;
 414             }
 415         }
 416         self.with_offset(i);
 417     }
 418
 419     fn scan_decimal_digits(&mut self) {
 420         self.skip_while(&Self::is_decimal_digit)
 421     }
 422
 423     fn scan_decimal_digits_with_underscores(&mut self) {
 424         self.scan_with_underscores(&Self::is_decimal_digit);
 425     }
 426
 427     fn scan_octal_digits(&mut self) {
 428         self.skip_while(&Self::is_octal_digit)
 429     }
 430
 431     fn scan_octal_digits_with_underscores(&mut self) {
 432         self.scan_with_underscores(&Self::is_octal_digit)
 433     }
 434
 435     fn scan_binary_digits_with_underscores(&mut self) {
 436         self.scan_with_underscores(&Self::is_binary_digit)
 437     }
 438
 439     fn scan_hexadecimal_digits(&mut self) {
 440         self.skip_while(&Self::is_hexadecimal_digit)
 441     }
 442
 443     fn scan_hexadecimal_digits_with_underscores(&mut self) {
 444         self.scan_with_underscores(&Self::is_hexadecimal_digit)
 445     }
 446
 447     fn scan_hex_literal(&mut self) -> TokenKind {
 448         let ch = self.peek_char(0);
 449         if !Self::is_hexadecimal_digit(ch) {
 450             self.with_error(Errors::error0001);
 451             TokenKind::HexadecimalLiteral
 452         } else {
 453             self.scan_hexadecimal_digits_with_underscores();
 454             TokenKind::HexadecimalLiteral
 455         }
 456     }
 457
 458     fn scan_binary_literal(&mut self) -> TokenKind {
 459         let ch = self.peek_char(0);
 460         if !Self::is_binary_digit(ch) {
 461             self.with_error(Errors::error0002);
 462             TokenKind::BinaryLiteral
 463         } else {
 464             self.scan_binary_digits_with_underscores();
 465             TokenKind::BinaryLiteral
 466         }
 467     }
 468
 469     fn scan_exponent(&mut self) -> TokenKind {
 470         let ch = self.peek_char(1);
 471         if ch == '+' || ch == '-' {
 472             self.advance(2)
 473         } else {
 474             self.advance(1)
 475         }
 476         let ch = self.peek_char(0);
 477         if !Self::is_decimal_digit(ch) {
 478             self.with_error(Errors::error0003);
 479             TokenKind::FloatingLiteral
 480         } else {
 481             self.scan_decimal_digits();
 482             TokenKind::FloatingLiteral
 483         }
 484     }
 485
 486     fn scan_after_decimal_point(&mut self) -> TokenKind {
 487         self.advance(1);
 488         self.scan_decimal_digits();
 489         let ch = self.peek_char(0);
 490         if ch == 'e' || ch == 'E' {
 491             self.scan_exponent()
 492         } else {
 493             TokenKind::FloatingLiteral
 494         }
 495     }
 496
 497     fn scan_octal_or_float(&mut self) -> TokenKind {
 498         // We've scanned a leading zero.
 499         // We have an irritating ambiguity here.  09 is not a legal octal or
 500         // floating literal, but 09e1 and 09.1 are.
 501         self.advance(1);
 502         let ch = self.peek_char(0);
 503         match ch {
 504             '.' =>
 505             // 0.
 506             {
 507                 self.scan_after_decimal_point()
 508             }
 509             'e' | 'E' =>
 510             // 0e
 511             {
 512                 self.scan_exponent()
 513             }
 514             _ if ('0'..='9').contains(&ch) => {
 515                 // 05
 516                 let mut lexer_oct = self.clone();
 517                 lexer_oct.scan_octal_digits();
 518
 519                 let mut lexer_dec = self.clone();
 520                 lexer_dec.scan_decimal_digits();
 521                 if (lexer_oct.width()) == (lexer_dec.width()) {
 522                     // Only octal digits. Could be an octal literal, or could
 523                     // be a float.
 524                     let ch = lexer_oct.peek_char(0);
 525                     if ch == 'e' || ch == 'E' {
 526                         self.continue_from(lexer_oct);
 527                         self.scan_exponent()
 528                     } else if ch == '.' {
 529                         self.continue_from(lexer_oct);
 530                         self.scan_after_decimal_point()
 531                     } else {
 532                         // This is irritating - we only want to allow underscores for integer
 533                         // literals. Deferring the lexing with underscores here allows us to
 534                         // make sure we're not dealing with floats.
 535                         self.continue_from(lexer_oct);
 536                         self.scan_octal_digits_with_underscores();
 537                         TokenKind::OctalLiteral
 538                     }
 539                 } else {
 540                     // We had decimal digits following a leading zero; this is either a
 541                     // float literal or an octal to be truncated at the first non-octal
 542                     // digit.
 543                     let ch = lexer_dec.peek_char(0);
 544                     if ch == 'e' || ch == 'E' {
 545                         self.continue_from(lexer_dec);
 546                         self.scan_exponent()
 547                     } else if ch == '.' {
 548                         self.continue_from(lexer_dec);
 549                         self.scan_after_decimal_point()
 550                     } else {
 551                         // an octal to be truncated at the first non-octal digit
 552                         // Again we differ the lexing with underscores here
 553                         self.scan_decimal_digits_with_underscores();
 554                         TokenKind::OctalLiteral
 555                     }
 556                 }
 557             }
 558             _ =>
 559             // 0 is a decimal literal
 560             {
 561                 TokenKind::DecimalLiteral
 562             }
 563         }
 564     }
 565
 566     fn scan_decimal_or_float(&mut self) -> TokenKind {
 567         // We've scanned a leading non-zero digit.
 568         let mut lexer_no_underscores = self.clone();
 569         lexer_no_underscores.scan_decimal_digits();
 570         let mut lexer_with_underscores = self.clone();
 571         lexer_with_underscores.scan_decimal_digits_with_underscores();
 572         let ch = lexer_no_underscores.peek_char(0);
 573         match ch {
 574             '.' =>
 575             // 123.
 576             {
 577                 self.continue_from(lexer_no_underscores);
 578                 self.scan_after_decimal_point()
 579             }
 580             'e' | 'E' =>
 581             // 123e
 582             {
 583                 self.continue_from(lexer_no_underscores);
 584                 self.scan_exponent()
 585             }
 586             _ =>
 587             // 123
 588             {
 589                 self.continue_from(lexer_with_underscores);
 590                 TokenKind::DecimalLiteral
 591             }
 592         }
 593     }
 594
 595     fn scan_single_quote_string_literal(&mut self) -> TokenKind {
 596         // TODO: What about newlines embedded?
 597         // SPEC:
 598         // single-quoted-string-literal::
 599         //   b-opt  ' sq-char-sequence-opt  '
 600         //
 601         // TODO: What is this b-opt?  We don't lex an optional 'b' before a literal.
 602         //
 603         // sq-char-sequence::
 604         //   sq-char
 605         //   sq-char-sequence   sq-char
 606         //
 607         // sq-char::
 608         //   sq-escape-sequence
 609         //   \opt   any character except single-quote (') or backslash (\)
 610         //
 611         // sq-escape-sequence:: one of
 612         //   \'  \\
 613         let n = self.source.length();
 614         let peek = |x| self.source.get(x);
 615
 616         let mut has_error0012 = false;
 617         let mut has_error0006 = false;
 618
 619         let mut i = 1 + self.offset();
 620         let new_offset = loop {
 621             if i >= n {
 622                 has_error0012 = true;
 623                 break n;
 624             } else {
 625                 let ch = peek(i);
 626                 match ch {
 627                     INVALID => {
 628                         has_error0006 = true;
 629                         i += 1
 630                     }
 631                     '\\' => i += 2,
 632                     '\'' => break (1 + i),
 633                     _ => i += 1,
 634                 }
 635             }
 636         };
 637
 638         if has_error0006 {
 639             self.with_error(Errors::error0006)
 640         }
 641         if has_error0012 {
 642             self.with_error(Errors::error0012)
 643         }
 644
 645         self.with_offset(new_offset);
 646         TokenKind::SingleQuotedStringLiteral
 647     }
 648
 649     fn scan_hexadecimal_escape(&mut self) {
 650         let ch2 = self.peek_char(2);
 651         let ch3 = self.peek_char(3);
 652         if !(Self::is_hexadecimal_digit(ch2)) {
 653             // TODO: Consider producing an error for a malformed hex escape
 654             // let lexer = with_error lexer SyntaxError.error0005 in
 655             self.advance(2);
 656         } else if !(Self::is_hexadecimal_digit(ch3)) {
 657             // let lexer = with_error lexer SyntaxError.error0005 in
 658             self.advance(3)
 659         } else {
 660             self.advance(4)
 661         }
 662     }
 663
 664     fn scan_unicode_escape(&mut self) {
 665         // At present the lexer is pointing at \u
 666         if self.peek_char(2) == '{' {
 667             if self.peek_char(3) == '$' {
 668                 // We have a malformed unicode escape that contains a possible embedded
 669                 // expression. Eat the \u and keep on processing the embedded expression.
 670                 // TODO: Consider producing a warning for a malformed unicode escape.
 671                 self.advance(2)
 672             } else {
 673                 // We have a possibly well-formed escape sequence, and at least we know
 674                 // that it is not an embedded expression.
 675                 // TODO: Consider producing an error if the digits are out of range
 676                 // of legal Unicode characters.
 677                 // TODO: Consider producing an error if there are no digits.
 678                 // Skip over the slash, u and brace, and start lexing the number.
 679                 self.advance(3);
 680                 self.scan_hexadecimal_digits();
 681                 let ch = self.peek_char(0);
 682                 if ch != '}' {
 683                     // TODO: Consider producing a warning for a malformed unicode escape.
 684                     {}
 685                 } else {
 686                     self.advance(1)
 687                 }
 688             }
 689         } else {
 690             // We have a malformed unicode escape sequence. Bail out.
 691             // TODO: Consider producing a warning for a malformed unicode escape.
 692             self.advance(2)
 693         }
 694     }
 695
 696     fn skip_uninteresting_double_quote_like_string_characters(&mut self) {
 697         let is_uninteresting = |ch| match ch {
 698             INVALID | '\\' | '$' | '{' | '[' | ']' | '-' => false,
 699             ch if ('0'..='9').contains(&ch) => false,
 700             ch => ch != '"' && !Self::is_name_nondigit(ch),
 701         };
 702         self.skip_while(&is_uninteresting);
 703     }
 704
 705     fn scan_integer_literal_in_string(&mut self) -> TokenKind {
 706         if self.peek_char(0) == '0' {
 707             match self.peek_char(1) {
 708                 'x' | 'X' => {
 709                     self.advance(2);
 710                     self.scan_hex_literal()
 711                 }
 712                 'b' | 'B' => {
 713                     self.advance(2);
 714                     self.scan_binary_literal()
 715                 }
 716                 _ => {
 717                     // An integer literal starting with 0 in a string will actually
 718                     // always be treated as a string index in HHVM, and not as an octal.
 719                     // In such a case, HHVM actually scans all decimal digits to create the
 720                     // token. TODO: (kasper) T40381519 we may want to change this behavior to something more
 721                     // sensible
 722                     self.scan_decimal_digits_with_underscores();
 723                     TokenKind::DecimalLiteral
 724                 }
 725             }
 726         } else {
 727             self.scan_decimal_digits_with_underscores();
 728             TokenKind::DecimalLiteral
 729         }
 730     }
 731
 732     fn scan_double_quote_like_string_literal_from_start(&mut self) -> TokenKind {
 733         let literal_token_kind = TokenKind::DoubleQuotedStringLiteral;
 734         let head_token_kind = TokenKind::DoubleQuotedStringLiteralHead;
 735         self.advance(1);
 736         loop {
 737             // If there's nothing interesting in this double-quoted string then
 738             // we can just hand it back as-is.
 739             self.skip_uninteresting_double_quote_like_string_characters();
 740             match self.peek_char(0) {
 741                 INVALID => {
 742                     // If the string is unterminated then give an error; if this is an
 743                     // embedded zero character then give an error and recurse; we might
 744                     // be able to make more progress.
 745                     if self.at_end() {
 746                         self.with_error(Errors::error0012);
 747                         break literal_token_kind;
 748                     } else {
 749                         self.with_error(Errors::error0006);
 750                         self.advance(1)
 751                     }
 752                 }
 753                 '"' => {
 754                     // We made it to the end without finding a special character.
 755                     self.advance(1);
 756                     break literal_token_kind;
 757                 }
 758                 _ =>
 759                 // We've found a backslash, dollar or brace.
 760                 {
 761                     break head_token_kind;
 762                 }
 763             }
 764         }
 765     }
 766
 767     fn is_heredoc_tail(&self, name: &[u8]) -> bool {
 768         // A heredoc tail is the identifier immediately preceded by a newline
 769         // and immediately followed by an optional semi and then a newline.
 770         //
 771         // Note that the newline and optional semi are not part of the literal;
 772         // the literal's lexeme ends at the end of the name. Either there is
 773         // no trivia and the next token is a semi-with-trailing-newline, or
 774         // the trailing trivia is a newline.
 775         //
 776         // This odd rule is to ensure that both
 777         // $x = <<<HERE
 778         // something
 779         // HERE;
 780         //
 781         // and
 782         //
 783         // $x = <<<HERE
 784         // something
 785         // HERE
 786         // . "something else";
 787         //
 788         // are legal.
 789         if !(Self::is_newline(self.peek_back(1))) {
 790             false
 791         } else {
 792             let len = name.len();
 793             let ch0 = self.peek_char(len);
 794             let ch1 = self.peek_char(len + 1);
 795             ((Self::is_newline(ch0)) || ch0 == ';' && (Self::is_newline(ch1)))
 796                 && self.peek_string(len) == name
 797         }
 798     }
 799
 800     fn get_tail_token_kind(&self, literal_kind: &StringLiteralKind) -> TokenKind {
 801         match literal_kind {
 802             StringLiteralKind::LiteralHeredoc { .. } => TokenKind::HeredocStringLiteralTail,
 803             StringLiteralKind::LiteralDoubleQuoted => TokenKind::DoubleQuotedStringLiteralTail,
 804         }
 805     }
 806
 807     fn get_string_literal_body_or_double_quoted_tail(
 808         &self,
 809         literal_kind: &StringLiteralKind,
 810     ) -> TokenKind {
 811         if literal_kind == &StringLiteralKind::LiteralDoubleQuoted {
 812             TokenKind::DoubleQuotedStringLiteralTail
 813         } else {
 814             TokenKind::StringLiteralBody
 815         }
 816     }
 817
 818     fn scan_string_literal_in_progress(&mut self, literal_kind: &StringLiteralKind) -> TokenKind {
 819         let (is_heredoc, name): (bool, &[u8]) = match literal_kind {
 820             StringLiteralKind::LiteralHeredoc { heredoc } => (true, heredoc),
 821             _ => (false, b""),
 822         };
 823         let ch0 = self.peek_char(0);
 824         if Self::is_name_nondigit(ch0) {
 825             if is_heredoc && (self.is_heredoc_tail(name)) {
 826                 self.scan_name_impl();
 827                 TokenKind::HeredocStringLiteralTail
 828             } else {
 829                 self.scan_name_impl();
 830                 TokenKind::Name
 831             }
 832         } else {
 833             match ch0 {
 834                 INVALID => {
 835                     if self.at_end() {
 836                         self.with_error(Errors::error0012);
 837                         self.get_tail_token_kind(literal_kind)
 838                     } else {
 839                         self.with_error(Errors::error0006);
 840                         self.advance(1);
 841                         self.skip_uninteresting_double_quote_like_string_characters();
 842                         TokenKind::StringLiteralBody
 843                     }
 844                 }
 845                 '"' => {
 846                     let kind = self.get_string_literal_body_or_double_quoted_tail(literal_kind);
 847                     self.advance(1);
 848                     kind
 849                 }
 850                 '$' => {
 851                     if Self::is_name_nondigit(self.peek_char(1)) {
 852                         self.scan_variable()
 853                     } else {
 854                         self.advance(1);
 855                         TokenKind::Dollar
 856                     }
 857                 }
 858                 '{' => {
 859                     self.advance(1);
 860                     TokenKind::LeftBrace
 861                 }
 862                 '\\' => {
 863                     match self.peek_char(1) {
 864                         // In these cases we just skip the escape sequence and
 865                         // keep on scanning for special characters.
 866                         | '\\' | '"' | '$' | 'e' | 'f' | 'n' | 'r' | 't' | 'v' | '`'
 867                         // Same in these cases; there might be more octal characters following but
 868                         // if there are, we'll just eat them as normal characters.
 869                         | '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' => {
 870                             self.advance(2);
 871                             self.skip_uninteresting_double_quote_like_string_characters();
 872                             TokenKind::StringLiteralBody}
 873                         | 'x' => {
 874                             self.scan_hexadecimal_escape();
 875                             self.skip_uninteresting_double_quote_like_string_characters();
 876                             TokenKind::StringLiteralBody }
 877                         | 'u' => {
 878                             self.scan_unicode_escape();
 879                             self.skip_uninteresting_double_quote_like_string_characters();
 880                             TokenKind::StringLiteralBody }
 881                         | '{' => {
 882                             // The rules for escaping open braces in Hack are bizarre. Suppose we
 883                             // have
 884                             // $x = 123;
 885                             // $y = 456;
 886                             // $z = "\{$x,$y\}";
 887                             // What is the value of $z?  Naively you would think that the backslash
 888                             // escapes the braces, and the variables are embedded, so {123,456}. But
 889                             // that's not what happens. Yes, the backslash makes the brace no longer
 890                             // the opening brace of an expression. But the backslash is still part
 891                             // of the string!  This is the string \{123,456\}.
 892                             // TODO: We might want to fix this because this is very strange.
 893                             // Eat the backslash and the brace.
 894                             self.advance(2);
 895                             TokenKind::StringLiteralBody
 896                         }
 897                     | _ => {
 898                        // TODO: A backslash followed by something other than an escape sequence
 899                        // is legal in hack, and treated as though it was just the backslash
 900                        // and the character. However we might consider making this a warning.
 901                        // It is particularly egregious when we have something like:
 902                        // $x = "abcdef \
 903                        //       ghi";
 904                        // The author of the code likely means the backslash to mean line
 905                        // continuation but in fact it just means to put a backslash and newline
 906                        // in the string.
 907                           self.advance(1);
 908                           self.skip_uninteresting_double_quote_like_string_characters();
 909                           TokenKind::StringLiteralBody
 910                       }
 911                    }
 912                 }
 913                 '[' => {
 914                     self.advance(1);
 915                     TokenKind::LeftBracket
 916                 }
 917                 ']' => {
 918                     self.advance(1);
 919                     TokenKind::RightBracket
 920                 }
 921                 '-' => {
 922                     if (self.peek_char(1)) == '>' {
 923                         self.advance(2);
 924                         TokenKind::MinusGreaterThan
 925                     } else {
 926                         // Nothing interesting here. Skip it and find the next
 927                         // interesting character.
 928                         self.advance(1);
 929                         self.skip_uninteresting_double_quote_like_string_characters();
 930                         TokenKind::StringLiteralBody
 931                     }
 932                 }
 933                 ch if ('0'..='9').contains(&ch) => {
 934                     let mut lexer1 = self.clone();
 935                     let literal = lexer1.scan_integer_literal_in_string();
 936
 937                     if self.errors.len() == lexer1.errors.len() {
 938                         self.continue_from(lexer1);
 939                         literal
 940                     } else {
 941                         // If we failed to scan a literal, do not interpret the literal
 942                         self.with_offset(lexer1.offset());
 943                         TokenKind::StringLiteralBody
 944                     }
 945                 }
 946                 _ => {
 947                     // Nothing interesting here. Skip it and find the next
 948                     // interesting character.
 949                     self.advance(1);
 950                     self.skip_uninteresting_double_quote_like_string_characters();
 951                     TokenKind::StringLiteralBody
 952                 }
 953             }
 954         }
 955     }
 956     // A heredoc string literal has the form
 957     //
 958     // header
 959     // optional body
 960     // trailer
 961     //
 962     // The header is:
 963     //
 964     // <<< (optional whitespace) name (no whitespace) (newline)
 965     //
 966     // The optional body is:
 967     //
 968     // any characters whatsoever including newlines (newline)
 969     //
 970     // The trailer is:
 971     //
 972     // (no whitespace) name (no whitespace) (optional semi) (no whitespace) (newline)
 973     //
 974     // The names must be identical.  The trailing semi and newline must be present.
 975     //
 976     // The body is any and all characters, up to the first line that exactly matches
 977     // the trailer.
 978     //
 979     // The body may contain embedded expressions.
 980     //
 981     // A nowdoc string literal has the same form except that the first name is
 982     // enclosed in single quotes, and it may not contain embedded expressions.
 983     fn scan_docstring_name_actual(&mut self) -> &'a [u8] {
 984         let ch = self.peek_char(0);
 985         if Self::is_name_nondigit(ch) {
 986             let start_offset = self.offset();
 987             self.advance(1);
 988             self.skip_name_end();
 989             self.source.sub(start_offset, self.offset() - start_offset)
 990         } else {
 991             self.with_error(Errors::error0008);
 992             b""
 993         }
 994     }
 995
 996     fn scan_docstring_name(&mut self) -> (&'a [u8], TokenKind) {
 997         self.skip_whitespace();
 998         let ch = self.peek_char(0);
 999         let kind = if ch == '\'' {
1000             TokenKind::NowdocStringLiteral
1001         } else {
1002             TokenKind::HeredocStringLiteral
1003         };
1004
1005         let name = if ch == '\'' {
1006             self.advance(1);
1007             let name = self.scan_docstring_name_actual();
1008             if (self.peek_char(0)) == '\'' {
1009                 self.advance(1);
1010                 name
1011             } else {
1012                 self.with_error(Errors::error0010);
1013                 name
1014             }
1015         } else {
1016             // Starting with PHP 5.3.0, the opening Heredoc identifier
1017             // may optionally be enclosed in double quotes:
1018             if ch == '"' {
1019                 self.advance(1)
1020             };
1021             let name = self.scan_docstring_name_actual();
1022             if ch == '"' {
1023                 // same logic as above, just for double quote
1024                 if self.peek_char(0) == '\"' {
1025                     self.advance(1);
1026                 } else {
1027                     self.with_error(Errors::missing_double_quote)
1028                 }
1029             }
1030             name
1031         };
1032         (name, kind)
1033     }
1034
1035     fn scan_docstring_header(&mut self) -> (&'a [u8], TokenKind) {
1036         let ch = self.peek_char(0);
1037         // Skip 3 for <<< or 4 for b<<<
1038         let skip_count = if ch == 'b' { 4 } else { 3 };
1039         self.advance(skip_count);
1040         let (name, kind) = self.scan_docstring_name();
1041         let ch = self.peek_char(0);
1042         if !Self::is_newline(ch) {
1043             self.with_error(Errors::error0011)
1044         }
1045         self.skip_to_end_of_line();
1046         self.skip_end_of_line();
1047         (name, kind)
1048     }
1049
1050     fn scan_docstring_remainder(&mut self, name: &[u8]) {
1051         let len = name.len();
1052         loop {
1053             let ch0 = self.peek_char(len);
1054             let ch1 = self.peek_char(len + 1);
1055             if ((Self::is_newline(ch0)) || ch0 == ';' && (Self::is_newline(ch1)))
1056                 && self.peek_string(len as usize) == name
1057             {
1058                 self.advance(len as usize);
1059                 break;
1060             } else {
1061                 self.skip_to_end_of_line();
1062                 let ch = self.peek_char(0);
1063                 if Self::is_newline(ch) {
1064                     self.skip_end_of_line()
1065                 } else {
1066                     // If we got here then we ran off the end of the file without
1067                     // finding a newline. Just bail.
1068                     self.with_error(Errors::error0011);
1069                     break;
1070                 }
1071             }
1072         }
1073     }
1074
1075     fn scan_docstring_literal(&mut self) -> TokenKind {
1076         let (name, kind) = self.scan_docstring_header();
1077         self.scan_docstring_remainder(name);
1078         kind
1079     }
1080
1081     fn scan_xhp_label(&mut self) {
1082         self.advance(1);
1083         self.skip_name_end();
1084     }
1085
1086     fn scan_xhp_element_name(&mut self, attribute: bool) -> TokenKind {
1087         // An XHP element name is a sequence of one or more XHP labels each separated
1088         // by a single : or -.  Note that it is possible for an XHP element name to be
1089         // followed immediately by a : or - that is the next token, so if we find
1090         // a : or - not followed by a label, we need to terminate the token.
1091         self.scan_xhp_label();
1092         let ch0 = self.peek_char(0);
1093         let ch1 = self.peek_char(1);
1094         if (!attribute && ch0 == ':' || ch0 == '-') && Self::is_name_nondigit(ch1) {
1095             self.advance(1);
1096             self.scan_xhp_element_name(false)
1097         } else {
1098             TokenKind::XHPElementName
1099         }
1100     }
1101
1102     fn scan_xhp_class_no_dash(&mut self) -> TokenKind {
1103         self.scan_xhp_label();
1104         let ch0 = self.peek_char(0);
1105         let ch1 = self.peek_char(1);
1106         if ch0 == ':' && Self::is_name_nondigit(ch1) {
1107             self.advance(1);
1108             self.scan_xhp_class_no_dash()
1109         } else {
1110             TokenKind::XHPElementName
1111         }
1112     }
1113
1114     // Is the next token we're going to lex a possible xhp class name?
1115     fn is_xhp_class_name(&self) -> bool {
1116         (self.peek_char(0) == ':') && (Self::is_name_nondigit(self.peek_char(1)))
1117     }
1118
1119     fn scan_xhp_class_name(&mut self) -> TokenKind {
1120         // An XHP class name is a colon followed by an xhp name.
1121         if self.is_xhp_class_name() {
1122             self.advance(1);
1123             self.scan_xhp_element_name(false);
1124             TokenKind::XHPClassName
1125         } else {
1126             self.with_error(Errors::error0008);
1127             self.advance(1);
1128             TokenKind::ErrorToken
1129         }
1130     }
1131
1132     // To support xhp class style class definitions we don't require a : prefix
1133     fn scan_xhp_modifier_class_name(&mut self) -> TokenKind {
1134         // we don't want to allow xhp names with a : prefix here
1135         if self.peek_char(0) == ':' {
1136             self.with_error(Errors::error0008);
1137             TokenKind::ErrorToken
1138         } else {
1139             self.scan_xhp_class_no_dash();
1140             TokenKind::XHPClassName
1141         }
1142     }
1143
1144     fn scan_xhp_string_literal(&mut self) -> TokenKind {
1145         // XHP string literals are just straight up "find the closing quote"
1146         // strings.  Embedded newlines are legal.
1147         let mut offset: usize = 1;
1148         loop {
1149             match self.peek_char(offset) {
1150                 INVALID => {
1151                     self.advance(offset);
1152                     if self.at_end() {
1153                         self.with_error(Errors::error0012);
1154                         return TokenKind::XHPStringLiteral;
1155                     } else {
1156                         self.with_error(Errors::error0006);
1157                         offset = 1
1158                     }
1159                 }
1160                 '"' => {
1161                     self.advance(offset + 1);
1162                     return TokenKind::XHPStringLiteral;
1163                 }
1164                 _ => offset += 1,
1165             }
1166         }
1167     }
1168
1169     // Note that this does not scan an XHP body
1170     fn scan_xhp_token(&mut self) -> TokenKind {
1171         // TODO: HHVM requires that there be no trivia between < and name in an
1172         // opening tag, but does allow trivia between </ and name in a closing tag.
1173         // Consider allowing trivia in an opening tag.
1174         let ch0 = self.peek_char(0);
1175         if ch0 == INVALID && self.at_end() {
1176             TokenKind::EndOfFile
1177         } else if self.is_xhp_class_name() || Self::is_name_nondigit(ch0) {
1178             self.scan_xhp_element_name(false)
1179         } else {
1180             match ch0 {
1181                 '{' => {
1182                     self.advance(1);
1183                     TokenKind::LeftBrace
1184                 }
1185                 '}' => {
1186                     self.advance(1);
1187                     TokenKind::RightBrace
1188                 }
1189                 '=' => {
1190                     self.advance(1);
1191                     TokenKind::Equal
1192                 }
1193                 '<' => {
1194                     if (self.peek_char(1)) == '/' {
1195                         self.advance(2);
1196                         TokenKind::LessThanSlash
1197                     } else {
1198                         self.advance(1);
1199                         TokenKind::LessThan
1200                     }
1201                 }
1202                 '"' => self.scan_xhp_string_literal(),
1203                 '/' => {
1204                     if (self.peek_char(1)) == '>' {
1205                         self.advance(2);
1206                         TokenKind::SlashGreaterThan
1207                     } else {
1208                         self.with_error(Errors::error0006);
1209                         self.advance(1);
1210                         TokenKind::ErrorToken
1211                     }
1212                 }
1213                 '>' => {
1214                     self.advance(1);
1215                     TokenKind::GreaterThan
1216                 }
1217                 _ => {
1218                     self.with_error(Errors::error0006);
1219                     self.advance(1);
1220                     TokenKind::ErrorToken
1221                 }
1222             }
1223         }
1224     }
1225
1226     fn scan_xhp_comment(&mut self) {
1227         let mut offset = 4;
1228         loop {
1229             let ch0 = self.peek_char(offset);
1230             let ch1 = self.peek_char(offset + 1);
1231             let ch2 = self.peek_char(offset + 2);
1232             match (ch0, ch1, ch2) {
1233                 (INVALID, _, _) => {
1234                     self.advance(offset as usize);
1235                     return self.with_error(Errors::error0014);
1236                 }
1237                 ('-', '-', '>') => return self.advance((offset + 3) as usize),
1238                 _ => offset += 1,
1239             }
1240         }
1241     }
1242     fn scan_xhp_body(&mut self) -> TokenKind {
1243         // Naively you might think that an XHP body is just a bunch of characters,
1244         // terminated by an embedded { } expression or a tag.  However, whitespace
1245         // and newlines are relevant in XHP bodies because they are "soft".
1246         // That is, any section of contiguous trivia has the same semantics as a
1247         // single space or newline -- just as in HTML.
1248         //
1249         // Obviously this is of relevance to code formatters.
1250         //
1251         // Therefore we detect whitespace and newlines within XHP bodies and treat
1252         // it as trivia surrounding the tokens within the body.
1253         //
1254         // TODO: Is this also true of whitespace within XHP comments? If so then
1255         // we need to make XHP comments a sequence of tokens, rather than a
1256         // single token as they are now.
1257         let ch0 = self.peek_char(0);
1258
1259         match ch0 {
1260             INVALID if self.at_end() => TokenKind::EndOfFile,
1261             '{' => {
1262                 self.advance(1);
1263                 TokenKind::LeftBrace
1264             }
1265             '}' => {
1266                 self.advance(1);
1267                 TokenKind::RightBrace
1268             }
1269             '<' => {
1270                 let ch1 = self.peek_char(1);
1271                 let ch2 = self.peek_char(2);
1272                 let ch3 = self.peek_char(3);
1273                 match (ch1, ch2, ch3) {
1274                     ('!', '-', '-') => {
1275                         self.scan_xhp_comment();
1276                         TokenKind::XHPComment
1277                     }
1278                     ('/', _, _) => {
1279                         self.advance(2);
1280                         TokenKind::LessThanSlash
1281                     }
1282                     _ => {
1283                         self.advance(1);
1284                         TokenKind::LessThan
1285                     }
1286                 }
1287             }
1288             _ => {
1289                 let mut offset = 0;
1290                 loop {
1291                     let ch = self.peek_char(offset);
1292                     match ch {
1293                         INVALID => {
1294                             self.advance(offset);
1295                             if self.at_end() {
1296                                 self.with_error(Errors::error0013);
1297                                 break;
1298                             } else {
1299                                 self.with_error(Errors::error0006);
1300                                 offset = 1
1301                             }
1302                         }
1303                         '\t' | ' ' | '\r' | '\n' | '{' | '}' | '<' => {
1304                             self.advance(offset);
1305                             break;
1306                         }
1307                         _ => offset += 1,
1308                     }
1309                 }
1310                 TokenKind::XHPBody
1311             }
1312         }
1313     }
1314
1315     fn scan_dollar_token(&mut self) -> TokenKind {
1316         // We have a problem here.  We wish to be able to lexically analyze both
1317         // PHP and Hack, but the introduction of $$ to Hack makes them incompatible.
1318         // "$$x" and "$$ $x" are legal in PHP, but illegal in Hack.
1319         // The rule in PHP seems to be that $ is a prefix operator, it is a token,
1320         // it can be followed by trivia, but the next token has to be another $
1321         // operator, a variable $x, or a {.
1322         //
1323         // Here's a reasonable compromise.  (TODO: Review this decision.)
1324         //
1325         // $$x lexes as $ $x
1326         // $$$x lexes as $ $ $x
1327         // and so on.
1328         //
1329         // $$ followed by anything other than a name or a $ lexes as $$.
1330         //
1331         // This means that lexing a PHP program which contains "$$ $x" is different
1332         // will fail at parse time, but I'm willing to live with that.
1333         //
1334         // This means that lexing a Hack program which contains
1335         // "$x |> $$instanceof Foo" produces an error as well.
1336         //
1337         // If these decisions are unacceptable then we will need to make the lexer
1338         // be aware of whether it is lexing PHP or Hack; thus far we have not had
1339         // to make this distinction.
1340
1341         // We are already at $.
1342         let ch1 = self.peek_char(1);
1343         match ch1 {
1344             '$' => {
1345                 let ch2 = self.peek_char(2);
1346                 if ch2 == '$' || ch2 == '{' || Self::is_name_nondigit(ch2) {
1347                     self.advance(1);
1348                     TokenKind::Dollar // $$x or $$$
1349                 } else {
1350                     self.advance(2);
1351                     TokenKind::DollarDollar // $$
1352                 }
1353             }
1354             _ => {
1355                 if Self::is_name_nondigit(ch1) {
1356                     self.scan_variable() // $x
1357                 } else {
1358                     self.advance(1);
1359                     TokenKind::Dollar // $
1360                 }
1361             }
1362         }
1363     }
1364
1365     fn scan_token(&mut self, in_type: bool) -> TokenKind {
1366         let ch0 = self.peek_char(0);
1367         match ch0 {
1368             '[' => {
1369                 self.advance(1);
1370                 TokenKind::LeftBracket
1371             }
1372             ']' => {
1373                 self.advance(1);
1374                 TokenKind::RightBracket
1375             }
1376             '(' => {
1377                 self.advance(1);
1378                 TokenKind::LeftParen
1379             }
1380             ')' => {
1381                 self.advance(1);
1382                 TokenKind::RightParen
1383             }
1384             '{' => {
1385                 self.advance(1);
1386                 TokenKind::LeftBrace
1387             }
1388             '}' => {
1389                 self.advance(1);
1390                 TokenKind::RightBrace
1391             }
1392             '.' => match self.peek_char(1) {
1393                 '=' => {
1394                     self.advance(2);
1395                     TokenKind::DotEqual
1396                 }
1397                 ch if ('0'..='9').contains(&ch) => self.scan_after_decimal_point(),
1398                 '.' => {
1399                     if (self.peek_char(2)) == '.' {
1400                         self.advance(3);
1401                         TokenKind::DotDotDot
1402                     } else {
1403                         self.advance(1);
1404                         TokenKind::Dot
1405                     }
1406                 }
1407                 _ => {
1408                     self.advance(1);
1409                     TokenKind::Dot
1410                 }
1411             },
1412             '-' => match self.peek_char(1) {
1413                 '=' => {
1414                     self.advance(2);
1415                     TokenKind::MinusEqual
1416                 }
1417                 '-' => {
1418                     self.advance(2);
1419                     TokenKind::MinusMinus
1420                 }
1421                 '>' => {
1422                     self.advance(2);
1423                     TokenKind::MinusGreaterThan
1424                 }
1425                 _ => {
1426                     self.advance(1);
1427                     TokenKind::Minus
1428                 }
1429             },
1430             '+' => match self.peek_char(1) {
1431                 '=' => {
1432                     self.advance(2);
1433                     TokenKind::PlusEqual
1434                 }
1435                 '+' => {
1436                     self.advance(2);
1437                     TokenKind::PlusPlus
1438                 }
1439                 _ => {
1440                     self.advance(1);
1441                     TokenKind::Plus
1442                 }
1443             },
1444             '*' => match (self.peek_char(1), self.peek_char(2)) {
1445                 ('=', _) => {
1446                     self.advance(2);
1447                     TokenKind::StarEqual
1448                 }
1449                 ('*', '=') => {
1450                     self.advance(3);
1451                     TokenKind::StarStarEqual
1452                 }
1453                 ('*', _) => {
1454                     self.advance(2);
1455                     TokenKind::StarStar
1456                 }
1457                 _ => {
1458                     self.advance(1);
1459                     TokenKind::Star
1460                 }
1461             },
1462             '~' => {
1463                 self.advance(1);
1464                 TokenKind::Tilde
1465             }
1466             '!' => match (self.peek_char(1), self.peek_char(2)) {
1467                 ('=', '=') => {
1468                     self.advance(3);
1469                     TokenKind::ExclamationEqualEqual
1470                 }
1471                 ('=', _) => {
1472                     self.advance(2);
1473                     TokenKind::ExclamationEqual
1474                 }
1475                 _ => {
1476                     self.advance(1);
1477                     TokenKind::Exclamation
1478                 }
1479             },
1480             '$' => self.scan_dollar_token(),
1481             '/' => {
1482                 if (self.peek_char(1)) == '=' {
1483                     self.advance(2);
1484                     TokenKind::SlashEqual
1485                 } else {
1486                     self.advance(1);
1487                     TokenKind::Slash
1488                 }
1489             }
1490             '%' => {
1491                 if (self.peek_char(1)) == '=' {
1492                     self.advance(2);
1493                     TokenKind::PercentEqual
1494                 } else {
1495                     self.advance(1);
1496                     TokenKind::Percent
1497                 }
1498             }
1499             '<' => {
1500                 match (self.peek_char(1), self.peek_char(2)) {
1501                     ('<', '<') => self.scan_docstring_literal(),
1502                     ('<', '=') => {
1503                         self.advance(3);
1504                         TokenKind::LessThanLessThanEqual
1505                     }
1506                     // TODO: We lex and parse the spaceship operator.
1507                     // TODO: This is not in the spec at present.  We should either make it an
1508                     // TODO: error, or add it to the specification.
1509                     ('=', '>') => {
1510                         self.advance(3);
1511                         TokenKind::LessThanEqualGreaterThan
1512                     }
1513                     ('=', _) => {
1514                         self.advance(2);
1515                         TokenKind::LessThanEqual
1516                     }
1517                     ('<', _) => {
1518                         self.advance(2);
1519                         TokenKind::LessThanLessThan
1520                     }
1521                     _ => {
1522                         self.advance(1);
1523                         TokenKind::LessThan
1524                     }
1525                 }
1526             }
1527             '>' => {
1528                 match (self.peek_char(1), self.peek_char(2)) {
1529                     // If we are parsing a generic type argument list then we might be at the >>
1530                     // in `List<List<int>>``, or at the >= of `let x:vec<int>=...`. In that case
1531                     // we want to lex two >'s instead of >> / one > and one = instead of >=.
1532                     (ch, _) if (ch == '>' || ch == '=') && in_type => {
1533                         self.advance(1);
1534                         TokenKind::GreaterThan
1535                     }
1536                     ('>', '=') => {
1537                         self.advance(3);
1538                         TokenKind::GreaterThanGreaterThanEqual
1539                     }
1540                     ('>', _) => {
1541                         self.advance(2);
1542                         TokenKind::GreaterThanGreaterThan
1543                     }
1544                     ('=', _) => {
1545                         self.advance(2);
1546                         TokenKind::GreaterThanEqual
1547                     }
1548                     _ => {
1549                         self.advance(1);
1550                         TokenKind::GreaterThan
1551                     }
1552                 }
1553             }
1554             '=' => match (self.peek_char(1), self.peek_char(2)) {
1555                 ('=', '=') => {
1556                     self.advance(3);
1557                     TokenKind::EqualEqualEqual
1558                 }
1559                 ('=', '>') => {
1560                     self.advance(3);
1561                     TokenKind::EqualEqualGreaterThan
1562                 }
1563                 ('=', _) => {
1564                     self.advance(2);
1565                     TokenKind::EqualEqual
1566                 }
1567                 ('>', _) => {
1568                     self.advance(2);
1569                     TokenKind::EqualGreaterThan
1570                 }
1571                 _ => {
1572                     self.advance(1);
1573                     TokenKind::Equal
1574                 }
1575             },
1576             '^' => {
1577                 if (self.peek_char(1)) == '=' {
1578                     self.advance(2);
1579                     TokenKind::CaratEqual
1580                 } else {
1581                     self.advance(1);
1582                     TokenKind::Carat
1583                 }
1584             }
1585             '|' => match self.peek_char(1) {
1586                 '=' => {
1587                     self.advance(2);
1588                     TokenKind::BarEqual
1589                 }
1590                 '>' => {
1591                     self.advance(2);
1592                     TokenKind::BarGreaterThan
1593                 }
1594                 '|' => {
1595                     self.advance(2);
1596                     TokenKind::BarBar
1597                 }
1598                 _ => {
1599                     self.advance(1);
1600                     TokenKind::Bar
1601                 }
1602             },
1603             '&' => match self.peek_char(1) {
1604                 '=' => {
1605                     self.advance(2);
1606                     TokenKind::AmpersandEqual
1607                 }
1608                 '&' => {
1609                     self.advance(2);
1610                     TokenKind::AmpersandAmpersand
1611                 }
1612                 _ => {
1613                     self.advance(1);
1614                     TokenKind::Ampersand
1615                 }
1616             },
1617             '?' => match (self.peek_char(1), self.peek_char(2)) {
1618                 (':', _) if !in_type => {
1619                     self.advance(2);
1620                     TokenKind::QuestionColon
1621                 }
1622                 ('-', '>') => {
1623                     self.advance(3);
1624                     TokenKind::QuestionMinusGreaterThan
1625                 }
1626                 ('?', '=') => {
1627                     self.advance(3);
1628                     TokenKind::QuestionQuestionEqual
1629                 }
1630                 ('?', _) => {
1631                     self.advance(2);
1632                     TokenKind::QuestionQuestion
1633                 }
1634                 ('a', 's') if !Self::is_name_nondigit(self.peek_char(3)) => {
1635                     self.advance(3);
1636                     TokenKind::QuestionAs
1637                 }
1638                 _ => {
1639                     self.advance(1);
1640                     TokenKind::Question
1641                 }
1642             },
1643             ':' => {
1644                 let ch1 = self.peek_char(1);
1645
1646                 if ch1 == ':' {
1647                     self.advance(2);
1648                     TokenKind::ColonColon
1649                 } else {
1650                     self.advance(1);
1651                     TokenKind::Colon
1652                 }
1653             }
1654             ';' => {
1655                 self.advance(1);
1656                 TokenKind::Semicolon
1657             }
1658             ',' => {
1659                 self.advance(1);
1660                 TokenKind::Comma
1661             }
1662             '@' => {
1663                 self.advance(1);
1664                 TokenKind::At
1665             }
1666             '0' => match self.peek_char(1) {
1667                 'x' | 'X' => {
1668                     self.advance(2);
1669                     self.scan_hex_literal()
1670                 }
1671                 'b' | 'B' => {
1672                     self.advance(2);
1673                     self.scan_binary_literal()
1674                 }
1675                 _ => self.scan_octal_or_float(),
1676             },
1677             ch if ('1'..='9').contains(&ch) => self.scan_decimal_or_float(),
1678             '\'' => self.scan_single_quote_string_literal(),
1679             '"' => self.scan_double_quote_like_string_literal_from_start(),
1680             '`' => {
1681                 self.advance(1);
1682                 TokenKind::Backtick
1683             }
1684             '\\' => {
1685                 self.advance(1);
1686                 TokenKind::Backslash
1687             }
1688             '#' => {
1689                 self.advance(1);
1690                 TokenKind::Hash
1691             }
1692             'b' if {
1693                 let c1 = self.peek_char(1);
1694                 let c2 = self.peek_char(2);
1695                 let c3 = self.peek_char(3);
1696                 c1 == '"' || c1 == '\'' || (c1 == '<' && c2 == '<' && c3 == '<')
1697             } =>
1698             {
1699                 self.advance(1);
1700                 self.scan_token(in_type)
1701             }
1702             // Names
1703             _ => {
1704                 if ch0 == INVALID && self.at_end() {
1705                     TokenKind::EndOfFile
1706                 } else if Self::is_name_nondigit(ch0) {
1707                     self.scan_name()
1708                 } else {
1709                     self.with_error(Errors::error0006);
1710                     self.advance(1);
1711                     TokenKind::ErrorToken
1712                 }
1713             }
1714         }
1715     }
1716
1717     fn scan_token_outside_type(&mut self) -> TokenKind {
1718         self.scan_token(false)
1719     }
1720
1721     fn scan_token_inside_type(&mut self) -> TokenKind {
1722         self.scan_token(true)
1723     }
1724
1725     // Lexing trivia
1726
1727     // SPEC:
1728     //
1729     // white-space-character::
1730     //   new-line
1731     //   Space character (U+0020)
1732     //   Horizontal-tab character (U+0009)
1733     //
1734     // single-line-comment::
1735     //   //   input-characters-opt
1736     //   #    input-characters-opt
1737     //
1738     // new-line::
1739     //   Carriage-return character (U+000D)
1740     //   Line-feed character (U+000A)
1741     //   Carriage-return character followed by line-feed character
1742
1743     fn str_scan_end_of_line(s: &[u8], i: usize) -> usize {
1744         match s.get(i).map(|x| *x as char) {
1745             None => i + 1,
1746             Some('\r') => match s.get(i + 1).map(|x| *x as char) {
1747                 Some('\n') => 2 + i,
1748                 _ => i + 1,
1749             },
1750             Some('\n') => i + 1,
1751             _ => panic!("str_scan_end_of_line called while not on end of line!"),
1752         }
1753     }
1754
1755     fn scan_end_of_line(&mut self) -> Trivium<TF> {
1756         match self.peek_char(0) {
1757             '\r' => {
1758                 let w = if self.peek_char(1) == '\n' { 2 } else { 1 };
1759                 self.advance(w);
1760                 Trivia::<TF>::make_eol(self.start, w)
1761             }
1762             '\n' => {
1763                 self.advance(1);
1764                 Trivia::<TF>::make_eol(self.start, 1)
1765             }
1766             _ => panic!("scan_end_of_line called while not on end of line!"),
1767         }
1768     }
1769
1770     fn scan_single_line_comment(&mut self) -> Trivium<TF> {
1771         // A fallthrough comment is two slashes, any amount of whitespace,
1772         // FALLTHROUGH, and any characters may follow.
1773         // TODO: Consider allowing lowercase fallthrough.
1774
1775         self.advance(2);
1776         self.skip_whitespace();
1777         let lexer_ws = self.clone();
1778         self.skip_to_end_of_line();
1779         let w = self.width();
1780         let remainder = self.offset - lexer_ws.offset;
1781         if remainder >= 11 && lexer_ws.peek_string(11) == b"FALLTHROUGH" {
1782             Trivia::<TF>::make_fallthrough(self.start, w)
1783         } else {
1784             Trivia::<TF>::make_single_line_comment(self.start, w)
1785         }
1786     }
1787
1788     fn skip_to_end_of_delimited_comment(&mut self) {
1789         let mut offset = 0;
1790         loop {
1791             let ch0 = self.peek_char(offset);
1792             if ch0 == INVALID {
1793                 self.advance(offset);
1794                 if self.at_end() {
1795                     return self.with_error(Errors::error0007);
1796                 } else {
1797                     // TODO: Do we want to give a warning for an embedded zero char
1798                     // inside a comment?
1799                     offset = 1;
1800                 }
1801             } else if ch0 == '*' && (self.peek_char(offset + 1)) == '/' {
1802                 return self.advance(offset + 2);
1803             } else {
1804                 offset += 1
1805             }
1806         }
1807     }
1808
1809     fn scan_delimited_comment(&mut self) -> Trivium<TF> {
1810         // The original lexer lexes a fixme / ignore error as:
1811         //
1812         // slash star [whitespace]* HH_FIXME [whitespace or newline]* leftbracket
1813         // [whitespace or newline]* integer [any text]* star slash
1814         //
1815         // Notice that the original lexer oddly enough does not verify that there
1816         // is a right bracket.
1817         //
1818         // For our purposes we will just check for HH_FIXME / HH_IGNORE_ERROR;
1819         // a later pass can try to parse out the integer if there is one,
1820         // give a warning if there is not, and so on.
1821
1822         self.advance(2);
1823         self.skip_whitespace();
1824
1825         let lexer_ws = self.clone();
1826         self.skip_to_end_of_delimited_comment();
1827         let w = self.width();
1828         if lexer_ws.match_string(b"HH_FIXME") {
1829             Trivia::<TF>::make_fix_me(self.start, w)
1830         } else if lexer_ws.match_string(b"HH_IGNORE_ERROR") {
1831             Trivia::<TF>::make_ignore_error(self.start, w)
1832         } else {
1833             Trivia::<TF>::make_delimited_comment(self.start, w)
1834         }
1835     }
1836
1837     fn scan_php_trivium(&mut self) -> Option<Trivium<TF>> {
1838         match self.peek_char(0) {
1839             '#' => {
1840                 self.start_new_lexeme();
1841                 // Not trivia
1842                 None
1843             }
1844             '/' => {
1845                 self.start_new_lexeme();
1846                 match self.peek_char(1) {
1847                     '/' => Some(self.scan_single_line_comment()),
1848                     '*' => Some(self.scan_delimited_comment()),
1849                     _ => None,
1850                 }
1851             }
1852             ' ' | '\t' => {
1853                 let new_end = Self::str_skip_whitespace(self.source_text_string(), self.offset);
1854                 let new_start = self.offset;
1855                 let new_trivia = Trivia::<TF>::make_whitespace(new_start, new_end - new_start);
1856                 self.with_start_offset(new_start, new_end);
1857                 Some(new_trivia)
1858             }
1859             '\r' | '\n' => {
1860                 self.start_new_lexeme();
1861                 Some(self.scan_end_of_line())
1862             }
1863             _ => {
1864                 self.start_new_lexeme();
1865                 // Not trivia
1866                 None
1867             }
1868         }
1869     }
1870
1871     fn scan_xhp_trivium(&mut self) -> Option<Trivium<TF>> {
1872         // TODO: Should XHP comments <!-- --> be their own thing, or a kind of
1873         // trivia associated with a token? Right now they are the former.
1874         let i = self.offset;
1875         let ch = self.peek_char(0);
1876         match ch {
1877             ' ' | '\t' => {
1878                 let j = Self::str_skip_whitespace(self.source_text_string(), i);
1879                 self.with_start_offset(i, j);
1880                 Some(Trivia::<TF>::make_whitespace(i, j - i))
1881             }
1882             '\r' | '\n' => {
1883                 let j = Self::str_scan_end_of_line(self.source_text_string(), i);
1884                 self.with_start_offset(i, j);
1885                 Some(Trivia::<TF>::make_eol(i, j - i))
1886             }
1887             _ =>
1888             // Not trivia
1889             {
1890                 self.start_new_lexeme();
1891                 None
1892             }
1893         }
1894     }
1895
1896     // We divide trivia into "leading" and "trailing" trivia of an associated
1897     // token. This means that we must find a dividing line between the trailing trivia
1898     // following one token and the leading trivia of the following token. Plainly
1899     // we need only find this line while scanning trailing trivia. The heuristics
1900     // we use are:
1901     // * The first newline trivia encountered is the last trailing trivia.
1902     // * The newline which follows a // or # comment is not part of the comment
1903     //   but does terminate the trailing trivia.
1904     // * A pragma to turn checks off (HH_FIXME and HH_IGNORE_ERROR) is
1905     //   always a leading trivia.
1906     fn scan_leading_trivia(
1907         &mut self,
1908         scanner: impl Fn(&mut Self) -> Option<Trivium<TF>>,
1909     ) -> Trivia<TF> {
1910         let mut acc = self.token_factory.trivia_factory_mut().make();
1911         while let Some(t) = scanner(self) {
1912             acc.push(t)
1913         }
1914         acc
1915     }
1916
1917     fn scan_leading_trivia_with_width(
1918         &mut self,
1919         scanner: impl Fn(&mut Self) -> Option<Trivium<TF>>,
1920         mut width: usize,
1921     ) -> Trivia<TF> {
1922         let mut acc = self.token_factory.trivia_factory_mut().make();
1923         let mut extra_token_error_width = 0;
1924         let mut extra_token_error_offset = self.offset();
1925         loop {
1926             if width == 0 {
1927                 if extra_token_error_width > 0 {
1928                     acc.push(Trivia::<TF>::make_extra_token_error(
1929                         extra_token_error_offset,
1930                         extra_token_error_width,
1931                     ));
1932                 }
1933                 break acc;
1934             }
1935             if let Some(t) = scanner(self) {
1936                 if extra_token_error_width > 0 {
1937                     acc.push(Trivia::<TF>::make_extra_token_error(
1938                         extra_token_error_offset,
1939                         extra_token_error_width,
1940                     ));
1941                     extra_token_error_width = 0;
1942                     extra_token_error_offset = self.start();
1943                 }
1944                 width -= t.width();
1945                 acc.push(t);
1946             } else {
1947                 self.advance(1);
1948                 width -= 1;
1949                 extra_token_error_width += 1;
1950             }
1951         }
1952     }
1953
1954     pub fn scan_leading_php_trivia_with_width(
1955         &mut self,
1956         width: usize,
1957     ) -> <TF::Token as LexableToken>::Trivia {
1958         self.scan_leading_trivia_with_width(&Self::scan_php_trivium, width)
1959     }
1960
1961     pub fn scan_leading_xhp_trivia_with_width(
1962         &mut self,
1963         width: usize,
1964     ) -> <TF::Token as LexableToken>::Trivia {
1965         self.scan_leading_trivia_with_width(&Self::scan_xhp_trivium, width)
1966     }
1967
1968     pub(crate) fn scan_leading_php_trivia(&mut self) -> <TF::Token as LexableToken>::Trivia {
1969         self.scan_leading_trivia(&Self::scan_php_trivium)
1970     }
1971
1972     pub(crate) fn scan_leading_xhp_trivia(&mut self) -> <TF::Token as LexableToken>::Trivia {
1973         self.scan_leading_trivia(&Self::scan_xhp_trivium)
1974     }
1975
1976     fn scan_trailing_trivia(
1977         &mut self,
1978         scanner: impl Fn(&mut Self) -> Option<Trivium<TF>>,
1979     ) -> <TF::Token as LexableToken>::Trivia {
1980         let mut acc = self.token_factory.trivia_factory_mut().make();
1981         loop {
1982             let mut lexer1 = self.clone();
1983             match scanner(&mut lexer1) {
1984                 None => {
1985                     self.continue_from(lexer1);
1986                     return acc;
1987                 }
1988                 Some(t) => match t.kind() {
1989                     TriviaKind::EndOfLine => {
1990                         self.continue_from(lexer1);
1991                         acc.push(t);
1992                         return acc;
1993                     }
1994                     TriviaKind::FixMe | TriviaKind::IgnoreError => {
1995                         return acc;
1996                     }
1997                     _ => {
1998                         self.continue_from(lexer1);
1999                         acc.push(t)
2000                     }
2001                 },
2002             }
2003         }
2004     }
2005
2006     pub fn scan_trailing_php_trivia(&mut self) -> <TF::Token as LexableToken>::Trivia {
2007         self.scan_trailing_trivia(&Self::scan_php_trivium)
2008     }
2009
2010     pub fn scan_trailing_xhp_trivia(&mut self) -> <TF::Token as LexableToken>::Trivia {
2011         self.scan_trailing_trivia(&Self::scan_xhp_trivium)
2012     }
2013
2014     pub fn is_next_name(&self) -> bool {
2015         let mut lexer = self.clone();
2016         lexer.scan_leading_php_trivia();
2017         Self::is_name_nondigit(lexer.peek_char(0))
2018     }
2019
2020     pub fn is_next_xhp_class_name(&self) -> bool {
2021         let mut lexer = self.clone();
2022         lexer.scan_leading_php_trivia();
2023         lexer.is_xhp_class_name()
2024     }
2025
2026     as_case_insensitive_keyword!(
2027         12,
2028         U12,
2029         "abstract",
2030         "and",
2031         "as",
2032         "bool",
2033         "boolean",
2034         "break",
2035         "callable",
2036         "case",
2037         "catch",
2038         "class",
2039         "clone",
2040         "const",
2041         "continue",
2042         "default",
2043         "die",
2044         "do",
2045         "echo",
2046         "else",
2047         "elseif",
2048         "empty",
2049         "endfor",
2050         "endforeach",
2051         "endif",
2052         "endswitch",
2053         "endwhile",
2054         "eval",
2055         "exit",
2056         "extends",
2057         "false",
2058         "final",
2059         "finally",
2060         "for",
2061         "foreach",
2062         "function",
2063         "global",
2064         "if",
2065         "implements",
2066         "include",
2067         "include_once",
2068         "inout",
2069         "instanceof",
2070         "insteadof",
2071         "int",
2072         "integer",
2073         "interface",
2074         "isset",
2075         "list",
2076         "namespace",
2077         "new",
2078         "null",
2079         "or",
2080         "parent",
2081         "print",
2082         "private",
2083         "protected",
2084         "public",
2085         "require",
2086         "require_once",
2087         "return",
2088         "self",
2089         "static",
2090         "string",
2091         "switch",
2092         "throw",
2093         "trait",
2094         "try",
2095         "true",
2096         "unset",
2097         "use",
2098         "using",
2099         "var",
2100         "void",
2101         "while",
2102         "xor",
2103         "yield"
2104     );
2105
2106     fn as_keyword(&mut self, only_reserved: bool, kind: TokenKind) -> TokenKind {
2107         if kind == TokenKind::Name {
2108             let original_text = self.current_text_as_str();
2109             let (text, has_upper) = self
2110                 .as_case_insensitive_keyword(original_text)
2111                 .unwrap_or((original_text, false));
2112             match TokenKind::from_string(text.as_bytes(), only_reserved) {
2113                 Some(keyword) => {
2114                     if has_upper && text != "true" && text != "false" && text != "null" {
2115                         let err = Errors::uppercase_kw(original_text);
2116                         self.with_error(err);
2117                     }
2118                     keyword
2119                 }
2120                 _ => TokenKind::Name,
2121             }
2122         } else {
2123             kind
2124         }
2125     }
2126
2127     fn scan_token_and_leading_trivia(
2128         &mut self,
2129         scanner: impl Fn(&mut Self) -> TokenKind,
2130         as_name: KwSet,
2131     ) -> (TokenKind, usize, <TF::Token as LexableToken>::Trivia) {
2132         // Get past the leading trivia
2133         let leading = self.scan_leading_php_trivia();
2134         // Remember where we were when we started this token
2135         self.start_new_lexeme();
2136         let kind = scanner(self);
2137         let kind = match as_name {
2138             KwSet::AllKeywords => kind,
2139             KwSet::NonReservedKeywords => self.as_keyword(true, kind),
2140             KwSet::NoKeywords => self.as_keyword(false, kind),
2141         };
2142         let w = self.width();
2143         (kind, w, leading)
2144     }
2145
2146     fn scan_token_and_trivia(
2147         &mut self,
2148         scanner: &impl Fn(&mut Self) -> TokenKind,
2149         as_name: KwSet,
2150     ) -> TF::Token {
2151         let token_start = self.offset;
2152
2153         let (kind, w, leading) = self.scan_token_and_leading_trivia(scanner, as_name);
2154         let trailing = match kind {
2155             TokenKind::DoubleQuotedStringLiteralHead => {
2156                 self.token_factory.trivia_factory_mut().make()
2157             }
2158             _ => self.scan_trailing_php_trivia(),
2159         };
2160         self.token_factory
2161             .make(kind, token_start, w, leading, trailing)
2162     }
2163
2164     fn scan_assert_progress(&mut self, tokenizer: impl Fn(&mut Self) -> TF::Token) -> TF::Token {
2165         let original_remaining = self.remaining();
2166         let token = tokenizer(self);
2167         let new_remaining = self.remaining();
2168         if new_remaining < original_remaining
2169             || original_remaining == 0
2170                 && new_remaining == 0
2171                 && (token.kind()) == TokenKind::EndOfFile
2172         {
2173             token
2174         } else {
2175             panic!(
2176                 "failed to make progress at {} {} {} {:?}\n",
2177                 self.offset,
2178                 original_remaining,
2179                 new_remaining,
2180                 token.kind()
2181             )
2182         }
2183     }
2184
2185     fn scan_next_token(
2186         &mut self,
2187         scanner: impl Fn(&mut Self) -> TokenKind,
2188         as_name: KwSet,
2189     ) -> TF::Token {
2190         let tokenizer = |x: &mut Self| x.scan_token_and_trivia(&scanner, as_name);
2191         self.scan_assert_progress(&tokenizer)
2192     }
2193
2194     fn scan_next_token_as_name(&mut self, scanner: impl Fn(&mut Self) -> TokenKind) -> TF::Token {
2195         self.scan_next_token(scanner, KwSet::AllKeywords)
2196     }
2197
2198     fn scan_next_token_as_keyword(
2199         &mut self,
2200         scanner: impl Fn(&mut Self) -> TokenKind,
2201     ) -> TF::Token {
2202         self.scan_next_token(scanner, KwSet::NoKeywords)
2203     }
2204
2205     fn scan_next_token_nonreserved_as_name(
2206         &mut self,
2207         scanner: impl Fn(&mut Self) -> TokenKind,
2208     ) -> TF::Token {
2209         self.scan_next_token(scanner, KwSet::NonReservedKeywords)
2210     }
2211
2212     fn next_token_impl(&mut self) -> TF::Token {
2213         if self.in_type {
2214             self.scan_next_token_as_keyword(&Self::scan_token_inside_type)
2215         } else {
2216             self.scan_next_token_as_keyword(&Self::scan_token_outside_type)
2217         }
2218     }
2219
2220     // Entrypoints
2221     pub fn peek_next_token(&self) -> TF::Token {
2222         {
2223             let cache = self.cache.borrow();
2224             if let Some(cache) = cache.as_ref() {
2225                 if cache.0 == *self {
2226                     return cache.1.clone();
2227                 }
2228             }
2229         }
2230
2231         let mut lexer = self.clone();
2232         lexer.errors = vec![];
2233         let before = lexer.to_lexer_pre_snapshot();
2234         let token = lexer.next_token_impl();
2235         let after = lexer.into_lexer_post_snapshot();
2236         self.cache
2237             .replace(Some(LexerCache(before, token.clone(), after)));
2238         token
2239     }
2240
2241     pub fn next_token(&mut self) -> TF::Token {
2242         {
2243             let mut cache = self.cache.borrow_mut();
2244             if let Some(ref mut cache) = cache.deref_mut() {
2245                 if cache.0 == *self {
2246                     self.start = (cache.2).start;
2247                     self.offset = (cache.2).offset;
2248                     self.in_type = (cache.2).in_type;
2249                     if !(cache.2).errors.is_empty() {
2250                         self.errors.append(&mut (cache.2).errors.clone());
2251                     }
2252                     return cache.1.clone();
2253                 }
2254             }
2255         }
2256         self.next_token_impl()
2257     }
2258
2259     pub fn next_token_no_trailing(&mut self) -> TF::Token {
2260         let tokenizer = |x: &mut Self| {
2261             let token_start = x.offset;
2262             let (kind, w, leading) =
2263                 x.scan_token_and_leading_trivia(&Self::scan_token_outside_type, KwSet::NoKeywords);
2264             let trailing = x.token_factory.trivia_factory_mut().make();
2265             x.token_factory
2266                 .make(kind, token_start, w, leading, trailing)
2267         };
2268         self.scan_assert_progress(&tokenizer)
2269     }
2270
2271     pub fn next_token_in_string(&mut self, literal_kind: &StringLiteralKind) -> TF::Token {
2272         let token_start = self.offset;
2273         self.start_new_lexeme();
2274         // We're inside a string. Do not scan leading trivia.
2275         let kind = self.scan_string_literal_in_progress(literal_kind);
2276         let w = self.width();
2277         // Only scan trailing trivia if we've finished the string.
2278         let trailing = match kind {
2279             TokenKind::DoubleQuotedStringLiteralTail | TokenKind::HeredocStringLiteralTail => {
2280                 self.scan_trailing_php_trivia()
2281             }
2282             _ => self.token_factory.trivia_factory_mut().make(),
2283         };
2284         let leading = self.token_factory.trivia_factory_mut().make();
2285         self.token_factory
2286             .make(kind, token_start, w, leading, trailing)
2287     }
2288
2289     pub fn next_docstring_header(&mut self) -> (TF::Token, &'a [u8]) {
2290         // We're at the beginning of a heredoc string literal. Scan leading
2291         // trivia but not trailing trivia.
2292         let token_start = self.offset;
2293         let leading = self.scan_leading_php_trivia();
2294         self.start_new_lexeme();
2295         let (name, _) = self.scan_docstring_header();
2296         let w = self.width();
2297         let trailing = self.token_factory.trivia_factory_mut().make();
2298         let token = self.token_factory.make(
2299             TokenKind::HeredocStringLiteralHead,
2300             token_start,
2301             w,
2302             leading,
2303             trailing,
2304         );
2305         (token, name)
2306     }
2307
2308     pub fn next_token_as_name(&mut self) -> TF::Token {
2309         self.scan_next_token_as_name(&Self::scan_token_outside_type)
2310     }
2311
2312     pub fn next_token_non_reserved_as_name(&mut self) -> TF::Token {
2313         self.scan_next_token_nonreserved_as_name(&Self::scan_token_outside_type)
2314     }
2315
2316     pub fn next_xhp_element_token(&mut self, no_trailing: bool) -> (TF::Token, &[u8]) {
2317         // XHP elements have whitespace, newlines and Hack comments.
2318         let tokenizer = |lexer: &mut Self| {
2319             let token_start = lexer.offset;
2320             let (kind, w, leading) =
2321                 lexer.scan_token_and_leading_trivia(&Self::scan_xhp_token, KwSet::AllKeywords);
2322             // We do not scan trivia after an XHPOpen's >. If that is the beginning of
2323             // an XHP body then we want any whitespace or newlines to be leading trivia
2324             // of the body token.
2325             match kind {
2326                 TokenKind::GreaterThan | TokenKind::SlashGreaterThan if no_trailing => {
2327                     let trailing = lexer.token_factory.trivia_factory_mut().make();
2328                     lexer
2329                         .token_factory
2330                         .make(kind, token_start, w, leading, trailing)
2331                 }
2332                 _ => {
2333                     let trailing = lexer.scan_trailing_php_trivia();
2334                     lexer
2335                         .token_factory
2336                         .make(kind, token_start, w, leading, trailing)
2337                 }
2338             }
2339         };
2340         let token = self.scan_assert_progress(&tokenizer);
2341         let token_width = token.width();
2342         let trailing_width = token.trailing_width();
2343         let token_start_offset = (self.offset) - trailing_width - token_width;
2344         let token_text = self.source.sub(token_start_offset, token_width);
2345         (token, token_text)
2346     }
2347
2348     pub fn next_xhp_body_token(&mut self) -> TF::Token {
2349         let scanner = |lexer: &mut Self| {
2350             let token_start = lexer.offset;
2351             let leading = lexer.scan_leading_xhp_trivia();
2352             lexer.start_new_lexeme();
2353             let kind = lexer.scan_xhp_body();
2354             let w = lexer.width();
2355             let trailing =
2356                 // Trivia (leading and trailing) is semantically
2357                 // significant for XHPBody tokens. When we find elements or
2358                 // braced expressions inside the body, the trivia should be
2359                 // seen as leading the next token, but we should certainly
2360                 // keep it trailing if this is an XHPBody token.
2361                 if kind == TokenKind::XHPBody {
2362                     lexer.scan_trailing_xhp_trivia()
2363                 } else {
2364                     lexer.token_factory.trivia_factory_mut().make()
2365                 };
2366             lexer
2367                 .token_factory
2368                 .make(kind, token_start, w, leading, trailing)
2369         };
2370         self.scan_assert_progress(&scanner)
2371     }
2372
2373     //
2374     // When the xhp modifier is used for declaring xhp classes
2375     // we do not allow colon prefixes or dashes.
2376     //
2377     // This ensures that the syntax is closer to regular classes.
2378     //
2379     pub fn next_xhp_modifier_class_name(&mut self) -> TF::Token {
2380         self.scan_token_and_trivia(&Self::scan_xhp_modifier_class_name, KwSet::NoKeywords)
2381     }
2382
2383     pub fn next_xhp_class_name(&mut self) -> TF::Token {
2384         self.scan_token_and_trivia(&Self::scan_xhp_class_name, KwSet::NoKeywords)
2385     }
2386
2387     pub fn next_xhp_name(&mut self) -> TF::Token {
2388         let scanner = |x: &mut Self| x.scan_xhp_element_name(false);
2389         self.scan_token_and_trivia(&scanner, KwSet::NoKeywords)
2390     }
2391
2392     fn make_hashbang_token(&mut self) -> TF::Token {
2393         let leading = self.token_factory.trivia_factory_mut().make();
2394         self.skip_to_end_of_line();
2395         let token_start = self.start;
2396         let token_width = self.width();
2397         let trailing = self.scan_trailing_php_trivia();
2398         self.start_new_lexeme();
2399         self.token_factory.make(
2400             TokenKind::Hashbang,
2401             token_start,
2402             token_width,
2403             leading,
2404             trailing,
2405         )
2406     }
2407
2408     fn make_long_tag(
2409         &mut self,
2410         name_token_offset: usize,
2411         size: usize,
2412         less_than_question_token: TF::Token,
2413     ) -> (TF::Token, Option<TF::Token>) {
2414         // skip name
2415         self.advance(size);
2416         // single line comments that follow the language in leading markup_text
2417         // determine the file check mode, read the trailing trivia and attach it
2418         // to the language token
2419         let trailing = self.scan_trailing_php_trivia();
2420         let leading = self.token_factory.trivia_factory_mut().make();
2421         let name =
2422             self.token_factory
2423                 .make(TokenKind::Name, name_token_offset, size, leading, trailing);
2424         (less_than_question_token, Some(name))
2425     }
2426
2427     fn make_markup_suffix(&mut self) -> (TF::Token, Option<TF::Token>) {
2428         let leading = self.token_factory.trivia_factory_mut().make();
2429         let trailing = self.token_factory.trivia_factory_mut().make();
2430         let less_than_question_token = self.token_factory.make(
2431             TokenKind::LessThanQuestion,
2432             self.offset,
2433             2,
2434             leading,
2435             trailing,
2436         );
2437         // skip <?
2438         self.advance(2);
2439         let name_token_offset = self.offset;
2440         let ch0 = self.peek_char(0).to_ascii_lowercase();
2441         let ch1 = self.peek_char(1).to_ascii_lowercase();
2442         match (ch0, ch1) {
2443             ('h', 'h') => self.make_long_tag(name_token_offset, 2, less_than_question_token),
2444             _ => (less_than_question_token, (None)),
2445         }
2446     }
2447
2448     fn skip_to_end_of_header(
2449         &mut self,
2450     ) -> (Option<TF::Token>, Option<(TF::Token, Option<TF::Token>)>) {
2451         let start_offset = {
2452             // if leading section starts with #! - it should span the entire line
2453             if self.offset != 0 {
2454                 panic!("Should only try to lex header at start of document")
2455             };
2456             // this should really just be `self.offset` - but, skip whitespace as the FFP
2457             // tests use magic comments in leading markup to set flags, but blank
2458             // them out before parsing; the newlines are kept to provide correct line
2459             // numbers in errors
2460             self.skip_while_to_offset(&|x| Self::is_newline(x) || Self::is_whitespace_no_newline(x))
2461         };
2462         let hashbang = if self.peek_def(start_offset, INVALID) == '#'
2463             && self.peek_def(start_offset + 1, INVALID) == '!'
2464         {
2465             self.with_offset(start_offset);
2466             Some(self.make_hashbang_token())
2467         } else {
2468             None
2469         };
2470
2471         let start_offset = self
2472             .skip_while_to_offset(&|x| Self::is_newline(x) || Self::is_whitespace_no_newline(x));
2473         let suffix = if self.peek_def(start_offset, INVALID) == '<'
2474             && self.peek_def(start_offset + 1, INVALID) == '?'
2475         {
2476             self.with_offset(start_offset);
2477             Some(self.make_markup_suffix())
2478         } else {
2479             None
2480         };
2481
2482         (hashbang, suffix)
2483     }
2484
2485     pub fn scan_header(&mut self) -> (Option<TF::Token>, Option<(TF::Token, Option<TF::Token>)>) {
2486         self.start_new_lexeme();
2487         self.skip_to_end_of_header()
2488     }
2489
2490     pub fn is_next_xhp_category_name(&self) -> bool {
2491         let mut lexer = self.clone();
2492         let _ = lexer.scan_leading_php_trivia();
2493         // An XHP category is an xhp element name preceded by a %.
2494         let ch0 = lexer.peek_char(0);
2495         let ch1 = lexer.peek_char(1);
2496         ch0 == '%' && Self::is_name_nondigit(ch1)
2497     }
2498
2499     fn scan_xhp_category_name(&mut self) -> TokenKind {
2500         if self.is_next_xhp_category_name() {
2501             self.advance(1);
2502             let _ = self.scan_xhp_element_name(false);
2503             TokenKind::XHPCategoryName
2504         } else {
2505             self.scan_token(false)
2506         }
2507     }
2508
2509     pub fn next_xhp_category_name(&mut self) -> TF::Token {
2510         self.scan_token_and_trivia(&Self::scan_xhp_category_name, KwSet::NoKeywords)
2511     }
2512 }