hphp/hack/src/parser/lexer.rs

   1 // Copyright (c) 2019, Facebook, Inc.
   2 // All rights reserved.
   3 //
   4 // This source code is licensed under the MIT license found in the
   5 // LICENSE file in the "hack" directory of this source tree.
   6
   7 use crate::lexable_token::LexableToken;
   8 use crate::lexable_trivia::LexableTrivia;
   9 use crate::source_text::{SourceText, INVALID};
  10 use crate::syntax_error::{self as Errors, Error, SyntaxError};
  11 use crate::token_kind::TokenKind;
  12 use crate::trivia_kind::TriviaKind;
  13
  14 use std::marker::PhantomData;
  15
  16 #[derive(Debug, Clone)]
  17 pub struct Lexer<'a, Token: LexableToken> {
  18     source: &'a SourceText<'a>,
  19     start: usize,
  20     offset: usize,
  21     errors: Vec<SyntaxError>,
  22     is_experimental_mode: bool,
  23     in_type: bool,
  24     _phantom: PhantomData<Token>,
  25 }
  26
  27 #[derive(Debug, PartialEq)]
  28 pub enum StringLiteralKind {
  29     LiteralDoubleQuoted,
  30     LiteralHeredoc { heredoc: Vec<u8> },
  31 }
  32
  33 #[derive(Debug, Copy, Clone)]
  34 pub enum KwSet {
  35     AllKeywords,
  36     NonReservedKeywords,
  37     NoKeywords,
  38 }
  39
  40 impl<'a, Token: LexableToken> Lexer<'a, Token> {
  41     pub fn make_at(
  42         source: &'a SourceText<'a>,
  43         is_experimental_mode: bool,
  44         offset: usize,
  45     ) -> Self {
  46         Self {
  47             source,
  48             start: offset,
  49             offset,
  50             errors: vec![],
  51             is_experimental_mode,
  52             in_type: false,
  53             _phantom: PhantomData,
  54         }
  55     }
  56
  57     pub fn make(
  58         source: &'a SourceText<'a>,
  59         is_experimental_mode: bool,
  60     ) -> Self {
  61         Self::make_at(
  62             source,
  63             is_experimental_mode,
  64             0,
  65         )
  66     }
  67
  68     fn continue_from(&mut self, l: Lexer<Token>) {
  69         self.start = l.start;
  70         self.offset = l.offset;
  71         self.errors = l.errors
  72     }
  73
  74     pub fn start(&self) -> usize {
  75         self.start
  76     }
  77
  78     pub fn offset(&self) -> usize {
  79         self.offset
  80     }
  81
  82     pub fn errors(&self) -> &[SyntaxError] {
  83         &self.errors
  84     }
  85
  86     fn with_error(&mut self, error: Error) {
  87         let error = SyntaxError::make(self.start(), self.offset(), error);
  88         self.errors.push(error)
  89     }
  90
  91     fn with_offset(&mut self, offset: usize) {
  92         self.offset = offset
  93     }
  94
  95     fn with_start_offset(&mut self, start: usize, offset: usize) {
  96         self.start = start;
  97         self.offset = offset;
  98     }
  99
 100     fn start_new_lexeme(&mut self) {
 101         self.start = self.offset
 102     }
 103
 104     pub fn advance(&mut self, i: usize) {
 105         self.offset = self.offset + i
 106     }
 107
 108     fn is_experimental_mode(&self) -> bool {
 109         self.is_experimental_mode
 110     }
 111
 112     pub fn set_in_type(&mut self, in_type: bool) {
 113         self.in_type = in_type
 114     }
 115
 116     pub fn source(&self) -> &SourceText<'a> {
 117         self.source
 118     }
 119
 120     fn source_text_string(&self) -> &[u8] {
 121         self.source.text()
 122     }
 123
 124     // Housekeeping
 125
 126     pub fn peek_char(&self, index: usize) -> char {
 127         self.source.get(self.offset() + index)
 128     }
 129
 130     fn peek_string(&self, size: usize) -> &[u8] {
 131         &self.source.sub(self.offset, size)
 132     }
 133
 134     fn match_string(&self, s: &[u8]) -> bool {
 135         s == self.peek_string(s.len())
 136     }
 137
 138     fn width(&self) -> usize {
 139         self.offset - self.start
 140     }
 141
 142     fn current_text(&self) -> &[u8] {
 143         self.source.sub(self.start, self.width())
 144     }
 145
 146     fn current_text_as_str(&self) -> &str {
 147         unsafe { std::str::from_utf8_unchecked(self.current_text()) }
 148     }
 149
 150     fn at_end(&self) -> bool {
 151         self.offset() >= self.source.length()
 152     }
 153
 154     fn remaining(&self) -> usize {
 155         let r = (self.source.length() as isize) - (self.offset as isize);
 156         if r < 0 {
 157             0
 158         } else {
 159             r as usize
 160         }
 161     }
 162
 163     fn peek(&self, i: usize) -> char {
 164         self.source.get(i)
 165     }
 166
 167     fn peek_back(&self, index: usize) -> char {
 168         self.source.get(self.offset() - index)
 169     }
 170
 171     fn peek_def(&self, index: usize, default: char) -> char {
 172         if index >= self.source.length() {
 173             default
 174         } else {
 175             self.source.get(index)
 176         }
 177     }
 178
 179     // Character classification
 180
 181     fn is_whitespace_no_newline(c: char) -> bool {
 182         match c {
 183             ' ' | '\t' => true,
 184             _ => false,
 185         }
 186     }
 187
 188     fn is_newline(ch: char) -> bool {
 189         match ch {
 190             '\r' | '\n' => true,
 191             _ => false,
 192         }
 193     }
 194
 195     fn is_binary_digit(ch: char) -> bool {
 196         match ch {
 197             '0' | '1' => true,
 198             _ => false,
 199         }
 200     }
 201
 202     fn is_octal_digit(c: char) -> bool {
 203         ('0' <= c && c <= '7')
 204     }
 205
 206     fn is_decimal_digit(ch: char) -> bool {
 207         '0' <= ch && ch <= '9'
 208     }
 209
 210     fn is_hexadecimal_digit(c: char) -> bool {
 211         ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
 212     }
 213
 214     fn is_name_nondigit(c: char) -> bool {
 215         (c == '_') || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('\x7f' <= c)
 216     }
 217
 218     fn is_name_letter(c: char) -> bool {
 219         (c == '_')
 220             || ('0' <= c && c <= '9')
 221             || ('a' <= c && c <= 'z')
 222             || ('A' <= c && c <= 'Z')
 223             || ('\x7f' <= c)
 224     }
 225
 226     // Lexing
 227
 228     fn skip_while_to_offset(&self, p: &Fn(char) -> bool) -> usize {
 229         let n = self.source.length();
 230         let mut i = self.offset();
 231         while i < n && p(self.peek(i)) {
 232             i = i + 1;
 233         }
 234         i
 235     }
 236
 237     // advance offset as long as the predicate is true
 238     fn skip_while(&mut self, p: &Fn(char) -> bool) {
 239         self.with_offset(self.skip_while_to_offset(p))
 240     }
 241
 242     fn str_skip_while(s: &[u8], mut i: usize, p: &Fn(char) -> bool) -> usize {
 243         let n = s.len();
 244         loop {
 245             if i < n && p(s[i] as char) {
 246                 i = i + 1
 247             } else {
 248                 return i;
 249             }
 250         }
 251     }
 252
 253     fn skip_whitespace(&mut self) {
 254         self.skip_while(&Self::is_whitespace_no_newline);
 255     }
 256
 257     fn str_skip_whitespace(s: &[u8], i: usize) -> usize {
 258         Self::str_skip_while(s, i, &Self::is_whitespace_no_newline)
 259     }
 260
 261     fn not_newline(ch: char) -> bool {
 262         !(Self::is_newline(ch))
 263     }
 264
 265     fn skip_to_end_of_line(&mut self) {
 266         self.skip_while(&Self::not_newline)
 267     }
 268
 269     fn skip_to_end_of_line_or_end_tag(&mut self) {
 270         let n = self.source.length();
 271         let peek_def = |i| if i < n { self.peek(i) } else { INVALID };
 272
 273         let should_stop = |i| {
 274             i >= n || {
 275                 let ch = self.peek(i);
 276                 Self::is_newline(ch) || (ch == '?' && peek_def(i + 1) == '>')
 277             }
 278         };
 279         let mut i = self.offset();
 280         while !(should_stop(i)) {
 281             i = i + 1
 282         }
 283         self.with_offset(i)
 284     }
 285
 286     fn skip_name_end(&mut self) {
 287         self.skip_while(&Self::is_name_letter)
 288     }
 289
 290     fn skip_end_of_line(&mut self) {
 291         match self.peek_char(0) {
 292             '\n' => self.advance(1),
 293             '\r' => {
 294                 if self.peek_char(1) == '\n' {
 295                     self.advance(2)
 296                 } else {
 297                     self.advance(1)
 298                 }
 299             }
 300             _ => {}
 301         }
 302     }
 303
 304     fn scan_name_impl(&mut self) {
 305         assert!(Self::is_name_nondigit(self.peek_char(0)));
 306         self.advance(1);
 307         self.skip_name_end();
 308     }
 309
 310     fn scan_name(&mut self) -> TokenKind {
 311         self.scan_name_impl();
 312         TokenKind::Name
 313     }
 314
 315     fn scan_variable(&mut self) -> TokenKind {
 316         assert_eq!('$', self.peek_char(0));
 317         self.advance(1);
 318         self.scan_name_impl();
 319         TokenKind::Variable
 320     }
 321
 322     fn scan_with_underscores(&mut self, accepted_char: &Fn(char) -> bool) {
 323         let n = self.source.length();
 324         let peek_def = |i| if i < n { self.peek(i) } else { INVALID };
 325         let mut i = self.offset();
 326         while i < n {
 327             let ch = self.peek(i);
 328             if accepted_char(ch) {
 329                 i = i + 1
 330             } else if ch == ' ' && accepted_char(peek_def(i + 1)) {
 331                 i = i + 2;
 332             } else {
 333                 break;
 334             }
 335         }
 336         self.with_offset(i);
 337     }
 338
 339     fn scan_decimal_digits(&mut self) {
 340         self.skip_while(&Self::is_decimal_digit)
 341     }
 342
 343     fn scan_decimal_digits_with_underscores(&mut self) {
 344         self.scan_with_underscores(&Self::is_decimal_digit);
 345     }
 346
 347     fn scan_octal_digits(&mut self) {
 348         self.skip_while(&Self::is_octal_digit)
 349     }
 350
 351     fn scan_octal_digits_with_underscores(&mut self) {
 352         self.scan_with_underscores(&Self::is_octal_digit)
 353     }
 354
 355     fn scan_binary_digits_with_underscores(&mut self) {
 356         self.scan_with_underscores(&Self::is_binary_digit)
 357     }
 358
 359     fn scan_hexadecimal_digits(&mut self) {
 360         self.skip_while(&Self::is_hexadecimal_digit)
 361     }
 362
 363     fn scan_hexadecimal_digits_with_underscores(&mut self) {
 364         self.scan_with_underscores(&Self::is_hexadecimal_digit)
 365     }
 366
 367     fn scan_hex_literal(&mut self) -> TokenKind {
 368         let ch = self.peek_char(0);
 369         if !Self::is_hexadecimal_digit(ch) {
 370             self.with_error(Errors::error0001);
 371             TokenKind::HexadecimalLiteral
 372         } else {
 373             self.scan_hexadecimal_digits_with_underscores();
 374             TokenKind::HexadecimalLiteral
 375         }
 376     }
 377
 378     fn scan_binary_literal(&mut self) -> TokenKind {
 379         let ch = self.peek_char(0);
 380         if !Self::is_binary_digit(ch) {
 381             self.with_error(Errors::error0002);
 382             TokenKind::BinaryLiteral
 383         } else {
 384             self.scan_binary_digits_with_underscores();
 385             TokenKind::BinaryLiteral
 386         }
 387     }
 388
 389     fn scan_exponent(&mut self) -> TokenKind {
 390         let ch = self.peek_char(1);
 391         if ch == '+' || ch == '-' {
 392             self.advance(2)
 393         } else {
 394             self.advance(1)
 395         }
 396         let ch = self.peek_char(0);
 397         if !Self::is_decimal_digit(ch) {
 398             self.with_error(Errors::error0003);
 399             TokenKind::FloatingLiteral
 400         } else {
 401             self.scan_decimal_digits();
 402             TokenKind::FloatingLiteral
 403         }
 404     }
 405
 406     fn scan_after_decimal_point(&mut self) -> TokenKind {
 407         self.advance(1);
 408         self.scan_decimal_digits();
 409         let ch = self.peek_char(0);
 410         if ch == 'e' || ch == 'E' {
 411             self.scan_exponent()
 412         } else {
 413             TokenKind::FloatingLiteral
 414         }
 415     }
 416
 417     fn scan_octal_or_float(&mut self) -> TokenKind {
 418         // We've scanned a leading zero.
 419         // We have an irritating ambiguity here.  09 is not a legal octal or
 420         // floating literal, but 09e1 and 09.1 are.
 421         self.advance(1);
 422         let ch = self.peek_char(0);
 423         match ch {
 424             '.' =>
 425             // 0.
 426             {
 427                 self.scan_after_decimal_point()
 428             }
 429             'e' | 'E' =>
 430             // 0e
 431             {
 432                 self.scan_exponent()
 433             }
 434             _ if '0' <= ch && ch <= '9' => {
 435                 // 05
 436                 let mut lexer_oct = self.clone();
 437                 lexer_oct.scan_octal_digits();
 438
 439                 let mut lexer_dec = self.clone();
 440                 lexer_dec.scan_decimal_digits();
 441                 if (lexer_oct.width()) == (lexer_dec.width()) {
 442                     // Only octal digits. Could be an octal literal, or could
 443                     // be a float.
 444                     let ch = lexer_oct.peek_char(0);
 445                     if ch == 'e' || ch == 'E' {
 446                         self.continue_from(lexer_oct);
 447                         self.scan_exponent()
 448                     } else if ch == '.' {
 449                         self.continue_from(lexer_oct);
 450                         self.scan_after_decimal_point()
 451                     } else {
 452                         // This is irritating - we only want to allow underscores for integer
 453                         // literals. Deferring the lexing with underscores here allows us to
 454                         // make sure we're not dealing with floats.
 455                         self.continue_from(lexer_oct);
 456                         self.scan_octal_digits_with_underscores();
 457                         TokenKind::OctalLiteral
 458                     }
 459                 } else {
 460                     // We had decimal digits following a leading zero; this is either a
 461                     // float literal or an octal to be truncated at the first non-octal
 462                     // digit.
 463                     let ch = lexer_dec.peek_char(0);
 464                     if ch == 'e' || ch == 'E' {
 465                         self.continue_from(lexer_dec);
 466                         self.scan_exponent()
 467                     } else if ch == '.' {
 468                         self.continue_from(lexer_dec);
 469                         self.scan_after_decimal_point()
 470                     } else {
 471                         // an octal to be truncated at the first non-octal digit
 472                         // Again we differ the lexing with underscores here
 473                         self.scan_decimal_digits_with_underscores();
 474                         TokenKind::OctalLiteral
 475                     }
 476                 }
 477             }
 478             _ =>
 479             // 0 is a decimal literal
 480             {
 481                 TokenKind::DecimalLiteral
 482             }
 483         }
 484     }
 485
 486     fn scan_decimal_or_float(&mut self) -> TokenKind {
 487         // We've scanned a leading non-zero digit.
 488         let mut lexer_no_underscores = self.clone();
 489         lexer_no_underscores.scan_decimal_digits();
 490         let mut lexer_with_underscores = self.clone();
 491         lexer_with_underscores.scan_decimal_digits_with_underscores();
 492         let ch = lexer_no_underscores.peek_char(0);
 493         match ch {
 494             '.' =>
 495             // 123.
 496             {
 497                 self.continue_from(lexer_no_underscores);
 498                 self.scan_after_decimal_point()
 499             }
 500             'e' | 'E' =>
 501             // 123e
 502             {
 503                 self.continue_from(lexer_no_underscores);
 504                 self.scan_exponent()
 505             }
 506             _ =>
 507             // 123
 508             {
 509                 self.continue_from(lexer_with_underscores);
 510                 TokenKind::DecimalLiteral
 511             }
 512         }
 513     }
 514
 515     fn scan_single_quote_string_literal(&mut self) -> TokenKind {
 516         // TODO: What about newlines embedded?
 517         // SPEC:
 518         // single-quoted-string-literal::
 519         //   b-opt  ' sq-char-sequence-opt  '
 520         //
 521         // TODO: What is this b-opt?  We don't lex an optional 'b' before a literal.
 522         //
 523         // sq-char-sequence::
 524         //   sq-char
 525         //   sq-char-sequence   sq-char
 526         //
 527         // sq-char::
 528         //   sq-escape-sequence
 529         //   \opt   any character except single-quote (') or backslash (\)
 530         //
 531         // sq-escape-sequence:: one of
 532         //   \'  \\
 533         let n = self.source.length();
 534         let peek = |x| self.source.get(x);
 535
 536         let mut has_error0012 = false;
 537         let mut has_error0006 = false;
 538
 539         let mut i = 1 + self.offset();
 540         let new_offset = loop {
 541             if i >= n {
 542                 has_error0012 = true;
 543                 break n - 1;
 544             } else {
 545                 let ch = peek(i);
 546                 match ch {
 547                     INVALID => {
 548                         has_error0006 = true;
 549                         i = 1 + i
 550                     }
 551                     '\\' => i = 2 + i,
 552                     '\'' => break (1 + i),
 553                     _ => i = 1 + i,
 554                 }
 555             }
 556         };
 557
 558         if has_error0006 {
 559             self.with_error(Errors::error0006)
 560         }
 561         if has_error0012 {
 562             self.with_error(Errors::error0012)
 563         }
 564
 565         self.with_offset(new_offset);
 566         TokenKind::SingleQuotedStringLiteral
 567     }
 568
 569     fn scan_hexadecimal_escape(&mut self) {
 570         let ch2 = self.peek_char(2);
 571         let ch3 = self.peek_char(3);
 572         if !(Self::is_hexadecimal_digit(ch2)) {
 573             // TODO: Consider producing an error for a malformed hex escape
 574             // let lexer = with_error lexer SyntaxError.error0005 in
 575             self.advance(2);
 576         } else if !(Self::is_hexadecimal_digit(ch3)) {
 577             // let lexer = with_error lexer SyntaxError.error0005 in
 578             self.advance(3)
 579         } else {
 580             self.advance(4)
 581         }
 582     }
 583
 584     fn scan_unicode_escape(&mut self) {
 585         // At present the lexer is pointing at \u
 586         if self.peek_char(2) == '{' {
 587             if self.peek_char(3) == '$' {
 588                 // We have a malformed unicode escape that contains a possible embedded
 589                 // expression. Eat the \u and keep on processing the embedded expression.
 590                 // TODO: Consider producing a warning for a malformed unicode escape.
 591                 self.advance(2)
 592             } else {
 593                 // We have a possibly well-formed escape sequence, and at least we know
 594                 // that it is not an embedded expression.
 595                 // TODO: Consider producing an error if the digits are out of range
 596                 // of legal Unicode characters.
 597                 // TODO: Consider producing an error if there are no digits.
 598                 // Skip over the slash, u and brace, and start lexing the number.
 599                 self.advance(3);
 600                 self.scan_hexadecimal_digits();
 601                 let ch = self.peek_char(0);
 602                 if ch != '}' {
 603                     // TODO: Consider producing a warning for a malformed unicode escape.
 604                     {}
 605                 } else {
 606                     self.advance(1)
 607                 }
 608             }
 609         } else {
 610             // We have a malformed unicode escape sequence. Bail out.
 611             // TODO: Consider producing a warning for a malformed unicode escape.
 612             self.advance(2)
 613         }
 614     }
 615
 616     fn skip_uninteresting_double_quote_like_string_characters(&mut self, start_char: char) {
 617         let is_uninteresting = |ch| match ch {
 618             INVALID | '\\' | '$' | '{' | '[' | ']' | '-' => false,
 619             ch if '0' <= ch && ch <= '9' => false,
 620             ch => ch != start_char && !Self::is_name_nondigit(ch),
 621         };
 622         self.skip_while(&is_uninteresting);
 623     }
 624
 625     fn scan_integer_literal_in_string(&mut self) -> TokenKind {
 626         if self.peek_char(0) == '0' {
 627             match self.peek_char(1) {
 628                 'x' | 'X' => {
 629                     self.advance(2);
 630                     self.scan_hex_literal()
 631                 }
 632                 'b' | 'B' => {
 633                     self.advance(2);
 634                     self.scan_binary_literal()
 635                 }
 636                 _ => {
 637                     // An integer literal starting with 0 in a string will actually
 638                     // always be treated as a string index in HHVM, and not as an octal.
 639                     // In such a case, HHVM actually scans all decimal digits to create the
 640                     // token. TODO: (kasper) T40381519 we may want to change this behavior to something more
 641                     // sensible
 642                     self.scan_decimal_digits_with_underscores();
 643                     TokenKind::DecimalLiteral
 644                 }
 645             }
 646         } else {
 647             self.scan_decimal_digits_with_underscores();
 648             TokenKind::DecimalLiteral
 649         }
 650     }
 651
 652     fn scan_double_quote_like_string_literal_from_start(&mut self, start_char: char) -> TokenKind {
 653         let literal_token_kind = TokenKind::DoubleQuotedStringLiteral;
 654         let head_token_kind = TokenKind::DoubleQuotedStringLiteralHead;
 655         self.advance(1);
 656         loop {
 657             // If there's nothing interesting in this double-quoted string then
 658             // we can just hand it back as-is.
 659             self.skip_uninteresting_double_quote_like_string_characters(start_char);
 660             match self.peek_char(0) {
 661                 INVALID => {
 662                     // If the string is unterminated then give an error; if this is an
 663                     // embedded zero character then give an error and recurse; we might
 664                     // be able to make more progress.
 665                     if self.at_end() {
 666                         self.with_error(Errors::error0012);
 667                         break literal_token_kind;
 668                     } else {
 669                         self.with_error(Errors::error0006);
 670                         self.advance(1)
 671                     }
 672                 }
 673                 '`' | '"' => {
 674                     // We made it to the end without finding a special character.
 675                     self.advance(1);
 676                     break literal_token_kind;
 677                 }
 678                 _ =>
 679                 // We've found a backslash, dollar or brace.
 680                 {
 681                     break head_token_kind;
 682                 }
 683             }
 684         }
 685     }
 686
 687     fn is_heredoc_tail(&self, name: &[u8]) -> bool {
 688         // A heredoc tail is the identifier immediately preceded by a newline
 689         // and immediately followed by an optional semi and then a newline.
 690         //
 691         // Note that the newline and optional semi are not part of the literal;
 692         // the literal's lexeme ends at the end of the name. Either there is
 693         // no trivia and the next token is a semi-with-trailing-newline, or
 694         // the trailing trivia is a newline.
 695         //
 696         // This odd rule is to ensure that both
 697         // $x = <<<HERE
 698         // something
 699         // HERE;
 700         //
 701         // and
 702         //
 703         // $x = <<<HERE
 704         // something
 705         // HERE
 706         // . "something else";
 707         //
 708         // are legal.
 709         if !(Self::is_newline(self.peek_back(1))) {
 710             false
 711         } else {
 712             let len = name.len();
 713             let ch0 = self.peek_char(len);
 714             let ch1 = self.peek_char(len + 1);
 715             ((Self::is_newline(ch0)) || ch0 == ';' && (Self::is_newline(ch1)))
 716                 && self.peek_string(len) == name
 717         }
 718     }
 719
 720     fn get_tail_token_kind(&self, literal_kind: &StringLiteralKind) -> TokenKind {
 721         match literal_kind {
 722             StringLiteralKind::LiteralHeredoc { heredoc: _ } => TokenKind::HeredocStringLiteralTail,
 723             StringLiteralKind::LiteralDoubleQuoted => TokenKind::DoubleQuotedStringLiteralTail,
 724         }
 725     }
 726
 727     fn get_string_literal_body_or_double_quoted_tail(
 728         &self,
 729         literal_kind: &StringLiteralKind,
 730     ) -> TokenKind {
 731         if literal_kind == &StringLiteralKind::LiteralDoubleQuoted {
 732             TokenKind::DoubleQuotedStringLiteralTail
 733         } else {
 734             TokenKind::StringLiteralBody
 735         }
 736     }
 737
 738     fn scan_string_literal_in_progress(&mut self, literal_kind: &StringLiteralKind) -> TokenKind {
 739         let (is_heredoc, name): (bool, &[u8]) = match literal_kind {
 740             StringLiteralKind::LiteralHeredoc { heredoc } => (true, &heredoc),
 741             _ => (false, "".as_bytes()),
 742         };
 743         let start_char = '"';
 744         let ch0 = self.peek_char(0);
 745         if Self::is_name_nondigit(ch0) {
 746             if is_heredoc && (self.is_heredoc_tail(name)) {
 747                 self.scan_name_impl();
 748                 TokenKind::HeredocStringLiteralTail
 749             } else {
 750                 self.scan_name_impl();
 751                 TokenKind::Name
 752             }
 753         } else {
 754             match ch0 {
 755                 INVALID => {
 756                     if self.at_end() {
 757                         self.with_error(Errors::error0012);
 758                         self.get_tail_token_kind(literal_kind)
 759                     } else {
 760                         self.with_error(Errors::error0006);
 761                         self.advance(1);
 762                         self.skip_uninteresting_double_quote_like_string_characters(start_char);
 763                         TokenKind::StringLiteralBody
 764                     }
 765                 }
 766                 '"' => {
 767                     let kind = self.get_string_literal_body_or_double_quoted_tail(literal_kind);
 768                     self.advance(1);
 769                     kind
 770                 }
 771                 '$' => {
 772                     if Self::is_name_nondigit(self.peek_char(1)) {
 773                         self.scan_variable()
 774                     } else {
 775                         self.advance(1);
 776                         TokenKind::Dollar
 777                     }
 778                 }
 779                 '{' => {
 780                     self.advance(1);
 781                     TokenKind::LeftBrace
 782                 }
 783                 '\\' => {
 784                     match self.peek_char(1) {
 785                         // In these cases we just skip the escape sequence and
 786                         // keep on scanning for special characters.
 787                         | '\\' | '"' | '$' | 'e' | 'f' | 'n' | 'r' | 't' | 'v' | '`'
 788                         // Same in these cases; there might be more octal characters following but
 789                         // if there are, we'll just eat them as normal characters.
 790                         | '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' => {
 791                             self.advance(2);
 792                             self.skip_uninteresting_double_quote_like_string_characters(start_char);
 793                             TokenKind::StringLiteralBody}
 794                         | 'x' => {
 795                             self.scan_hexadecimal_escape();
 796                             self.skip_uninteresting_double_quote_like_string_characters(start_char);
 797                             TokenKind::StringLiteralBody }
 798                         | 'u' => {
 799                             self.scan_unicode_escape();
 800                             self.skip_uninteresting_double_quote_like_string_characters(start_char);
 801                             TokenKind::StringLiteralBody }
 802                         | '{' => {
 803                             // The rules for escaping open braces in Hack are bizarre. Suppose we
 804                             // have
 805                             // $x = 123;
 806                             // $y = 456;
 807                             // $z = "\{$x,$y\}";
 808                             // What is the value of $z?  Naively you would think that the backslash
 809                             // escapes the braces, and the variables are embedded, so {123,456}. But
 810                             // that's not what happens. Yes, the backslash makes the brace no longer
 811                             // the opening brace of an expression. But the backslash is still part
 812                             // of the string!  This is the string \{123,456\}.
 813                             // TODO: We might want to fix this because this is very strange.
 814                             // Eat the backslash and the brace.
 815                             self.advance(2);
 816                             TokenKind::StringLiteralBody
 817                         }
 818                     | _ => {
 819                        // TODO: A backslash followed by something other than an escape sequence
 820                        // is legal in hack, and treated as though it was just the backslash
 821                        // and the character. However we might consider making this a warning.
 822                        // It is particularly egregious when we have something like:
 823                        // $x = "abcdef \
 824                        //       ghi";
 825                        // The author of the code likely means the backslash to mean line
 826                        // continuation but in fact it just means to put a backslash and newline
 827                        // in the string.
 828                           self.advance(1);
 829                           self.skip_uninteresting_double_quote_like_string_characters(start_char);
 830                           TokenKind::StringLiteralBody
 831                       }
 832                    }
 833                 }
 834                 '[' => {
 835                     self.advance(1);
 836                     TokenKind::LeftBracket
 837                 }
 838                 ']' => {
 839                     self.advance(1);;
 840                     TokenKind::RightBracket
 841                 }
 842                 '-' => {
 843                     if (self.peek_char(1)) == '>' {
 844                         self.advance(2);
 845                         TokenKind::MinusGreaterThan
 846                     } else {
 847                         // Nothing interesting here. Skip it and find the next
 848                         // interesting character.
 849                         self.advance(1);
 850                         self.skip_uninteresting_double_quote_like_string_characters(start_char);
 851                         TokenKind::StringLiteralBody
 852                     }
 853                 }
 854                 ch if '0' <= ch && ch <= '9' => {
 855                     let mut lexer1 = self.clone();
 856                     let literal = lexer1.scan_integer_literal_in_string();
 857
 858                     if self.errors.len() == lexer1.errors.len() {
 859                         self.continue_from(lexer1);
 860                         literal
 861                     } else {
 862                         // If we failed to scan a literal, do not interpret the literal
 863                         self.with_offset(lexer1.offset());
 864                         TokenKind::StringLiteralBody
 865                     }
 866                 }
 867                 _ => {
 868                     // Nothing interesting here. Skip it and find the next
 869                     // interesting character.
 870                     self.advance(1);
 871                     self.skip_uninteresting_double_quote_like_string_characters(start_char);
 872                     TokenKind::StringLiteralBody
 873                 }
 874             }
 875         }
 876     }
 877     // A heredoc string literal has the form
 878     //
 879     // header
 880     // optional body
 881     // trailer
 882     //
 883     // The header is:
 884     //
 885     // <<< (optional whitespace) name (no whitespace) (newline)
 886     //
 887     // The optional body is:
 888     //
 889     // any characters whatsoever including newlines (newline)
 890     //
 891     // The trailer is:
 892     //
 893     // (no whitespace) name (no whitespace) (optional semi) (no whitespace) (newline)
 894     //
 895     // The names must be identical.  The trailing semi and newline must be present.
 896     //
 897     // The body is any and all characters, up to the first line that exactly matches
 898     // the trailer.
 899     //
 900     // The body may contain embedded expressions.
 901     //
 902     // A nowdoc string literal has the same form except that the first name is
 903     // enclosed in single quotes, and it may not contain embedded expressions.
 904     fn scan_docstring_name_actual(&mut self) -> &'a [u8] {
 905         let ch = self.peek_char(0);
 906         if Self::is_name_nondigit(ch) {
 907             let start_offset = self.offset();
 908             self.advance(1);
 909             self.skip_name_end();
 910             let name = self.source.sub(start_offset, self.offset() - start_offset);
 911             name
 912         } else {
 913             self.with_error(Errors::error0008);
 914             "".as_bytes()
 915         }
 916     }
 917
 918     fn scan_docstring_name(&mut self) -> (&'a [u8], TokenKind) {
 919         self.skip_whitespace();
 920         let ch = self.peek_char(0);
 921         let kind = if ch == '\'' {
 922             TokenKind::NowdocStringLiteral
 923         } else {
 924             TokenKind::HeredocStringLiteral
 925         };
 926
 927         let name = if ch == '\'' {
 928             self.advance(1);
 929             let name = self.scan_docstring_name_actual();
 930             if (self.peek_char(0)) == '\'' {
 931                 self.advance(1);
 932                 name
 933             } else {
 934                 self.with_error(Errors::error0010);
 935                 name
 936             }
 937         } else {
 938             // Starting with PHP 5.3.0, the opening Heredoc identifier
 939             // may optionally be enclosed in double quotes:
 940             if ch == '"' {
 941                 self.advance(1)
 942             };
 943             let name = self.scan_docstring_name_actual();
 944             if ch == '"' {
 945                 // same logic as above, just for double quote
 946                 if self.peek_char(0) == '\"' {
 947                     self.advance(1);
 948                 } else {
 949                     self.with_error(Errors::missing_double_quote)
 950                 }
 951             }
 952             name
 953         };
 954         (name, kind)
 955     }
 956
 957     fn scan_docstring_header(&mut self) -> (&'a [u8], TokenKind) {
 958         let ch = self.peek_char(0);
 959         // Skip 3 for <<< or 4 for b<<<
 960         let skip_count = if ch == 'b' { 4 } else { 3 };
 961         self.advance(skip_count);
 962         let (name, kind) = self.scan_docstring_name();
 963         let ch = self.peek_char(0);
 964         if !Self::is_newline(ch) {
 965             self.with_error(Errors::error0011)
 966         }
 967         self.skip_to_end_of_line();
 968         self.skip_end_of_line();
 969         (name, kind)
 970     }
 971
 972     fn scan_docstring_remainder(&mut self, name: &[u8]) {
 973         let len = name.len();
 974         loop {
 975             let ch0 = self.peek_char(len);
 976             let ch1 = self.peek_char(len + 1);
 977             if ((Self::is_newline(ch0)) || ch0 == ';' && (Self::is_newline(ch1)))
 978                 && self.peek_string(len as usize) == name
 979             {
 980                 self.advance(len as usize);
 981                 break;
 982             } else {
 983                 self.skip_to_end_of_line();
 984                 let ch = self.peek_char(0);
 985                 if Self::is_newline(ch) {
 986                     self.skip_end_of_line()
 987                 } else {
 988                     // If we got here then we ran off the end of the file without
 989                     // finding a newline. Just bail.
 990                     self.with_error(Errors::error0011);
 991                     break;
 992                 }
 993             }
 994         }
 995     }
 996
 997     fn scan_docstring_literal(&mut self) -> TokenKind {
 998         let (name, kind) = self.scan_docstring_header();
 999         self.scan_docstring_remainder(name);
1000         kind
1001     }
1002
1003     fn scan_xhp_label(&mut self) {
1004         // An XHP label has the same grammar as a Hack name.
1005         let _: TokenKind = self.scan_name();
1006     }
1007
1008     fn scan_xhp_element_name(&mut self, attribute: bool) -> TokenKind {
1009         // An XHP element name is a sequence of one or more XHP labels each separated
1010         // by a single : or -.  Note that it is possible for an XHP element name to be
1011         // followed immediately by a : or - that is the next token, so if we find
1012         // a : or - not followed by a label, we need to terminate the token.
1013         self.scan_xhp_label();
1014         let ch0 = self.peek_char(0);
1015         let ch1 = self.peek_char(1);
1016         if (!attribute && ch0 == ':' || ch0 == '-') && Self::is_name_nondigit(ch1) {
1017             self.advance(1);
1018             self.scan_xhp_element_name(false)
1019         } else {
1020             TokenKind::XHPElementName
1021         }
1022     }
1023
1024     // Is the next token we're going to lex a possible xhp class name?
1025     fn is_xhp_class_name(&self) -> bool {
1026         (self.peek_char(0) == ':') && (Self::is_name_nondigit(self.peek_char(1)))
1027     }
1028
1029     fn scan_xhp_class_name(&mut self) -> TokenKind {
1030         // An XHP class name is a colon followed by an xhp name.
1031         if self.is_xhp_class_name() {
1032             self.advance(1);
1033             self.scan_xhp_element_name(false);
1034             TokenKind::XHPClassName
1035         } else {
1036             self.with_error(Errors::error0008);
1037             self.advance(1);
1038             TokenKind::ErrorToken
1039         }
1040     }
1041
1042     fn scan_xhp_string_literal(&mut self) -> TokenKind {
1043         // XHP string literals are just straight up "find the closing quote"
1044         // strings.  Embedded newlines are legal.
1045         let mut offset: usize = 1;
1046         loop {
1047             match self.peek_char(offset) {
1048                 INVALID => {
1049                     self.advance(offset);
1050                     if self.at_end() {
1051                         self.with_error(Errors::error0012);
1052                         return TokenKind::XHPStringLiteral;
1053                     } else {
1054                         self.with_error(Errors::error0006);
1055                         offset = 1
1056                     }
1057                 }
1058                 '"' => {
1059                     self.advance(offset + 1);
1060                     return TokenKind::XHPStringLiteral;
1061                 }
1062                 _ => offset = offset + 1,
1063             }
1064         }
1065     }
1066
1067     // Note that this does not scan an XHP body
1068     fn scan_xhp_token(&mut self) -> TokenKind {
1069         // TODO: HHVM requires that there be no trivia between < and name in an
1070         // opening tag, but does allow trivia between </ and name in a closing tag.
1071         // Consider allowing trivia in an opening tag.
1072         let ch0 = self.peek_char(0);
1073         if ch0 == INVALID && self.at_end() {
1074             TokenKind::EndOfFile
1075         } else if Self::is_name_nondigit(ch0) {
1076             self.scan_xhp_element_name(false)
1077         } else {
1078             match ch0 {
1079                 '{' => {
1080                     self.advance(1);
1081                     TokenKind::LeftBrace
1082                 }
1083                 '}' => {
1084                     self.advance(1);
1085                     TokenKind::RightBrace
1086                 }
1087                 '=' => {
1088                     self.advance(1);
1089                     TokenKind::Equal
1090                 }
1091                 '<' => {
1092                     if (self.peek_char(1)) == '/' {
1093                         self.advance(2);
1094                         TokenKind::LessThanSlash
1095                     } else {
1096                         self.advance(1);
1097                         TokenKind::LessThan
1098                     }
1099                 }
1100                 '"' => self.scan_xhp_string_literal(),
1101                 '/' => {
1102                     if (self.peek_char(1)) == '>' {
1103                         self.advance(2);
1104                         TokenKind::SlashGreaterThan
1105                     } else {
1106                         self.with_error(Errors::error0006);
1107                         self.advance(1);
1108                         TokenKind::ErrorToken
1109                     }
1110                 }
1111                 '>' => {
1112                     self.advance(1);
1113                     TokenKind::GreaterThan
1114                 }
1115                 _ => {
1116                     self.with_error(Errors::error0006);
1117                     self.advance(1);
1118                     TokenKind::ErrorToken
1119                 }
1120             }
1121         }
1122     }
1123
1124     fn scan_xhp_comment(&mut self) {
1125         let mut offset = 4;
1126         loop {
1127             let ch0 = self.peek_char(offset);
1128             let ch1 = self.peek_char(offset + 1);
1129             let ch2 = self.peek_char(offset + 2);
1130             match (ch0, ch1, ch2) {
1131                 (INVALID, _, _) => {
1132                     self.advance(offset as usize);
1133                     return self.with_error(Errors::error0014);
1134                 }
1135                 ('-', '-', '>') => return self.advance((offset + 3) as usize),
1136                 _ => offset = offset + 1,
1137             }
1138         }
1139     }
1140     fn scan_xhp_body(&mut self) -> TokenKind {
1141         // Naively you might think that an XHP body is just a bunch of characters,
1142         // terminated by an embedded { } expression or a tag.  However, whitespace
1143         // and newlines are relevant in XHP bodies because they are "soft".
1144         // That is, any section of contiguous trivia has the same semantics as a
1145         // single space or newline -- just as in HTML.
1146         //
1147         // Obviously this is of relevance to code formatters.
1148         //
1149         // Therefore we detect whitespace and newlines within XHP bodies and treat
1150         // it as trivia surrounding the tokens within the body.
1151         //
1152         // TODO: Is this also true of whitespace within XHP comments? If so then
1153         // we need to make XHP comments a sequence of tokens, rather than a
1154         // single token as they are now.
1155         let ch0 = self.peek_char(0);
1156
1157         match ch0 {
1158             INVALID if self.at_end() => TokenKind::EndOfFile,
1159             '{' => {
1160                 self.advance(1);
1161                 TokenKind::LeftBrace
1162             }
1163             '}' => {
1164                 self.advance(1);
1165                 TokenKind::RightBrace
1166             }
1167             '<' => {
1168                 let ch1 = self.peek_char(1);
1169                 let ch2 = self.peek_char(2);
1170                 let ch3 = self.peek_char(3);
1171                 match (ch1, ch2, ch3) {
1172                     ('!', '-', '-') => {
1173                         self.scan_xhp_comment();
1174                         TokenKind::XHPComment
1175                     }
1176                     ('/', _, _) => {
1177                         self.advance(2);
1178                         TokenKind::LessThanSlash
1179                     }
1180                     _ => {
1181                         self.advance(1);
1182                         TokenKind::LessThan
1183                     }
1184                 }
1185             }
1186             _ => {
1187                 let mut offset = 0;
1188                 loop {
1189                     let ch = self.peek_char(offset);
1190                     match ch {
1191                         INVALID => {
1192                             self.advance(offset);
1193                             if self.at_end() {
1194                                 self.with_error(Errors::error0013);
1195                                 break;
1196                             } else {
1197                                 self.with_error(Errors::error0006);
1198                                 offset = 1
1199                             }
1200                         }
1201                         '\t' | ' ' | '\r' | '\n' | '{' | '}' | '<' => {
1202                             self.advance(offset);
1203                             break;
1204                         }
1205                         _ => offset = offset + 1,
1206                     }
1207                 }
1208                 TokenKind::XHPBody
1209             }
1210         }
1211     }
1212
1213     fn scan_dollar_token(&mut self) -> TokenKind {
1214         // We have a problem here.  We wish to be able to lexically analyze both
1215         // PHP and Hack, but the introduction of $$ to Hack makes them incompatible.
1216         // "$$x" and "$$ $x" are legal in PHP, but illegal in Hack.
1217         // The rule in PHP seems to be that $ is a prefix operator, it is a token,
1218         // it can be followed by trivia, but the next token has to be another $
1219         // operator, a variable $x, or a {.
1220         //
1221         // Here's a reasonable compromise.  (TODO: Review this decision.)
1222         //
1223         // $$x lexes as $ $x
1224         // $$$x lexes as $ $ $x
1225         // and so on.
1226         //
1227         // $$ followed by anything other than a name or a $ lexes as $$.
1228         //
1229         // This means that lexing a PHP program which contains "$$ $x" is different
1230         // will fail at parse time, but I'm willing to live with that.
1231         //
1232         // This means that lexing a Hack program which contains
1233         // "$x |> $$instanceof Foo" produces an error as well.
1234         //
1235         // If these decisions are unacceptable then we will need to make the lexer
1236         // be aware of whether it is lexing PHP or Hack; thus far we have not had
1237         // to make this distinction.
1238
1239         // We are already at $.
1240         let ch1 = self.peek_char(1);
1241         match ch1 {
1242             '$' => {
1243                 let ch2 = self.peek_char(2);
1244                 if ch2 == '$' || ch2 == '{' || Self::is_name_nondigit(ch2) {
1245                     self.advance(1);
1246                     TokenKind::Dollar // $$x or $$$
1247                 } else {
1248                     self.advance(2);
1249                     TokenKind::DollarDollar // $$
1250                 }
1251             }
1252             _ => {
1253                 if Self::is_name_nondigit(ch1) {
1254                     self.scan_variable() // $x
1255                 } else {
1256                     self.advance(1);
1257                     TokenKind::Dollar // $
1258                 }
1259             }
1260         }
1261     }
1262
1263     fn scan_token(&mut self, in_type: bool) -> TokenKind {
1264         let ch0 = self.peek_char(0);
1265         match ch0 {
1266             '[' => {
1267                 self.advance(1);
1268                 TokenKind::LeftBracket
1269             }
1270             ']' => {
1271                 self.advance(1);
1272                 TokenKind::RightBracket
1273             }
1274             '(' => {
1275                 self.advance(1);
1276                 TokenKind::LeftParen
1277             }
1278             ')' => {
1279                 self.advance(1);
1280                 TokenKind::RightParen
1281             }
1282             '{' => {
1283                 self.advance(1);
1284                 TokenKind::LeftBrace
1285             }
1286             '}' => {
1287                 self.advance(1);
1288                 TokenKind::RightBrace
1289             }
1290             '.' => match self.peek_char(1) {
1291                 '=' => {
1292                     self.advance(2);
1293                     TokenKind::DotEqual
1294                 }
1295                 ch if '0' <= ch && ch <= '9' => self.scan_after_decimal_point(),
1296                 '.' => {
1297                     if (self.peek_char(2)) == '.' {
1298                         self.advance(3);
1299                         TokenKind::DotDotDot
1300                     } else {
1301                         self.advance(1);
1302                         TokenKind::Dot
1303                     }
1304                 }
1305                 _ => {
1306                     self.advance(1);
1307                     TokenKind::Dot
1308                 }
1309             },
1310             '-' => match self.peek_char(1) {
1311                 '=' => {
1312                     self.advance(2);
1313                     TokenKind::MinusEqual
1314                 }
1315                 '-' => {
1316                     self.advance(2);
1317                     TokenKind::MinusMinus
1318                 }
1319                 '>' => {
1320                     self.advance(2);
1321                     TokenKind::MinusGreaterThan
1322                 }
1323                 _ => {
1324                     self.advance(1);
1325                     TokenKind::Minus
1326                 }
1327             },
1328             '+' => match self.peek_char(1) {
1329                 '=' => {
1330                     self.advance(2);
1331                     TokenKind::PlusEqual
1332                 }
1333                 '+' => {
1334                     self.advance(2);
1335                     TokenKind::PlusPlus
1336                 }
1337                 _ => {
1338                     self.advance(1);
1339                     TokenKind::Plus
1340                 }
1341             },
1342             '*' => match (self.peek_char(1), self.peek_char(2)) {
1343                 ('=', _) => {
1344                     self.advance(2);
1345                     TokenKind::StarEqual
1346                 }
1347                 ('*', '=') => {
1348                     self.advance(3);
1349                     TokenKind::StarStarEqual
1350                 }
1351                 ('*', _) => {
1352                     self.advance(2);
1353                     TokenKind::StarStar
1354                 }
1355                 _ => {
1356                     self.advance(1);
1357                     TokenKind::Star
1358                 }
1359             },
1360             '~' => {
1361                 self.advance(1);
1362                 TokenKind::Tilde
1363             }
1364             '!' => match (self.peek_char(1), self.peek_char(2)) {
1365                 ('=', '=') => {
1366                     self.advance(3);
1367                     TokenKind::ExclamationEqualEqual
1368                 }
1369                 ('=', _) => {
1370                     self.advance(2);
1371                     TokenKind::ExclamationEqual
1372                 }
1373                 _ => {
1374                     self.advance(1);
1375                     TokenKind::Exclamation
1376                 }
1377             },
1378             '$' => self.scan_dollar_token(),
1379             '/' => {
1380                 if (self.peek_char(1)) == '=' {
1381                     self.advance(2);
1382                     TokenKind::SlashEqual
1383                 } else {
1384                     self.advance(1);
1385                     TokenKind::Slash
1386                 }
1387             }
1388             '%' => {
1389                 if (self.peek_char(1)) == '=' {
1390                     self.advance(2);
1391                     TokenKind::PercentEqual
1392                 } else {
1393                     self.advance(1);
1394                     TokenKind::Percent
1395                 }
1396             }
1397             '<' => {
1398                 match (self.peek_char(1), self.peek_char(2)) {
1399                     ('<', '<') => self.scan_docstring_literal(),
1400                     ('<', '=') => {
1401                         self.advance(3);
1402                         TokenKind::LessThanLessThanEqual
1403                     }
1404                     // TODO: We lex and parse the spaceship operator.
1405                     // TODO: This is not in the spec at present.  We should either make it an
1406                     // TODO: error, or add it to the specification.
1407                     ('=', '>') => {
1408                         self.advance(3);
1409                         TokenKind::LessThanEqualGreaterThan
1410                     }
1411                     ('=', _) => {
1412                         self.advance(2);
1413                         TokenKind::LessThanEqual
1414                     }
1415                     ('<', _) => {
1416                         self.advance(2);
1417                         TokenKind::LessThanLessThan
1418                     }
1419                     _ => {
1420                         self.advance(1);
1421                         TokenKind::LessThan
1422                     }
1423                 }
1424             }
1425             '>' => {
1426                 match (self.peek_char(1), self.peek_char(2)) {
1427                     // If we are parsing a generic type argument list then we might be at the >>
1428                     // in `List<List<int>>``, or at the >= of `let x:vec<int>=...`. In that case
1429                     // we want to lex two >'s instead of >> / one > and one = instead of >=.
1430                     (ch, _) if (ch == '>' || ch == '=') && in_type => {
1431                         self.advance(1);
1432                         TokenKind::GreaterThan
1433                     }
1434                     ('>', '=') => {
1435                         self.advance(3);
1436                         TokenKind::GreaterThanGreaterThanEqual
1437                     }
1438                     ('>', _) => {
1439                         self.advance(2);
1440                         TokenKind::GreaterThanGreaterThan
1441                     }
1442                     ('=', _) => {
1443                         self.advance(2);
1444                         TokenKind::GreaterThanEqual
1445                     }
1446                     _ => {
1447                         self.advance(1);
1448                         TokenKind::GreaterThan
1449                     }
1450                 }
1451             }
1452             '=' => match (self.peek_char(1), self.peek_char(2)) {
1453                 ('=', '=') => {
1454                     self.advance(3);
1455                     TokenKind::EqualEqualEqual
1456                 }
1457                 ('=', '>') => {
1458                     self.advance(3);
1459                     TokenKind::EqualEqualGreaterThan
1460                 }
1461                 ('=', _) => {
1462                     self.advance(2);
1463                     TokenKind::EqualEqual
1464                 }
1465                 ('>', _) => {
1466                     self.advance(2);
1467                     TokenKind::EqualGreaterThan
1468                 }
1469                 _ => {
1470                     self.advance(1);
1471                     TokenKind::Equal
1472                 }
1473             },
1474             '^' => {
1475                 if (self.peek_char(1)) == '=' {
1476                     self.advance(2);
1477                     TokenKind::CaratEqual
1478                 } else {
1479                     self.advance(1);
1480                     TokenKind::Carat
1481                 }
1482             }
1483             '|' => match self.peek_char(1) {
1484                 '=' => {
1485                     self.advance(2);
1486                     TokenKind::BarEqual
1487                 }
1488                 '>' => {
1489                     self.advance(2);
1490                     TokenKind::BarGreaterThan
1491                 }
1492                 '|' => {
1493                     self.advance(2);
1494                     TokenKind::BarBar
1495                 }
1496                 _ => {
1497                     self.advance(1);
1498                     TokenKind::Bar
1499                 }
1500             },
1501             '&' => match self.peek_char(1) {
1502                 '=' => {
1503                     self.advance(2);
1504                     TokenKind::AmpersandEqual
1505                 }
1506                 '&' => {
1507                     self.advance(2);
1508                     TokenKind::AmpersandAmpersand
1509                 }
1510                 _ => {
1511                     self.advance(1);
1512                     TokenKind::Ampersand
1513                 }
1514             },
1515             '?' => match (self.peek_char(1), self.peek_char(2)) {
1516                 (':', _) if !in_type => {
1517                     self.advance(2);
1518                     TokenKind::QuestionColon
1519                 }
1520                 ('-', '>') => {
1521                     self.advance(3);
1522                     TokenKind::QuestionMinusGreaterThan
1523                 }
1524                 ('?', '=') => {
1525                     self.advance(3);
1526                     TokenKind::QuestionQuestionEqual
1527                 }
1528                 ('?', _) => {
1529                     self.advance(2);
1530                     TokenKind::QuestionQuestion
1531                 }
1532                 ('>', _) => {
1533                     self.advance(2);
1534                     TokenKind::QuestionGreaterThan
1535                 }
1536                 ('a', 's') if !Self::is_name_nondigit(self.peek_char(3)) => {
1537                     self.advance(3);
1538                     TokenKind::QuestionAs
1539                 }
1540                 _ => {
1541                     self.advance(1);
1542                     TokenKind::Question
1543                 }
1544             },
1545             ':' => {
1546                 // In experimental mode only: try to scan for a pocket universes atom
1547                 // of the form `:@`
1548                 let ch1 = self.peek_char(1);
1549
1550                 if ch1 == ':' {
1551                     self.advance(2);
1552                     TokenKind::ColonColon
1553                 } else if self.is_experimental_mode && ch1 == '@' {
1554                     self.advance(2);
1555                     TokenKind::ColonAt
1556                 } else {
1557                     self.advance(1);
1558                     TokenKind::Colon
1559                 }
1560             }
1561             ';' => {
1562                 self.advance(1);
1563                 TokenKind::Semicolon
1564             }
1565             ',' => {
1566                 self.advance(1);
1567                 TokenKind::Comma
1568             }
1569             '@' => {
1570                 self.advance(1);
1571                 TokenKind::At
1572             }
1573             '0' => match self.peek_char(1) {
1574                 'x' | 'X' => {
1575                     self.advance(2);
1576                     self.scan_hex_literal()
1577                 }
1578                 'b' | 'B' => {
1579                     self.advance(2);
1580                     self.scan_binary_literal()
1581                 }
1582                 _ => self.scan_octal_or_float(),
1583             },
1584             ch if '1' <= ch && ch <= '9' => self.scan_decimal_or_float(),
1585             '\'' => self.scan_single_quote_string_literal(),
1586             '`' => self.scan_double_quote_like_string_literal_from_start('`'),
1587             '"' => self.scan_double_quote_like_string_literal_from_start('"'),
1588             '\\' => {
1589                 self.advance(1);
1590                 TokenKind::Backslash
1591             }
1592             'b' if {
1593                 let c1 = self.peek_char(1);
1594                 let c2 = self.peek_char(2);
1595                 let c3 = self.peek_char(3);
1596                 c1 == '"' || c1 == '\'' || (c1 == '<' && c2 == '<' && c3 == '<')
1597             } =>
1598             {
1599                 self.advance(1);
1600                 self.scan_token(in_type)
1601             }
1602             // Names
1603             _ => {
1604                 if ch0 == INVALID && self.at_end() {
1605                     TokenKind::EndOfFile
1606                 } else if Self::is_name_nondigit(ch0) {
1607                     self.scan_name()
1608                 } else {
1609                     self.with_error(Errors::error0006);
1610                     self.advance(1);
1611                     TokenKind::ErrorToken
1612                 }
1613             }
1614         }
1615     }
1616
1617     fn scan_token_outside_type(&mut self) -> TokenKind {
1618         self.scan_token(false)
1619     }
1620
1621     fn scan_token_inside_type(&mut self) -> TokenKind {
1622         self.scan_token(true)
1623     }
1624
1625     // Lexing trivia
1626
1627     // SPEC:
1628     //
1629     // white-space-character::
1630     //   new-line
1631     //   Space character (U+0020)
1632     //   Horizontal-tab character (U+0009)
1633     //
1634     // single-line-comment::
1635     //   //   input-characters-opt
1636     //   #    input-characters-opt
1637     //
1638     // new-line::
1639     //   Carriage-return character (U+000D)
1640     //   Line-feed character (U+000A)
1641     //   Carriage-return character followed by line-feed character
1642
1643     fn str_scan_end_of_line(s: &[u8], i: usize) -> usize {
1644         match s.get(i).map(|x| *x as char) {
1645             None => i + 1,
1646             Some('\r') => match s.get(i + 1).map(|x| *x as char) {
1647                 Some('\n') => 2 + i,
1648                 _ => i + 1,
1649             },
1650             Some('\n') => i + 1,
1651             _ => panic!("str_scan_end_of_line called while not on end of line!"),
1652         }
1653     }
1654
1655     fn scan_end_of_line(&mut self) -> Token::Trivia {
1656         match self.peek_char(0) {
1657             '\r' => {
1658                 let w = if self.peek_char(1) == '\n' { 2 } else { 1 };
1659                 self.advance(w);
1660                 Token::Trivia::make_eol(self.source(), self.start, w)
1661             }
1662             '\n' => {
1663                 self.advance(1);
1664                 Token::Trivia::make_eol(self.source(), self.start, 1)
1665             }
1666             _ => panic!("scan_end_of_line called while not on end of line!"),
1667         }
1668     }
1669
1670     fn scan_hash_comment(&mut self) -> Token::Trivia {
1671         self.skip_to_end_of_line();
1672         Token::Trivia::make_single_line_comment(self.source(), self.start, self.width())
1673     }
1674
1675     fn scan_single_line_comment(&mut self) -> Token::Trivia {
1676         // A fallthrough comment is two slashes, any amount of whitespace,
1677         // FALLTHROUGH, and any characters may follow.
1678         // TODO: Consider allowing lowercase fallthrough.
1679
1680         self.advance(2);
1681         self.skip_whitespace();
1682         let lexer_ws = self.clone();
1683         self.skip_to_end_of_line_or_end_tag();
1684         let w = self.width();
1685         let remainder = self.offset - lexer_ws.offset;
1686         if remainder >= 11 && lexer_ws.peek_string(11) == "FALLTHROUGH".as_bytes() {
1687             Token::Trivia::make_fallthrough(self.source(), self.start, w)
1688         } else {
1689             Token::Trivia::make_single_line_comment(self.source(), self.start, w)
1690         }
1691     }
1692
1693     fn skip_to_end_of_delimited_comment(&mut self) {
1694         let mut offset = 0;
1695         loop {
1696             let ch0 = self.peek_char(offset);
1697             if ch0 == INVALID {
1698                 self.advance(offset);
1699                 if self.at_end() {
1700                     return self.with_error(Errors::error0007);
1701                 } else {
1702                     // TODO: Do we want to give a warning for an embedded zero char
1703                     // inside a comment?
1704                     offset = 1;
1705                 }
1706             } else if ch0 == '*' && (self.peek_char(offset + 1)) == '/' {
1707                 return self.advance(offset + 2);
1708             } else {
1709                 offset = offset + 1
1710             }
1711         }
1712     }
1713
1714     fn scan_delimited_comment(&mut self) -> Token::Trivia {
1715         // The original lexer lexes a fixme / ignore error as:
1716         //
1717         // slash star [whitespace]* HH_FIXME [whitespace or newline]* leftbracket
1718         // [whitespace or newline]* integer [any text]* star slash
1719         //
1720         // Notice that the original lexer oddly enough does not verify that there
1721         // is a right bracket.
1722         //
1723         // For our purposes we will just check for HH_FIXME / HH_IGNORE_ERROR;
1724         // a later pass can try to parse out the integer if there is one,
1725         // give a warning if there is not, and so on.
1726
1727         self.advance(2);
1728         self.skip_whitespace();
1729
1730         let lexer_ws = self.clone();
1731         self.skip_to_end_of_delimited_comment();
1732         let w = self.width();
1733         if lexer_ws.match_string("HH_FIXME".as_bytes()) {
1734             Token::Trivia::make_fix_me(self.source(), self.start, w)
1735         } else if lexer_ws.match_string("HH_IGNORE_ERROR".as_bytes()) {
1736             Token::Trivia::make_ignore_error(self.source(), self.start, w)
1737         } else {
1738             Token::Trivia::make_delimited_comment(self.source(), self.start, w)
1739         }
1740     }
1741
1742     fn scan_php_trivia(&mut self) -> Option<Token::Trivia> {
1743         // Hack does not support PHP style embedded markup:
1744         // <?php
1745         // if (x) {
1746         // ?>
1747         // <foo>bar</foo>
1748         // <?php
1749         // } else { ... }
1750         //
1751         // However, ?> is never legal in Hack, so we can treat ?> ... any text ... <?php
1752         // as a comment, and then give an error saying that this feature is not supported
1753         // in Hack.
1754         //
1755         // TODO: Give an error if this appears in a Hack program.
1756         match self.peek_char(0) {
1757             '#' => {
1758                 self.start_new_lexeme();
1759                 Some(self.scan_hash_comment())
1760             }
1761             '/' => {
1762                 self.start_new_lexeme();
1763                 match self.peek_char(1) {
1764                     '/' => Some(self.scan_single_line_comment()),
1765                     '*' => Some(self.scan_delimited_comment()),
1766                     _ => None,
1767                 }
1768             }
1769             ' ' | '\t' => {
1770                 let new_end = Self::str_skip_whitespace(self.source_text_string(), self.offset);
1771                 let new_start = self.offset;
1772                 let new_trivia =
1773                     Token::Trivia::make_whitespace(self.source(), new_start, new_end - new_start);
1774                 self.with_start_offset(new_start, new_end);
1775                 Some(new_trivia)
1776             }
1777             '\r' | '\n' => {
1778                 self.start_new_lexeme();
1779                 Some(self.scan_end_of_line())
1780             }
1781             _ => {
1782                 self.start_new_lexeme();
1783                 // Not trivia
1784                 None
1785             }
1786         }
1787     }
1788
1789     fn scan_xhp_trivia(&mut self) -> Option<Token::Trivia> {
1790         // TODO: Should XHP comments <!-- --> be their own thing, or a kind of
1791         // trivia associated with a token? Right now they are the former.
1792         let i = self.offset;
1793         let ch = self.peek_char(0);
1794         match ch {
1795             ' ' | '\t' => {
1796                 let j = Self::str_skip_whitespace(self.source_text_string(), i);
1797                 self.with_start_offset(i, j);
1798                 Some(Token::Trivia::make_whitespace(self.source(), i, j - i))
1799             }
1800             '\r' | '\n' => {
1801                 let j = Self::str_scan_end_of_line(self.source_text_string(), i);
1802                 self.with_start_offset(i, j);
1803                 Some(Token::Trivia::make_eol(self.source(), i, j - i))
1804             }
1805             _ =>
1806             // Not trivia
1807             {
1808                 self.start_new_lexeme();
1809                 None
1810             }
1811         }
1812     }
1813
1814     // We divide trivia into "leading" and "trailing" trivia of an associated
1815     // token. This means that we must find a dividing line between the trailing trivia
1816     // following one token and the leading trivia of the following token. Plainly
1817     // we need only find this line while scanning trailing trivia. The heuristics
1818     // we use are:
1819     // * The first newline trivia encountered is the last trailing trivia.
1820     // * The newline which follows a // or # comment is not part of the comment
1821     //   but does terminate the trailing trivia.
1822     // * A pragma to turn checks off (HH_FIXME and HH_IGNORE_ERROR) is
1823     //   always a leading trivia.
1824     fn scan_leading_trivia(
1825         &mut self,
1826         scanner: &Fn(&mut Self) -> Option<Token::Trivia>,
1827     ) -> Vec<Token::Trivia> {
1828         let mut acc = vec![];
1829         loop {
1830             match scanner(self) {
1831                 None => return acc,
1832                 Some(t) => acc.push(t),
1833             }
1834         }
1835     }
1836
1837     pub fn scan_leading_php_trivia(&mut self) -> Vec<Token::Trivia> {
1838         self.scan_leading_trivia(&Self::scan_php_trivia)
1839     }
1840
1841     pub fn scan_leading_xhp_trivia(&mut self) -> Vec<Token::Trivia> {
1842         self.scan_leading_trivia(&Self::scan_xhp_trivia)
1843     }
1844
1845     fn scan_trailing_trivia(
1846         &mut self,
1847         scanner: &Fn(&mut Self) -> Option<Token::Trivia>,
1848     ) -> Vec<Token::Trivia> {
1849         let mut acc = vec![];
1850         loop {
1851             let mut lexer1 = self.clone();
1852             match scanner(&mut lexer1) {
1853                 None => {
1854                     self.continue_from(lexer1);
1855                     return acc;
1856                 }
1857                 Some(t) => match t.kind() {
1858                     TriviaKind::EndOfLine => {
1859                         self.continue_from(lexer1);
1860                         acc.push(t);
1861                         return acc;
1862                     }
1863                     TriviaKind::FixMe | TriviaKind::IgnoreError => {
1864                         return acc;
1865                     }
1866                     _ => {
1867                         self.continue_from(lexer1);
1868                         acc.push(t)
1869                     }
1870                 },
1871             }
1872         }
1873     }
1874
1875     pub fn scan_trailing_php_trivia(&mut self) -> Vec<Token::Trivia> {
1876         self.scan_trailing_trivia(&Self::scan_php_trivia)
1877     }
1878
1879     pub fn scan_trailing_xhp_trivia(&mut self) -> Vec<Token::Trivia> {
1880         self.scan_trailing_trivia(&Self::scan_xhp_trivia)
1881     }
1882
1883     pub fn is_next_name(&self) -> bool {
1884         let mut lexer = self.clone();
1885         lexer.scan_leading_php_trivia();
1886         Self::is_name_nondigit(lexer.peek_char(0))
1887     }
1888
1889     pub fn is_next_xhp_class_name(&self) -> bool {
1890         let mut lexer = self.clone();
1891         lexer.scan_leading_php_trivia();
1892         lexer.is_xhp_class_name()
1893     }
1894
1895     fn as_case_insensitive_keyword(&self, text: &str) -> Option<String> {
1896         let lower = text.to_ascii_lowercase();
1897         let res = match lower.as_ref() {
1898             "__halt_compiler" | "abstract" | "and" | "array" | "as" | "bool" | "boolean"
1899             | "break" | "callable" | "case" | "catch" | "class" | "clone" | "const"
1900             | "continue" | "default" | "die" | "do" | "echo" | "else" | "elseif"
1901             | "empty" | "endfor" | "endforeach" | "endif" | "endswitch"
1902             | "endwhile" | "eval" | "exit" | "extends" | "false" | "final" | "finally" | "for"
1903             | "foreach" | "function" | "global" | "goto" | "if" | "implements" | "include"
1904             | "include_once" | "inout" | "instanceof" | "insteadof" | "int" | "integer" | "interface"
1905             | "isset" | "list" | "namespace" | "new" | "null" | "or" | "parent" | "print"
1906             | "private" | "protected" | "public" | "require" | "require_once" | "return"
1907             | "self" | "static" | "string" | "switch" | "throw" | "trait" | "try" | "true"
1908             | "unset" | "use" | "using" | "var" | "void" | "while" | "xor" | "yield" => Some(lower),
1909             _ => None,
1910         };
1911         res.map(|x| x.to_owned())
1912     }
1913
1914     fn lowercase_error(&self, original_text: &str, lowered_text: &str) -> bool {
1915         match lowered_text {
1916             "true" | "false" | "null" => false,
1917             _ => original_text != lowered_text,
1918         }
1919     }
1920
1921     fn as_keyword(&mut self, only_reserved: bool, kind: TokenKind) -> TokenKind {
1922         if kind == TokenKind::Name {
1923             let original_text = self.current_text_as_str();
1924             let text_as_lowercase_keyword = self.as_case_insensitive_keyword(original_text);
1925             let text = match text_as_lowercase_keyword.as_ref() {
1926                 Some(x) => x,
1927                 None => original_text,
1928             };
1929             match TokenKind::from_string(&text.as_bytes(), only_reserved) {
1930                 Some(TokenKind::Let) if (!(self.is_experimental_mode())) => TokenKind::Name,
1931                 Some(keyword) => {
1932                     if self.lowercase_error(original_text, &text) {
1933                         let err = Errors::uppercase_kw(original_text);
1934                         self.with_error(err);
1935                     }
1936                     keyword
1937                 }
1938                 _ => TokenKind::Name,
1939             }
1940         } else {
1941             kind
1942         }
1943     }
1944
1945     fn scan_token_and_leading_trivia(
1946         &mut self,
1947         scanner: &Fn(&mut Self) -> TokenKind,
1948         as_name: KwSet,
1949     ) -> (TokenKind, usize, Vec<Token::Trivia>) {
1950         // Get past the leading trivia
1951         let leading = self.scan_leading_php_trivia();
1952         // Remember where we were when we started this token
1953         self.start_new_lexeme();
1954         let kind = scanner(self);
1955         let kind = match as_name {
1956             KwSet::AllKeywords => kind,
1957             KwSet::NonReservedKeywords => self.as_keyword(true, kind),
1958             KwSet::NoKeywords => self.as_keyword(false, kind),
1959         };
1960         let w = self.width();
1961         (kind, w, leading)
1962     }
1963
1964     fn scan_token_and_trivia(
1965         &mut self,
1966         scanner: &Fn(&mut Self) -> TokenKind,
1967         as_name: KwSet,
1968     ) -> Token {
1969         let token_start = self.offset;
1970
1971         let (kind, w, leading) = self.scan_token_and_leading_trivia(scanner, as_name);
1972         let trailing = match kind {
1973             TokenKind::DoubleQuotedStringLiteralHead => vec![],
1974             TokenKind::QuestionGreaterThan => {
1975                 if Self::is_newline(self.peek_char(0)) {
1976                     // consume only trailing EOL token after ?> as trailing trivia
1977                     vec![self.scan_end_of_line()]
1978                 } else {
1979                     vec![]
1980                 }
1981             }
1982             _ => self.scan_trailing_php_trivia(),
1983         };
1984         Token::make(kind, token_start, w, leading, trailing)
1985     }
1986
1987     fn scan_assert_progress(&mut self, tokenizer: &Fn(&mut Self) -> Token) -> Token {
1988         let original_remaining = self.remaining();
1989         let token = tokenizer(self);
1990         let new_remaining = self.remaining();
1991         if new_remaining < original_remaining
1992             || original_remaining == 0
1993                 && new_remaining == 0
1994                 && (token.kind()) == TokenKind::EndOfFile
1995         {
1996             token
1997         } else {
1998             panic!("failed to make progress at {}\n", self.offset)
1999         }
2000     }
2001
2002     fn scan_next_token(&mut self, scanner: &Fn(&mut Self) -> TokenKind, as_name: KwSet) -> Token {
2003         let tokenizer = |x: &mut Self| x.scan_token_and_trivia(scanner, as_name);
2004         self.scan_assert_progress(&tokenizer)
2005     }
2006
2007     fn scan_next_token_as_name(&mut self, scanner: &Fn(&mut Self) -> TokenKind) -> Token {
2008         self.scan_next_token(scanner, KwSet::AllKeywords)
2009     }
2010
2011     fn scan_next_token_as_keyword(&mut self, scanner: &Fn(&mut Self) -> TokenKind) -> Token {
2012         self.scan_next_token(scanner, KwSet::NoKeywords)
2013     }
2014
2015     fn scan_next_token_nonreserved_as_name(
2016         &mut self,
2017         scanner: &Fn(&mut Self) -> TokenKind,
2018     ) -> Token {
2019         self.scan_next_token(scanner, KwSet::NonReservedKeywords)
2020     }
2021
2022     // Entrypoints
2023
2024     pub fn next_token(&mut self) -> Token {
2025         if self.in_type {
2026             self.scan_next_token_as_keyword(&Self::scan_token_inside_type)
2027         } else {
2028             self.scan_next_token_as_keyword(&Self::scan_token_outside_type)
2029         }
2030     }
2031
2032     pub fn next_token_no_trailing(&mut self) -> Token {
2033         let tokenizer = |x: &mut Self| {
2034             let token_start = x.offset;
2035             let (kind, w, leading) =
2036                 x.scan_token_and_leading_trivia(&Self::scan_token_outside_type, KwSet::NoKeywords);
2037             Token::make(kind, token_start, w, leading, vec![])
2038         };
2039         self.scan_assert_progress(&tokenizer)
2040     }
2041
2042     pub fn next_token_in_string(&mut self, literal_kind: &StringLiteralKind) -> Token {
2043         let token_start = self.offset;
2044         self.start_new_lexeme();
2045         // We're inside a string. Do not scan leading trivia.
2046         let kind = self.scan_string_literal_in_progress(literal_kind);
2047         let w = self.width();
2048         // Only scan trailing trivia if we've finished the string.
2049         let trailing = match kind {
2050             TokenKind::DoubleQuotedStringLiteralTail | TokenKind::HeredocStringLiteralTail => {
2051                 self.scan_trailing_php_trivia()
2052             }
2053             _ => vec![],
2054         };
2055         Token::make(kind, token_start, w, vec![], trailing)
2056     }
2057
2058     pub fn next_docstring_header(&mut self) -> (Token, &'a [u8]) {
2059         // We're at the beginning of a heredoc string literal. Scan leading
2060         // trivia but not trailing trivia.
2061         let token_start = self.offset;
2062         let leading = self.scan_leading_php_trivia();
2063         self.start_new_lexeme();
2064         let (name, _) = self.scan_docstring_header();
2065         let w = self.width();
2066         let token = Token::make(
2067             TokenKind::HeredocStringLiteralHead,
2068             token_start,
2069             w,
2070             leading,
2071             vec![],
2072         );
2073         (token, name)
2074     }
2075
2076     pub fn next_token_as_name(&mut self) -> Token {
2077         self.scan_next_token_as_name(&Self::scan_token_outside_type)
2078     }
2079
2080     pub fn next_token_non_reserved_as_name(&mut self) -> Token {
2081         self.scan_next_token_nonreserved_as_name(&Self::scan_token_outside_type)
2082     }
2083
2084     pub fn next_xhp_element_token(&mut self, no_trailing: bool) -> (Token, &[u8]) {
2085         // XHP elements have whitespace, newlines and Hack comments.
2086         let tokenizer = |lexer: &mut Self| {
2087             let token_start = lexer.offset;
2088             let (kind, w, leading) =
2089                 lexer.scan_token_and_leading_trivia(&Self::scan_xhp_token, KwSet::AllKeywords);
2090             // We do not scan trivia after an XHPOpen's >. If that is the beginning of
2091             // an XHP body then we want any whitespace or newlines to be leading trivia
2092             // of the body token.
2093             match kind {
2094                 TokenKind::GreaterThan | TokenKind::SlashGreaterThan if no_trailing => {
2095                     Token::make(kind, token_start, w, leading, vec![])
2096                 }
2097                 _ => {
2098                     let trailing = lexer.scan_trailing_php_trivia();
2099                     Token::make(kind, token_start, w, leading, trailing)
2100                 }
2101             }
2102         };
2103         let token = self.scan_assert_progress(&tokenizer);
2104         let token_width = token.width();
2105         let trailing_width = token.trailing_width();
2106         let token_start_offset = (self.offset) - trailing_width - token_width;
2107         let token_text = self.source.sub(token_start_offset, token_width);
2108         (token, token_text)
2109     }
2110
2111     pub fn next_xhp_body_token(&mut self) -> Token {
2112         let scanner = |lexer: &mut Self| {
2113             let token_start = lexer.offset;
2114             let leading = lexer.scan_leading_xhp_trivia();
2115             lexer.start_new_lexeme();
2116             let kind = lexer.scan_xhp_body();
2117             let w = lexer.width();
2118             let trailing =
2119              // Trivia (leading and trailing) is semantically
2120              // significant for XHPBody tokens. When we find elements or
2121              // braced expressions inside the body, the trivia should be
2122              // seen as leading the next token, but we should certainly
2123              // keep it trailing if this is an XHPBody token.
2124              if kind == TokenKind::XHPBody
2125              { lexer.scan_trailing_xhp_trivia() }
2126              else  { vec!() };
2127             Token::make(kind, token_start, w, leading, trailing)
2128         };
2129         self.scan_assert_progress(&scanner)
2130     }
2131
2132     pub fn next_xhp_class_name(&mut self) -> Token {
2133         self.scan_token_and_trivia(&Self::scan_xhp_class_name, KwSet::NoKeywords)
2134     }
2135
2136     pub fn next_xhp_name(&mut self) -> Token {
2137         let scanner = |x: &mut Self| x.scan_xhp_element_name(false);
2138         self.scan_token_and_trivia(&scanner, KwSet::NoKeywords)
2139     }
2140
2141     fn make_markup_token(&self) -> Token {
2142         Token::make(TokenKind::Markup, self.start, self.width(), vec![], vec![])
2143     }
2144
2145     fn make_long_tag(
2146         &mut self,
2147         name_token_offset: usize,
2148         size: usize,
2149         markup_text: Token,
2150         less_than_question_token: Token,
2151     ) -> (Token, Option<(Token, Option<Token>)>) {
2152         // skip name
2153         self.advance(size);
2154         // single line comments that follow the language in leading markup_text
2155         // determine the file check mode, read the trailing trivia and attach it
2156         // to the language token
2157         let trailing = self.scan_trailing_php_trivia();
2158         let name = Token::make(TokenKind::Name, name_token_offset, size, vec![], trailing);
2159         (markup_text, Some((less_than_question_token, Some(name))))
2160     }
2161
2162     fn make_markup_and_suffix(&mut self) -> (Token, Option<(Token, Option<Token>)>) {
2163         let markup_text = self.make_markup_token();
2164         let less_than_question_token =
2165             Token::make(TokenKind::LessThanQuestion, self.offset, 2, vec![], vec![]);
2166         // skip <?
2167         self.advance(2);
2168         let name_token_offset = self.offset;
2169         let ch0 = self.peek_char(0).to_ascii_lowercase();
2170         let ch1 = self.peek_char(1).to_ascii_lowercase();
2171         let ch2 = self.peek_char(2).to_ascii_lowercase();
2172         match (ch0, ch1, ch2) {
2173             ('h', 'h', _) => {
2174                 self.make_long_tag(name_token_offset, 2, markup_text, less_than_question_token)
2175             }
2176             ('p', 'h', 'p') => {
2177                 self.make_long_tag(name_token_offset, 3, markup_text, less_than_question_token)
2178             }
2179             ('=', _, _) => {
2180                 // skip =
2181                 self.advance(1);
2182                 let equal = Token::make(TokenKind::Equal, name_token_offset, 1, vec![], vec![]);
2183
2184                 (markup_text, Some((less_than_question_token, Some(equal))))
2185             }
2186             _ => {
2187                 (markup_text, Some((less_than_question_token, (None))))
2188             }
2189         }
2190     }
2191
2192     fn skip_to_end_of_markup(&mut self) -> (Token, Option<(Token, Option<Token>)>) {
2193         let start_offset = {
2194             // if leading section starts with #! - it should span the entire line
2195             let index = self.offset;
2196             if index != 0 {
2197                 panic!("Should only try to lex header at start of document")
2198             };
2199             if self.peek_def(index, INVALID) == '#' && self.peek_def(index + 1, INVALID) == '!' {
2200                 self.skip_while_to_offset(&Self::not_newline) + 1
2201             } else {
2202                 // this should really just be `index` - but, skip whitespace as the FFP
2203                 // tests use magic comments in leading markup to set flags, but blank
2204                 // them out before parsing; the newlines are kept to provide correct line
2205                 // numbers in errors
2206                 self.skip_while_to_offset(&|x| {
2207                     Self::is_newline(x) || Self::is_whitespace_no_newline(x)
2208                 })
2209             }
2210         };
2211         if self.peek(start_offset) == '<' && self.peek_def(start_offset + 1, INVALID) == '?' {
2212             self.with_offset(start_offset);
2213             self.make_markup_and_suffix()
2214         } else {
2215             (self.make_markup_token(), None)
2216         }
2217     }
2218
2219     pub fn scan_header(&mut self) -> (Token, Option<(Token, Option<Token>)>) {
2220         self.start_new_lexeme();
2221         self.skip_to_end_of_markup()
2222     }
2223
2224     pub fn is_next_xhp_category_name(&self) -> bool {
2225         let mut lexer = self.clone();
2226         let _ = lexer.scan_leading_php_trivia();
2227         // An XHP category is an xhp element name preceded by a %.
2228         let ch0 = lexer.peek_char(0);
2229         let ch1 = lexer.peek_char(1);
2230         ch0 == '%' && Self::is_name_nondigit(ch1)
2231     }
2232
2233     fn scan_xhp_category_name(&mut self) -> TokenKind {
2234         if self.is_next_xhp_category_name() {
2235             self.advance(1);
2236             let _ = self.scan_xhp_element_name(false);
2237             TokenKind::XHPCategoryName
2238         } else {
2239             self.scan_token(false)
2240         }
2241     }
2242
2243     pub fn next_xhp_category_name(&mut self) -> Token {
2244         self.scan_token_and_trivia(&Self::scan_xhp_category_name, KwSet::NoKeywords)
2245     }
2246
2247     pub fn rescan_halt_compiler(&mut self, last_token: Token) -> Token {
2248         // __halt_compiler stops parsing of the file.
2249         // In order to preserve fill fidelity aspect of the parser
2250         // we pack everything that follows __halt_compiler as
2251         // separate opaque kind of trivia - it will be attached as a trailing trivia
2252         // to the last_token and existing trailing trivia will be merged in.
2253
2254         // This is incorrect for minimal token
2255         let leading_start_offset = last_token.leading_start_offset().unwrap_or(0);
2256         let start_offset = leading_start_offset + last_token.leading_width() + last_token.width();
2257
2258         let length = self.source.length();
2259         let trailing = Token::Trivia::make_after_halt_compiler(
2260             self.source(),
2261             start_offset,
2262             length - start_offset,
2263         );
2264         self.with_offset(length);
2265         last_token.with_trailing(vec![trailing])
2266     }
2267 }