1 // Copyright (c) 2019, Facebook, Inc.
2 // All rights reserved.
4 // This source code is licensed under the MIT license found in the
5 // LICENSE file in the "hack" directory of this source tree.
7 use parser_core_types::lexable_token::LexableToken;
8 use parser_core_types::lexable_trivia::LexableTrivia;
9 use parser_core_types::source_text::{SourceText, INVALID};
10 use parser_core_types::syntax_error::{self as Errors, Error, SyntaxError};
11 use parser_core_types::token_kind::TokenKind;
12 use parser_core_types::trivia_kind::TriviaKind;
14 use std::cell::RefCell;
15 use std::ops::DerefMut;
19 struct LexerPreSnapshot {
26 struct LexerPostSnapshot {
30 errors: Vec<SyntaxError>,
33 impl<'a, Token: LexableToken<'a>> PartialEq<Lexer<'a, Token>> for LexerPreSnapshot {
34 fn eq(&self, other: &Lexer<'a, Token>) -> bool {
35 self.start == other.start && self.offset == other.offset && self.in_type == other.in_type
42 One token look ahead in parser is implemented by `parser.peek_token()` ... `parser.next_token()`.
43 Re-scanning in next_token can be avoided by caching the result of `peek_token`, consecutive
44 `peek_token`s can also get improved.
46 `Lexer.peek_next_token()` checks cache first if cache misses it will clone of the current lexer and
47 call next_token on cloned lexer. To cache the result, it takes a snapshot of lexer state before and
48 after calling next_token, and store them in current lexer.
50 Clone trait of Lexer is derived automatically, therefore `cache: Rc<...>` is also cloned. `Rc` ensures
51 cloned lexer and original lexer share the same cache, this is intended! Other than one token look
52 ahead still clones parser, therefore lexer get cloned, sharing cache allows cloned lexer uses
53 cache from original lexer and vise versa. It is measured that 2% faster than not sharing cache.
55 NOTE: There is an invariant assumed by this caching mechanism. `errors` in `Lexer` can only add new errors
56 and must not remove any error when scanning forward! `Lexer.peek_next_token()` clones a new `Lexer` and
57 reset `errors` to empty, look ahead may accumulate new errors and these errors will be appended to the original
58 `Lexer`. The reason we need this invariant is that between `peek_next_token` and `next_token` we can not
59 prove no new error added. Actually it is observed that new errors are added between these two calls.
62 struct LexerCache<Token>(LexerPreSnapshot, Token, LexerPostSnapshot);
64 #[derive(Debug, Clone)]
65 pub struct Lexer<'a, Token: LexableToken<'a>> {
66 source: SourceText<'a>,
69 errors: Vec<SyntaxError>,
70 is_experimental_mode: bool,
73 cache: Rc<RefCell<Option<LexerCache<Token>>>>,
76 #[derive(Debug, PartialEq)]
77 pub enum StringLiteralKind {
79 LiteralHeredoc { heredoc: Vec<u8> },
82 #[derive(Debug, Copy, Clone)]
89 impl<'a, Token: LexableToken<'a>> Lexer<'a, Token> {
90 fn to_lexer_pre_snapshot(&self) -> LexerPreSnapshot {
94 in_type: self.in_type,
98 fn into_lexer_post_snapshot(self) -> LexerPostSnapshot {
102 in_type: self.in_type,
107 pub fn make_at(source: &SourceText<'a>, is_experimental_mode: bool, offset: usize) -> Self {
109 source: source.clone(),
113 is_experimental_mode,
115 cache: Rc::new(RefCell::new(None)),
119 pub fn make(source: &SourceText<'a>, is_experimental_mode: bool) -> Self {
120 Self::make_at(source, is_experimental_mode, 0)
123 fn continue_from(&mut self, l: Lexer<'a, Token>) {
124 self.start = l.start;
125 self.offset = l.offset;
126 self.errors = l.errors
129 pub fn start(&self) -> usize {
133 pub fn offset(&self) -> usize {
137 pub fn errors(&self) -> &[SyntaxError] {
141 fn with_error(&mut self, error: Error) {
142 let error = SyntaxError::make(self.start(), self.offset(), error);
143 self.errors.push(error)
146 fn with_offset(&mut self, offset: usize) {
150 fn with_start_offset(&mut self, start: usize, offset: usize) {
152 self.offset = offset;
155 fn start_new_lexeme(&mut self) {
156 self.start = self.offset
159 pub fn advance(&mut self, i: usize) {
163 fn is_experimental_mode(&self) -> bool {
164 self.is_experimental_mode
167 pub fn set_in_type(&mut self, in_type: bool) {
168 self.in_type = in_type
171 pub fn source(&self) -> &SourceText<'a> {
175 fn source_text_string(&self) -> &[u8] {
181 pub fn peek_char(&self, index: usize) -> char {
182 self.source.get(self.offset() + index)
185 fn peek_string(&self, size: usize) -> &[u8] {
186 &self.source.sub(self.offset, size)
189 fn match_string(&self, s: &[u8]) -> bool {
190 s == self.peek_string(s.len())
193 fn width(&self) -> usize {
194 self.offset - self.start
197 fn current_text(&self) -> &[u8] {
198 self.source.sub(self.start, self.width())
201 fn current_text_as_str(&self) -> &str {
202 unsafe { std::str::from_utf8_unchecked(self.current_text()) }
205 fn at_end(&self) -> bool {
206 self.offset() >= self.source.length()
209 fn remaining(&self) -> usize {
210 let r = (self.source.length() as isize) - (self.offset as isize);
218 fn peek(&self, i: usize) -> char {
222 fn peek_back(&self, index: usize) -> char {
223 self.source.get(self.offset() - index)
226 fn peek_def(&self, index: usize, default: char) -> char {
227 if index >= self.source.length() {
230 self.source.get(index)
234 // Character classification
236 fn is_whitespace_no_newline(c: char) -> bool {
243 fn is_newline(ch: char) -> bool {
250 fn is_binary_digit(ch: char) -> bool {
257 fn is_octal_digit(c: char) -> bool {
258 ('0' <= c && c <= '7')
261 fn is_decimal_digit(ch: char) -> bool {
262 '0' <= ch && ch <= '9'
265 fn is_hexadecimal_digit(c: char) -> bool {
266 ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
269 fn is_name_nondigit(c: char) -> bool {
270 (c == '_') || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('\x7f' <= c)
273 fn is_name_letter(c: char) -> bool {
275 || ('0' <= c && c <= '9')
276 || ('a' <= c && c <= 'z')
277 || ('A' <= c && c <= 'Z')
283 fn skip_while_to_offset(&self, p: impl Fn(char) -> bool) -> usize {
284 let n = self.source.length();
285 let mut i = self.offset();
286 while i < n && p(self.peek(i)) {
292 // advance offset as long as the predicate is true
293 fn skip_while(&mut self, p: impl Fn(char) -> bool) {
294 self.with_offset(self.skip_while_to_offset(p))
297 fn str_skip_while(s: &[u8], mut i: usize, p: impl Fn(char) -> bool) -> usize {
300 if i < n && p(s[i] as char) {
308 fn skip_whitespace(&mut self) {
309 self.skip_while(&Self::is_whitespace_no_newline);
312 fn str_skip_whitespace(s: &[u8], i: usize) -> usize {
313 Self::str_skip_while(s, i, &Self::is_whitespace_no_newline)
316 fn not_newline(ch: char) -> bool {
317 !(Self::is_newline(ch))
320 fn skip_to_end_of_line(&mut self) {
321 self.skip_while(&Self::not_newline)
324 fn skip_to_end_of_line_or_end_tag(&mut self) {
325 let n = self.source.length();
326 let peek_def = |i| if i < n { self.peek(i) } else { INVALID };
328 let should_stop = |i| {
330 let ch = self.peek(i);
331 Self::is_newline(ch) || (ch == '?' && peek_def(i + 1) == '>')
334 let mut i = self.offset();
335 while !(should_stop(i)) {
341 fn skip_name_end(&mut self) {
342 self.skip_while(&Self::is_name_letter)
345 fn skip_end_of_line(&mut self) {
346 match self.peek_char(0) {
347 '\n' => self.advance(1),
349 if self.peek_char(1) == '\n' {
359 fn scan_name_impl(&mut self) {
360 assert!(Self::is_name_nondigit(self.peek_char(0)));
362 self.skip_name_end();
365 fn scan_name(&mut self) -> TokenKind {
366 self.scan_name_impl();
370 fn scan_variable(&mut self) -> TokenKind {
371 assert_eq!('$', self.peek_char(0));
373 self.scan_name_impl();
377 fn scan_with_underscores(&mut self, accepted_char: impl Fn(char) -> bool) {
378 let n = self.source.length();
379 let peek_def = |i| if i < n { self.peek(i) } else { INVALID };
380 let mut i = self.offset();
382 let ch = self.peek(i);
383 if accepted_char(ch) {
385 } else if ch == ' ' && accepted_char(peek_def(i + 1)) {
394 fn scan_decimal_digits(&mut self) {
395 self.skip_while(&Self::is_decimal_digit)
398 fn scan_decimal_digits_with_underscores(&mut self) {
399 self.scan_with_underscores(&Self::is_decimal_digit);
402 fn scan_octal_digits(&mut self) {
403 self.skip_while(&Self::is_octal_digit)
406 fn scan_octal_digits_with_underscores(&mut self) {
407 self.scan_with_underscores(&Self::is_octal_digit)
410 fn scan_binary_digits_with_underscores(&mut self) {
411 self.scan_with_underscores(&Self::is_binary_digit)
414 fn scan_hexadecimal_digits(&mut self) {
415 self.skip_while(&Self::is_hexadecimal_digit)
418 fn scan_hexadecimal_digits_with_underscores(&mut self) {
419 self.scan_with_underscores(&Self::is_hexadecimal_digit)
422 fn scan_hex_literal(&mut self) -> TokenKind {
423 let ch = self.peek_char(0);
424 if !Self::is_hexadecimal_digit(ch) {
425 self.with_error(Errors::error0001);
426 TokenKind::HexadecimalLiteral
428 self.scan_hexadecimal_digits_with_underscores();
429 TokenKind::HexadecimalLiteral
433 fn scan_binary_literal(&mut self) -> TokenKind {
434 let ch = self.peek_char(0);
435 if !Self::is_binary_digit(ch) {
436 self.with_error(Errors::error0002);
437 TokenKind::BinaryLiteral
439 self.scan_binary_digits_with_underscores();
440 TokenKind::BinaryLiteral
444 fn scan_exponent(&mut self) -> TokenKind {
445 let ch = self.peek_char(1);
446 if ch == '+' || ch == '-' {
451 let ch = self.peek_char(0);
452 if !Self::is_decimal_digit(ch) {
453 self.with_error(Errors::error0003);
454 TokenKind::FloatingLiteral
456 self.scan_decimal_digits();
457 TokenKind::FloatingLiteral
461 fn scan_after_decimal_point(&mut self) -> TokenKind {
463 self.scan_decimal_digits();
464 let ch = self.peek_char(0);
465 if ch == 'e' || ch == 'E' {
468 TokenKind::FloatingLiteral
472 fn scan_octal_or_float(&mut self) -> TokenKind {
473 // We've scanned a leading zero.
474 // We have an irritating ambiguity here. 09 is not a legal octal or
475 // floating literal, but 09e1 and 09.1 are.
477 let ch = self.peek_char(0);
482 self.scan_after_decimal_point()
489 _ if '0' <= ch && ch <= '9' => {
491 let mut lexer_oct = self.clone();
492 lexer_oct.scan_octal_digits();
494 let mut lexer_dec = self.clone();
495 lexer_dec.scan_decimal_digits();
496 if (lexer_oct.width()) == (lexer_dec.width()) {
497 // Only octal digits. Could be an octal literal, or could
499 let ch = lexer_oct.peek_char(0);
500 if ch == 'e' || ch == 'E' {
501 self.continue_from(lexer_oct);
503 } else if ch == '.' {
504 self.continue_from(lexer_oct);
505 self.scan_after_decimal_point()
507 // This is irritating - we only want to allow underscores for integer
508 // literals. Deferring the lexing with underscores here allows us to
509 // make sure we're not dealing with floats.
510 self.continue_from(lexer_oct);
511 self.scan_octal_digits_with_underscores();
512 TokenKind::OctalLiteral
515 // We had decimal digits following a leading zero; this is either a
516 // float literal or an octal to be truncated at the first non-octal
518 let ch = lexer_dec.peek_char(0);
519 if ch == 'e' || ch == 'E' {
520 self.continue_from(lexer_dec);
522 } else if ch == '.' {
523 self.continue_from(lexer_dec);
524 self.scan_after_decimal_point()
526 // an octal to be truncated at the first non-octal digit
527 // Again we differ the lexing with underscores here
528 self.scan_decimal_digits_with_underscores();
529 TokenKind::OctalLiteral
534 // 0 is a decimal literal
536 TokenKind::DecimalLiteral
541 fn scan_decimal_or_float(&mut self) -> TokenKind {
542 // We've scanned a leading non-zero digit.
543 let mut lexer_no_underscores = self.clone();
544 lexer_no_underscores.scan_decimal_digits();
545 let mut lexer_with_underscores = self.clone();
546 lexer_with_underscores.scan_decimal_digits_with_underscores();
547 let ch = lexer_no_underscores.peek_char(0);
552 self.continue_from(lexer_no_underscores);
553 self.scan_after_decimal_point()
558 self.continue_from(lexer_no_underscores);
564 self.continue_from(lexer_with_underscores);
565 TokenKind::DecimalLiteral
570 fn scan_single_quote_string_literal(&mut self) -> TokenKind {
571 // TODO: What about newlines embedded?
573 // single-quoted-string-literal::
574 // b-opt ' sq-char-sequence-opt '
576 // TODO: What is this b-opt? We don't lex an optional 'b' before a literal.
578 // sq-char-sequence::
580 // sq-char-sequence sq-char
583 // sq-escape-sequence
584 // \opt any character except single-quote (') or backslash (\)
586 // sq-escape-sequence:: one of
588 let n = self.source.length();
589 let peek = |x| self.source.get(x);
591 let mut has_error0012 = false;
592 let mut has_error0006 = false;
594 let mut i = 1 + self.offset();
595 let new_offset = loop {
597 has_error0012 = true;
603 has_error0006 = true;
607 '\'' => break (1 + i),
614 self.with_error(Errors::error0006)
617 self.with_error(Errors::error0012)
620 self.with_offset(new_offset);
621 TokenKind::SingleQuotedStringLiteral
624 fn scan_hexadecimal_escape(&mut self) {
625 let ch2 = self.peek_char(2);
626 let ch3 = self.peek_char(3);
627 if !(Self::is_hexadecimal_digit(ch2)) {
628 // TODO: Consider producing an error for a malformed hex escape
629 // let lexer = with_error lexer SyntaxError.error0005 in
631 } else if !(Self::is_hexadecimal_digit(ch3)) {
632 // let lexer = with_error lexer SyntaxError.error0005 in
639 fn scan_unicode_escape(&mut self) {
640 // At present the lexer is pointing at \u
641 if self.peek_char(2) == '{' {
642 if self.peek_char(3) == '$' {
643 // We have a malformed unicode escape that contains a possible embedded
644 // expression. Eat the \u and keep on processing the embedded expression.
645 // TODO: Consider producing a warning for a malformed unicode escape.
648 // We have a possibly well-formed escape sequence, and at least we know
649 // that it is not an embedded expression.
650 // TODO: Consider producing an error if the digits are out of range
651 // of legal Unicode characters.
652 // TODO: Consider producing an error if there are no digits.
653 // Skip over the slash, u and brace, and start lexing the number.
655 self.scan_hexadecimal_digits();
656 let ch = self.peek_char(0);
658 // TODO: Consider producing a warning for a malformed unicode escape.
665 // We have a malformed unicode escape sequence. Bail out.
666 // TODO: Consider producing a warning for a malformed unicode escape.
671 fn skip_uninteresting_double_quote_like_string_characters(&mut self, start_char: char) {
672 let is_uninteresting = |ch| match ch {
673 INVALID | '\\' | '$' | '{' | '[' | ']' | '-' => false,
674 ch if '0' <= ch && ch <= '9' => false,
675 ch => ch != start_char && !Self::is_name_nondigit(ch),
677 self.skip_while(&is_uninteresting);
680 fn scan_integer_literal_in_string(&mut self) -> TokenKind {
681 if self.peek_char(0) == '0' {
682 match self.peek_char(1) {
685 self.scan_hex_literal()
689 self.scan_binary_literal()
692 // An integer literal starting with 0 in a string will actually
693 // always be treated as a string index in HHVM, and not as an octal.
694 // In such a case, HHVM actually scans all decimal digits to create the
695 // token. TODO: (kasper) T40381519 we may want to change this behavior to something more
697 self.scan_decimal_digits_with_underscores();
698 TokenKind::DecimalLiteral
702 self.scan_decimal_digits_with_underscores();
703 TokenKind::DecimalLiteral
707 fn scan_double_quote_like_string_literal_from_start(&mut self, start_char: char) -> TokenKind {
708 let literal_token_kind = TokenKind::DoubleQuotedStringLiteral;
709 let head_token_kind = TokenKind::DoubleQuotedStringLiteralHead;
712 // If there's nothing interesting in this double-quoted string then
713 // we can just hand it back as-is.
714 self.skip_uninteresting_double_quote_like_string_characters(start_char);
715 match self.peek_char(0) {
717 // If the string is unterminated then give an error; if this is an
718 // embedded zero character then give an error and recurse; we might
719 // be able to make more progress.
721 self.with_error(Errors::error0012);
722 break literal_token_kind;
724 self.with_error(Errors::error0006);
729 // We made it to the end without finding a special character.
731 break literal_token_kind;
734 // We've found a backslash, dollar or brace.
736 break head_token_kind;
742 fn is_heredoc_tail(&self, name: &[u8]) -> bool {
743 // A heredoc tail is the identifier immediately preceded by a newline
744 // and immediately followed by an optional semi and then a newline.
746 // Note that the newline and optional semi are not part of the literal;
747 // the literal's lexeme ends at the end of the name. Either there is
748 // no trivia and the next token is a semi-with-trailing-newline, or
749 // the trailing trivia is a newline.
751 // This odd rule is to ensure that both
761 // . "something else";
764 if !(Self::is_newline(self.peek_back(1))) {
767 let len = name.len();
768 let ch0 = self.peek_char(len);
769 let ch1 = self.peek_char(len + 1);
770 ((Self::is_newline(ch0)) || ch0 == ';' && (Self::is_newline(ch1)))
771 && self.peek_string(len) == name
775 fn get_tail_token_kind(&self, literal_kind: &StringLiteralKind) -> TokenKind {
777 StringLiteralKind::LiteralHeredoc { .. } => TokenKind::HeredocStringLiteralTail,
778 StringLiteralKind::LiteralDoubleQuoted => TokenKind::DoubleQuotedStringLiteralTail,
782 fn get_string_literal_body_or_double_quoted_tail(
784 literal_kind: &StringLiteralKind,
786 if literal_kind == &StringLiteralKind::LiteralDoubleQuoted {
787 TokenKind::DoubleQuotedStringLiteralTail
789 TokenKind::StringLiteralBody
793 fn scan_string_literal_in_progress(&mut self, literal_kind: &StringLiteralKind) -> TokenKind {
794 let (is_heredoc, name): (bool, &[u8]) = match literal_kind {
795 StringLiteralKind::LiteralHeredoc { heredoc } => (true, &heredoc),
798 let start_char = '"';
799 let ch0 = self.peek_char(0);
800 if Self::is_name_nondigit(ch0) {
801 if is_heredoc && (self.is_heredoc_tail(name)) {
802 self.scan_name_impl();
803 TokenKind::HeredocStringLiteralTail
805 self.scan_name_impl();
812 self.with_error(Errors::error0012);
813 self.get_tail_token_kind(literal_kind)
815 self.with_error(Errors::error0006);
817 self.skip_uninteresting_double_quote_like_string_characters(start_char);
818 TokenKind::StringLiteralBody
822 let kind = self.get_string_literal_body_or_double_quoted_tail(literal_kind);
827 if Self::is_name_nondigit(self.peek_char(1)) {
839 match self.peek_char(1) {
840 // In these cases we just skip the escape sequence and
841 // keep on scanning for special characters.
842 | '\\' | '"' | '$' | 'e' | 'f' | 'n' | 'r' | 't' | 'v' | '`'
843 // Same in these cases; there might be more octal characters following but
844 // if there are, we'll just eat them as normal characters.
845 | '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' => {
847 self.skip_uninteresting_double_quote_like_string_characters(start_char);
848 TokenKind::StringLiteralBody}
850 self.scan_hexadecimal_escape();
851 self.skip_uninteresting_double_quote_like_string_characters(start_char);
852 TokenKind::StringLiteralBody }
854 self.scan_unicode_escape();
855 self.skip_uninteresting_double_quote_like_string_characters(start_char);
856 TokenKind::StringLiteralBody }
858 // The rules for escaping open braces in Hack are bizarre. Suppose we
863 // What is the value of $z? Naively you would think that the backslash
864 // escapes the braces, and the variables are embedded, so {123,456}. But
865 // that's not what happens. Yes, the backslash makes the brace no longer
866 // the opening brace of an expression. But the backslash is still part
867 // of the string! This is the string \{123,456\}.
868 // TODO: We might want to fix this because this is very strange.
869 // Eat the backslash and the brace.
871 TokenKind::StringLiteralBody
874 // TODO: A backslash followed by something other than an escape sequence
875 // is legal in hack, and treated as though it was just the backslash
876 // and the character. However we might consider making this a warning.
877 // It is particularly egregious when we have something like:
880 // The author of the code likely means the backslash to mean line
881 // continuation but in fact it just means to put a backslash and newline
884 self.skip_uninteresting_double_quote_like_string_characters(start_char);
885 TokenKind::StringLiteralBody
891 TokenKind::LeftBracket
895 TokenKind::RightBracket
898 if (self.peek_char(1)) == '>' {
900 TokenKind::MinusGreaterThan
902 // Nothing interesting here. Skip it and find the next
903 // interesting character.
905 self.skip_uninteresting_double_quote_like_string_characters(start_char);
906 TokenKind::StringLiteralBody
909 ch if '0' <= ch && ch <= '9' => {
910 let mut lexer1 = self.clone();
911 let literal = lexer1.scan_integer_literal_in_string();
913 if self.errors.len() == lexer1.errors.len() {
914 self.continue_from(lexer1);
917 // If we failed to scan a literal, do not interpret the literal
918 self.with_offset(lexer1.offset());
919 TokenKind::StringLiteralBody
923 // Nothing interesting here. Skip it and find the next
924 // interesting character.
926 self.skip_uninteresting_double_quote_like_string_characters(start_char);
927 TokenKind::StringLiteralBody
932 // A heredoc string literal has the form
940 // <<< (optional whitespace) name (no whitespace) (newline)
942 // The optional body is:
944 // any characters whatsoever including newlines (newline)
948 // (no whitespace) name (no whitespace) (optional semi) (no whitespace) (newline)
950 // The names must be identical. The trailing semi and newline must be present.
952 // The body is any and all characters, up to the first line that exactly matches
955 // The body may contain embedded expressions.
957 // A nowdoc string literal has the same form except that the first name is
958 // enclosed in single quotes, and it may not contain embedded expressions.
959 fn scan_docstring_name_actual(&mut self) -> &'a [u8] {
960 let ch = self.peek_char(0);
961 if Self::is_name_nondigit(ch) {
962 let start_offset = self.offset();
964 self.skip_name_end();
965 self.source.sub(start_offset, self.offset() - start_offset)
967 self.with_error(Errors::error0008);
972 fn scan_docstring_name(&mut self) -> (&'a [u8], TokenKind) {
973 self.skip_whitespace();
974 let ch = self.peek_char(0);
975 let kind = if ch == '\'' {
976 TokenKind::NowdocStringLiteral
978 TokenKind::HeredocStringLiteral
981 let name = if ch == '\'' {
983 let name = self.scan_docstring_name_actual();
984 if (self.peek_char(0)) == '\'' {
988 self.with_error(Errors::error0010);
992 // Starting with PHP 5.3.0, the opening Heredoc identifier
993 // may optionally be enclosed in double quotes:
997 let name = self.scan_docstring_name_actual();
999 // same logic as above, just for double quote
1000 if self.peek_char(0) == '\"' {
1003 self.with_error(Errors::missing_double_quote)
1011 fn scan_docstring_header(&mut self) -> (&'a [u8], TokenKind) {
1012 let ch = self.peek_char(0);
1013 // Skip 3 for <<< or 4 for b<<<
1014 let skip_count = if ch == 'b' { 4 } else { 3 };
1015 self.advance(skip_count);
1016 let (name, kind) = self.scan_docstring_name();
1017 let ch = self.peek_char(0);
1018 if !Self::is_newline(ch) {
1019 self.with_error(Errors::error0011)
1021 self.skip_to_end_of_line();
1022 self.skip_end_of_line();
1026 fn scan_docstring_remainder(&mut self, name: &[u8]) {
1027 let len = name.len();
1029 let ch0 = self.peek_char(len);
1030 let ch1 = self.peek_char(len + 1);
1031 if ((Self::is_newline(ch0)) || ch0 == ';' && (Self::is_newline(ch1)))
1032 && self.peek_string(len as usize) == name
1034 self.advance(len as usize);
1037 self.skip_to_end_of_line();
1038 let ch = self.peek_char(0);
1039 if Self::is_newline(ch) {
1040 self.skip_end_of_line()
1042 // If we got here then we ran off the end of the file without
1043 // finding a newline. Just bail.
1044 self.with_error(Errors::error0011);
1051 fn scan_docstring_literal(&mut self) -> TokenKind {
1052 let (name, kind) = self.scan_docstring_header();
1053 self.scan_docstring_remainder(name);
1057 fn scan_xhp_label(&mut self) {
1058 // An XHP label has the same grammar as a Hack name.
1059 let _: TokenKind = self.scan_name();
1062 fn scan_xhp_element_name(&mut self, attribute: bool) -> TokenKind {
1063 // An XHP element name is a sequence of one or more XHP labels each separated
1064 // by a single : or -. Note that it is possible for an XHP element name to be
1065 // followed immediately by a : or - that is the next token, so if we find
1066 // a : or - not followed by a label, we need to terminate the token.
1067 self.scan_xhp_label();
1068 let ch0 = self.peek_char(0);
1069 let ch1 = self.peek_char(1);
1070 if (!attribute && ch0 == ':' || ch0 == '-') && Self::is_name_nondigit(ch1) {
1072 self.scan_xhp_element_name(false)
1074 TokenKind::XHPElementName
1078 // Is the next token we're going to lex a possible xhp class name?
1079 fn is_xhp_class_name(&self) -> bool {
1080 (self.peek_char(0) == ':') && (Self::is_name_nondigit(self.peek_char(1)))
1083 fn scan_xhp_class_name(&mut self) -> TokenKind {
1084 // An XHP class name is a colon followed by an xhp name.
1085 if self.is_xhp_class_name() {
1087 self.scan_xhp_element_name(false);
1088 TokenKind::XHPClassName
1090 self.with_error(Errors::error0008);
1092 TokenKind::ErrorToken
1096 fn scan_xhp_string_literal(&mut self) -> TokenKind {
1097 // XHP string literals are just straight up "find the closing quote"
1098 // strings. Embedded newlines are legal.
1099 let mut offset: usize = 1;
1101 match self.peek_char(offset) {
1103 self.advance(offset);
1105 self.with_error(Errors::error0012);
1106 return TokenKind::XHPStringLiteral;
1108 self.with_error(Errors::error0006);
1113 self.advance(offset + 1);
1114 return TokenKind::XHPStringLiteral;
1121 // Note that this does not scan an XHP body
1122 fn scan_xhp_token(&mut self) -> TokenKind {
1123 // TODO: HHVM requires that there be no trivia between < and name in an
1124 // opening tag, but does allow trivia between </ and name in a closing tag.
1125 // Consider allowing trivia in an opening tag.
1126 let ch0 = self.peek_char(0);
1127 if ch0 == INVALID && self.at_end() {
1128 TokenKind::EndOfFile
1129 } else if Self::is_name_nondigit(ch0) {
1130 self.scan_xhp_element_name(false)
1135 TokenKind::LeftBrace
1139 TokenKind::RightBrace
1146 if (self.peek_char(1)) == '/' {
1148 TokenKind::LessThanSlash
1154 '"' => self.scan_xhp_string_literal(),
1156 if (self.peek_char(1)) == '>' {
1158 TokenKind::SlashGreaterThan
1160 self.with_error(Errors::error0006);
1162 TokenKind::ErrorToken
1167 TokenKind::GreaterThan
1170 self.with_error(Errors::error0006);
1172 TokenKind::ErrorToken
1178 fn scan_xhp_comment(&mut self) {
1181 let ch0 = self.peek_char(offset);
1182 let ch1 = self.peek_char(offset + 1);
1183 let ch2 = self.peek_char(offset + 2);
1184 match (ch0, ch1, ch2) {
1185 (INVALID, _, _) => {
1186 self.advance(offset as usize);
1187 return self.with_error(Errors::error0014);
1189 ('-', '-', '>') => return self.advance((offset + 3) as usize),
1194 fn scan_xhp_body(&mut self) -> TokenKind {
1195 // Naively you might think that an XHP body is just a bunch of characters,
1196 // terminated by an embedded { } expression or a tag. However, whitespace
1197 // and newlines are relevant in XHP bodies because they are "soft".
1198 // That is, any section of contiguous trivia has the same semantics as a
1199 // single space or newline -- just as in HTML.
1201 // Obviously this is of relevance to code formatters.
1203 // Therefore we detect whitespace and newlines within XHP bodies and treat
1204 // it as trivia surrounding the tokens within the body.
1206 // TODO: Is this also true of whitespace within XHP comments? If so then
1207 // we need to make XHP comments a sequence of tokens, rather than a
1208 // single token as they are now.
1209 let ch0 = self.peek_char(0);
1212 INVALID if self.at_end() => TokenKind::EndOfFile,
1215 TokenKind::LeftBrace
1219 TokenKind::RightBrace
1222 let ch1 = self.peek_char(1);
1223 let ch2 = self.peek_char(2);
1224 let ch3 = self.peek_char(3);
1225 match (ch1, ch2, ch3) {
1226 ('!', '-', '-') => {
1227 self.scan_xhp_comment();
1228 TokenKind::XHPComment
1232 TokenKind::LessThanSlash
1243 let ch = self.peek_char(offset);
1246 self.advance(offset);
1248 self.with_error(Errors::error0013);
1251 self.with_error(Errors::error0006);
1255 '\t' | ' ' | '\r' | '\n' | '{' | '}' | '<' => {
1256 self.advance(offset);
1267 fn scan_dollar_token(&mut self) -> TokenKind {
1268 // We have a problem here. We wish to be able to lexically analyze both
1269 // PHP and Hack, but the introduction of $$ to Hack makes them incompatible.
1270 // "$$x" and "$$ $x" are legal in PHP, but illegal in Hack.
1271 // The rule in PHP seems to be that $ is a prefix operator, it is a token,
1272 // it can be followed by trivia, but the next token has to be another $
1273 // operator, a variable $x, or a {.
1275 // Here's a reasonable compromise. (TODO: Review this decision.)
1277 // $$x lexes as $ $x
1278 // $$$x lexes as $ $ $x
1281 // $$ followed by anything other than a name or a $ lexes as $$.
1283 // This means that lexing a PHP program which contains "$$ $x" is different
1284 // will fail at parse time, but I'm willing to live with that.
1286 // This means that lexing a Hack program which contains
1287 // "$x |> $$instanceof Foo" produces an error as well.
1289 // If these decisions are unacceptable then we will need to make the lexer
1290 // be aware of whether it is lexing PHP or Hack; thus far we have not had
1291 // to make this distinction.
1293 // We are already at $.
1294 let ch1 = self.peek_char(1);
1297 let ch2 = self.peek_char(2);
1298 if ch2 == '$' || ch2 == '{' || Self::is_name_nondigit(ch2) {
1300 TokenKind::Dollar // $$x or $$$
1303 TokenKind::DollarDollar // $$
1307 if Self::is_name_nondigit(ch1) {
1308 self.scan_variable() // $x
1311 TokenKind::Dollar // $
1317 fn scan_token(&mut self, in_type: bool) -> TokenKind {
1318 let ch0 = self.peek_char(0);
1322 TokenKind::LeftBracket
1326 TokenKind::RightBracket
1330 TokenKind::LeftParen
1334 TokenKind::RightParen
1338 TokenKind::LeftBrace
1342 TokenKind::RightBrace
1344 '.' => match self.peek_char(1) {
1349 ch if '0' <= ch && ch <= '9' => self.scan_after_decimal_point(),
1351 if (self.peek_char(2)) == '.' {
1353 TokenKind::DotDotDot
1364 '-' => match self.peek_char(1) {
1367 TokenKind::MinusEqual
1371 TokenKind::MinusMinus
1375 TokenKind::MinusGreaterThan
1382 '+' => match self.peek_char(1) {
1385 TokenKind::PlusEqual
1396 '*' => match (self.peek_char(1), self.peek_char(2)) {
1399 TokenKind::StarEqual
1403 TokenKind::StarStarEqual
1418 '!' => match (self.peek_char(1), self.peek_char(2)) {
1421 TokenKind::ExclamationEqualEqual
1425 TokenKind::ExclamationEqual
1429 TokenKind::Exclamation
1432 '$' => self.scan_dollar_token(),
1434 if (self.peek_char(1)) == '=' {
1436 TokenKind::SlashEqual
1443 if (self.peek_char(1)) == '=' {
1445 TokenKind::PercentEqual
1452 match (self.peek_char(1), self.peek_char(2)) {
1453 ('<', '<') => self.scan_docstring_literal(),
1456 TokenKind::LessThanLessThanEqual
1458 // TODO: We lex and parse the spaceship operator.
1459 // TODO: This is not in the spec at present. We should either make it an
1460 // TODO: error, or add it to the specification.
1463 TokenKind::LessThanEqualGreaterThan
1467 TokenKind::LessThanEqual
1471 TokenKind::LessThanLessThan
1480 match (self.peek_char(1), self.peek_char(2)) {
1481 // If we are parsing a generic type argument list then we might be at the >>
1482 // in `List<List<int>>``, or at the >= of `let x:vec<int>=...`. In that case
1483 // we want to lex two >'s instead of >> / one > and one = instead of >=.
1484 (ch, _) if (ch == '>' || ch == '=') && in_type => {
1486 TokenKind::GreaterThan
1490 TokenKind::GreaterThanGreaterThanEqual
1494 TokenKind::GreaterThanGreaterThan
1498 TokenKind::GreaterThanEqual
1502 TokenKind::GreaterThan
1506 '=' => match (self.peek_char(1), self.peek_char(2)) {
1509 TokenKind::EqualEqualEqual
1513 TokenKind::EqualEqualGreaterThan
1517 TokenKind::EqualEqual
1521 TokenKind::EqualGreaterThan
1529 if (self.peek_char(1)) == '=' {
1531 TokenKind::CaratEqual
1537 '|' => match self.peek_char(1) {
1544 TokenKind::BarGreaterThan
1555 '&' => match self.peek_char(1) {
1558 TokenKind::AmpersandEqual
1562 TokenKind::AmpersandAmpersand
1566 TokenKind::Ampersand
1569 '?' => match (self.peek_char(1), self.peek_char(2)) {
1570 (':', _) if !in_type => {
1572 TokenKind::QuestionColon
1576 TokenKind::QuestionMinusGreaterThan
1580 TokenKind::QuestionQuestionEqual
1584 TokenKind::QuestionQuestion
1588 TokenKind::QuestionGreaterThan
1590 ('a', 's') if !Self::is_name_nondigit(self.peek_char(3)) => {
1592 TokenKind::QuestionAs
1600 let ch1 = self.peek_char(1);
1604 TokenKind::ColonColon
1605 } else if ch1 == '@' {
1615 TokenKind::Semicolon
1625 '0' => match self.peek_char(1) {
1628 self.scan_hex_literal()
1632 self.scan_binary_literal()
1634 _ => self.scan_octal_or_float(),
1636 ch if '1' <= ch && ch <= '9' => self.scan_decimal_or_float(),
1637 '\'' => self.scan_single_quote_string_literal(),
1638 '`' => self.scan_double_quote_like_string_literal_from_start('`'),
1639 '"' => self.scan_double_quote_like_string_literal_from_start('"'),
1642 TokenKind::Backslash
1645 let c1 = self.peek_char(1);
1646 let c2 = self.peek_char(2);
1647 let c3 = self.peek_char(3);
1648 c1 == '"' || c1 == '\'' || (c1 == '<' && c2 == '<' && c3 == '<')
1652 self.scan_token(in_type)
1656 if ch0 == INVALID && self.at_end() {
1657 TokenKind::EndOfFile
1658 } else if Self::is_name_nondigit(ch0) {
1661 self.with_error(Errors::error0006);
1663 TokenKind::ErrorToken
1669 fn scan_token_outside_type(&mut self) -> TokenKind {
1670 self.scan_token(false)
1673 fn scan_token_inside_type(&mut self) -> TokenKind {
1674 self.scan_token(true)
1681 // white-space-character::
1683 // Space character (U+0020)
1684 // Horizontal-tab character (U+0009)
1686 // single-line-comment::
1687 // // input-characters-opt
1688 // # input-characters-opt
1691 // Carriage-return character (U+000D)
1692 // Line-feed character (U+000A)
1693 // Carriage-return character followed by line-feed character
1695 fn str_scan_end_of_line(s: &[u8], i: usize) -> usize {
1696 match s.get(i).map(|x| *x as char) {
1698 Some('\r') => match s.get(i + 1).map(|x| *x as char) {
1699 Some('\n') => 2 + i,
1702 Some('\n') => i + 1,
1703 _ => panic!("str_scan_end_of_line called while not on end of line!"),
1707 fn scan_end_of_line(&mut self) -> Token::Trivia {
1708 match self.peek_char(0) {
1710 let w = if self.peek_char(1) == '\n' { 2 } else { 1 };
1712 Token::Trivia::make_eol(self.source(), self.start, w)
1716 Token::Trivia::make_eol(self.source(), self.start, 1)
1718 _ => panic!("scan_end_of_line called while not on end of line!"),
1722 fn scan_hash_comment(&mut self) -> Token::Trivia {
1723 self.skip_to_end_of_line();
1724 Token::Trivia::make_single_line_comment(self.source(), self.start, self.width())
1727 fn scan_single_line_comment(&mut self) -> Token::Trivia {
1728 // A fallthrough comment is two slashes, any amount of whitespace,
1729 // FALLTHROUGH, and any characters may follow.
1730 // TODO: Consider allowing lowercase fallthrough.
1733 self.skip_whitespace();
1734 let lexer_ws = self.clone();
1735 self.skip_to_end_of_line_or_end_tag();
1736 let w = self.width();
1737 let remainder = self.offset - lexer_ws.offset;
1738 if remainder >= 11 && lexer_ws.peek_string(11) == b"FALLTHROUGH" {
1739 Token::Trivia::make_fallthrough(self.source(), self.start, w)
1741 Token::Trivia::make_single_line_comment(self.source(), self.start, w)
1745 fn skip_to_end_of_delimited_comment(&mut self) {
1748 let ch0 = self.peek_char(offset);
1750 self.advance(offset);
1752 return self.with_error(Errors::error0007);
1754 // TODO: Do we want to give a warning for an embedded zero char
1755 // inside a comment?
1758 } else if ch0 == '*' && (self.peek_char(offset + 1)) == '/' {
1759 return self.advance(offset + 2);
1766 fn scan_delimited_comment(&mut self) -> Token::Trivia {
1767 // The original lexer lexes a fixme / ignore error as:
1769 // slash star [whitespace]* HH_FIXME [whitespace or newline]* leftbracket
1770 // [whitespace or newline]* integer [any text]* star slash
1772 // Notice that the original lexer oddly enough does not verify that there
1773 // is a right bracket.
1775 // For our purposes we will just check for HH_FIXME / HH_IGNORE_ERROR;
1776 // a later pass can try to parse out the integer if there is one,
1777 // give a warning if there is not, and so on.
1780 self.skip_whitespace();
1782 let lexer_ws = self.clone();
1783 self.skip_to_end_of_delimited_comment();
1784 let w = self.width();
1785 if lexer_ws.match_string(b"HH_FIXME") {
1786 Token::Trivia::make_fix_me(self.source(), self.start, w)
1787 } else if lexer_ws.match_string(b"HH_IGNORE_ERROR") {
1788 Token::Trivia::make_ignore_error(self.source(), self.start, w)
1790 Token::Trivia::make_delimited_comment(self.source(), self.start, w)
1794 fn scan_php_trivia(&mut self) -> Option<Token::Trivia> {
1795 // Hack does not support PHP style embedded markup:
1803 // However, ?> is never legal in Hack, so we can treat ?> ... any text ... <?php
1804 // as a comment, and then give an error saying that this feature is not supported
1807 // TODO: Give an error if this appears in a Hack program.
1808 match self.peek_char(0) {
1810 self.start_new_lexeme();
1811 Some(self.scan_hash_comment())
1814 self.start_new_lexeme();
1815 match self.peek_char(1) {
1816 '/' => Some(self.scan_single_line_comment()),
1817 '*' => Some(self.scan_delimited_comment()),
1822 let new_end = Self::str_skip_whitespace(self.source_text_string(), self.offset);
1823 let new_start = self.offset;
1825 Token::Trivia::make_whitespace(self.source(), new_start, new_end - new_start);
1826 self.with_start_offset(new_start, new_end);
1830 self.start_new_lexeme();
1831 Some(self.scan_end_of_line())
1834 self.start_new_lexeme();
1841 fn scan_xhp_trivia(&mut self) -> Option<Token::Trivia> {
1842 // TODO: Should XHP comments <!-- --> be their own thing, or a kind of
1843 // trivia associated with a token? Right now they are the former.
1844 let i = self.offset;
1845 let ch = self.peek_char(0);
1848 let j = Self::str_skip_whitespace(self.source_text_string(), i);
1849 self.with_start_offset(i, j);
1850 Some(Token::Trivia::make_whitespace(self.source(), i, j - i))
1853 let j = Self::str_scan_end_of_line(self.source_text_string(), i);
1854 self.with_start_offset(i, j);
1855 Some(Token::Trivia::make_eol(self.source(), i, j - i))
1860 self.start_new_lexeme();
1866 // We divide trivia into "leading" and "trailing" trivia of an associated
1867 // token. This means that we must find a dividing line between the trailing trivia
1868 // following one token and the leading trivia of the following token. Plainly
1869 // we need only find this line while scanning trailing trivia. The heuristics
1871 // * The first newline trivia encountered is the last trailing trivia.
1872 // * The newline which follows a // or # comment is not part of the comment
1873 // but does terminate the trailing trivia.
1874 // * A pragma to turn checks off (HH_FIXME and HH_IGNORE_ERROR) is
1875 // always a leading trivia.
1876 fn scan_leading_trivia(
1878 scanner: impl Fn(&mut Self) -> Option<Token::Trivia>,
1879 ) -> Vec<Token::Trivia> {
1880 let mut acc = vec![];
1881 while let Some(t) = scanner(self) {
1887 pub fn scan_leading_php_trivia(&mut self) -> Vec<Token::Trivia> {
1888 self.scan_leading_trivia(&Self::scan_php_trivia)
1891 pub fn scan_leading_xhp_trivia(&mut self) -> Vec<Token::Trivia> {
1892 self.scan_leading_trivia(&Self::scan_xhp_trivia)
1895 fn scan_trailing_trivia(
1897 scanner: impl Fn(&mut Self) -> Option<Token::Trivia>,
1898 ) -> Vec<Token::Trivia> {
1899 let mut acc = vec![];
1901 let mut lexer1 = self.clone();
1902 match scanner(&mut lexer1) {
1904 self.continue_from(lexer1);
1907 Some(t) => match t.kind() {
1908 TriviaKind::EndOfLine => {
1909 self.continue_from(lexer1);
1913 TriviaKind::FixMe | TriviaKind::IgnoreError => {
1917 self.continue_from(lexer1);
1925 pub fn scan_trailing_php_trivia(&mut self) -> Vec<Token::Trivia> {
1926 self.scan_trailing_trivia(&Self::scan_php_trivia)
1929 pub fn scan_trailing_xhp_trivia(&mut self) -> Vec<Token::Trivia> {
1930 self.scan_trailing_trivia(&Self::scan_xhp_trivia)
1933 pub fn is_next_name(&self) -> bool {
1934 let mut lexer = self.clone();
1935 lexer.scan_leading_php_trivia();
1936 Self::is_name_nondigit(lexer.peek_char(0))
1939 pub fn is_next_xhp_class_name(&self) -> bool {
1940 let mut lexer = self.clone();
1941 lexer.scan_leading_php_trivia();
1942 lexer.is_xhp_class_name()
1945 fn as_case_insensitive_keyword(&self, text: &str) -> Option<String> {
1946 let lower = text.to_ascii_lowercase();
1947 let res = match lower.as_ref() {
1948 "__halt_compiler" | "abstract" | "and" | "array" | "as" | "bool" | "boolean"
1949 | "break" | "callable" | "case" | "catch" | "class" | "clone" | "const"
1950 | "continue" | "default" | "die" | "do" | "echo" | "else" | "elseif" | "empty"
1951 | "endfor" | "endforeach" | "endif" | "endswitch" | "endwhile" | "eval" | "exit"
1952 | "extends" | "false" | "final" | "finally" | "for" | "foreach" | "function"
1953 | "global" | "goto" | "if" | "implements" | "include" | "include_once" | "inout"
1954 | "instanceof" | "insteadof" | "int" | "integer" | "interface" | "isset" | "list"
1955 | "namespace" | "new" | "null" | "or" | "parent" | "print" | "private"
1956 | "protected" | "public" | "require" | "require_once" | "return" | "self"
1957 | "static" | "string" | "switch" | "throw" | "trait" | "try" | "true" | "unset"
1958 | "use" | "using" | "var" | "void" | "while" | "xor" | "yield" => Some(lower),
1961 res.map(|x| x.to_owned())
1964 fn lowercase_error(&self, original_text: &str, lowered_text: &str) -> bool {
1965 match lowered_text {
1966 "true" | "false" | "null" => false,
1967 _ => original_text != lowered_text,
1971 fn as_keyword(&mut self, only_reserved: bool, kind: TokenKind) -> TokenKind {
1972 if kind == TokenKind::Name {
1973 let original_text = self.current_text_as_str();
1974 let text_as_lowercase_keyword = self.as_case_insensitive_keyword(original_text);
1975 let text = match text_as_lowercase_keyword.as_ref() {
1977 None => original_text,
1979 match TokenKind::from_string(&text.as_bytes(), only_reserved) {
1980 Some(TokenKind::Let) if (!(self.is_experimental_mode())) => TokenKind::Name,
1982 if self.lowercase_error(original_text, &text) {
1983 let err = Errors::uppercase_kw(original_text);
1984 self.with_error(err);
1988 _ => TokenKind::Name,
1995 fn scan_token_and_leading_trivia(
1997 scanner: impl Fn(&mut Self) -> TokenKind,
1999 ) -> (TokenKind, usize, Vec<Token::Trivia>) {
2000 // Get past the leading trivia
2001 let leading = self.scan_leading_php_trivia();
2002 // Remember where we were when we started this token
2003 self.start_new_lexeme();
2004 let kind = scanner(self);
2005 let kind = match as_name {
2006 KwSet::AllKeywords => kind,
2007 KwSet::NonReservedKeywords => self.as_keyword(true, kind),
2008 KwSet::NoKeywords => self.as_keyword(false, kind),
2010 let w = self.width();
2014 fn scan_token_and_trivia(
2016 scanner: &impl Fn(&mut Self) -> TokenKind,
2019 let token_start = self.offset;
2021 let (kind, w, leading) = self.scan_token_and_leading_trivia(scanner, as_name);
2022 let trailing = match kind {
2023 TokenKind::DoubleQuotedStringLiteralHead => vec![],
2024 TokenKind::QuestionGreaterThan => {
2025 if Self::is_newline(self.peek_char(0)) {
2026 // consume only trailing EOL token after ?> as trailing trivia
2027 vec![self.scan_end_of_line()]
2032 _ => self.scan_trailing_php_trivia(),
2034 Token::make(kind, self.source(), token_start, w, leading, trailing)
2037 fn scan_assert_progress(&mut self, tokenizer: impl Fn(&mut Self) -> Token) -> Token {
2038 let original_remaining = self.remaining();
2039 let token = tokenizer(self);
2040 let new_remaining = self.remaining();
2041 if new_remaining < original_remaining
2042 || original_remaining == 0
2043 && new_remaining == 0
2044 && (token.kind()) == TokenKind::EndOfFile
2048 panic!("failed to make progress at {}\n", self.offset)
2054 scanner: impl Fn(&mut Self) -> TokenKind,
2057 let tokenizer = |x: &mut Self| x.scan_token_and_trivia(&scanner, as_name);
2058 self.scan_assert_progress(&tokenizer)
2061 fn scan_next_token_as_name(&mut self, scanner: impl Fn(&mut Self) -> TokenKind) -> Token {
2062 self.scan_next_token(scanner, KwSet::AllKeywords)
2065 fn scan_next_token_as_keyword(&mut self, scanner: impl Fn(&mut Self) -> TokenKind) -> Token {
2066 self.scan_next_token(scanner, KwSet::NoKeywords)
2069 fn scan_next_token_nonreserved_as_name(
2071 scanner: impl Fn(&mut Self) -> TokenKind,
2073 self.scan_next_token(scanner, KwSet::NonReservedKeywords)
2076 fn next_token_impl(&mut self) -> Token {
2078 self.scan_next_token_as_keyword(&Self::scan_token_inside_type)
2080 self.scan_next_token_as_keyword(&Self::scan_token_outside_type)
2085 pub fn peek_next_token(&self) -> Token {
2087 let cache = self.cache.borrow();
2088 if let Some(cache) = cache.as_ref() {
2089 if cache.0 == *self {
2090 return cache.1.clone();
2095 let mut lexer = self.clone();
2096 lexer.errors = vec![];
2097 let before = lexer.to_lexer_pre_snapshot();
2098 let token = lexer.next_token_impl();
2099 let after = lexer.into_lexer_post_snapshot();
2101 .replace(Some(LexerCache(before, token.clone(), after)));
2105 pub fn next_token(&mut self) -> Token {
2107 let mut cache = self.cache.borrow_mut();
2108 if let Some(ref mut cache) = cache.deref_mut() {
2109 if cache.0 == *self {
2110 self.start = (cache.2).start;
2111 self.offset = (cache.2).offset;
2112 self.in_type = (cache.2).in_type;
2113 if !(cache.2).errors.is_empty() {
2114 self.errors.append(&mut (cache.2).errors.clone());
2116 return cache.1.clone();
2120 self.next_token_impl()
2123 pub fn next_token_no_trailing(&mut self) -> Token {
2124 let tokenizer = |x: &mut Self| {
2125 let token_start = x.offset;
2126 let (kind, w, leading) =
2127 x.scan_token_and_leading_trivia(&Self::scan_token_outside_type, KwSet::NoKeywords);
2128 Token::make(kind, x.source(), token_start, w, leading, vec![])
2130 self.scan_assert_progress(&tokenizer)
2133 pub fn next_token_in_string(&mut self, literal_kind: &StringLiteralKind) -> Token {
2134 let token_start = self.offset;
2135 self.start_new_lexeme();
2136 // We're inside a string. Do not scan leading trivia.
2137 let kind = self.scan_string_literal_in_progress(literal_kind);
2138 let w = self.width();
2139 // Only scan trailing trivia if we've finished the string.
2140 let trailing = match kind {
2141 TokenKind::DoubleQuotedStringLiteralTail | TokenKind::HeredocStringLiteralTail => {
2142 self.scan_trailing_php_trivia()
2146 Token::make(kind, self.source(), token_start, w, vec![], trailing)
2149 pub fn next_docstring_header(&mut self) -> (Token, &'a [u8]) {
2150 // We're at the beginning of a heredoc string literal. Scan leading
2151 // trivia but not trailing trivia.
2152 let token_start = self.offset;
2153 let leading = self.scan_leading_php_trivia();
2154 self.start_new_lexeme();
2155 let (name, _) = self.scan_docstring_header();
2156 let w = self.width();
2157 let token = Token::make(
2158 TokenKind::HeredocStringLiteralHead,
2168 pub fn next_token_as_name(&mut self) -> Token {
2169 self.scan_next_token_as_name(&Self::scan_token_outside_type)
2172 pub fn next_token_non_reserved_as_name(&mut self) -> Token {
2173 self.scan_next_token_nonreserved_as_name(&Self::scan_token_outside_type)
2176 pub fn next_xhp_element_token(&mut self, no_trailing: bool) -> (Token, &[u8]) {
2177 // XHP elements have whitespace, newlines and Hack comments.
2178 let tokenizer = |lexer: &mut Self| {
2179 let token_start = lexer.offset;
2180 let (kind, w, leading) =
2181 lexer.scan_token_and_leading_trivia(&Self::scan_xhp_token, KwSet::AllKeywords);
2182 // We do not scan trivia after an XHPOpen's >. If that is the beginning of
2183 // an XHP body then we want any whitespace or newlines to be leading trivia
2184 // of the body token.
2186 TokenKind::GreaterThan | TokenKind::SlashGreaterThan if no_trailing => {
2187 Token::make(kind, lexer.source(), token_start, w, leading, vec![])
2190 let trailing = lexer.scan_trailing_php_trivia();
2191 Token::make(kind, lexer.source(), token_start, w, leading, trailing)
2195 let token = self.scan_assert_progress(&tokenizer);
2196 let token_width = token.width();
2197 let trailing_width = token.trailing_width();
2198 let token_start_offset = (self.offset) - trailing_width - token_width;
2199 let token_text = self.source.sub(token_start_offset, token_width);
2203 pub fn next_xhp_body_token(&mut self) -> Token {
2204 let scanner = |lexer: &mut Self| {
2205 let token_start = lexer.offset;
2206 let leading = lexer.scan_leading_xhp_trivia();
2207 lexer.start_new_lexeme();
2208 let kind = lexer.scan_xhp_body();
2209 let w = lexer.width();
2211 // Trivia (leading and trailing) is semantically
2212 // significant for XHPBody tokens. When we find elements or
2213 // braced expressions inside the body, the trivia should be
2214 // seen as leading the next token, but we should certainly
2215 // keep it trailing if this is an XHPBody token.
2216 if kind == TokenKind::XHPBody
2217 { lexer.scan_trailing_xhp_trivia() }
2219 Token::make(kind, lexer.source(), token_start, w, leading, trailing)
2221 self.scan_assert_progress(&scanner)
2224 pub fn next_xhp_class_name(&mut self) -> Token {
2225 self.scan_token_and_trivia(&Self::scan_xhp_class_name, KwSet::NoKeywords)
2228 pub fn next_xhp_name(&mut self) -> Token {
2229 let scanner = |x: &mut Self| x.scan_xhp_element_name(false);
2230 self.scan_token_and_trivia(&scanner, KwSet::NoKeywords)
2233 fn make_markup_token(&self) -> Token {
2246 name_token_offset: usize,
2249 less_than_question_token: Token,
2250 ) -> (Token, Option<(Token, Option<Token>)>) {
2253 // single line comments that follow the language in leading markup_text
2254 // determine the file check mode, read the trailing trivia and attach it
2255 // to the language token
2256 let trailing = self.scan_trailing_php_trivia();
2257 let name = Token::make(
2265 (markup_text, Some((less_than_question_token, Some(name))))
2268 fn make_markup_and_suffix(&mut self) -> (Token, Option<(Token, Option<Token>)>) {
2269 let markup_text = self.make_markup_token();
2270 let less_than_question_token = Token::make(
2271 TokenKind::LessThanQuestion,
2280 let name_token_offset = self.offset;
2281 let ch0 = self.peek_char(0).to_ascii_lowercase();
2282 let ch1 = self.peek_char(1).to_ascii_lowercase();
2283 let ch2 = self.peek_char(2).to_ascii_lowercase();
2284 match (ch0, ch1, ch2) {
2286 self.make_long_tag(name_token_offset, 2, markup_text, less_than_question_token)
2288 ('p', 'h', 'p') => {
2289 self.make_long_tag(name_token_offset, 3, markup_text, less_than_question_token)
2294 let equal = Token::make(
2303 (markup_text, Some((less_than_question_token, Some(equal))))
2305 _ => (markup_text, Some((less_than_question_token, (None)))),
2309 fn skip_to_end_of_markup(&mut self) -> (Token, Option<(Token, Option<Token>)>) {
2310 let start_offset = {
2311 // if leading section starts with #! - it should span the entire line
2312 let index = self.offset;
2314 panic!("Should only try to lex header at start of document")
2316 if self.peek_def(index, INVALID) == '#' && self.peek_def(index + 1, INVALID) == '!' {
2317 self.skip_while_to_offset(&Self::not_newline) + 1
2319 // this should really just be `index` - but, skip whitespace as the FFP
2320 // tests use magic comments in leading markup to set flags, but blank
2321 // them out before parsing; the newlines are kept to provide correct line
2322 // numbers in errors
2323 self.skip_while_to_offset(&|x| {
2324 Self::is_newline(x) || Self::is_whitespace_no_newline(x)
2328 if self.peek(start_offset) == '<' && self.peek_def(start_offset + 1, INVALID) == '?' {
2329 self.with_offset(start_offset);
2330 self.make_markup_and_suffix()
2332 (self.make_markup_token(), None)
2336 pub fn scan_header(&mut self) -> (Token, Option<(Token, Option<Token>)>) {
2337 self.start_new_lexeme();
2338 self.skip_to_end_of_markup()
2341 pub fn is_next_xhp_category_name(&self) -> bool {
2342 let mut lexer = self.clone();
2343 let _ = lexer.scan_leading_php_trivia();
2344 // An XHP category is an xhp element name preceded by a %.
2345 let ch0 = lexer.peek_char(0);
2346 let ch1 = lexer.peek_char(1);
2347 ch0 == '%' && Self::is_name_nondigit(ch1)
2350 fn scan_xhp_category_name(&mut self) -> TokenKind {
2351 if self.is_next_xhp_category_name() {
2353 let _ = self.scan_xhp_element_name(false);
2354 TokenKind::XHPCategoryName
2356 self.scan_token(false)
2360 pub fn next_xhp_category_name(&mut self) -> Token {
2361 self.scan_token_and_trivia(&Self::scan_xhp_category_name, KwSet::NoKeywords)
2364 pub fn rescan_halt_compiler(&mut self, last_token: Token) -> Token {
2365 // __halt_compiler stops parsing of the file.
2366 // In order to preserve fill fidelity aspect of the parser
2367 // we pack everything that follows __halt_compiler as
2368 // separate opaque kind of trivia - it will be attached as a trailing trivia
2369 // to the last_token and existing trailing trivia will be merged in.
2371 // This is incorrect for minimal token
2372 let leading_start_offset = last_token.leading_start_offset().unwrap_or(0);
2373 let start_offset = leading_start_offset + last_token.leading_width() + last_token.width();
2375 let length = self.source.length();
2376 let trailing = Token::Trivia::make_after_halt_compiler(
2379 length - start_offset,
2381 self.with_offset(length);
2382 last_token.with_trailing(vec![trailing])