1 // Copyright (c) 2019, Facebook, Inc.
2 // All rights reserved.
4 // This source code is licensed under the MIT license found in the
5 // LICENSE file in the "hack" directory of this source tree.
7 use parser_core_types::{
8 lexable_token::LexableToken,
9 lexable_trivia::{LexableTrivia, LexableTrivium},
10 source_text::{SourceText, INVALID},
11 syntax_error::{self as Errors, Error, SyntaxError},
12 token_factory::{TokenFactory, Trivia, Trivium},
13 token_kind::TokenKind,
14 trivia_factory::TriviaFactory,
15 trivia_kind::TriviaKind,
17 use static_assertions::*;
19 use std::cell::RefCell;
20 use std::ops::DerefMut;
24 struct LexerPreSnapshot {
31 struct LexerPostSnapshot {
35 errors: Vec<SyntaxError>,
38 impl<'a, TF> PartialEq<Lexer<'a, TF>> for LexerPreSnapshot
42 fn eq(&self, other: &Lexer<'a, TF>) -> bool {
43 self.start == other.start && self.offset == other.offset && self.in_type == other.in_type
50 One token look ahead in parser is implemented by `parser.peek_token()` ... `parser.next_token()`.
51 Re-scanning in next_token can be avoided by caching the result of `peek_token`, consecutive
52 `peek_token`s can also get improved.
54 `Lexer.peek_next_token()` checks cache first if cache misses it will clone of the current lexer and
55 call next_token on cloned lexer. To cache the result, it takes a snapshot of lexer state before and
56 after calling next_token, and store them in current lexer.
58 Clone trait of Lexer is derived automatically, therefore `cache: Rc<...>` is also cloned. `Rc` ensures
59 cloned lexer and original lexer share the same cache, this is intended! Other than one token look
60 ahead still clones parser, therefore lexer get cloned, sharing cache allows cloned lexer uses
61 cache from original lexer and vise versa. It is measured that 2% faster than not sharing cache.
63 NOTE: There is an invariant assumed by this caching mechanism. `errors` in `Lexer` can only add new errors
64 and must not remove any error when scanning forward! `Lexer.peek_next_token()` clones a new `Lexer` and
65 reset `errors` to empty, look ahead may accumulate new errors and these errors will be appended to the original
66 `Lexer`. The reason we need this invariant is that between `peek_next_token` and `next_token` we can not
67 prove no new error added. Actually it is observed that new errors are added between these two calls.
70 struct LexerCache<Token>(LexerPreSnapshot, Token, LexerPostSnapshot);
72 #[derive(Debug, Clone)]
73 pub struct Lexer<'a, TF>
77 source: SourceText<'a>,
80 errors: Vec<SyntaxError>,
83 cache: Rc<RefCell<Option<LexerCache<TF::Token>>>>,
86 #[derive(Debug, PartialEq)]
87 pub enum StringLiteralKind {
89 LiteralHeredoc { heredoc: Vec<u8> },
92 #[derive(Debug, Copy, Clone)]
99 macro_rules! as_case_insensitive_keyword {
100 ($size:tt, $size_type:ty $(, $keyword:tt)+) => {
101 fn as_case_insensitive_keyword(&self, text: &str) -> Option<(&'static str, bool)> {
102 use heapless::consts::*;
104 // - The $size should be greater than or equal to the each length of keyword
105 // - The $size should be equal to at least one of the length of a keyword
106 // Therefore, $size is equal to the length of the longest keyword.
108 const_assert!($size >= $keyword.len());
112 $size == $keyword.len() ||
117 if text.len() > $size {
120 let mut t: heapless::String<$size_type> = text.into();
121 let t: &mut str = t.as_mut_str();
122 t.make_ascii_lowercase();
123 let has_upper = t != text;
124 let t: &str = t as &str;
127 $keyword => Some(($keyword, has_upper)),
136 impl<'a, TF> Lexer<'a, TF>
140 fn to_lexer_pre_snapshot(&self) -> LexerPreSnapshot {
144 in_type: self.in_type,
148 fn into_lexer_post_snapshot(self) -> LexerPostSnapshot {
152 in_type: self.in_type,
157 pub fn make_at(source: &SourceText<'a>, offset: usize, token_factory: TF) -> Self {
159 source: source.clone(),
164 cache: Rc::new(RefCell::new(None)),
169 pub fn make(source: &SourceText<'a>, token_factory: TF) -> Self {
170 Self::make_at(source, 0, token_factory)
173 fn continue_from(&mut self, l: Lexer<'a, TF>) {
174 self.start = l.start;
175 self.offset = l.offset;
176 self.errors = l.errors
179 pub fn start(&self) -> usize {
183 pub fn offset(&self) -> usize {
187 pub fn errors(&self) -> &[SyntaxError] {
191 fn with_error(&mut self, error: Error) {
192 let error = SyntaxError::make(self.start(), self.offset(), error);
193 self.errors.push(error)
196 fn with_offset(&mut self, offset: usize) {
200 fn with_start_offset(&mut self, start: usize, offset: usize) {
202 self.offset = offset;
205 fn start_new_lexeme(&mut self) {
206 self.start = self.offset
209 pub fn advance(&mut self, i: usize) {
213 pub fn set_in_type(&mut self, in_type: bool) {
214 self.in_type = in_type
217 pub fn source(&self) -> &SourceText<'a> {
221 fn source_text_string(&self) -> &[u8] {
227 pub fn peek_char(&self, index: usize) -> char {
228 self.source.get(self.offset() + index)
231 fn peek_string(&self, size: usize) -> &[u8] {
232 self.source.sub(self.offset, size)
235 fn match_string(&self, s: &[u8]) -> bool {
236 s == self.peek_string(s.len())
239 fn width(&self) -> usize {
240 self.offset - self.start
243 fn current_text(&self) -> &[u8] {
244 self.source.sub(self.start, self.width())
247 fn current_text_as_str(&self) -> &str {
248 unsafe { std::str::from_utf8_unchecked(self.current_text()) }
251 fn at_end(&self) -> bool {
252 self.offset() >= self.source.length()
255 fn remaining(&self) -> usize {
256 let r = (self.source.length() as isize) - (self.offset as isize);
257 if r < 0 { 0 } else { r as usize }
260 fn peek(&self, i: usize) -> char {
264 fn peek_back(&self, index: usize) -> char {
265 self.source.get(self.offset() - index)
268 fn peek_def(&self, index: usize, default: char) -> char {
269 if index >= self.source.length() {
272 self.source.get(index)
276 // Character classification
278 fn is_whitespace_no_newline(c: char) -> bool {
285 fn is_newline(ch: char) -> bool {
292 fn is_binary_digit(ch: char) -> bool {
299 fn is_octal_digit(c: char) -> bool {
300 ('0'..='7').contains(&c)
303 fn is_decimal_digit(ch: char) -> bool {
304 ('0'..='9').contains(&ch)
307 fn is_hexadecimal_digit(c: char) -> bool {
308 ('0'..='9').contains(&c) || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
311 fn is_name_nondigit(c: char) -> bool {
312 (c == '_') || ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) || ('\x7f' <= c)
315 fn is_name_letter(c: char) -> bool {
317 || ('0'..='9').contains(&c)
318 || ('a'..='z').contains(&c)
319 || ('A'..='Z').contains(&c)
325 fn skip_while_to_offset(&self, p: impl Fn(char) -> bool) -> usize {
326 let n = self.source.length();
327 let mut i = self.offset();
328 while i < n && p(self.peek(i)) {
334 // advance offset as long as the predicate is true
335 fn skip_while(&mut self, p: impl Fn(char) -> bool) {
336 self.with_offset(self.skip_while_to_offset(p))
339 fn str_skip_while(s: &[u8], mut i: usize, p: impl Fn(char) -> bool) -> usize {
342 if i < n && p(s[i] as char) {
350 fn skip_whitespace(&mut self) {
351 self.skip_while(&Self::is_whitespace_no_newline);
354 fn str_skip_whitespace(s: &[u8], i: usize) -> usize {
355 Self::str_skip_while(s, i, &Self::is_whitespace_no_newline)
358 fn not_newline(ch: char) -> bool {
359 !(Self::is_newline(ch))
362 fn skip_to_end_of_line(&mut self) {
363 self.skip_while(&Self::not_newline)
366 fn skip_name_end(&mut self) {
367 self.skip_while(&Self::is_name_letter)
370 fn skip_end_of_line(&mut self) {
371 match self.peek_char(0) {
372 '\n' => self.advance(1),
374 if self.peek_char(1) == '\n' {
384 fn scan_name_impl(&mut self) {
385 assert!(Self::is_name_nondigit(self.peek_char(0)));
387 self.skip_name_end();
390 fn scan_name(&mut self) -> TokenKind {
391 self.scan_name_impl();
395 fn scan_variable(&mut self) -> TokenKind {
396 assert_eq!('$', self.peek_char(0));
398 self.scan_name_impl();
402 fn scan_with_underscores(&mut self, accepted_char: impl Fn(char) -> bool) {
403 let n = self.source.length();
404 let peek_def = |i| if i < n { self.peek(i) } else { INVALID };
405 let mut i = self.offset();
407 let ch = self.peek(i);
408 if accepted_char(ch) {
410 } else if ch == ' ' && accepted_char(peek_def(i + 1)) {
419 fn scan_decimal_digits(&mut self) {
420 self.skip_while(&Self::is_decimal_digit)
423 fn scan_decimal_digits_with_underscores(&mut self) {
424 self.scan_with_underscores(&Self::is_decimal_digit);
427 fn scan_octal_digits(&mut self) {
428 self.skip_while(&Self::is_octal_digit)
431 fn scan_octal_digits_with_underscores(&mut self) {
432 self.scan_with_underscores(&Self::is_octal_digit)
435 fn scan_binary_digits_with_underscores(&mut self) {
436 self.scan_with_underscores(&Self::is_binary_digit)
439 fn scan_hexadecimal_digits(&mut self) {
440 self.skip_while(&Self::is_hexadecimal_digit)
443 fn scan_hexadecimal_digits_with_underscores(&mut self) {
444 self.scan_with_underscores(&Self::is_hexadecimal_digit)
447 fn scan_hex_literal(&mut self) -> TokenKind {
448 let ch = self.peek_char(0);
449 if !Self::is_hexadecimal_digit(ch) {
450 self.with_error(Errors::error0001);
451 TokenKind::HexadecimalLiteral
453 self.scan_hexadecimal_digits_with_underscores();
454 TokenKind::HexadecimalLiteral
458 fn scan_binary_literal(&mut self) -> TokenKind {
459 let ch = self.peek_char(0);
460 if !Self::is_binary_digit(ch) {
461 self.with_error(Errors::error0002);
462 TokenKind::BinaryLiteral
464 self.scan_binary_digits_with_underscores();
465 TokenKind::BinaryLiteral
469 fn scan_exponent(&mut self) -> TokenKind {
470 let ch = self.peek_char(1);
471 if ch == '+' || ch == '-' {
476 let ch = self.peek_char(0);
477 if !Self::is_decimal_digit(ch) {
478 self.with_error(Errors::error0003);
479 TokenKind::FloatingLiteral
481 self.scan_decimal_digits();
482 TokenKind::FloatingLiteral
486 fn scan_after_decimal_point(&mut self) -> TokenKind {
488 self.scan_decimal_digits();
489 let ch = self.peek_char(0);
490 if ch == 'e' || ch == 'E' {
493 TokenKind::FloatingLiteral
497 fn scan_octal_or_float(&mut self) -> TokenKind {
498 // We've scanned a leading zero.
499 // We have an irritating ambiguity here. 09 is not a legal octal or
500 // floating literal, but 09e1 and 09.1 are.
502 let ch = self.peek_char(0);
507 self.scan_after_decimal_point()
514 _ if ('0'..='9').contains(&ch) => {
516 let mut lexer_oct = self.clone();
517 lexer_oct.scan_octal_digits();
519 let mut lexer_dec = self.clone();
520 lexer_dec.scan_decimal_digits();
521 if (lexer_oct.width()) == (lexer_dec.width()) {
522 // Only octal digits. Could be an octal literal, or could
524 let ch = lexer_oct.peek_char(0);
525 if ch == 'e' || ch == 'E' {
526 self.continue_from(lexer_oct);
528 } else if ch == '.' {
529 self.continue_from(lexer_oct);
530 self.scan_after_decimal_point()
532 // This is irritating - we only want to allow underscores for integer
533 // literals. Deferring the lexing with underscores here allows us to
534 // make sure we're not dealing with floats.
535 self.continue_from(lexer_oct);
536 self.scan_octal_digits_with_underscores();
537 TokenKind::OctalLiteral
540 // We had decimal digits following a leading zero; this is either a
541 // float literal or an octal to be truncated at the first non-octal
543 let ch = lexer_dec.peek_char(0);
544 if ch == 'e' || ch == 'E' {
545 self.continue_from(lexer_dec);
547 } else if ch == '.' {
548 self.continue_from(lexer_dec);
549 self.scan_after_decimal_point()
551 // an octal to be truncated at the first non-octal digit
552 // Again we differ the lexing with underscores here
553 self.scan_decimal_digits_with_underscores();
554 TokenKind::OctalLiteral
559 // 0 is a decimal literal
561 TokenKind::DecimalLiteral
566 fn scan_decimal_or_float(&mut self) -> TokenKind {
567 // We've scanned a leading non-zero digit.
568 let mut lexer_no_underscores = self.clone();
569 lexer_no_underscores.scan_decimal_digits();
570 let mut lexer_with_underscores = self.clone();
571 lexer_with_underscores.scan_decimal_digits_with_underscores();
572 let ch = lexer_no_underscores.peek_char(0);
577 self.continue_from(lexer_no_underscores);
578 self.scan_after_decimal_point()
583 self.continue_from(lexer_no_underscores);
589 self.continue_from(lexer_with_underscores);
590 TokenKind::DecimalLiteral
595 fn scan_single_quote_string_literal(&mut self) -> TokenKind {
596 // TODO: What about newlines embedded?
598 // single-quoted-string-literal::
599 // b-opt ' sq-char-sequence-opt '
601 // TODO: What is this b-opt? We don't lex an optional 'b' before a literal.
603 // sq-char-sequence::
605 // sq-char-sequence sq-char
608 // sq-escape-sequence
609 // \opt any character except single-quote (') or backslash (\)
611 // sq-escape-sequence:: one of
613 let n = self.source.length();
614 let peek = |x| self.source.get(x);
616 let mut has_error0012 = false;
617 let mut has_error0006 = false;
619 let mut i = 1 + self.offset();
620 let new_offset = loop {
622 has_error0012 = true;
628 has_error0006 = true;
632 '\'' => break (1 + i),
639 self.with_error(Errors::error0006)
642 self.with_error(Errors::error0012)
645 self.with_offset(new_offset);
646 TokenKind::SingleQuotedStringLiteral
649 fn scan_hexadecimal_escape(&mut self) {
650 let ch2 = self.peek_char(2);
651 let ch3 = self.peek_char(3);
652 if !(Self::is_hexadecimal_digit(ch2)) {
653 // TODO: Consider producing an error for a malformed hex escape
654 // let lexer = with_error lexer SyntaxError.error0005 in
656 } else if !(Self::is_hexadecimal_digit(ch3)) {
657 // let lexer = with_error lexer SyntaxError.error0005 in
664 fn scan_unicode_escape(&mut self) {
665 // At present the lexer is pointing at \u
666 if self.peek_char(2) == '{' {
667 if self.peek_char(3) == '$' {
668 // We have a malformed unicode escape that contains a possible embedded
669 // expression. Eat the \u and keep on processing the embedded expression.
670 // TODO: Consider producing a warning for a malformed unicode escape.
673 // We have a possibly well-formed escape sequence, and at least we know
674 // that it is not an embedded expression.
675 // TODO: Consider producing an error if the digits are out of range
676 // of legal Unicode characters.
677 // TODO: Consider producing an error if there are no digits.
678 // Skip over the slash, u and brace, and start lexing the number.
680 self.scan_hexadecimal_digits();
681 let ch = self.peek_char(0);
683 // TODO: Consider producing a warning for a malformed unicode escape.
690 // We have a malformed unicode escape sequence. Bail out.
691 // TODO: Consider producing a warning for a malformed unicode escape.
696 fn skip_uninteresting_double_quote_like_string_characters(&mut self) {
697 let is_uninteresting = |ch| match ch {
698 INVALID | '\\' | '$' | '{' | '[' | ']' | '-' => false,
699 ch if ('0'..='9').contains(&ch) => false,
700 ch => ch != '"' && !Self::is_name_nondigit(ch),
702 self.skip_while(&is_uninteresting);
705 fn scan_integer_literal_in_string(&mut self) -> TokenKind {
706 if self.peek_char(0) == '0' {
707 match self.peek_char(1) {
710 self.scan_hex_literal()
714 self.scan_binary_literal()
717 // An integer literal starting with 0 in a string will actually
718 // always be treated as a string index in HHVM, and not as an octal.
719 // In such a case, HHVM actually scans all decimal digits to create the
720 // token. TODO: (kasper) T40381519 we may want to change this behavior to something more
722 self.scan_decimal_digits_with_underscores();
723 TokenKind::DecimalLiteral
727 self.scan_decimal_digits_with_underscores();
728 TokenKind::DecimalLiteral
732 fn scan_double_quote_like_string_literal_from_start(&mut self) -> TokenKind {
733 let literal_token_kind = TokenKind::DoubleQuotedStringLiteral;
734 let head_token_kind = TokenKind::DoubleQuotedStringLiteralHead;
737 // If there's nothing interesting in this double-quoted string then
738 // we can just hand it back as-is.
739 self.skip_uninteresting_double_quote_like_string_characters();
740 match self.peek_char(0) {
742 // If the string is unterminated then give an error; if this is an
743 // embedded zero character then give an error and recurse; we might
744 // be able to make more progress.
746 self.with_error(Errors::error0012);
747 break literal_token_kind;
749 self.with_error(Errors::error0006);
754 // We made it to the end without finding a special character.
756 break literal_token_kind;
759 // We've found a backslash, dollar or brace.
761 break head_token_kind;
767 fn is_heredoc_tail(&self, name: &[u8]) -> bool {
768 // A heredoc tail is the identifier immediately preceded by a newline
769 // and immediately followed by an optional semi and then a newline.
771 // Note that the newline and optional semi are not part of the literal;
772 // the literal's lexeme ends at the end of the name. Either there is
773 // no trivia and the next token is a semi-with-trailing-newline, or
774 // the trailing trivia is a newline.
776 // This odd rule is to ensure that both
786 // . "something else";
789 if !(Self::is_newline(self.peek_back(1))) {
792 let len = name.len();
793 let ch0 = self.peek_char(len);
794 let ch1 = self.peek_char(len + 1);
795 ((Self::is_newline(ch0)) || ch0 == ';' && (Self::is_newline(ch1)))
796 && self.peek_string(len) == name
800 fn get_tail_token_kind(&self, literal_kind: &StringLiteralKind) -> TokenKind {
802 StringLiteralKind::LiteralHeredoc { .. } => TokenKind::HeredocStringLiteralTail,
803 StringLiteralKind::LiteralDoubleQuoted => TokenKind::DoubleQuotedStringLiteralTail,
807 fn get_string_literal_body_or_double_quoted_tail(
809 literal_kind: &StringLiteralKind,
811 if literal_kind == &StringLiteralKind::LiteralDoubleQuoted {
812 TokenKind::DoubleQuotedStringLiteralTail
814 TokenKind::StringLiteralBody
818 fn scan_string_literal_in_progress(&mut self, literal_kind: &StringLiteralKind) -> TokenKind {
819 let (is_heredoc, name): (bool, &[u8]) = match literal_kind {
820 StringLiteralKind::LiteralHeredoc { heredoc } => (true, heredoc),
823 let ch0 = self.peek_char(0);
824 if Self::is_name_nondigit(ch0) {
825 if is_heredoc && (self.is_heredoc_tail(name)) {
826 self.scan_name_impl();
827 TokenKind::HeredocStringLiteralTail
829 self.scan_name_impl();
836 self.with_error(Errors::error0012);
837 self.get_tail_token_kind(literal_kind)
839 self.with_error(Errors::error0006);
841 self.skip_uninteresting_double_quote_like_string_characters();
842 TokenKind::StringLiteralBody
846 let kind = self.get_string_literal_body_or_double_quoted_tail(literal_kind);
851 if Self::is_name_nondigit(self.peek_char(1)) {
863 match self.peek_char(1) {
864 // In these cases we just skip the escape sequence and
865 // keep on scanning for special characters.
866 | '\\' | '"' | '$' | 'e' | 'f' | 'n' | 'r' | 't' | 'v' | '`'
867 // Same in these cases; there might be more octal characters following but
868 // if there are, we'll just eat them as normal characters.
869 | '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' => {
871 self.skip_uninteresting_double_quote_like_string_characters();
872 TokenKind::StringLiteralBody}
874 self.scan_hexadecimal_escape();
875 self.skip_uninteresting_double_quote_like_string_characters();
876 TokenKind::StringLiteralBody }
878 self.scan_unicode_escape();
879 self.skip_uninteresting_double_quote_like_string_characters();
880 TokenKind::StringLiteralBody }
882 // The rules for escaping open braces in Hack are bizarre. Suppose we
887 // What is the value of $z? Naively you would think that the backslash
888 // escapes the braces, and the variables are embedded, so {123,456}. But
889 // that's not what happens. Yes, the backslash makes the brace no longer
890 // the opening brace of an expression. But the backslash is still part
891 // of the string! This is the string \{123,456\}.
892 // TODO: We might want to fix this because this is very strange.
893 // Eat the backslash and the brace.
895 TokenKind::StringLiteralBody
898 // TODO: A backslash followed by something other than an escape sequence
899 // is legal in hack, and treated as though it was just the backslash
900 // and the character. However we might consider making this a warning.
901 // It is particularly egregious when we have something like:
904 // The author of the code likely means the backslash to mean line
905 // continuation but in fact it just means to put a backslash and newline
908 self.skip_uninteresting_double_quote_like_string_characters();
909 TokenKind::StringLiteralBody
915 TokenKind::LeftBracket
919 TokenKind::RightBracket
922 if (self.peek_char(1)) == '>' {
924 TokenKind::MinusGreaterThan
926 // Nothing interesting here. Skip it and find the next
927 // interesting character.
929 self.skip_uninteresting_double_quote_like_string_characters();
930 TokenKind::StringLiteralBody
933 ch if ('0'..='9').contains(&ch) => {
934 let mut lexer1 = self.clone();
935 let literal = lexer1.scan_integer_literal_in_string();
937 if self.errors.len() == lexer1.errors.len() {
938 self.continue_from(lexer1);
941 // If we failed to scan a literal, do not interpret the literal
942 self.with_offset(lexer1.offset());
943 TokenKind::StringLiteralBody
947 // Nothing interesting here. Skip it and find the next
948 // interesting character.
950 self.skip_uninteresting_double_quote_like_string_characters();
951 TokenKind::StringLiteralBody
956 // A heredoc string literal has the form
964 // <<< (optional whitespace) name (no whitespace) (newline)
966 // The optional body is:
968 // any characters whatsoever including newlines (newline)
972 // (no whitespace) name (no whitespace) (optional semi) (no whitespace) (newline)
974 // The names must be identical. The trailing semi and newline must be present.
976 // The body is any and all characters, up to the first line that exactly matches
979 // The body may contain embedded expressions.
981 // A nowdoc string literal has the same form except that the first name is
982 // enclosed in single quotes, and it may not contain embedded expressions.
983 fn scan_docstring_name_actual(&mut self) -> &'a [u8] {
984 let ch = self.peek_char(0);
985 if Self::is_name_nondigit(ch) {
986 let start_offset = self.offset();
988 self.skip_name_end();
989 self.source.sub(start_offset, self.offset() - start_offset)
991 self.with_error(Errors::error0008);
996 fn scan_docstring_name(&mut self) -> (&'a [u8], TokenKind) {
997 self.skip_whitespace();
998 let ch = self.peek_char(0);
999 let kind = if ch == '\'' {
1000 TokenKind::NowdocStringLiteral
1002 TokenKind::HeredocStringLiteral
1005 let name = if ch == '\'' {
1007 let name = self.scan_docstring_name_actual();
1008 if (self.peek_char(0)) == '\'' {
1012 self.with_error(Errors::error0010);
1016 // Starting with PHP 5.3.0, the opening Heredoc identifier
1017 // may optionally be enclosed in double quotes:
1021 let name = self.scan_docstring_name_actual();
1023 // same logic as above, just for double quote
1024 if self.peek_char(0) == '\"' {
1027 self.with_error(Errors::missing_double_quote)
1035 fn scan_docstring_header(&mut self) -> (&'a [u8], TokenKind) {
1036 let ch = self.peek_char(0);
1037 // Skip 3 for <<< or 4 for b<<<
1038 let skip_count = if ch == 'b' { 4 } else { 3 };
1039 self.advance(skip_count);
1040 let (name, kind) = self.scan_docstring_name();
1041 let ch = self.peek_char(0);
1042 if !Self::is_newline(ch) {
1043 self.with_error(Errors::error0011)
1045 self.skip_to_end_of_line();
1046 self.skip_end_of_line();
1050 fn scan_docstring_remainder(&mut self, name: &[u8]) {
1051 let len = name.len();
1053 let ch0 = self.peek_char(len);
1054 let ch1 = self.peek_char(len + 1);
1055 if ((Self::is_newline(ch0)) || ch0 == ';' && (Self::is_newline(ch1)))
1056 && self.peek_string(len as usize) == name
1058 self.advance(len as usize);
1061 self.skip_to_end_of_line();
1062 let ch = self.peek_char(0);
1063 if Self::is_newline(ch) {
1064 self.skip_end_of_line()
1066 // If we got here then we ran off the end of the file without
1067 // finding a newline. Just bail.
1068 self.with_error(Errors::error0011);
1075 fn scan_docstring_literal(&mut self) -> TokenKind {
1076 let (name, kind) = self.scan_docstring_header();
1077 self.scan_docstring_remainder(name);
1081 fn scan_xhp_label(&mut self) {
1083 self.skip_name_end();
1086 fn scan_xhp_element_name(&mut self, attribute: bool) -> TokenKind {
1087 // An XHP element name is a sequence of one or more XHP labels each separated
1088 // by a single : or -. Note that it is possible for an XHP element name to be
1089 // followed immediately by a : or - that is the next token, so if we find
1090 // a : or - not followed by a label, we need to terminate the token.
1091 self.scan_xhp_label();
1092 let ch0 = self.peek_char(0);
1093 let ch1 = self.peek_char(1);
1094 if (!attribute && ch0 == ':' || ch0 == '-') && Self::is_name_nondigit(ch1) {
1096 self.scan_xhp_element_name(false)
1098 TokenKind::XHPElementName
1102 fn scan_xhp_class_no_dash(&mut self) -> TokenKind {
1103 self.scan_xhp_label();
1104 let ch0 = self.peek_char(0);
1105 let ch1 = self.peek_char(1);
1106 if ch0 == ':' && Self::is_name_nondigit(ch1) {
1108 self.scan_xhp_class_no_dash()
1110 TokenKind::XHPElementName
1114 // Is the next token we're going to lex a possible xhp class name?
1115 fn is_xhp_class_name(&self) -> bool {
1116 (self.peek_char(0) == ':') && (Self::is_name_nondigit(self.peek_char(1)))
1119 fn scan_xhp_class_name(&mut self) -> TokenKind {
1120 // An XHP class name is a colon followed by an xhp name.
1121 if self.is_xhp_class_name() {
1123 self.scan_xhp_element_name(false);
1124 TokenKind::XHPClassName
1126 self.with_error(Errors::error0008);
1128 TokenKind::ErrorToken
1132 // To support xhp class style class definitions we don't require a : prefix
1133 fn scan_xhp_modifier_class_name(&mut self) -> TokenKind {
1134 // we don't want to allow xhp names with a : prefix here
1135 if self.peek_char(0) == ':' {
1136 self.with_error(Errors::error0008);
1137 TokenKind::ErrorToken
1139 self.scan_xhp_class_no_dash();
1140 TokenKind::XHPClassName
1144 fn scan_xhp_string_literal(&mut self) -> TokenKind {
1145 // XHP string literals are just straight up "find the closing quote"
1146 // strings. Embedded newlines are legal.
1147 let mut offset: usize = 1;
1149 match self.peek_char(offset) {
1151 self.advance(offset);
1153 self.with_error(Errors::error0012);
1154 return TokenKind::XHPStringLiteral;
1156 self.with_error(Errors::error0006);
1161 self.advance(offset + 1);
1162 return TokenKind::XHPStringLiteral;
1169 // Note that this does not scan an XHP body
1170 fn scan_xhp_token(&mut self) -> TokenKind {
1171 // TODO: HHVM requires that there be no trivia between < and name in an
1172 // opening tag, but does allow trivia between </ and name in a closing tag.
1173 // Consider allowing trivia in an opening tag.
1174 let ch0 = self.peek_char(0);
1175 if ch0 == INVALID && self.at_end() {
1176 TokenKind::EndOfFile
1177 } else if self.is_xhp_class_name() || Self::is_name_nondigit(ch0) {
1178 self.scan_xhp_element_name(false)
1183 TokenKind::LeftBrace
1187 TokenKind::RightBrace
1194 if (self.peek_char(1)) == '/' {
1196 TokenKind::LessThanSlash
1202 '"' => self.scan_xhp_string_literal(),
1204 if (self.peek_char(1)) == '>' {
1206 TokenKind::SlashGreaterThan
1208 self.with_error(Errors::error0006);
1210 TokenKind::ErrorToken
1215 TokenKind::GreaterThan
1218 self.with_error(Errors::error0006);
1220 TokenKind::ErrorToken
1226 fn scan_xhp_comment(&mut self) {
1229 let ch0 = self.peek_char(offset);
1230 let ch1 = self.peek_char(offset + 1);
1231 let ch2 = self.peek_char(offset + 2);
1232 match (ch0, ch1, ch2) {
1233 (INVALID, _, _) => {
1234 self.advance(offset as usize);
1235 return self.with_error(Errors::error0014);
1237 ('-', '-', '>') => return self.advance((offset + 3) as usize),
1242 fn scan_xhp_body(&mut self) -> TokenKind {
1243 // Naively you might think that an XHP body is just a bunch of characters,
1244 // terminated by an embedded { } expression or a tag. However, whitespace
1245 // and newlines are relevant in XHP bodies because they are "soft".
1246 // That is, any section of contiguous trivia has the same semantics as a
1247 // single space or newline -- just as in HTML.
1249 // Obviously this is of relevance to code formatters.
1251 // Therefore we detect whitespace and newlines within XHP bodies and treat
1252 // it as trivia surrounding the tokens within the body.
1254 // TODO: Is this also true of whitespace within XHP comments? If so then
1255 // we need to make XHP comments a sequence of tokens, rather than a
1256 // single token as they are now.
1257 let ch0 = self.peek_char(0);
1260 INVALID if self.at_end() => TokenKind::EndOfFile,
1263 TokenKind::LeftBrace
1267 TokenKind::RightBrace
1270 let ch1 = self.peek_char(1);
1271 let ch2 = self.peek_char(2);
1272 let ch3 = self.peek_char(3);
1273 match (ch1, ch2, ch3) {
1274 ('!', '-', '-') => {
1275 self.scan_xhp_comment();
1276 TokenKind::XHPComment
1280 TokenKind::LessThanSlash
1291 let ch = self.peek_char(offset);
1294 self.advance(offset);
1296 self.with_error(Errors::error0013);
1299 self.with_error(Errors::error0006);
1303 '\t' | ' ' | '\r' | '\n' | '{' | '}' | '<' => {
1304 self.advance(offset);
1315 fn scan_dollar_token(&mut self) -> TokenKind {
1316 // We have a problem here. We wish to be able to lexically analyze both
1317 // PHP and Hack, but the introduction of $$ to Hack makes them incompatible.
1318 // "$$x" and "$$ $x" are legal in PHP, but illegal in Hack.
1319 // The rule in PHP seems to be that $ is a prefix operator, it is a token,
1320 // it can be followed by trivia, but the next token has to be another $
1321 // operator, a variable $x, or a {.
1323 // Here's a reasonable compromise. (TODO: Review this decision.)
1325 // $$x lexes as $ $x
1326 // $$$x lexes as $ $ $x
1329 // $$ followed by anything other than a name or a $ lexes as $$.
1331 // This means that lexing a PHP program which contains "$$ $x" is different
1332 // will fail at parse time, but I'm willing to live with that.
1334 // This means that lexing a Hack program which contains
1335 // "$x |> $$instanceof Foo" produces an error as well.
1337 // If these decisions are unacceptable then we will need to make the lexer
1338 // be aware of whether it is lexing PHP or Hack; thus far we have not had
1339 // to make this distinction.
1341 // We are already at $.
1342 let ch1 = self.peek_char(1);
1345 let ch2 = self.peek_char(2);
1346 if ch2 == '$' || ch2 == '{' || Self::is_name_nondigit(ch2) {
1348 TokenKind::Dollar // $$x or $$$
1351 TokenKind::DollarDollar // $$
1355 if Self::is_name_nondigit(ch1) {
1356 self.scan_variable() // $x
1359 TokenKind::Dollar // $
1365 fn scan_token(&mut self, in_type: bool) -> TokenKind {
1366 let ch0 = self.peek_char(0);
1370 TokenKind::LeftBracket
1374 TokenKind::RightBracket
1378 TokenKind::LeftParen
1382 TokenKind::RightParen
1386 TokenKind::LeftBrace
1390 TokenKind::RightBrace
1392 '.' => match self.peek_char(1) {
1397 ch if ('0'..='9').contains(&ch) => self.scan_after_decimal_point(),
1399 if (self.peek_char(2)) == '.' {
1401 TokenKind::DotDotDot
1412 '-' => match self.peek_char(1) {
1415 TokenKind::MinusEqual
1419 TokenKind::MinusMinus
1423 TokenKind::MinusGreaterThan
1430 '+' => match self.peek_char(1) {
1433 TokenKind::PlusEqual
1444 '*' => match (self.peek_char(1), self.peek_char(2)) {
1447 TokenKind::StarEqual
1451 TokenKind::StarStarEqual
1466 '!' => match (self.peek_char(1), self.peek_char(2)) {
1469 TokenKind::ExclamationEqualEqual
1473 TokenKind::ExclamationEqual
1477 TokenKind::Exclamation
1480 '$' => self.scan_dollar_token(),
1482 if (self.peek_char(1)) == '=' {
1484 TokenKind::SlashEqual
1491 if (self.peek_char(1)) == '=' {
1493 TokenKind::PercentEqual
1500 match (self.peek_char(1), self.peek_char(2)) {
1501 ('<', '<') => self.scan_docstring_literal(),
1504 TokenKind::LessThanLessThanEqual
1506 // TODO: We lex and parse the spaceship operator.
1507 // TODO: This is not in the spec at present. We should either make it an
1508 // TODO: error, or add it to the specification.
1511 TokenKind::LessThanEqualGreaterThan
1515 TokenKind::LessThanEqual
1519 TokenKind::LessThanLessThan
1528 match (self.peek_char(1), self.peek_char(2)) {
1529 // If we are parsing a generic type argument list then we might be at the >>
1530 // in `List<List<int>>``, or at the >= of `let x:vec<int>=...`. In that case
1531 // we want to lex two >'s instead of >> / one > and one = instead of >=.
1532 (ch, _) if (ch == '>' || ch == '=') && in_type => {
1534 TokenKind::GreaterThan
1538 TokenKind::GreaterThanGreaterThanEqual
1542 TokenKind::GreaterThanGreaterThan
1546 TokenKind::GreaterThanEqual
1550 TokenKind::GreaterThan
1554 '=' => match (self.peek_char(1), self.peek_char(2)) {
1557 TokenKind::EqualEqualEqual
1561 TokenKind::EqualEqualGreaterThan
1565 TokenKind::EqualEqual
1569 TokenKind::EqualGreaterThan
1577 if (self.peek_char(1)) == '=' {
1579 TokenKind::CaratEqual
1585 '|' => match self.peek_char(1) {
1592 TokenKind::BarGreaterThan
1603 '&' => match self.peek_char(1) {
1606 TokenKind::AmpersandEqual
1610 TokenKind::AmpersandAmpersand
1614 TokenKind::Ampersand
1617 '?' => match (self.peek_char(1), self.peek_char(2)) {
1618 (':', _) if !in_type => {
1620 TokenKind::QuestionColon
1624 TokenKind::QuestionMinusGreaterThan
1628 TokenKind::QuestionQuestionEqual
1632 TokenKind::QuestionQuestion
1634 ('a', 's') if !Self::is_name_nondigit(self.peek_char(3)) => {
1636 TokenKind::QuestionAs
1644 let ch1 = self.peek_char(1);
1648 TokenKind::ColonColon
1656 TokenKind::Semicolon
1666 '0' => match self.peek_char(1) {
1669 self.scan_hex_literal()
1673 self.scan_binary_literal()
1675 _ => self.scan_octal_or_float(),
1677 ch if ('1'..='9').contains(&ch) => self.scan_decimal_or_float(),
1678 '\'' => self.scan_single_quote_string_literal(),
1679 '"' => self.scan_double_quote_like_string_literal_from_start(),
1686 TokenKind::Backslash
1693 let c1 = self.peek_char(1);
1694 let c2 = self.peek_char(2);
1695 let c3 = self.peek_char(3);
1696 c1 == '"' || c1 == '\'' || (c1 == '<' && c2 == '<' && c3 == '<')
1700 self.scan_token(in_type)
1704 if ch0 == INVALID && self.at_end() {
1705 TokenKind::EndOfFile
1706 } else if Self::is_name_nondigit(ch0) {
1709 self.with_error(Errors::error0006);
1711 TokenKind::ErrorToken
1717 fn scan_token_outside_type(&mut self) -> TokenKind {
1718 self.scan_token(false)
1721 fn scan_token_inside_type(&mut self) -> TokenKind {
1722 self.scan_token(true)
1729 // white-space-character::
1731 // Space character (U+0020)
1732 // Horizontal-tab character (U+0009)
1734 // single-line-comment::
1735 // // input-characters-opt
1736 // # input-characters-opt
1739 // Carriage-return character (U+000D)
1740 // Line-feed character (U+000A)
1741 // Carriage-return character followed by line-feed character
1743 fn str_scan_end_of_line(s: &[u8], i: usize) -> usize {
1744 match s.get(i).map(|x| *x as char) {
1746 Some('\r') => match s.get(i + 1).map(|x| *x as char) {
1747 Some('\n') => 2 + i,
1750 Some('\n') => i + 1,
1751 _ => panic!("str_scan_end_of_line called while not on end of line!"),
1755 fn scan_end_of_line(&mut self) -> Trivium<TF> {
1756 match self.peek_char(0) {
1758 let w = if self.peek_char(1) == '\n' { 2 } else { 1 };
1760 Trivia::<TF>::make_eol(self.start, w)
1764 Trivia::<TF>::make_eol(self.start, 1)
1766 _ => panic!("scan_end_of_line called while not on end of line!"),
1770 fn scan_single_line_comment(&mut self) -> Trivium<TF> {
1771 // A fallthrough comment is two slashes, any amount of whitespace,
1772 // FALLTHROUGH, and any characters may follow.
1773 // TODO: Consider allowing lowercase fallthrough.
1776 self.skip_whitespace();
1777 let lexer_ws = self.clone();
1778 self.skip_to_end_of_line();
1779 let w = self.width();
1780 let remainder = self.offset - lexer_ws.offset;
1781 if remainder >= 11 && lexer_ws.peek_string(11) == b"FALLTHROUGH" {
1782 Trivia::<TF>::make_fallthrough(self.start, w)
1784 Trivia::<TF>::make_single_line_comment(self.start, w)
1788 fn skip_to_end_of_delimited_comment(&mut self) {
1791 let ch0 = self.peek_char(offset);
1793 self.advance(offset);
1795 return self.with_error(Errors::error0007);
1797 // TODO: Do we want to give a warning for an embedded zero char
1798 // inside a comment?
1801 } else if ch0 == '*' && (self.peek_char(offset + 1)) == '/' {
1802 return self.advance(offset + 2);
1809 fn scan_delimited_comment(&mut self) -> Trivium<TF> {
1810 // The original lexer lexes a fixme / ignore error as:
1812 // slash star [whitespace]* HH_FIXME [whitespace or newline]* leftbracket
1813 // [whitespace or newline]* integer [any text]* star slash
1815 // Notice that the original lexer oddly enough does not verify that there
1816 // is a right bracket.
1818 // For our purposes we will just check for HH_FIXME / HH_IGNORE_ERROR;
1819 // a later pass can try to parse out the integer if there is one,
1820 // give a warning if there is not, and so on.
1823 self.skip_whitespace();
1825 let lexer_ws = self.clone();
1826 self.skip_to_end_of_delimited_comment();
1827 let w = self.width();
1828 if lexer_ws.match_string(b"HH_FIXME") {
1829 Trivia::<TF>::make_fix_me(self.start, w)
1830 } else if lexer_ws.match_string(b"HH_IGNORE_ERROR") {
1831 Trivia::<TF>::make_ignore_error(self.start, w)
1833 Trivia::<TF>::make_delimited_comment(self.start, w)
1837 fn scan_php_trivium(&mut self) -> Option<Trivium<TF>> {
1838 match self.peek_char(0) {
1840 self.start_new_lexeme();
1845 self.start_new_lexeme();
1846 match self.peek_char(1) {
1847 '/' => Some(self.scan_single_line_comment()),
1848 '*' => Some(self.scan_delimited_comment()),
1853 let new_end = Self::str_skip_whitespace(self.source_text_string(), self.offset);
1854 let new_start = self.offset;
1855 let new_trivia = Trivia::<TF>::make_whitespace(new_start, new_end - new_start);
1856 self.with_start_offset(new_start, new_end);
1860 self.start_new_lexeme();
1861 Some(self.scan_end_of_line())
1864 self.start_new_lexeme();
1871 fn scan_xhp_trivium(&mut self) -> Option<Trivium<TF>> {
1872 // TODO: Should XHP comments <!-- --> be their own thing, or a kind of
1873 // trivia associated with a token? Right now they are the former.
1874 let i = self.offset;
1875 let ch = self.peek_char(0);
1878 let j = Self::str_skip_whitespace(self.source_text_string(), i);
1879 self.with_start_offset(i, j);
1880 Some(Trivia::<TF>::make_whitespace(i, j - i))
1883 let j = Self::str_scan_end_of_line(self.source_text_string(), i);
1884 self.with_start_offset(i, j);
1885 Some(Trivia::<TF>::make_eol(i, j - i))
1890 self.start_new_lexeme();
1896 // We divide trivia into "leading" and "trailing" trivia of an associated
1897 // token. This means that we must find a dividing line between the trailing trivia
1898 // following one token and the leading trivia of the following token. Plainly
1899 // we need only find this line while scanning trailing trivia. The heuristics
1901 // * The first newline trivia encountered is the last trailing trivia.
1902 // * The newline which follows a // or # comment is not part of the comment
1903 // but does terminate the trailing trivia.
1904 // * A pragma to turn checks off (HH_FIXME and HH_IGNORE_ERROR) is
1905 // always a leading trivia.
1906 fn scan_leading_trivia(
1908 scanner: impl Fn(&mut Self) -> Option<Trivium<TF>>,
1910 let mut acc = self.token_factory.trivia_factory_mut().make();
1911 while let Some(t) = scanner(self) {
1917 fn scan_leading_trivia_with_width(
1919 scanner: impl Fn(&mut Self) -> Option<Trivium<TF>>,
1922 let mut acc = self.token_factory.trivia_factory_mut().make();
1923 let mut extra_token_error_width = 0;
1924 let mut extra_token_error_offset = self.offset();
1927 if extra_token_error_width > 0 {
1928 acc.push(Trivia::<TF>::make_extra_token_error(
1929 extra_token_error_offset,
1930 extra_token_error_width,
1935 if let Some(t) = scanner(self) {
1936 if extra_token_error_width > 0 {
1937 acc.push(Trivia::<TF>::make_extra_token_error(
1938 extra_token_error_offset,
1939 extra_token_error_width,
1941 extra_token_error_width = 0;
1942 extra_token_error_offset = self.start();
1949 extra_token_error_width += 1;
1954 pub fn scan_leading_php_trivia_with_width(
1957 ) -> <TF::Token as LexableToken>::Trivia {
1958 self.scan_leading_trivia_with_width(&Self::scan_php_trivium, width)
1961 pub fn scan_leading_xhp_trivia_with_width(
1964 ) -> <TF::Token as LexableToken>::Trivia {
1965 self.scan_leading_trivia_with_width(&Self::scan_xhp_trivium, width)
1968 pub(crate) fn scan_leading_php_trivia(&mut self) -> <TF::Token as LexableToken>::Trivia {
1969 self.scan_leading_trivia(&Self::scan_php_trivium)
1972 pub(crate) fn scan_leading_xhp_trivia(&mut self) -> <TF::Token as LexableToken>::Trivia {
1973 self.scan_leading_trivia(&Self::scan_xhp_trivium)
1976 fn scan_trailing_trivia(
1978 scanner: impl Fn(&mut Self) -> Option<Trivium<TF>>,
1979 ) -> <TF::Token as LexableToken>::Trivia {
1980 let mut acc = self.token_factory.trivia_factory_mut().make();
1982 let mut lexer1 = self.clone();
1983 match scanner(&mut lexer1) {
1985 self.continue_from(lexer1);
1988 Some(t) => match t.kind() {
1989 TriviaKind::EndOfLine => {
1990 self.continue_from(lexer1);
1994 TriviaKind::FixMe | TriviaKind::IgnoreError => {
1998 self.continue_from(lexer1);
2006 pub fn scan_trailing_php_trivia(&mut self) -> <TF::Token as LexableToken>::Trivia {
2007 self.scan_trailing_trivia(&Self::scan_php_trivium)
2010 pub fn scan_trailing_xhp_trivia(&mut self) -> <TF::Token as LexableToken>::Trivia {
2011 self.scan_trailing_trivia(&Self::scan_xhp_trivium)
2014 pub fn is_next_name(&self) -> bool {
2015 let mut lexer = self.clone();
2016 lexer.scan_leading_php_trivia();
2017 Self::is_name_nondigit(lexer.peek_char(0))
2020 pub fn is_next_xhp_class_name(&self) -> bool {
2021 let mut lexer = self.clone();
2022 lexer.scan_leading_php_trivia();
2023 lexer.is_xhp_class_name()
2026 as_case_insensitive_keyword!(
2106 fn as_keyword(&mut self, only_reserved: bool, kind: TokenKind) -> TokenKind {
2107 if kind == TokenKind::Name {
2108 let original_text = self.current_text_as_str();
2109 let (text, has_upper) = self
2110 .as_case_insensitive_keyword(original_text)
2111 .unwrap_or((original_text, false));
2112 match TokenKind::from_string(text.as_bytes(), only_reserved) {
2114 if has_upper && text != "true" && text != "false" && text != "null" {
2115 let err = Errors::uppercase_kw(original_text);
2116 self.with_error(err);
2120 _ => TokenKind::Name,
2127 fn scan_token_and_leading_trivia(
2129 scanner: impl Fn(&mut Self) -> TokenKind,
2131 ) -> (TokenKind, usize, <TF::Token as LexableToken>::Trivia) {
2132 // Get past the leading trivia
2133 let leading = self.scan_leading_php_trivia();
2134 // Remember where we were when we started this token
2135 self.start_new_lexeme();
2136 let kind = scanner(self);
2137 let kind = match as_name {
2138 KwSet::AllKeywords => kind,
2139 KwSet::NonReservedKeywords => self.as_keyword(true, kind),
2140 KwSet::NoKeywords => self.as_keyword(false, kind),
2142 let w = self.width();
2146 fn scan_token_and_trivia(
2148 scanner: &impl Fn(&mut Self) -> TokenKind,
2151 let token_start = self.offset;
2153 let (kind, w, leading) = self.scan_token_and_leading_trivia(scanner, as_name);
2154 let trailing = match kind {
2155 TokenKind::DoubleQuotedStringLiteralHead => {
2156 self.token_factory.trivia_factory_mut().make()
2158 _ => self.scan_trailing_php_trivia(),
2161 .make(kind, token_start, w, leading, trailing)
2164 fn scan_assert_progress(&mut self, tokenizer: impl Fn(&mut Self) -> TF::Token) -> TF::Token {
2165 let original_remaining = self.remaining();
2166 let token = tokenizer(self);
2167 let new_remaining = self.remaining();
2168 if new_remaining < original_remaining
2169 || original_remaining == 0
2170 && new_remaining == 0
2171 && (token.kind()) == TokenKind::EndOfFile
2176 "failed to make progress at {} {} {} {:?}\n",
2187 scanner: impl Fn(&mut Self) -> TokenKind,
2190 let tokenizer = |x: &mut Self| x.scan_token_and_trivia(&scanner, as_name);
2191 self.scan_assert_progress(&tokenizer)
2194 fn scan_next_token_as_name(&mut self, scanner: impl Fn(&mut Self) -> TokenKind) -> TF::Token {
2195 self.scan_next_token(scanner, KwSet::AllKeywords)
2198 fn scan_next_token_as_keyword(
2200 scanner: impl Fn(&mut Self) -> TokenKind,
2202 self.scan_next_token(scanner, KwSet::NoKeywords)
2205 fn scan_next_token_nonreserved_as_name(
2207 scanner: impl Fn(&mut Self) -> TokenKind,
2209 self.scan_next_token(scanner, KwSet::NonReservedKeywords)
2212 fn next_token_impl(&mut self) -> TF::Token {
2214 self.scan_next_token_as_keyword(&Self::scan_token_inside_type)
2216 self.scan_next_token_as_keyword(&Self::scan_token_outside_type)
2221 pub fn peek_next_token(&self) -> TF::Token {
2223 let cache = self.cache.borrow();
2224 if let Some(cache) = cache.as_ref() {
2225 if cache.0 == *self {
2226 return cache.1.clone();
2231 let mut lexer = self.clone();
2232 lexer.errors = vec![];
2233 let before = lexer.to_lexer_pre_snapshot();
2234 let token = lexer.next_token_impl();
2235 let after = lexer.into_lexer_post_snapshot();
2237 .replace(Some(LexerCache(before, token.clone(), after)));
2241 pub fn next_token(&mut self) -> TF::Token {
2243 let mut cache = self.cache.borrow_mut();
2244 if let Some(ref mut cache) = cache.deref_mut() {
2245 if cache.0 == *self {
2246 self.start = (cache.2).start;
2247 self.offset = (cache.2).offset;
2248 self.in_type = (cache.2).in_type;
2249 if !(cache.2).errors.is_empty() {
2250 self.errors.append(&mut (cache.2).errors.clone());
2252 return cache.1.clone();
2256 self.next_token_impl()
2259 pub fn next_token_no_trailing(&mut self) -> TF::Token {
2260 let tokenizer = |x: &mut Self| {
2261 let token_start = x.offset;
2262 let (kind, w, leading) =
2263 x.scan_token_and_leading_trivia(&Self::scan_token_outside_type, KwSet::NoKeywords);
2264 let trailing = x.token_factory.trivia_factory_mut().make();
2266 .make(kind, token_start, w, leading, trailing)
2268 self.scan_assert_progress(&tokenizer)
2271 pub fn next_token_in_string(&mut self, literal_kind: &StringLiteralKind) -> TF::Token {
2272 let token_start = self.offset;
2273 self.start_new_lexeme();
2274 // We're inside a string. Do not scan leading trivia.
2275 let kind = self.scan_string_literal_in_progress(literal_kind);
2276 let w = self.width();
2277 // Only scan trailing trivia if we've finished the string.
2278 let trailing = match kind {
2279 TokenKind::DoubleQuotedStringLiteralTail | TokenKind::HeredocStringLiteralTail => {
2280 self.scan_trailing_php_trivia()
2282 _ => self.token_factory.trivia_factory_mut().make(),
2284 let leading = self.token_factory.trivia_factory_mut().make();
2286 .make(kind, token_start, w, leading, trailing)
2289 pub fn next_docstring_header(&mut self) -> (TF::Token, &'a [u8]) {
2290 // We're at the beginning of a heredoc string literal. Scan leading
2291 // trivia but not trailing trivia.
2292 let token_start = self.offset;
2293 let leading = self.scan_leading_php_trivia();
2294 self.start_new_lexeme();
2295 let (name, _) = self.scan_docstring_header();
2296 let w = self.width();
2297 let trailing = self.token_factory.trivia_factory_mut().make();
2298 let token = self.token_factory.make(
2299 TokenKind::HeredocStringLiteralHead,
2308 pub fn next_token_as_name(&mut self) -> TF::Token {
2309 self.scan_next_token_as_name(&Self::scan_token_outside_type)
2312 pub fn next_token_non_reserved_as_name(&mut self) -> TF::Token {
2313 self.scan_next_token_nonreserved_as_name(&Self::scan_token_outside_type)
2316 pub fn next_xhp_element_token(&mut self, no_trailing: bool) -> (TF::Token, &[u8]) {
2317 // XHP elements have whitespace, newlines and Hack comments.
2318 let tokenizer = |lexer: &mut Self| {
2319 let token_start = lexer.offset;
2320 let (kind, w, leading) =
2321 lexer.scan_token_and_leading_trivia(&Self::scan_xhp_token, KwSet::AllKeywords);
2322 // We do not scan trivia after an XHPOpen's >. If that is the beginning of
2323 // an XHP body then we want any whitespace or newlines to be leading trivia
2324 // of the body token.
2326 TokenKind::GreaterThan | TokenKind::SlashGreaterThan if no_trailing => {
2327 let trailing = lexer.token_factory.trivia_factory_mut().make();
2330 .make(kind, token_start, w, leading, trailing)
2333 let trailing = lexer.scan_trailing_php_trivia();
2336 .make(kind, token_start, w, leading, trailing)
2340 let token = self.scan_assert_progress(&tokenizer);
2341 let token_width = token.width();
2342 let trailing_width = token.trailing_width();
2343 let token_start_offset = (self.offset) - trailing_width - token_width;
2344 let token_text = self.source.sub(token_start_offset, token_width);
2348 pub fn next_xhp_body_token(&mut self) -> TF::Token {
2349 let scanner = |lexer: &mut Self| {
2350 let token_start = lexer.offset;
2351 let leading = lexer.scan_leading_xhp_trivia();
2352 lexer.start_new_lexeme();
2353 let kind = lexer.scan_xhp_body();
2354 let w = lexer.width();
2356 // Trivia (leading and trailing) is semantically
2357 // significant for XHPBody tokens. When we find elements or
2358 // braced expressions inside the body, the trivia should be
2359 // seen as leading the next token, but we should certainly
2360 // keep it trailing if this is an XHPBody token.
2361 if kind == TokenKind::XHPBody {
2362 lexer.scan_trailing_xhp_trivia()
2364 lexer.token_factory.trivia_factory_mut().make()
2368 .make(kind, token_start, w, leading, trailing)
2370 self.scan_assert_progress(&scanner)
2374 // When the xhp modifier is used for declaring xhp classes
2375 // we do not allow colon prefixes or dashes.
2377 // This ensures that the syntax is closer to regular classes.
2379 pub fn next_xhp_modifier_class_name(&mut self) -> TF::Token {
2380 self.scan_token_and_trivia(&Self::scan_xhp_modifier_class_name, KwSet::NoKeywords)
2383 pub fn next_xhp_class_name(&mut self) -> TF::Token {
2384 self.scan_token_and_trivia(&Self::scan_xhp_class_name, KwSet::NoKeywords)
2387 pub fn next_xhp_name(&mut self) -> TF::Token {
2388 let scanner = |x: &mut Self| x.scan_xhp_element_name(false);
2389 self.scan_token_and_trivia(&scanner, KwSet::NoKeywords)
2392 fn make_hashbang_token(&mut self) -> TF::Token {
2393 let leading = self.token_factory.trivia_factory_mut().make();
2394 self.skip_to_end_of_line();
2395 let token_start = self.start;
2396 let token_width = self.width();
2397 let trailing = self.scan_trailing_php_trivia();
2398 self.start_new_lexeme();
2399 self.token_factory.make(
2400 TokenKind::Hashbang,
2410 name_token_offset: usize,
2412 less_than_question_token: TF::Token,
2413 ) -> (TF::Token, Option<TF::Token>) {
2416 // single line comments that follow the language in leading markup_text
2417 // determine the file check mode, read the trailing trivia and attach it
2418 // to the language token
2419 let trailing = self.scan_trailing_php_trivia();
2420 let leading = self.token_factory.trivia_factory_mut().make();
2423 .make(TokenKind::Name, name_token_offset, size, leading, trailing);
2424 (less_than_question_token, Some(name))
2427 fn make_markup_suffix(&mut self) -> (TF::Token, Option<TF::Token>) {
2428 let leading = self.token_factory.trivia_factory_mut().make();
2429 let trailing = self.token_factory.trivia_factory_mut().make();
2430 let less_than_question_token = self.token_factory.make(
2431 TokenKind::LessThanQuestion,
2439 let name_token_offset = self.offset;
2440 let ch0 = self.peek_char(0).to_ascii_lowercase();
2441 let ch1 = self.peek_char(1).to_ascii_lowercase();
2443 ('h', 'h') => self.make_long_tag(name_token_offset, 2, less_than_question_token),
2444 _ => (less_than_question_token, (None)),
2448 fn skip_to_end_of_header(
2450 ) -> (Option<TF::Token>, Option<(TF::Token, Option<TF::Token>)>) {
2451 let start_offset = {
2452 // if leading section starts with #! - it should span the entire line
2453 if self.offset != 0 {
2454 panic!("Should only try to lex header at start of document")
2456 // this should really just be `self.offset` - but, skip whitespace as the FFP
2457 // tests use magic comments in leading markup to set flags, but blank
2458 // them out before parsing; the newlines are kept to provide correct line
2459 // numbers in errors
2460 self.skip_while_to_offset(&|x| Self::is_newline(x) || Self::is_whitespace_no_newline(x))
2462 let hashbang = if self.peek_def(start_offset, INVALID) == '#'
2463 && self.peek_def(start_offset + 1, INVALID) == '!'
2465 self.with_offset(start_offset);
2466 Some(self.make_hashbang_token())
2471 let start_offset = self
2472 .skip_while_to_offset(&|x| Self::is_newline(x) || Self::is_whitespace_no_newline(x));
2473 let suffix = if self.peek_def(start_offset, INVALID) == '<'
2474 && self.peek_def(start_offset + 1, INVALID) == '?'
2476 self.with_offset(start_offset);
2477 Some(self.make_markup_suffix())
2485 pub fn scan_header(&mut self) -> (Option<TF::Token>, Option<(TF::Token, Option<TF::Token>)>) {
2486 self.start_new_lexeme();
2487 self.skip_to_end_of_header()
2490 pub fn is_next_xhp_category_name(&self) -> bool {
2491 let mut lexer = self.clone();
2492 let _ = lexer.scan_leading_php_trivia();
2493 // An XHP category is an xhp element name preceded by a %.
2494 let ch0 = lexer.peek_char(0);
2495 let ch1 = lexer.peek_char(1);
2496 ch0 == '%' && Self::is_name_nondigit(ch1)
2499 fn scan_xhp_category_name(&mut self) -> TokenKind {
2500 if self.is_next_xhp_category_name() {
2502 let _ = self.scan_xhp_element_name(false);
2503 TokenKind::XHPCategoryName
2505 self.scan_token(false)
2509 pub fn next_xhp_category_name(&mut self) -> TF::Token {
2510 self.scan_token_and_trivia(&Self::scan_xhp_category_name, KwSet::NoKeywords)