1 // Copyright (c) 2019, Facebook, Inc.
2 // All rights reserved.
4 // This source code is licensed under the MIT license found in the
5 // LICENSE file in the "hack" directory of this source tree.
7 use crate::lexable_token::LexableToken;
8 use crate::lexable_trivia::LexableTrivia;
9 use crate::source_text::{SourceText, INVALID};
10 use crate::syntax_error::{self as Errors, Error, SyntaxError};
11 use crate::token_kind::TokenKind;
12 use crate::trivia_kind::TriviaKind;
14 use std::marker::PhantomData;
16 #[derive(Debug, Clone)]
17 pub struct Lexer<'a, Token: LexableToken> {
18 source: &'a SourceText<'a>,
21 errors: Vec<SyntaxError>,
22 is_experimental_mode: bool,
24 _phantom: PhantomData<Token>,
27 #[derive(Debug, PartialEq)]
28 pub enum StringLiteralKind {
30 LiteralHeredoc { heredoc: Vec<u8> },
33 #[derive(Debug, Copy, Clone)]
40 impl<'a, Token: LexableToken> Lexer<'a, Token> {
42 source: &'a SourceText<'a>,
43 is_experimental_mode: bool,
53 _phantom: PhantomData,
58 source: &'a SourceText<'a>,
59 is_experimental_mode: bool,
68 fn continue_from(&mut self, l: Lexer<Token>) {
70 self.offset = l.offset;
71 self.errors = l.errors
74 pub fn start(&self) -> usize {
78 pub fn offset(&self) -> usize {
82 pub fn errors(&self) -> &[SyntaxError] {
86 fn with_error(&mut self, error: Error) {
87 let error = SyntaxError::make(self.start(), self.offset(), error);
88 self.errors.push(error)
91 fn with_offset(&mut self, offset: usize) {
95 fn with_start_offset(&mut self, start: usize, offset: usize) {
100 fn start_new_lexeme(&mut self) {
101 self.start = self.offset
104 pub fn advance(&mut self, i: usize) {
105 self.offset = self.offset + i
108 fn is_experimental_mode(&self) -> bool {
109 self.is_experimental_mode
112 pub fn set_in_type(&mut self, in_type: bool) {
113 self.in_type = in_type
116 pub fn source(&self) -> &SourceText<'a> {
120 fn source_text_string(&self) -> &[u8] {
126 pub fn peek_char(&self, index: usize) -> char {
127 self.source.get(self.offset() + index)
130 fn peek_string(&self, size: usize) -> &[u8] {
131 &self.source.sub(self.offset, size)
134 fn match_string(&self, s: &[u8]) -> bool {
135 s == self.peek_string(s.len())
138 fn width(&self) -> usize {
139 self.offset - self.start
142 fn current_text(&self) -> &[u8] {
143 self.source.sub(self.start, self.width())
146 fn current_text_as_str(&self) -> &str {
147 unsafe { std::str::from_utf8_unchecked(self.current_text()) }
150 fn at_end(&self) -> bool {
151 self.offset() >= self.source.length()
154 fn remaining(&self) -> usize {
155 let r = (self.source.length() as isize) - (self.offset as isize);
163 fn peek(&self, i: usize) -> char {
167 fn peek_back(&self, index: usize) -> char {
168 self.source.get(self.offset() - index)
171 fn peek_def(&self, index: usize, default: char) -> char {
172 if index >= self.source.length() {
175 self.source.get(index)
179 // Character classification
181 fn is_whitespace_no_newline(c: char) -> bool {
188 fn is_newline(ch: char) -> bool {
195 fn is_binary_digit(ch: char) -> bool {
202 fn is_octal_digit(c: char) -> bool {
203 ('0' <= c && c <= '7')
206 fn is_decimal_digit(ch: char) -> bool {
207 '0' <= ch && ch <= '9'
210 fn is_hexadecimal_digit(c: char) -> bool {
211 ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
214 fn is_name_nondigit(c: char) -> bool {
215 (c == '_') || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('\x7f' <= c)
218 fn is_name_letter(c: char) -> bool {
220 || ('0' <= c && c <= '9')
221 || ('a' <= c && c <= 'z')
222 || ('A' <= c && c <= 'Z')
228 fn skip_while_to_offset(&self, p: &Fn(char) -> bool) -> usize {
229 let n = self.source.length();
230 let mut i = self.offset();
231 while i < n && p(self.peek(i)) {
237 // advance offset as long as the predicate is true
238 fn skip_while(&mut self, p: &Fn(char) -> bool) {
239 self.with_offset(self.skip_while_to_offset(p))
242 fn str_skip_while(s: &[u8], mut i: usize, p: &Fn(char) -> bool) -> usize {
245 if i < n && p(s[i] as char) {
253 fn skip_whitespace(&mut self) {
254 self.skip_while(&Self::is_whitespace_no_newline);
257 fn str_skip_whitespace(s: &[u8], i: usize) -> usize {
258 Self::str_skip_while(s, i, &Self::is_whitespace_no_newline)
261 fn not_newline(ch: char) -> bool {
262 !(Self::is_newline(ch))
265 fn skip_to_end_of_line(&mut self) {
266 self.skip_while(&Self::not_newline)
269 fn skip_to_end_of_line_or_end_tag(&mut self) {
270 let n = self.source.length();
271 let peek_def = |i| if i < n { self.peek(i) } else { INVALID };
273 let should_stop = |i| {
275 let ch = self.peek(i);
276 Self::is_newline(ch) || (ch == '?' && peek_def(i + 1) == '>')
279 let mut i = self.offset();
280 while !(should_stop(i)) {
286 fn skip_name_end(&mut self) {
287 self.skip_while(&Self::is_name_letter)
290 fn skip_end_of_line(&mut self) {
291 match self.peek_char(0) {
292 '\n' => self.advance(1),
294 if self.peek_char(1) == '\n' {
304 fn scan_name_impl(&mut self) {
305 assert!(Self::is_name_nondigit(self.peek_char(0)));
307 self.skip_name_end();
310 fn scan_name(&mut self) -> TokenKind {
311 self.scan_name_impl();
315 fn scan_variable(&mut self) -> TokenKind {
316 assert_eq!('$', self.peek_char(0));
318 self.scan_name_impl();
322 fn scan_with_underscores(&mut self, accepted_char: &Fn(char) -> bool) {
323 let n = self.source.length();
324 let peek_def = |i| if i < n { self.peek(i) } else { INVALID };
325 let mut i = self.offset();
327 let ch = self.peek(i);
328 if accepted_char(ch) {
330 } else if ch == ' ' && accepted_char(peek_def(i + 1)) {
339 fn scan_decimal_digits(&mut self) {
340 self.skip_while(&Self::is_decimal_digit)
343 fn scan_decimal_digits_with_underscores(&mut self) {
344 self.scan_with_underscores(&Self::is_decimal_digit);
347 fn scan_octal_digits(&mut self) {
348 self.skip_while(&Self::is_octal_digit)
351 fn scan_octal_digits_with_underscores(&mut self) {
352 self.scan_with_underscores(&Self::is_octal_digit)
355 fn scan_binary_digits_with_underscores(&mut self) {
356 self.scan_with_underscores(&Self::is_binary_digit)
359 fn scan_hexadecimal_digits(&mut self) {
360 self.skip_while(&Self::is_hexadecimal_digit)
363 fn scan_hexadecimal_digits_with_underscores(&mut self) {
364 self.scan_with_underscores(&Self::is_hexadecimal_digit)
367 fn scan_hex_literal(&mut self) -> TokenKind {
368 let ch = self.peek_char(0);
369 if !Self::is_hexadecimal_digit(ch) {
370 self.with_error(Errors::error0001);
371 TokenKind::HexadecimalLiteral
373 self.scan_hexadecimal_digits_with_underscores();
374 TokenKind::HexadecimalLiteral
378 fn scan_binary_literal(&mut self) -> TokenKind {
379 let ch = self.peek_char(0);
380 if !Self::is_binary_digit(ch) {
381 self.with_error(Errors::error0002);
382 TokenKind::BinaryLiteral
384 self.scan_binary_digits_with_underscores();
385 TokenKind::BinaryLiteral
389 fn scan_exponent(&mut self) -> TokenKind {
390 let ch = self.peek_char(1);
391 if ch == '+' || ch == '-' {
396 let ch = self.peek_char(0);
397 if !Self::is_decimal_digit(ch) {
398 self.with_error(Errors::error0003);
399 TokenKind::FloatingLiteral
401 self.scan_decimal_digits();
402 TokenKind::FloatingLiteral
406 fn scan_after_decimal_point(&mut self) -> TokenKind {
408 self.scan_decimal_digits();
409 let ch = self.peek_char(0);
410 if ch == 'e' || ch == 'E' {
413 TokenKind::FloatingLiteral
417 fn scan_octal_or_float(&mut self) -> TokenKind {
418 // We've scanned a leading zero.
419 // We have an irritating ambiguity here. 09 is not a legal octal or
420 // floating literal, but 09e1 and 09.1 are.
422 let ch = self.peek_char(0);
427 self.scan_after_decimal_point()
434 _ if '0' <= ch && ch <= '9' => {
436 let mut lexer_oct = self.clone();
437 lexer_oct.scan_octal_digits();
439 let mut lexer_dec = self.clone();
440 lexer_dec.scan_decimal_digits();
441 if (lexer_oct.width()) == (lexer_dec.width()) {
442 // Only octal digits. Could be an octal literal, or could
444 let ch = lexer_oct.peek_char(0);
445 if ch == 'e' || ch == 'E' {
446 self.continue_from(lexer_oct);
448 } else if ch == '.' {
449 self.continue_from(lexer_oct);
450 self.scan_after_decimal_point()
452 // This is irritating - we only want to allow underscores for integer
453 // literals. Deferring the lexing with underscores here allows us to
454 // make sure we're not dealing with floats.
455 self.continue_from(lexer_oct);
456 self.scan_octal_digits_with_underscores();
457 TokenKind::OctalLiteral
460 // We had decimal digits following a leading zero; this is either a
461 // float literal or an octal to be truncated at the first non-octal
463 let ch = lexer_dec.peek_char(0);
464 if ch == 'e' || ch == 'E' {
465 self.continue_from(lexer_dec);
467 } else if ch == '.' {
468 self.continue_from(lexer_dec);
469 self.scan_after_decimal_point()
471 // an octal to be truncated at the first non-octal digit
472 // Again we differ the lexing with underscores here
473 self.scan_decimal_digits_with_underscores();
474 TokenKind::OctalLiteral
479 // 0 is a decimal literal
481 TokenKind::DecimalLiteral
486 fn scan_decimal_or_float(&mut self) -> TokenKind {
487 // We've scanned a leading non-zero digit.
488 let mut lexer_no_underscores = self.clone();
489 lexer_no_underscores.scan_decimal_digits();
490 let mut lexer_with_underscores = self.clone();
491 lexer_with_underscores.scan_decimal_digits_with_underscores();
492 let ch = lexer_no_underscores.peek_char(0);
497 self.continue_from(lexer_no_underscores);
498 self.scan_after_decimal_point()
503 self.continue_from(lexer_no_underscores);
509 self.continue_from(lexer_with_underscores);
510 TokenKind::DecimalLiteral
515 fn scan_single_quote_string_literal(&mut self) -> TokenKind {
516 // TODO: What about newlines embedded?
518 // single-quoted-string-literal::
519 // b-opt ' sq-char-sequence-opt '
521 // TODO: What is this b-opt? We don't lex an optional 'b' before a literal.
523 // sq-char-sequence::
525 // sq-char-sequence sq-char
528 // sq-escape-sequence
529 // \opt any character except single-quote (') or backslash (\)
531 // sq-escape-sequence:: one of
533 let n = self.source.length();
534 let peek = |x| self.source.get(x);
536 let mut has_error0012 = false;
537 let mut has_error0006 = false;
539 let mut i = 1 + self.offset();
540 let new_offset = loop {
542 has_error0012 = true;
548 has_error0006 = true;
552 '\'' => break (1 + i),
559 self.with_error(Errors::error0006)
562 self.with_error(Errors::error0012)
565 self.with_offset(new_offset);
566 TokenKind::SingleQuotedStringLiteral
569 fn scan_hexadecimal_escape(&mut self) {
570 let ch2 = self.peek_char(2);
571 let ch3 = self.peek_char(3);
572 if !(Self::is_hexadecimal_digit(ch2)) {
573 // TODO: Consider producing an error for a malformed hex escape
574 // let lexer = with_error lexer SyntaxError.error0005 in
576 } else if !(Self::is_hexadecimal_digit(ch3)) {
577 // let lexer = with_error lexer SyntaxError.error0005 in
584 fn scan_unicode_escape(&mut self) {
585 // At present the lexer is pointing at \u
586 if self.peek_char(2) == '{' {
587 if self.peek_char(3) == '$' {
588 // We have a malformed unicode escape that contains a possible embedded
589 // expression. Eat the \u and keep on processing the embedded expression.
590 // TODO: Consider producing a warning for a malformed unicode escape.
593 // We have a possibly well-formed escape sequence, and at least we know
594 // that it is not an embedded expression.
595 // TODO: Consider producing an error if the digits are out of range
596 // of legal Unicode characters.
597 // TODO: Consider producing an error if there are no digits.
598 // Skip over the slash, u and brace, and start lexing the number.
600 self.scan_hexadecimal_digits();
601 let ch = self.peek_char(0);
603 // TODO: Consider producing a warning for a malformed unicode escape.
610 // We have a malformed unicode escape sequence. Bail out.
611 // TODO: Consider producing a warning for a malformed unicode escape.
616 fn skip_uninteresting_double_quote_like_string_characters(&mut self, start_char: char) {
617 let is_uninteresting = |ch| match ch {
618 INVALID | '\\' | '$' | '{' | '[' | ']' | '-' => false,
619 ch if '0' <= ch && ch <= '9' => false,
620 ch => ch != start_char && !Self::is_name_nondigit(ch),
622 self.skip_while(&is_uninteresting);
625 fn scan_integer_literal_in_string(&mut self) -> TokenKind {
626 if self.peek_char(0) == '0' {
627 match self.peek_char(1) {
630 self.scan_hex_literal()
634 self.scan_binary_literal()
637 // An integer literal starting with 0 in a string will actually
638 // always be treated as a string index in HHVM, and not as an octal.
639 // In such a case, HHVM actually scans all decimal digits to create the
640 // token. TODO: (kasper) T40381519 we may want to change this behavior to something more
642 self.scan_decimal_digits_with_underscores();
643 TokenKind::DecimalLiteral
647 self.scan_decimal_digits_with_underscores();
648 TokenKind::DecimalLiteral
652 fn scan_double_quote_like_string_literal_from_start(&mut self, start_char: char) -> TokenKind {
653 let literal_token_kind = TokenKind::DoubleQuotedStringLiteral;
654 let head_token_kind = TokenKind::DoubleQuotedStringLiteralHead;
657 // If there's nothing interesting in this double-quoted string then
658 // we can just hand it back as-is.
659 self.skip_uninteresting_double_quote_like_string_characters(start_char);
660 match self.peek_char(0) {
662 // If the string is unterminated then give an error; if this is an
663 // embedded zero character then give an error and recurse; we might
664 // be able to make more progress.
666 self.with_error(Errors::error0012);
667 break literal_token_kind;
669 self.with_error(Errors::error0006);
674 // We made it to the end without finding a special character.
676 break literal_token_kind;
679 // We've found a backslash, dollar or brace.
681 break head_token_kind;
687 fn is_heredoc_tail(&self, name: &[u8]) -> bool {
688 // A heredoc tail is the identifier immediately preceded by a newline
689 // and immediately followed by an optional semi and then a newline.
691 // Note that the newline and optional semi are not part of the literal;
692 // the literal's lexeme ends at the end of the name. Either there is
693 // no trivia and the next token is a semi-with-trailing-newline, or
694 // the trailing trivia is a newline.
696 // This odd rule is to ensure that both
706 // . "something else";
709 if !(Self::is_newline(self.peek_back(1))) {
712 let len = name.len();
713 let ch0 = self.peek_char(len);
714 let ch1 = self.peek_char(len + 1);
715 ((Self::is_newline(ch0)) || ch0 == ';' && (Self::is_newline(ch1)))
716 && self.peek_string(len) == name
720 fn get_tail_token_kind(&self, literal_kind: &StringLiteralKind) -> TokenKind {
722 StringLiteralKind::LiteralHeredoc { heredoc: _ } => TokenKind::HeredocStringLiteralTail,
723 StringLiteralKind::LiteralDoubleQuoted => TokenKind::DoubleQuotedStringLiteralTail,
727 fn get_string_literal_body_or_double_quoted_tail(
729 literal_kind: &StringLiteralKind,
731 if literal_kind == &StringLiteralKind::LiteralDoubleQuoted {
732 TokenKind::DoubleQuotedStringLiteralTail
734 TokenKind::StringLiteralBody
738 fn scan_string_literal_in_progress(&mut self, literal_kind: &StringLiteralKind) -> TokenKind {
739 let (is_heredoc, name): (bool, &[u8]) = match literal_kind {
740 StringLiteralKind::LiteralHeredoc { heredoc } => (true, &heredoc),
741 _ => (false, "".as_bytes()),
743 let start_char = '"';
744 let ch0 = self.peek_char(0);
745 if Self::is_name_nondigit(ch0) {
746 if is_heredoc && (self.is_heredoc_tail(name)) {
747 self.scan_name_impl();
748 TokenKind::HeredocStringLiteralTail
750 self.scan_name_impl();
757 self.with_error(Errors::error0012);
758 self.get_tail_token_kind(literal_kind)
760 self.with_error(Errors::error0006);
762 self.skip_uninteresting_double_quote_like_string_characters(start_char);
763 TokenKind::StringLiteralBody
767 let kind = self.get_string_literal_body_or_double_quoted_tail(literal_kind);
772 if Self::is_name_nondigit(self.peek_char(1)) {
784 match self.peek_char(1) {
785 // In these cases we just skip the escape sequence and
786 // keep on scanning for special characters.
787 | '\\' | '"' | '$' | 'e' | 'f' | 'n' | 'r' | 't' | 'v' | '`'
788 // Same in these cases; there might be more octal characters following but
789 // if there are, we'll just eat them as normal characters.
790 | '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' => {
792 self.skip_uninteresting_double_quote_like_string_characters(start_char);
793 TokenKind::StringLiteralBody}
795 self.scan_hexadecimal_escape();
796 self.skip_uninteresting_double_quote_like_string_characters(start_char);
797 TokenKind::StringLiteralBody }
799 self.scan_unicode_escape();
800 self.skip_uninteresting_double_quote_like_string_characters(start_char);
801 TokenKind::StringLiteralBody }
803 // The rules for escaping open braces in Hack are bizarre. Suppose we
808 // What is the value of $z? Naively you would think that the backslash
809 // escapes the braces, and the variables are embedded, so {123,456}. But
810 // that's not what happens. Yes, the backslash makes the brace no longer
811 // the opening brace of an expression. But the backslash is still part
812 // of the string! This is the string \{123,456\}.
813 // TODO: We might want to fix this because this is very strange.
814 // Eat the backslash and the brace.
816 TokenKind::StringLiteralBody
819 // TODO: A backslash followed by something other than an escape sequence
820 // is legal in hack, and treated as though it was just the backslash
821 // and the character. However we might consider making this a warning.
822 // It is particularly egregious when we have something like:
825 // The author of the code likely means the backslash to mean line
826 // continuation but in fact it just means to put a backslash and newline
829 self.skip_uninteresting_double_quote_like_string_characters(start_char);
830 TokenKind::StringLiteralBody
836 TokenKind::LeftBracket
840 TokenKind::RightBracket
843 if (self.peek_char(1)) == '>' {
845 TokenKind::MinusGreaterThan
847 // Nothing interesting here. Skip it and find the next
848 // interesting character.
850 self.skip_uninteresting_double_quote_like_string_characters(start_char);
851 TokenKind::StringLiteralBody
854 ch if '0' <= ch && ch <= '9' => {
855 let mut lexer1 = self.clone();
856 let literal = lexer1.scan_integer_literal_in_string();
858 if self.errors.len() == lexer1.errors.len() {
859 self.continue_from(lexer1);
862 // If we failed to scan a literal, do not interpret the literal
863 self.with_offset(lexer1.offset());
864 TokenKind::StringLiteralBody
868 // Nothing interesting here. Skip it and find the next
869 // interesting character.
871 self.skip_uninteresting_double_quote_like_string_characters(start_char);
872 TokenKind::StringLiteralBody
877 // A heredoc string literal has the form
885 // <<< (optional whitespace) name (no whitespace) (newline)
887 // The optional body is:
889 // any characters whatsoever including newlines (newline)
893 // (no whitespace) name (no whitespace) (optional semi) (no whitespace) (newline)
895 // The names must be identical. The trailing semi and newline must be present.
897 // The body is any and all characters, up to the first line that exactly matches
900 // The body may contain embedded expressions.
902 // A nowdoc string literal has the same form except that the first name is
903 // enclosed in single quotes, and it may not contain embedded expressions.
904 fn scan_docstring_name_actual(&mut self) -> &'a [u8] {
905 let ch = self.peek_char(0);
906 if Self::is_name_nondigit(ch) {
907 let start_offset = self.offset();
909 self.skip_name_end();
910 let name = self.source.sub(start_offset, self.offset() - start_offset);
913 self.with_error(Errors::error0008);
918 fn scan_docstring_name(&mut self) -> (&'a [u8], TokenKind) {
919 self.skip_whitespace();
920 let ch = self.peek_char(0);
921 let kind = if ch == '\'' {
922 TokenKind::NowdocStringLiteral
924 TokenKind::HeredocStringLiteral
927 let name = if ch == '\'' {
929 let name = self.scan_docstring_name_actual();
930 if (self.peek_char(0)) == '\'' {
934 self.with_error(Errors::error0010);
938 // Starting with PHP 5.3.0, the opening Heredoc identifier
939 // may optionally be enclosed in double quotes:
943 let name = self.scan_docstring_name_actual();
945 // same logic as above, just for double quote
946 if self.peek_char(0) == '\"' {
949 self.with_error(Errors::missing_double_quote)
957 fn scan_docstring_header(&mut self) -> (&'a [u8], TokenKind) {
958 let ch = self.peek_char(0);
959 // Skip 3 for <<< or 4 for b<<<
960 let skip_count = if ch == 'b' { 4 } else { 3 };
961 self.advance(skip_count);
962 let (name, kind) = self.scan_docstring_name();
963 let ch = self.peek_char(0);
964 if !Self::is_newline(ch) {
965 self.with_error(Errors::error0011)
967 self.skip_to_end_of_line();
968 self.skip_end_of_line();
972 fn scan_docstring_remainder(&mut self, name: &[u8]) {
973 let len = name.len();
975 let ch0 = self.peek_char(len);
976 let ch1 = self.peek_char(len + 1);
977 if ((Self::is_newline(ch0)) || ch0 == ';' && (Self::is_newline(ch1)))
978 && self.peek_string(len as usize) == name
980 self.advance(len as usize);
983 self.skip_to_end_of_line();
984 let ch = self.peek_char(0);
985 if Self::is_newline(ch) {
986 self.skip_end_of_line()
988 // If we got here then we ran off the end of the file without
989 // finding a newline. Just bail.
990 self.with_error(Errors::error0011);
997 fn scan_docstring_literal(&mut self) -> TokenKind {
998 let (name, kind) = self.scan_docstring_header();
999 self.scan_docstring_remainder(name);
1003 fn scan_xhp_label(&mut self) {
1004 // An XHP label has the same grammar as a Hack name.
1005 let _: TokenKind = self.scan_name();
1008 fn scan_xhp_element_name(&mut self, attribute: bool) -> TokenKind {
1009 // An XHP element name is a sequence of one or more XHP labels each separated
1010 // by a single : or -. Note that it is possible for an XHP element name to be
1011 // followed immediately by a : or - that is the next token, so if we find
1012 // a : or - not followed by a label, we need to terminate the token.
1013 self.scan_xhp_label();
1014 let ch0 = self.peek_char(0);
1015 let ch1 = self.peek_char(1);
1016 if (!attribute && ch0 == ':' || ch0 == '-') && Self::is_name_nondigit(ch1) {
1018 self.scan_xhp_element_name(false)
1020 TokenKind::XHPElementName
1024 // Is the next token we're going to lex a possible xhp class name?
1025 fn is_xhp_class_name(&self) -> bool {
1026 (self.peek_char(0) == ':') && (Self::is_name_nondigit(self.peek_char(1)))
1029 fn scan_xhp_class_name(&mut self) -> TokenKind {
1030 // An XHP class name is a colon followed by an xhp name.
1031 if self.is_xhp_class_name() {
1033 self.scan_xhp_element_name(false);
1034 TokenKind::XHPClassName
1036 self.with_error(Errors::error0008);
1038 TokenKind::ErrorToken
1042 fn scan_xhp_string_literal(&mut self) -> TokenKind {
1043 // XHP string literals are just straight up "find the closing quote"
1044 // strings. Embedded newlines are legal.
1045 let mut offset: usize = 1;
1047 match self.peek_char(offset) {
1049 self.advance(offset);
1051 self.with_error(Errors::error0012);
1052 return TokenKind::XHPStringLiteral;
1054 self.with_error(Errors::error0006);
1059 self.advance(offset + 1);
1060 return TokenKind::XHPStringLiteral;
1062 _ => offset = offset + 1,
1067 // Note that this does not scan an XHP body
1068 fn scan_xhp_token(&mut self) -> TokenKind {
1069 // TODO: HHVM requires that there be no trivia between < and name in an
1070 // opening tag, but does allow trivia between </ and name in a closing tag.
1071 // Consider allowing trivia in an opening tag.
1072 let ch0 = self.peek_char(0);
1073 if ch0 == INVALID && self.at_end() {
1074 TokenKind::EndOfFile
1075 } else if Self::is_name_nondigit(ch0) {
1076 self.scan_xhp_element_name(false)
1081 TokenKind::LeftBrace
1085 TokenKind::RightBrace
1092 if (self.peek_char(1)) == '/' {
1094 TokenKind::LessThanSlash
1100 '"' => self.scan_xhp_string_literal(),
1102 if (self.peek_char(1)) == '>' {
1104 TokenKind::SlashGreaterThan
1106 self.with_error(Errors::error0006);
1108 TokenKind::ErrorToken
1113 TokenKind::GreaterThan
1116 self.with_error(Errors::error0006);
1118 TokenKind::ErrorToken
1124 fn scan_xhp_comment(&mut self) {
1127 let ch0 = self.peek_char(offset);
1128 let ch1 = self.peek_char(offset + 1);
1129 let ch2 = self.peek_char(offset + 2);
1130 match (ch0, ch1, ch2) {
1131 (INVALID, _, _) => {
1132 self.advance(offset as usize);
1133 return self.with_error(Errors::error0014);
1135 ('-', '-', '>') => return self.advance((offset + 3) as usize),
1136 _ => offset = offset + 1,
1140 fn scan_xhp_body(&mut self) -> TokenKind {
1141 // Naively you might think that an XHP body is just a bunch of characters,
1142 // terminated by an embedded { } expression or a tag. However, whitespace
1143 // and newlines are relevant in XHP bodies because they are "soft".
1144 // That is, any section of contiguous trivia has the same semantics as a
1145 // single space or newline -- just as in HTML.
1147 // Obviously this is of relevance to code formatters.
1149 // Therefore we detect whitespace and newlines within XHP bodies and treat
1150 // it as trivia surrounding the tokens within the body.
1152 // TODO: Is this also true of whitespace within XHP comments? If so then
1153 // we need to make XHP comments a sequence of tokens, rather than a
1154 // single token as they are now.
1155 let ch0 = self.peek_char(0);
1158 INVALID if self.at_end() => TokenKind::EndOfFile,
1161 TokenKind::LeftBrace
1165 TokenKind::RightBrace
1168 let ch1 = self.peek_char(1);
1169 let ch2 = self.peek_char(2);
1170 let ch3 = self.peek_char(3);
1171 match (ch1, ch2, ch3) {
1172 ('!', '-', '-') => {
1173 self.scan_xhp_comment();
1174 TokenKind::XHPComment
1178 TokenKind::LessThanSlash
1189 let ch = self.peek_char(offset);
1192 self.advance(offset);
1194 self.with_error(Errors::error0013);
1197 self.with_error(Errors::error0006);
1201 '\t' | ' ' | '\r' | '\n' | '{' | '}' | '<' => {
1202 self.advance(offset);
1205 _ => offset = offset + 1,
1213 fn scan_dollar_token(&mut self) -> TokenKind {
1214 // We have a problem here. We wish to be able to lexically analyze both
1215 // PHP and Hack, but the introduction of $$ to Hack makes them incompatible.
1216 // "$$x" and "$$ $x" are legal in PHP, but illegal in Hack.
1217 // The rule in PHP seems to be that $ is a prefix operator, it is a token,
1218 // it can be followed by trivia, but the next token has to be another $
1219 // operator, a variable $x, or a {.
1221 // Here's a reasonable compromise. (TODO: Review this decision.)
1223 // $$x lexes as $ $x
1224 // $$$x lexes as $ $ $x
1227 // $$ followed by anything other than a name or a $ lexes as $$.
1229 // This means that lexing a PHP program which contains "$$ $x" is different
1230 // will fail at parse time, but I'm willing to live with that.
1232 // This means that lexing a Hack program which contains
1233 // "$x |> $$instanceof Foo" produces an error as well.
1235 // If these decisions are unacceptable then we will need to make the lexer
1236 // be aware of whether it is lexing PHP or Hack; thus far we have not had
1237 // to make this distinction.
1239 // We are already at $.
1240 let ch1 = self.peek_char(1);
1243 let ch2 = self.peek_char(2);
1244 if ch2 == '$' || ch2 == '{' || Self::is_name_nondigit(ch2) {
1246 TokenKind::Dollar // $$x or $$$
1249 TokenKind::DollarDollar // $$
1253 if Self::is_name_nondigit(ch1) {
1254 self.scan_variable() // $x
1257 TokenKind::Dollar // $
1263 fn scan_token(&mut self, in_type: bool) -> TokenKind {
1264 let ch0 = self.peek_char(0);
1268 TokenKind::LeftBracket
1272 TokenKind::RightBracket
1276 TokenKind::LeftParen
1280 TokenKind::RightParen
1284 TokenKind::LeftBrace
1288 TokenKind::RightBrace
1290 '.' => match self.peek_char(1) {
1295 ch if '0' <= ch && ch <= '9' => self.scan_after_decimal_point(),
1297 if (self.peek_char(2)) == '.' {
1299 TokenKind::DotDotDot
1310 '-' => match self.peek_char(1) {
1313 TokenKind::MinusEqual
1317 TokenKind::MinusMinus
1321 TokenKind::MinusGreaterThan
1328 '+' => match self.peek_char(1) {
1331 TokenKind::PlusEqual
1342 '*' => match (self.peek_char(1), self.peek_char(2)) {
1345 TokenKind::StarEqual
1349 TokenKind::StarStarEqual
1364 '!' => match (self.peek_char(1), self.peek_char(2)) {
1367 TokenKind::ExclamationEqualEqual
1371 TokenKind::ExclamationEqual
1375 TokenKind::Exclamation
1378 '$' => self.scan_dollar_token(),
1380 if (self.peek_char(1)) == '=' {
1382 TokenKind::SlashEqual
1389 if (self.peek_char(1)) == '=' {
1391 TokenKind::PercentEqual
1398 match (self.peek_char(1), self.peek_char(2)) {
1399 ('<', '<') => self.scan_docstring_literal(),
1402 TokenKind::LessThanLessThanEqual
1404 // TODO: We lex and parse the spaceship operator.
1405 // TODO: This is not in the spec at present. We should either make it an
1406 // TODO: error, or add it to the specification.
1409 TokenKind::LessThanEqualGreaterThan
1413 TokenKind::LessThanEqual
1417 TokenKind::LessThanLessThan
1426 match (self.peek_char(1), self.peek_char(2)) {
1427 // If we are parsing a generic type argument list then we might be at the >>
1428 // in `List<List<int>>``, or at the >= of `let x:vec<int>=...`. In that case
1429 // we want to lex two >'s instead of >> / one > and one = instead of >=.
1430 (ch, _) if (ch == '>' || ch == '=') && in_type => {
1432 TokenKind::GreaterThan
1436 TokenKind::GreaterThanGreaterThanEqual
1440 TokenKind::GreaterThanGreaterThan
1444 TokenKind::GreaterThanEqual
1448 TokenKind::GreaterThan
1452 '=' => match (self.peek_char(1), self.peek_char(2)) {
1455 TokenKind::EqualEqualEqual
1459 TokenKind::EqualEqualGreaterThan
1463 TokenKind::EqualEqual
1467 TokenKind::EqualGreaterThan
1475 if (self.peek_char(1)) == '=' {
1477 TokenKind::CaratEqual
1483 '|' => match self.peek_char(1) {
1490 TokenKind::BarGreaterThan
1501 '&' => match self.peek_char(1) {
1504 TokenKind::AmpersandEqual
1508 TokenKind::AmpersandAmpersand
1512 TokenKind::Ampersand
1515 '?' => match (self.peek_char(1), self.peek_char(2)) {
1516 (':', _) if !in_type => {
1518 TokenKind::QuestionColon
1522 TokenKind::QuestionMinusGreaterThan
1526 TokenKind::QuestionQuestionEqual
1530 TokenKind::QuestionQuestion
1534 TokenKind::QuestionGreaterThan
1536 ('a', 's') if !Self::is_name_nondigit(self.peek_char(3)) => {
1538 TokenKind::QuestionAs
1546 // In experimental mode only: try to scan for a pocket universes atom
1548 let ch1 = self.peek_char(1);
1552 TokenKind::ColonColon
1553 } else if self.is_experimental_mode && ch1 == '@' {
1563 TokenKind::Semicolon
1573 '0' => match self.peek_char(1) {
1576 self.scan_hex_literal()
1580 self.scan_binary_literal()
1582 _ => self.scan_octal_or_float(),
1584 ch if '1' <= ch && ch <= '9' => self.scan_decimal_or_float(),
1585 '\'' => self.scan_single_quote_string_literal(),
1586 '`' => self.scan_double_quote_like_string_literal_from_start('`'),
1587 '"' => self.scan_double_quote_like_string_literal_from_start('"'),
1590 TokenKind::Backslash
1593 let c1 = self.peek_char(1);
1594 let c2 = self.peek_char(2);
1595 let c3 = self.peek_char(3);
1596 c1 == '"' || c1 == '\'' || (c1 == '<' && c2 == '<' && c3 == '<')
1600 self.scan_token(in_type)
1604 if ch0 == INVALID && self.at_end() {
1605 TokenKind::EndOfFile
1606 } else if Self::is_name_nondigit(ch0) {
1609 self.with_error(Errors::error0006);
1611 TokenKind::ErrorToken
1617 fn scan_token_outside_type(&mut self) -> TokenKind {
1618 self.scan_token(false)
1621 fn scan_token_inside_type(&mut self) -> TokenKind {
1622 self.scan_token(true)
1629 // white-space-character::
1631 // Space character (U+0020)
1632 // Horizontal-tab character (U+0009)
1634 // single-line-comment::
1635 // // input-characters-opt
1636 // # input-characters-opt
1639 // Carriage-return character (U+000D)
1640 // Line-feed character (U+000A)
1641 // Carriage-return character followed by line-feed character
1643 fn str_scan_end_of_line(s: &[u8], i: usize) -> usize {
1644 match s.get(i).map(|x| *x as char) {
1646 Some('\r') => match s.get(i + 1).map(|x| *x as char) {
1647 Some('\n') => 2 + i,
1650 Some('\n') => i + 1,
1651 _ => panic!("str_scan_end_of_line called while not on end of line!"),
1655 fn scan_end_of_line(&mut self) -> Token::Trivia {
1656 match self.peek_char(0) {
1658 let w = if self.peek_char(1) == '\n' { 2 } else { 1 };
1660 Token::Trivia::make_eol(self.source(), self.start, w)
1664 Token::Trivia::make_eol(self.source(), self.start, 1)
1666 _ => panic!("scan_end_of_line called while not on end of line!"),
1670 fn scan_hash_comment(&mut self) -> Token::Trivia {
1671 self.skip_to_end_of_line();
1672 Token::Trivia::make_single_line_comment(self.source(), self.start, self.width())
1675 fn scan_single_line_comment(&mut self) -> Token::Trivia {
1676 // A fallthrough comment is two slashes, any amount of whitespace,
1677 // FALLTHROUGH, and any characters may follow.
1678 // TODO: Consider allowing lowercase fallthrough.
1681 self.skip_whitespace();
1682 let lexer_ws = self.clone();
1683 self.skip_to_end_of_line_or_end_tag();
1684 let w = self.width();
1685 let remainder = self.offset - lexer_ws.offset;
1686 if remainder >= 11 && lexer_ws.peek_string(11) == "FALLTHROUGH".as_bytes() {
1687 Token::Trivia::make_fallthrough(self.source(), self.start, w)
1689 Token::Trivia::make_single_line_comment(self.source(), self.start, w)
1693 fn skip_to_end_of_delimited_comment(&mut self) {
1696 let ch0 = self.peek_char(offset);
1698 self.advance(offset);
1700 return self.with_error(Errors::error0007);
1702 // TODO: Do we want to give a warning for an embedded zero char
1703 // inside a comment?
1706 } else if ch0 == '*' && (self.peek_char(offset + 1)) == '/' {
1707 return self.advance(offset + 2);
1714 fn scan_delimited_comment(&mut self) -> Token::Trivia {
1715 // The original lexer lexes a fixme / ignore error as:
1717 // slash star [whitespace]* HH_FIXME [whitespace or newline]* leftbracket
1718 // [whitespace or newline]* integer [any text]* star slash
1720 // Notice that the original lexer oddly enough does not verify that there
1721 // is a right bracket.
1723 // For our purposes we will just check for HH_FIXME / HH_IGNORE_ERROR;
1724 // a later pass can try to parse out the integer if there is one,
1725 // give a warning if there is not, and so on.
1728 self.skip_whitespace();
1730 let lexer_ws = self.clone();
1731 self.skip_to_end_of_delimited_comment();
1732 let w = self.width();
1733 if lexer_ws.match_string("HH_FIXME".as_bytes()) {
1734 Token::Trivia::make_fix_me(self.source(), self.start, w)
1735 } else if lexer_ws.match_string("HH_IGNORE_ERROR".as_bytes()) {
1736 Token::Trivia::make_ignore_error(self.source(), self.start, w)
1738 Token::Trivia::make_delimited_comment(self.source(), self.start, w)
1742 fn scan_php_trivia(&mut self) -> Option<Token::Trivia> {
1743 // Hack does not support PHP style embedded markup:
1751 // However, ?> is never legal in Hack, so we can treat ?> ... any text ... <?php
1752 // as a comment, and then give an error saying that this feature is not supported
1755 // TODO: Give an error if this appears in a Hack program.
1756 match self.peek_char(0) {
1758 self.start_new_lexeme();
1759 Some(self.scan_hash_comment())
1762 self.start_new_lexeme();
1763 match self.peek_char(1) {
1764 '/' => Some(self.scan_single_line_comment()),
1765 '*' => Some(self.scan_delimited_comment()),
1770 let new_end = Self::str_skip_whitespace(self.source_text_string(), self.offset);
1771 let new_start = self.offset;
1773 Token::Trivia::make_whitespace(self.source(), new_start, new_end - new_start);
1774 self.with_start_offset(new_start, new_end);
1778 self.start_new_lexeme();
1779 Some(self.scan_end_of_line())
1782 self.start_new_lexeme();
1789 fn scan_xhp_trivia(&mut self) -> Option<Token::Trivia> {
1790 // TODO: Should XHP comments <!-- --> be their own thing, or a kind of
1791 // trivia associated with a token? Right now they are the former.
1792 let i = self.offset;
1793 let ch = self.peek_char(0);
1796 let j = Self::str_skip_whitespace(self.source_text_string(), i);
1797 self.with_start_offset(i, j);
1798 Some(Token::Trivia::make_whitespace(self.source(), i, j - i))
1801 let j = Self::str_scan_end_of_line(self.source_text_string(), i);
1802 self.with_start_offset(i, j);
1803 Some(Token::Trivia::make_eol(self.source(), i, j - i))
1808 self.start_new_lexeme();
1814 // We divide trivia into "leading" and "trailing" trivia of an associated
1815 // token. This means that we must find a dividing line between the trailing trivia
1816 // following one token and the leading trivia of the following token. Plainly
1817 // we need only find this line while scanning trailing trivia. The heuristics
1819 // * The first newline trivia encountered is the last trailing trivia.
1820 // * The newline which follows a // or # comment is not part of the comment
1821 // but does terminate the trailing trivia.
1822 // * A pragma to turn checks off (HH_FIXME and HH_IGNORE_ERROR) is
1823 // always a leading trivia.
1824 fn scan_leading_trivia(
1826 scanner: &Fn(&mut Self) -> Option<Token::Trivia>,
1827 ) -> Vec<Token::Trivia> {
1828 let mut acc = vec![];
1830 match scanner(self) {
1832 Some(t) => acc.push(t),
1837 pub fn scan_leading_php_trivia(&mut self) -> Vec<Token::Trivia> {
1838 self.scan_leading_trivia(&Self::scan_php_trivia)
1841 pub fn scan_leading_xhp_trivia(&mut self) -> Vec<Token::Trivia> {
1842 self.scan_leading_trivia(&Self::scan_xhp_trivia)
1845 fn scan_trailing_trivia(
1847 scanner: &Fn(&mut Self) -> Option<Token::Trivia>,
1848 ) -> Vec<Token::Trivia> {
1849 let mut acc = vec![];
1851 let mut lexer1 = self.clone();
1852 match scanner(&mut lexer1) {
1854 self.continue_from(lexer1);
1857 Some(t) => match t.kind() {
1858 TriviaKind::EndOfLine => {
1859 self.continue_from(lexer1);
1863 TriviaKind::FixMe | TriviaKind::IgnoreError => {
1867 self.continue_from(lexer1);
1875 pub fn scan_trailing_php_trivia(&mut self) -> Vec<Token::Trivia> {
1876 self.scan_trailing_trivia(&Self::scan_php_trivia)
1879 pub fn scan_trailing_xhp_trivia(&mut self) -> Vec<Token::Trivia> {
1880 self.scan_trailing_trivia(&Self::scan_xhp_trivia)
1883 pub fn is_next_name(&self) -> bool {
1884 let mut lexer = self.clone();
1885 lexer.scan_leading_php_trivia();
1886 Self::is_name_nondigit(lexer.peek_char(0))
1889 pub fn is_next_xhp_class_name(&self) -> bool {
1890 let mut lexer = self.clone();
1891 lexer.scan_leading_php_trivia();
1892 lexer.is_xhp_class_name()
1895 fn as_case_insensitive_keyword(&self, text: &str) -> Option<String> {
1896 let lower = text.to_ascii_lowercase();
1897 let res = match lower.as_ref() {
1898 "__halt_compiler" | "abstract" | "and" | "array" | "as" | "bool" | "boolean"
1899 | "break" | "callable" | "case" | "catch" | "class" | "clone" | "const"
1900 | "continue" | "default" | "die" | "do" | "echo" | "else" | "elseif"
1901 | "empty" | "endfor" | "endforeach" | "endif" | "endswitch"
1902 | "endwhile" | "eval" | "exit" | "extends" | "false" | "final" | "finally" | "for"
1903 | "foreach" | "function" | "global" | "goto" | "if" | "implements" | "include"
1904 | "include_once" | "inout" | "instanceof" | "insteadof" | "int" | "integer" | "interface"
1905 | "isset" | "list" | "namespace" | "new" | "null" | "or" | "parent" | "print"
1906 | "private" | "protected" | "public" | "require" | "require_once" | "return"
1907 | "self" | "static" | "string" | "switch" | "throw" | "trait" | "try" | "true"
1908 | "unset" | "use" | "using" | "var" | "void" | "while" | "xor" | "yield" => Some(lower),
1911 res.map(|x| x.to_owned())
1914 fn lowercase_error(&self, original_text: &str, lowered_text: &str) -> bool {
1915 match lowered_text {
1916 "true" | "false" | "null" => false,
1917 _ => original_text != lowered_text,
1921 fn as_keyword(&mut self, only_reserved: bool, kind: TokenKind) -> TokenKind {
1922 if kind == TokenKind::Name {
1923 let original_text = self.current_text_as_str();
1924 let text_as_lowercase_keyword = self.as_case_insensitive_keyword(original_text);
1925 let text = match text_as_lowercase_keyword.as_ref() {
1927 None => original_text,
1929 match TokenKind::from_string(&text.as_bytes(), only_reserved) {
1930 Some(TokenKind::Let) if (!(self.is_experimental_mode())) => TokenKind::Name,
1932 if self.lowercase_error(original_text, &text) {
1933 let err = Errors::uppercase_kw(original_text);
1934 self.with_error(err);
1938 _ => TokenKind::Name,
1945 fn scan_token_and_leading_trivia(
1947 scanner: &Fn(&mut Self) -> TokenKind,
1949 ) -> (TokenKind, usize, Vec<Token::Trivia>) {
1950 // Get past the leading trivia
1951 let leading = self.scan_leading_php_trivia();
1952 // Remember where we were when we started this token
1953 self.start_new_lexeme();
1954 let kind = scanner(self);
1955 let kind = match as_name {
1956 KwSet::AllKeywords => kind,
1957 KwSet::NonReservedKeywords => self.as_keyword(true, kind),
1958 KwSet::NoKeywords => self.as_keyword(false, kind),
1960 let w = self.width();
1964 fn scan_token_and_trivia(
1966 scanner: &Fn(&mut Self) -> TokenKind,
1969 let token_start = self.offset;
1971 let (kind, w, leading) = self.scan_token_and_leading_trivia(scanner, as_name);
1972 let trailing = match kind {
1973 TokenKind::DoubleQuotedStringLiteralHead => vec![],
1974 TokenKind::QuestionGreaterThan => {
1975 if Self::is_newline(self.peek_char(0)) {
1976 // consume only trailing EOL token after ?> as trailing trivia
1977 vec![self.scan_end_of_line()]
1982 _ => self.scan_trailing_php_trivia(),
1984 Token::make(kind, token_start, w, leading, trailing)
1987 fn scan_assert_progress(&mut self, tokenizer: &Fn(&mut Self) -> Token) -> Token {
1988 let original_remaining = self.remaining();
1989 let token = tokenizer(self);
1990 let new_remaining = self.remaining();
1991 if new_remaining < original_remaining
1992 || original_remaining == 0
1993 && new_remaining == 0
1994 && (token.kind()) == TokenKind::EndOfFile
1998 panic!("failed to make progress at {}\n", self.offset)
2002 fn scan_next_token(&mut self, scanner: &Fn(&mut Self) -> TokenKind, as_name: KwSet) -> Token {
2003 let tokenizer = |x: &mut Self| x.scan_token_and_trivia(scanner, as_name);
2004 self.scan_assert_progress(&tokenizer)
2007 fn scan_next_token_as_name(&mut self, scanner: &Fn(&mut Self) -> TokenKind) -> Token {
2008 self.scan_next_token(scanner, KwSet::AllKeywords)
2011 fn scan_next_token_as_keyword(&mut self, scanner: &Fn(&mut Self) -> TokenKind) -> Token {
2012 self.scan_next_token(scanner, KwSet::NoKeywords)
2015 fn scan_next_token_nonreserved_as_name(
2017 scanner: &Fn(&mut Self) -> TokenKind,
2019 self.scan_next_token(scanner, KwSet::NonReservedKeywords)
2024 pub fn next_token(&mut self) -> Token {
2026 self.scan_next_token_as_keyword(&Self::scan_token_inside_type)
2028 self.scan_next_token_as_keyword(&Self::scan_token_outside_type)
2032 pub fn next_token_no_trailing(&mut self) -> Token {
2033 let tokenizer = |x: &mut Self| {
2034 let token_start = x.offset;
2035 let (kind, w, leading) =
2036 x.scan_token_and_leading_trivia(&Self::scan_token_outside_type, KwSet::NoKeywords);
2037 Token::make(kind, token_start, w, leading, vec![])
2039 self.scan_assert_progress(&tokenizer)
2042 pub fn next_token_in_string(&mut self, literal_kind: &StringLiteralKind) -> Token {
2043 let token_start = self.offset;
2044 self.start_new_lexeme();
2045 // We're inside a string. Do not scan leading trivia.
2046 let kind = self.scan_string_literal_in_progress(literal_kind);
2047 let w = self.width();
2048 // Only scan trailing trivia if we've finished the string.
2049 let trailing = match kind {
2050 TokenKind::DoubleQuotedStringLiteralTail | TokenKind::HeredocStringLiteralTail => {
2051 self.scan_trailing_php_trivia()
2055 Token::make(kind, token_start, w, vec![], trailing)
2058 pub fn next_docstring_header(&mut self) -> (Token, &'a [u8]) {
2059 // We're at the beginning of a heredoc string literal. Scan leading
2060 // trivia but not trailing trivia.
2061 let token_start = self.offset;
2062 let leading = self.scan_leading_php_trivia();
2063 self.start_new_lexeme();
2064 let (name, _) = self.scan_docstring_header();
2065 let w = self.width();
2066 let token = Token::make(
2067 TokenKind::HeredocStringLiteralHead,
2076 pub fn next_token_as_name(&mut self) -> Token {
2077 self.scan_next_token_as_name(&Self::scan_token_outside_type)
2080 pub fn next_token_non_reserved_as_name(&mut self) -> Token {
2081 self.scan_next_token_nonreserved_as_name(&Self::scan_token_outside_type)
2084 pub fn next_xhp_element_token(&mut self, no_trailing: bool) -> (Token, &[u8]) {
2085 // XHP elements have whitespace, newlines and Hack comments.
2086 let tokenizer = |lexer: &mut Self| {
2087 let token_start = lexer.offset;
2088 let (kind, w, leading) =
2089 lexer.scan_token_and_leading_trivia(&Self::scan_xhp_token, KwSet::AllKeywords);
2090 // We do not scan trivia after an XHPOpen's >. If that is the beginning of
2091 // an XHP body then we want any whitespace or newlines to be leading trivia
2092 // of the body token.
2094 TokenKind::GreaterThan | TokenKind::SlashGreaterThan if no_trailing => {
2095 Token::make(kind, token_start, w, leading, vec![])
2098 let trailing = lexer.scan_trailing_php_trivia();
2099 Token::make(kind, token_start, w, leading, trailing)
2103 let token = self.scan_assert_progress(&tokenizer);
2104 let token_width = token.width();
2105 let trailing_width = token.trailing_width();
2106 let token_start_offset = (self.offset) - trailing_width - token_width;
2107 let token_text = self.source.sub(token_start_offset, token_width);
2111 pub fn next_xhp_body_token(&mut self) -> Token {
2112 let scanner = |lexer: &mut Self| {
2113 let token_start = lexer.offset;
2114 let leading = lexer.scan_leading_xhp_trivia();
2115 lexer.start_new_lexeme();
2116 let kind = lexer.scan_xhp_body();
2117 let w = lexer.width();
2119 // Trivia (leading and trailing) is semantically
2120 // significant for XHPBody tokens. When we find elements or
2121 // braced expressions inside the body, the trivia should be
2122 // seen as leading the next token, but we should certainly
2123 // keep it trailing if this is an XHPBody token.
2124 if kind == TokenKind::XHPBody
2125 { lexer.scan_trailing_xhp_trivia() }
2127 Token::make(kind, token_start, w, leading, trailing)
2129 self.scan_assert_progress(&scanner)
2132 pub fn next_xhp_class_name(&mut self) -> Token {
2133 self.scan_token_and_trivia(&Self::scan_xhp_class_name, KwSet::NoKeywords)
2136 pub fn next_xhp_name(&mut self) -> Token {
2137 let scanner = |x: &mut Self| x.scan_xhp_element_name(false);
2138 self.scan_token_and_trivia(&scanner, KwSet::NoKeywords)
2141 fn make_markup_token(&self) -> Token {
2142 Token::make(TokenKind::Markup, self.start, self.width(), vec![], vec![])
2147 name_token_offset: usize,
2150 less_than_question_token: Token,
2151 ) -> (Token, Option<(Token, Option<Token>)>) {
2154 // single line comments that follow the language in leading markup_text
2155 // determine the file check mode, read the trailing trivia and attach it
2156 // to the language token
2157 let trailing = self.scan_trailing_php_trivia();
2158 let name = Token::make(TokenKind::Name, name_token_offset, size, vec![], trailing);
2159 (markup_text, Some((less_than_question_token, Some(name))))
2162 fn make_markup_and_suffix(&mut self) -> (Token, Option<(Token, Option<Token>)>) {
2163 let markup_text = self.make_markup_token();
2164 let less_than_question_token =
2165 Token::make(TokenKind::LessThanQuestion, self.offset, 2, vec![], vec![]);
2168 let name_token_offset = self.offset;
2169 let ch0 = self.peek_char(0).to_ascii_lowercase();
2170 let ch1 = self.peek_char(1).to_ascii_lowercase();
2171 let ch2 = self.peek_char(2).to_ascii_lowercase();
2172 match (ch0, ch1, ch2) {
2174 self.make_long_tag(name_token_offset, 2, markup_text, less_than_question_token)
2176 ('p', 'h', 'p') => {
2177 self.make_long_tag(name_token_offset, 3, markup_text, less_than_question_token)
2182 let equal = Token::make(TokenKind::Equal, name_token_offset, 1, vec![], vec![]);
2184 (markup_text, Some((less_than_question_token, Some(equal))))
2187 (markup_text, Some((less_than_question_token, (None))))
2192 fn skip_to_end_of_markup(&mut self) -> (Token, Option<(Token, Option<Token>)>) {
2193 let start_offset = {
2194 // if leading section starts with #! - it should span the entire line
2195 let index = self.offset;
2197 panic!("Should only try to lex header at start of document")
2199 if self.peek_def(index, INVALID) == '#' && self.peek_def(index + 1, INVALID) == '!' {
2200 self.skip_while_to_offset(&Self::not_newline) + 1
2202 // this should really just be `index` - but, skip whitespace as the FFP
2203 // tests use magic comments in leading markup to set flags, but blank
2204 // them out before parsing; the newlines are kept to provide correct line
2205 // numbers in errors
2206 self.skip_while_to_offset(&|x| {
2207 Self::is_newline(x) || Self::is_whitespace_no_newline(x)
2211 if self.peek(start_offset) == '<' && self.peek_def(start_offset + 1, INVALID) == '?' {
2212 self.with_offset(start_offset);
2213 self.make_markup_and_suffix()
2215 (self.make_markup_token(), None)
2219 pub fn scan_header(&mut self) -> (Token, Option<(Token, Option<Token>)>) {
2220 self.start_new_lexeme();
2221 self.skip_to_end_of_markup()
2224 pub fn is_next_xhp_category_name(&self) -> bool {
2225 let mut lexer = self.clone();
2226 let _ = lexer.scan_leading_php_trivia();
2227 // An XHP category is an xhp element name preceded by a %.
2228 let ch0 = lexer.peek_char(0);
2229 let ch1 = lexer.peek_char(1);
2230 ch0 == '%' && Self::is_name_nondigit(ch1)
2233 fn scan_xhp_category_name(&mut self) -> TokenKind {
2234 if self.is_next_xhp_category_name() {
2236 let _ = self.scan_xhp_element_name(false);
2237 TokenKind::XHPCategoryName
2239 self.scan_token(false)
2243 pub fn next_xhp_category_name(&mut self) -> Token {
2244 self.scan_token_and_trivia(&Self::scan_xhp_category_name, KwSet::NoKeywords)
2247 pub fn rescan_halt_compiler(&mut self, last_token: Token) -> Token {
2248 // __halt_compiler stops parsing of the file.
2249 // In order to preserve fill fidelity aspect of the parser
2250 // we pack everything that follows __halt_compiler as
2251 // separate opaque kind of trivia - it will be attached as a trailing trivia
2252 // to the last_token and existing trailing trivia will be merged in.
2254 // This is incorrect for minimal token
2255 let leading_start_offset = last_token.leading_start_offset().unwrap_or(0);
2256 let start_offset = leading_start_offset + last_token.leading_width() + last_token.width();
2258 let length = self.source.length();
2259 let trailing = Token::Trivia::make_after_halt_compiler(
2262 length - start_offset,
2264 self.with_offset(length);
2265 last_token.with_trailing(vec![trailing])