2 * Copyright (c) 2016, Facebook, Inc.
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the "hack" directory of this source tree. An additional grant
7 * of patent rights can be found in the PATENTS file in the same directory.
11 module TriviaKind
= Full_fidelity_trivia_kind
12 module TokenKind
= Full_fidelity_token_kind
13 module SourceText
= Full_fidelity_source_text
14 module SyntaxError
= Full_fidelity_syntax_error
19 start
: int; (* Both start and offset are absolute offsets in the text. *)
21 errors
: SyntaxError.t list
23 val make
: SourceText.t
-> t
25 val source
: t
-> SourceText.t
26 val errors
: t
-> SyntaxError.t list
29 val with_error
: t
-> string -> t
30 val with_offset
: t
-> int -> t
31 val with_offset_errors
: t
-> int -> SyntaxError.t list
-> t
32 val start_new_lexeme
: t
-> t
33 val advance
: t
-> int -> t
34 val with_start_offset
: t
-> int -> int -> t
37 let padding = String.make
100 '
\x00'
39 (* text consists of a pair consisting of a string, padded by a certain, fixed
40 * amount of null bytes, and then the rest of the source text *)
43 start
: int; (* Both start and offset are absolute offsets in the text. *)
45 errors
: SyntaxError.t list
49 let text'
= SourceText.append_padding
text padding in
50 { text = text'
; start
= 0; offset
= 0; errors
= [] }
54 let errors x
= x
.errors
55 let offset x
= x
.offset
57 let with_error lexer message
=
58 let error = SyntaxError.make lexer
.start lexer
.offset message
in
59 { lexer
with errors = error :: lexer
.errors }
61 let with_offset lexer
offset = {lexer
with offset = offset}
63 let with_start_offset lexer
start offset = {lexer
with start = start; offset = offset}
65 let with_offset_errors lexer
offset errors = {
66 lexer
with offset = offset; errors = errors
69 let start_new_lexeme lexer
=
70 { lexer
with start = lexer
.offset }
72 let advance lexer index
=
73 { lexer
with offset = lexer
.offset + index
}
76 module WithToken
(Token
: Lexable_token_sig.LexableToken_S
) = struct
78 module Trivia
= Token.Trivia
84 let start = Lexer.start
85 let source = Lexer.source
86 let errors = Lexer.errors
87 let offset = Lexer.offset
88 let with_error = Lexer.with_error
89 let with_offset = Lexer.with_offset
90 let start_new_lexeme = Lexer.start_new_lexeme
91 let advance = Lexer.advance
92 let with_offset_errors = Lexer.with_offset_errors
93 let with_start_offset = Lexer.with_start_offset
95 let start_offset = start
96 let end_offset = offset
100 let empty = make SourceText.empty
102 let source_text_string (l
: lexer
) = SourceText.text (source l
)
104 type string_literal_kind
=
105 | Literal_execution_string
106 | Literal_double_quoted
107 | Literal_heredoc
of string
111 let peek_char lexer index
=
112 lexer
.Lexer.text.SourceText.text.[offset lexer
+ index
]
114 let peek_string lexer size
=
115 String.sub lexer
.Lexer.text.SourceText.text (offset lexer
) size
117 let match_string lexer s
=
118 s
= peek_string lexer
(String.length s
)
120 let make_error_with_location (l
: lexer
) (msg
: string) =
121 SyntaxError.make (start l
) (offset l
) msg
124 (offset lexer
) - (start lexer
)
126 let current_text lexer
=
127 SourceText.sub
(source lexer
) (start lexer
) (width lexer
)
129 let current_text_at lexer length relative_start
=
130 SourceText.sub
(source lexer
) ((start lexer
) + relative_start
) length
133 (offset lexer
) >= SourceText.length
(source lexer
)
135 let at_end_index lexer index
=
136 index
>= SourceText.length
(source lexer
)
138 let remaining lexer
=
139 let r = (SourceText.length
(source lexer
)) - offset lexer
in
140 if r < 0 then 0 else r
142 let text_len (l
: lexer
) =
143 SourceText.length
(source l
)
145 let peek (l
: lexer
) i
=
146 SourceText.get
(source l
) i
148 let peek_def (l
: lexer
) i ~def
=
149 if i
>= SourceText.length
(source l
) then
152 SourceText.get
(source l
) i
154 (* Character classification *)
155 let is_whitespace_no_newline : char
-> bool = function
159 let is_newline = function
160 | '
\r'
| '
\n'
-> true
163 let is_binary_digit = function
167 let is_octal_digit = function
171 let is_decimal_digit = function
175 let is_hexadecimal_digit = function
176 | '
0'
.. '
9'
| 'a'
.. 'f'
| 'A'
.. 'F'
-> true
179 let is_name_nondigit = function
183 | '
\x7f'
.. '
\xff'
-> true
186 let is_name_letter = function
191 | '
\x7f'
.. '
\xff'
-> true
196 let skip_while_to_offset l p
=
197 let n = SourceText.length
(source l
) in
199 if i
< n && p
(peek l i
) then aux (i
+ 1) else i
in
202 (* create a new lexer where the offset is advanced as
203 * long as the predicate is true *)
204 let skip_while (l
: lexer
) (p
: char
-> bool) =
205 with_offset l
(skip_while_to_offset l p
)
207 let str_skip_while ~str ~i ~p
=
208 let n = String.length str
in
210 if i
< n && p str
.[i
] then aux (i
+ 1) else i
in
213 let skip_whitespace (l
: lexer
) =
214 skip_while l
is_whitespace_no_newline
216 let str_skip_whitespace ~str ~i
=
217 str_skip_while ~str ~i ~p
:is_whitespace_no_newline
219 let not_newline ch
= not
(is_newline ch
)
221 let skip_to_end_of_line (l
: lexer
) =
222 skip_while l
not_newline
224 let skip_to_end_of_line_or_end_tag (l
: lexer
) =
225 let n = text_len l
in
226 let peek_def i
= if i
< n then peek l i
else invalid in
230 (is_newline ch) || (ch = '?'
&& peek_def (succ i
) = '
>'
)
232 let i = ref (offset l
) in
233 while (not
(should_stop !i)) do incr
i done;
236 let skip_name_end (l
: lexer
) =
237 skip_while l
is_name_letter
239 let skip_end_of_line lexer
=
240 match peek_char lexer
0 with
241 | '
\n'
-> advance lexer
1
243 if (peek_char lexer
1) = '
\n'
then advance lexer
2 else advance lexer
1
246 let scan_name_impl lexer
=
247 assert (is_name_nondigit (peek_char lexer
0));
248 skip_name_end (advance lexer
1)
250 let scan_name lexer
=
251 let lexer = scan_name_impl lexer in
252 (lexer, TokenKind.Name
)
254 let scan_variable lexer =
255 assert('$'
= peek_char lexer 0);
256 let lexer = scan_name_impl (advance lexer 1) in
257 (lexer, TokenKind.Variable
)
259 let scan_with_underscores (l
: lexer) accepted_char
=
260 let n = text_len l
in
261 let peek_def i = if i < n then peek l
i else invalid in
264 else let ch = peek l
i in
265 if accepted_char
ch then aux (succ
i)
266 else if ch = ' '
&& accepted_char
(peek_def (succ
i)) then
269 with_offset l
(aux (offset l
))
271 let scan_decimal_digits (l
: lexer) =
272 skip_while l
is_decimal_digit
274 let scan_decimal_digits_with_underscores lexer =
275 scan_with_underscores lexer is_decimal_digit
277 let scan_octal_digits (l
: lexer) =
278 skip_while l
is_octal_digit
280 let scan_octal_digits_with_underscores (l
: lexer) =
281 scan_with_underscores l
is_octal_digit
283 let scan_binary_digits_with_underscores (l
: lexer) =
284 scan_with_underscores l
is_binary_digit
286 let scan_hexadecimal_digits (l
: lexer) =
287 skip_while l
is_hexadecimal_digit
289 let scan_hexadecimal_digits_with_underscores (l
: lexer) =
290 scan_with_underscores l
is_hexadecimal_digit
292 let scan_hex_literal lexer =
293 let ch = peek_char lexer 0 in
294 if not
(is_hexadecimal_digit ch) then
295 let lexer = with_error lexer SyntaxError.error0001
in
296 (lexer, TokenKind.HexadecimalLiteral
)
298 (scan_hexadecimal_digits_with_underscores lexer, TokenKind.HexadecimalLiteral
)
300 let scan_binary_literal lexer =
301 let ch = peek_char lexer 0 in
302 if not
(is_binary_digit ch) then
303 let lexer = with_error lexer SyntaxError.error0002
in
304 (lexer, TokenKind.BinaryLiteral
)
306 (scan_binary_digits_with_underscores lexer, TokenKind.BinaryLiteral
)
308 let scan_exponent lexer =
309 let ch = peek_char lexer 1 in
310 let lexer = if ch = '
+'
|| ch = '
-'
then (advance lexer 2)
311 else (advance lexer 1) in
312 let ch = peek_char lexer 0 in
313 if not
(is_decimal_digit ch) then
314 let lexer = with_error lexer SyntaxError.error0003
in
315 (lexer, TokenKind.FloatingLiteral
)
317 (scan_decimal_digits lexer, TokenKind.FloatingLiteral
)
319 let scan_after_decimal_point lexer =
320 let lexer = advance lexer 1 in
321 let lexer = scan_decimal_digits lexer in
322 let ch = peek_char lexer 0 in
323 if ch = 'e'
|| ch = 'E'
then
326 (lexer, TokenKind.FloatingLiteral
)
328 let scan_octal_or_float lexer =
329 (* We've scanned a leading zero. *)
330 (* We have an irritating ambiguity here. 09 is not a legal octal or
331 * floating literal, but 09e1 and 09.1 are. *)
332 let lexer = advance lexer 1 in
333 let ch = peek_char lexer 0 in
335 | '
.'
-> (* 0. *) scan_after_decimal_point lexer
336 | 'e'
| 'E'
-> (* 0e *) scan_exponent lexer
339 let lexer_oct = scan_octal_digits lexer in
340 let lexer_dec = scan_decimal_digits lexer in
341 if (width lexer_oct) = (width lexer_dec) then
343 (* Only octal digits. Could be an octal literal, or could
345 let ch = peek_char lexer_oct 0 in
346 if ch = 'e'
|| ch = 'E'
then scan_exponent lexer_oct
347 else if ch = '
.'
then scan_after_decimal_point lexer_oct
349 (* This is irritating - we only want to allow underscores for integer
350 literals. Deferring the lexing with underscores here allows us to
351 make sure we're not dealing with floats. *)
352 let lexer_oct_with_underscores =
353 scan_octal_digits_with_underscores lexer in
354 (lexer_oct_with_underscores, TokenKind.OctalLiteral
)
358 (* We had decimal digits following a leading zero; this is either a
359 float literal or an octal to be truncated at the first non-octal
361 let ch = peek_char lexer_dec 0 in
362 if ch = 'e'
|| ch = 'E'
then
363 scan_exponent lexer_dec
364 else if ch = '
.'
then
365 scan_after_decimal_point lexer_dec
366 else (* an octal to be truncated at the first non-octal digit *)
367 (* Again we differ the lexing with underscores here *)
368 let lexer_dec_with_underscores =
369 scan_decimal_digits_with_underscores lexer in
370 (lexer_dec_with_underscores, TokenKind.OctalLiteral
)
372 | _
-> (* 0 *) (lexer, TokenKind.OctalLiteral
)
374 let scan_decimal_or_float lexer =
375 (* We've scanned a leading non-zero digit. *)
376 let lexer_no_underscores = scan_decimal_digits lexer in
377 let lexer_with_underscores = scan_decimal_digits_with_underscores lexer in
378 let ch = peek_char lexer_no_underscores 0 in
380 | '
.'
-> (* 123. *) scan_after_decimal_point lexer_no_underscores
381 | 'e'
| 'E'
-> (* 123e *) scan_exponent lexer_no_underscores
382 | _
-> (* 123 *) (lexer_with_underscores, TokenKind.DecimalLiteral
)
384 let scan_single_quote_string_literal (l
: lexer) =
385 (* TODO: What about newlines embedded? *)
387 single-quoted-string-literal::
388 b-opt ' sq-char-sequence-opt '
390 TODO: What is this b-opt? We don't lex an optional 'b' before a literal.
394 sq-char-sequence sq-char
398 \opt any character except single-quote (') or backslash (\)
400 sq-escape-sequence:: one of
405 let n = SourceText.length
(source l
) in
406 let peek = SourceText.get
(source l
) in
408 let has_error0012 = ref false in
409 let has_error0006 = ref false in
413 (has_error0012 := true; n - 1)
417 | '
\000'
-> (has_error0006 := true; stepper (1+i))
418 | '
\\'
-> stepper (2+i)
423 let new_offset = stepper (1 + (offset l
)) in
426 let err msg
= make_error_with_location l msg
in
427 match (!has_error0006, !has_error0012) with
428 | (true, true) -> (err SyntaxError.error0006
:: err SyntaxError.error0012
:: (errors l
))
429 | (true, false) -> (err SyntaxError.error0006
:: (errors l
))
430 | (false, true) -> (err SyntaxError.error0012
:: (errors l
))
431 | (false, false) -> (errors l
) in
433 let res = with_offset_errors l
new_offset new_errors in
434 (res, TokenKind.SingleQuotedStringLiteral
)
436 let scan_hexadecimal_escape lexer =
437 let ch2 = peek_char lexer 2 in
438 let ch3 = peek_char lexer 3 in
439 if not
(is_hexadecimal_digit ch2) then
440 (* TODO: Consider producing an error for a malformed hex escape *)
441 (* let lexer = with_error lexer SyntaxError.error0005 in *)
443 else if not
(is_hexadecimal_digit ch3) then
444 (* let lexer = with_error lexer SyntaxError.error0005 in *)
449 let scan_unicode_escape lexer =
450 (* At present the lexer is pointing at \u *)
451 if (peek_char lexer 2) = '
{'
then
452 if (peek_char lexer 3) = '$'
then
453 (* We have a malformed unicode escape that contains a possible embedded
454 expression. Eat the \u and keep on processing the embedded expression. *)
455 (* TODO: Consider producing a warning for a malformed unicode escape. *)
458 (* We have a possibly well-formed escape sequence, and at least we know
459 that it is not an embedded expression. *)
460 (* TODO: Consider producing an error if the digits are out of range
461 of legal Unicode characters. *)
462 (* TODO: Consider producing an error if there are no digits. *)
463 (* Skip over the slash, u and brace, and start lexing the number. *)
464 let lexer = advance lexer 3 in
465 let lexer = scan_hexadecimal_digits lexer in
466 let ch = peek_char lexer 0 in
468 (* TODO: Consider producing a warning for a malformed unicode escape. *)
473 (* We have a malformed unicode escape sequence. Bail out. *)
474 (* TODO: Consider producing a warning for a malformed unicode escape. *)
477 let skip_uninteresting_double_quote_like_string_characters (l
: lexer) start_char
=
478 let is_uninteresting ch =
480 | '
\000'
| '
\\'
| '$'
| '
{'
| '
['
| '
]'
| '
-'
481 | '
0'
.. '
9'
-> false
482 | ch -> ch <> start_char
&& not
(is_name_nondigit ch) in
483 skip_while l
is_uninteresting
485 let scan_integer_literal_in_string lexer =
486 if (peek_char lexer 0) = '
0'
then
487 match peek_char lexer 1 with
488 | 'x'
| 'X'
-> scan_hex_literal (advance lexer 2)
489 | 'b'
| 'B'
-> scan_binary_literal (advance lexer 2)
491 (* An integer literal starting with 0 in a string will actually
492 always be treated as a string index in HHVM, and not as an octal.
493 In such a case, HHVM actually scans all decimal digits to create the
494 token. TODO: we may want to change this behavior to something more
496 (scan_decimal_digits_with_underscores lexer, TokenKind.DecimalLiteral
)
498 (scan_decimal_digits_with_underscores lexer, TokenKind.DecimalLiteral
)
500 (* scans double quoted or execution string literals - they have similar rules
501 for content interpretation except for \"" character - it is escaped in
502 double quoted string and remain intact in execution string literals *)
503 let scan_double_quote_like_string_literal_from_start lexer start_char
=
504 let literal_token_kind =
505 if start_char
= '`'
then TokenKind.ExecutionStringLiteral
506 else TokenKind.DoubleQuotedStringLiteral
in
507 let head_token_kind =
508 if start_char
= '`'
then TokenKind.ExecutionStringLiteralHead
509 else TokenKind.DoubleQuotedStringLiteralHead
in
511 (* If there's nothing interesting in this double-quoted string then
512 we can just hand it back as-is. *)
514 skip_uninteresting_double_quote_like_string_characters lexer start_char
in
515 match peek_char lexer 0 with
517 (* If the string is unterminated then give an error; if this is an
518 embedded zero character then give an error and recurse; we might
519 be able to make more progress. *)
521 let lexer = with_error lexer SyntaxError.error0012
in
522 (lexer, literal_token_kind)
524 let lexer = with_error lexer SyntaxError.error0006
in
525 aux (advance lexer 1)
527 (* We made it to the end without finding a special character. *)
528 (advance lexer 1, literal_token_kind)
529 | _ -> (* We've found a backslash, dollar or brace. *)
530 (lexer, head_token_kind) in
531 aux (advance lexer 1)
533 let is_heredoc_tail lexer name =
534 (* A heredoc tail is the identifier immediately preceded by a newline
535 and immediately followed by an optional semi and then a newline.
537 Note that the newline and optional semi are not part of the literal;
538 the literal's lexeme ends at the end of the name. Either there is
539 no trivia and the next token is a semi-with-trailing-newline, or
540 the trailing trivia is a newline.
542 This odd rule is to ensure that both
557 if not (is_newline (peek_char lexer (-1))) then
560 let len = String.length name in
561 let ch0 = peek_char lexer len in
562 let ch1 = peek_char lexer (len + 1) in
563 ((is_newline ch0) || ch0 = ';' && (is_newline ch1)) &&
564 (peek_string lexer len) = name
566 let get_tail_token_kind literal_kind =
567 match literal_kind with
568 | Literal_heredoc _-> TokenKind.HeredocStringLiteralTail
569 | Literal_execution_string -> TokenKind.ExecutionStringLiteralTail
570 | Literal_double_quoted -> TokenKind.DoubleQuotedStringLiteralTail
572 let get_string_literal_body_or_double_quoted_tail literal_kind =
573 if literal_kind = Literal_double_quoted
574 then TokenKind.DoubleQuotedStringLiteralTail
575 else TokenKind.StringLiteralBody
577 let scan_string_literal_in_progress lexer literal_kind =
578 let is_heredoc, name =
579 match literal_kind with
580 | Literal_heredoc name -> true, name
583 if literal_kind = Literal_execution_string then '`'
585 let ch0 = peek_char lexer 0 in
586 if is_name_nondigit ch0 then
587 if is_heredoc && (is_heredoc_tail lexer name
) then
588 (scan_name_impl lexer, TokenKind.HeredocStringLiteralTail
)
590 (scan_name_impl lexer, TokenKind.Name
)
595 let lexer = with_error lexer SyntaxError.error0012
in
596 (lexer, get_tail_token_kind literal_kind
)
598 let lexer = with_error lexer SyntaxError.error0006
in
599 let lexer = advance lexer 1 in
601 skip_uninteresting_double_quote_like_string_characters
604 (lexer, TokenKind.StringLiteralBody
)
605 | '`'
when literal_kind
= Literal_execution_string
->
606 (* '`' terminates execution string *)
607 (advance lexer 1, TokenKind.ExecutionStringLiteralTail
)
609 let kind = get_string_literal_body_or_double_quoted_tail literal_kind in
610 (advance lexer 1, kind)
612 if is_name_nondigit (peek_char lexer 1) then scan_variable lexer
613 else (advance lexer 1, TokenKind.Dollar)
614 | '{' -> (advance lexer 1, TokenKind.LeftBrace)
616 match peek_char lexer 1 with
617 (* In these cases we just skip the escape sequence and
618 keep on scanning for special characters. *)
619 | '\\' | '"'
| '$'
| 'e'
| 'f'
| '
n'
| '
r'
| 't'
| 'v'
| '`'
620 (* Same in these cases; there might be more octal characters following but
621 if there are, we'll just eat them as normal characters. *)
623 let lexer = advance lexer 2 in
625 skip_uninteresting_double_quote_like_string_characters
627 (lexer, TokenKind.StringLiteralBody
)
629 let lexer = scan_hexadecimal_escape lexer in
631 skip_uninteresting_double_quote_like_string_characters
633 (lexer, TokenKind.StringLiteralBody
)
635 let lexer = scan_unicode_escape lexer in
637 skip_uninteresting_double_quote_like_string_characters
639 (lexer, TokenKind.StringLiteralBody
)
641 (* The rules for escaping open braces in Hack are bizarre. Suppose we
646 What is the value of $z? Naively you would think that the backslash
647 escapes the braces, and the variables are embedded, so {123,456}. But
648 that's not what happens. Yes, the backslash makes the brace no longer
649 the opening brace of an expression. But the backslash is still part
650 of the string! This is the string \{123,456\}.
651 TODO: We might want to fix this because this is very strange. *)
652 (* Eat the backslash and the brace. *)
653 let lexer = advance lexer 2 in
654 (lexer, TokenKind.StringLiteralBody
)
656 (* TODO: A backslash followed by something other than an escape sequence
657 is legal in hack, and treated as though it was just the backslash
658 and the character. However we might consider making this a warning.
659 It is particularly egregious when we have something like:
662 The author of the code likely means the backslash to mean line
663 continuation but in fact it just means to put a backslash and newline
666 let lexer = advance lexer 1 in
668 skip_uninteresting_double_quote_like_string_characters
670 (lexer, TokenKind.StringLiteralBody
)
673 let lexer = advance lexer 1 in
674 (lexer, TokenKind.LeftBracket
)
676 let lexer = advance lexer 1 in
677 (lexer, TokenKind.RightBracket
)
679 if (peek_char lexer 1) = '
>'
then
680 let lexer = advance lexer 2 in
681 (lexer, TokenKind.MinusGreaterThan
)
683 (* Nothing interesting here. Skip it and find the next
684 interesting character. *)
685 let lexer = advance lexer 1 in
687 skip_uninteresting_double_quote_like_string_characters
689 (lexer, TokenKind.StringLiteralBody
)
691 let (lexer1
, _
) as literal
= scan_integer_literal_in_string lexer in
692 if errors lexer == errors lexer1
then literal
else
693 (* If we failed to scan a literal, do not interpret the literal *)
694 (with_offset lexer (offset lexer1
), TokenKind.StringLiteralBody
)
696 (* Nothing interesting here. Skip it and find the next
697 interesting character. *)
698 let lexer = advance lexer 1 in
700 skip_uninteresting_double_quote_like_string_characters
702 (lexer, TokenKind.StringLiteralBody
)
704 (* A heredoc string literal has the form
712 <<< (optional whitespace) name (no whitespace) (newline)
714 The optional body is:
716 any characters whatsoever including newlines (newline)
720 (no whitespace) name (no whitespace) (optional semi) (no whitespace) (newline)
722 The names must be identical. The trailing semi and newline must be present.
724 The body is any and all characters, up to the first line that exactly matches
727 The body may contain embedded expressions.
729 A nowdoc string literal has the same form except that the first name is
730 enclosed in single quotes, and it may not contain embedded expressions.
734 let scan_docstring_name_actual lexer =
735 let ch = peek_char lexer 0 in
736 if is_name_nondigit ch then
737 let end_lexer = skip_name_end (advance lexer 1) in
738 let name = SourceText.sub
739 (source lexer) (offset lexer) (offset end_lexer - offset lexer) in
742 let lexer = with_error lexer SyntaxError.error0008
in
745 let scan_docstring_name lexer =
746 let lexer = skip_whitespace lexer in
747 let ch = peek_char lexer 0 in
749 if ch = '
\''
then TokenKind.NowdocStringLiteral
750 else TokenKind.HeredocStringLiteral
in
753 let (lexer, name) = scan_docstring_name_actual (advance lexer 1) in
754 if (peek_char lexer 0) = '
\''
then
755 (advance lexer 1, name)
757 (with_error lexer SyntaxError.error0010
, name)
759 (* Starting with PHP 5.3.0, the opening Heredoc identifier
760 may optionally be enclosed in double quotes:*)
761 let lexer = if ch = '
"' then advance lexer 1 else lexer in
762 let lexer, name = scan_docstring_name_actual lexer in
764 if ch = '"'
&& peek_char lexer 0 = '
\"'
then advance lexer 1 else lexer
770 let scan_docstring_header lexer =
771 let ch = peek_char lexer 0 in
772 (* Skip 3 for <<< or 4 for b<<< *)
773 let skip_count = if ch = 'b'
then 4 else 3 in
774 let lexer = advance lexer skip_count in
775 let (lexer, name, kind) = scan_docstring_name lexer in
776 let ch = peek_char lexer 0 in
778 if is_newline ch then lexer
779 else with_error lexer SyntaxError.error0011
in
780 let lexer = skip_to_end_of_line lexer in
781 let lexer = skip_end_of_line lexer in
784 let scan_docstring_remainder name lexer =
785 let len = String.length
name in
787 let ch0 = peek_char lexer len in
788 let ch1 = peek_char lexer (len + 1) in
789 if ((is_newline ch0) || ch0 = '
;'
&& (is_newline ch1)) &&
790 (peek_string lexer len) = name then
793 let lexer = skip_to_end_of_line lexer in
794 let ch = peek_char lexer 0 in
795 if is_newline ch then
796 aux (skip_end_of_line lexer)
798 (* If we got here then we ran off the end of the file without
799 finding a newline. Just bail. *)
800 with_error lexer SyntaxError.error0011
in
803 let scan_docstring_literal lexer =
804 let (lexer, name, kind) = scan_docstring_header lexer in
805 let lexer = scan_docstring_remainder name lexer in
808 let scan_xhp_label lexer =
809 (* An XHP label has the same grammar as a Hack name. *)
810 let (lexer, _
) = scan_name lexer in
813 let rec scan_xhp_element_name ?
(attribute
=false) lexer =
814 (* An XHP element name is a sequence of one or more XHP labels each separated
815 by a single : or -. Note that it is possible for an XHP element name to be
816 followed immediately by a : or - that is the next token, so if we find
817 a : or - not followed by a label, we need to terminate the token. *)
818 let lexer = scan_xhp_label lexer in
819 let ch0 = peek_char lexer 0 in
820 let ch1 = peek_char lexer 1 in
821 if (not attribute
&& ch0 = '
:'
|| ch0 = '
-'
) && is_name_nondigit ch1 then
822 scan_xhp_element_name (advance lexer 1)
824 (lexer, TokenKind.XHPElementName
)
826 (* Is the next token we're going to lex a possible xhp class name? *)
827 let is_xhp_class_name lexer =
828 (peek_char lexer 0 = '
:'
) && (is_name_nondigit (peek_char lexer 1))
830 let scan_xhp_class_name lexer =
831 (* An XHP class name is a colon followed by an xhp name. *)
832 if is_xhp_class_name lexer then
833 let (lexer, _
) = scan_xhp_element_name (advance lexer 1) in
834 (lexer, TokenKind.XHPClassName
)
836 let lexer = with_error lexer SyntaxError.error0008
in
837 (advance lexer 1, TokenKind.ErrorToken
)
839 let scan_xhp_string_literal lexer =
840 (* XHP string literals are just straight up "find the closing quote"
841 strings. Embedded newlines are legal. *)
842 let rec aux lexer offset =
843 match peek_char lexer offset with
845 let lexer = advance lexer offset in
847 let lexer = with_error lexer SyntaxError.error0012
in
848 (lexer, TokenKind.XHPStringLiteral
)
850 let lexer = with_error lexer SyntaxError.error0006
in
852 | '
"' -> (advance lexer (offset + 1), TokenKind.XHPStringLiteral)
853 | _ -> aux lexer (offset + 1) in
856 (* Note that this does not scan an XHP body *)
857 let scan_xhp_token lexer =
858 (* TODO: HHVM requires that there be no trivia between < and name in an
859 opening tag, but does allow trivia between </ and name in a closing tag.
860 Consider allowing trivia in an opening tag. *)
861 let ch0 = peek_char lexer 0 in
862 if ch0 = invalid && at_end lexer then
863 (lexer, TokenKind.EndOfFile)
864 else if is_name_nondigit ch0 then
865 scan_xhp_element_name lexer
867 | '{' -> (advance lexer 1, TokenKind.LeftBrace)
868 | '}' -> (advance lexer 1, TokenKind.RightBrace)
869 | '=' -> (advance lexer 1, TokenKind.Equal)
871 if (peek_char lexer 1) = '/' then
872 (advance lexer 2, TokenKind.LessThanSlash)
874 (advance lexer 1, TokenKind.LessThan)
875 | '"'
-> scan_xhp_string_literal lexer
877 if (peek_char lexer 1) = '
>'
then
878 (advance lexer 2, TokenKind.SlashGreaterThan
)
880 let lexer = with_error lexer SyntaxError.error0006
in
881 (advance lexer 1, TokenKind.ErrorToken
)
882 | '
>'
-> (advance lexer 1, TokenKind.GreaterThan
)
884 let lexer = with_error lexer SyntaxError.error0006
in
885 (advance lexer 1, TokenKind.ErrorToken
)
887 let scan_xhp_comment lexer =
888 let rec aux lexer offset =
889 let ch0 = peek_char lexer offset in
890 let ch1 = peek_char lexer (offset + 1) in
891 let ch2 = peek_char lexer (offset + 2) in
892 match (ch0, ch1, ch2) with
893 | ('
\000'
, _
, _
) -> with_error (advance lexer offset) SyntaxError.error0014
894 | ('
-'
, '
-'
, '
>'
) -> (advance lexer (offset + 3))
895 | _
-> aux lexer (offset + 1) in
898 let scan_xhp_body lexer =
899 (* Naively you might think that an XHP body is just a bunch of characters,
900 terminated by an embedded { } expression or a tag. However, whitespace
901 and newlines are relevant in XHP bodies because they are "soft".
902 That is, any section of contiguous trivia has the same semantics as a
903 single space or newline -- just as in HTML.
905 Obviously this is of relevance to code formatters.
907 Therefore we detect whitespace and newlines within XHP bodies and treat
908 it as trivia surrounding the tokens within the body.
910 TODO: Is this also true of whitespace within XHP comments? If so then
911 we need to make XHP comments a sequence of tokens, rather than a
912 single token as they are now.
914 let rec aux lexer offset =
915 let ch = peek_char lexer offset in
918 let lexer = advance lexer offset in
920 let lexer = with_error lexer SyntaxError.error0013
in
923 let lexer = with_error lexer SyntaxError.error0006
in
925 | '
\t'
| ' '
| '
\r'
| '
\n'
| '
{'
| '
}'
| '
<'
-> advance lexer offset
926 | _
-> aux lexer (offset + 1) in
927 let ch0 = peek_char lexer 0 in
929 | '
\000'
when at_end lexer -> (lexer, TokenKind.EndOfFile
)
930 | '
{'
-> (advance lexer 1, TokenKind.LeftBrace
)
931 | '
}'
-> (advance lexer 1, TokenKind.RightBrace
)
933 let ch1 = peek_char lexer 1 in
934 let ch2 = peek_char lexer 2 in
935 let ch3 = peek_char lexer 3 in
936 match (ch1, ch2, ch3) with
937 | ('
!'
, '
-'
, '
-'
) -> (scan_xhp_comment lexer, TokenKind.XHPComment
)
938 | ('
/'
, _
, _
) -> (advance lexer 2, TokenKind.LessThanSlash
)
939 | _
-> (advance lexer 1, TokenKind.LessThan
)
941 | _
-> ((aux lexer 0), TokenKind.XHPBody
)
943 let scan_dollar_token lexer =
945 We have a problem here. We wish to be able to lexically analyze both
946 PHP and Hack, but the introduction of $$ to Hack makes them incompatible.
947 "$$x" and "$$ $x" are legal in PHP, but illegal in Hack.
948 The rule in PHP seems to be that $ is a prefix operator, it is a token,
949 it can be followed by trivia, but the next token has to be another $
950 operator, a variable $x, or a {.
952 Here's a reasonable compromise. (TODO: Review this decision.)
958 $$ followed by anything other than a name or a $ lexes as $$.
960 This means that lexing a PHP program which contains "$$ $x" is different
961 will fail at parse time, but I'm willing to live with that.
963 This means that lexing a Hack program which contains
964 "$x |> $$instanceof Foo" produces an error as well.
966 If these decisions are unacceptable then we will need to make the lexer
967 be aware of whether it is lexing PHP or Hack; thus far we have not had
968 to make this distinction.
971 (* We are already at $. *)
972 let ch1 = peek_char lexer 1 in
975 let ch2 = peek_char lexer 2 in
976 if ch2 = '$'
|| is_name_nondigit ch2 then
977 (advance lexer 1, TokenKind.Dollar
) (* $$x or $$$*)
979 (advance lexer 2, TokenKind.DollarDollar
) (* $$ *)
981 if is_name_nondigit ch1 then scan_variable lexer (* $x *)
982 else (advance lexer 1, TokenKind.Dollar
) (* $ *)
984 let rec scan_token_impl : bool -> lexer -> (lexer * TokenKind.t
) =
986 let ch0 = peek_char lexer 0 in
988 | '
['
-> (advance lexer 1, TokenKind.LeftBracket
)
989 | '
]'
-> (advance lexer 1, TokenKind.RightBracket
)
990 | '
('
-> (advance lexer 1, TokenKind.LeftParen
)
991 | '
)'
-> (advance lexer 1, TokenKind.RightParen
)
992 | '
{'
-> (advance lexer 1, TokenKind.LeftBrace
)
993 | '
}'
-> (advance lexer 1, TokenKind.RightBrace
)
995 match peek_char lexer 1 with
996 | '
='
-> (advance lexer 2, TokenKind.DotEqual
)
998 scan_after_decimal_point lexer
1000 if (peek_char lexer 2) = '
.'
then (advance lexer 3, TokenKind.DotDotDot
)
1001 else (advance lexer 1, TokenKind.Dot
)
1002 | _
-> (advance lexer 1, TokenKind.Dot
)
1005 match peek_char lexer 1 with
1006 | '
='
-> (advance lexer 2, TokenKind.MinusEqual
)
1007 | '
-'
-> (advance lexer 2, TokenKind.MinusMinus
)
1008 | '
>'
-> (advance lexer 2, TokenKind.MinusGreaterThan
)
1009 | _
-> (advance lexer 1, TokenKind.Minus
)
1012 match peek_char lexer 1 with
1013 | '
='
-> (advance lexer 2, TokenKind.PlusEqual
)
1014 | '
+'
-> (advance lexer 2, TokenKind.PlusPlus
)
1015 | _
-> (advance lexer 1, TokenKind.Plus
)
1018 match (peek_char lexer 1, peek_char lexer 2) with
1019 | ('
='
, _
) -> (advance lexer 2, TokenKind.StarEqual
)
1020 | ('
*'
, '
='
) -> (advance lexer 3, TokenKind.StarStarEqual
)
1021 | ('
*'
, _
) -> (advance lexer 2, TokenKind.StarStar
)
1022 | _
-> (advance lexer 1, TokenKind.Star
)
1024 | '~'
-> (advance lexer 1, TokenKind.Tilde
)
1026 match (peek_char lexer 1, peek_char lexer 2) with
1027 | ('
='
, '
='
) -> (advance lexer 3, TokenKind.ExclamationEqualEqual
)
1028 | ('
='
, _
) -> (advance lexer 2, TokenKind.ExclamationEqual
)
1029 | _
-> (advance lexer 1, TokenKind.Exclamation
)
1031 | '$'
-> scan_dollar_token lexer
1033 if (peek_char lexer 1) = '
='
then (advance lexer 2, TokenKind.SlashEqual
)
1034 else (advance lexer 1, TokenKind.Slash
)
1036 if (peek_char lexer 1) = '
='
then (advance lexer 2, TokenKind.PercentEqual
)
1037 else (advance lexer 1, TokenKind.Percent
)
1039 match (peek_char lexer 1, peek_char lexer 2) with
1040 | ('
<'
, '
<'
) -> scan_docstring_literal lexer
1041 | ('
<'
, '
='
) -> (advance lexer 3, TokenKind.LessThanLessThanEqual
)
1042 (* TODO: We lex and parse the spaceship operator.
1043 TODO: This is not in the spec at present. We should either make it an
1044 TODO: error, or add it to the specification. *)
1045 | ('
='
, '
>'
) -> (advance lexer 3, TokenKind.LessThanEqualGreaterThan
)
1046 | ('
>'
, _
) -> (advance lexer 2, TokenKind.LessThanGreaterThan
)
1047 | ('
='
, _
) -> (advance lexer 2, TokenKind.LessThanEqual
)
1048 | ('
<'
, _
) -> (advance lexer 2, TokenKind.LessThanLessThan
)
1049 | _
-> (advance lexer 1, TokenKind.LessThan
)
1052 match (peek_char lexer 1, peek_char lexer 2) with
1053 | ('
>'
, '
='
) -> (advance lexer 3, TokenKind.GreaterThanGreaterThanEqual
)
1055 (* If we are parsing a generic type argument list then we might be
1056 at the >> in List<List<int>>. In that case we want to lex two
1059 (advance lexer 1, TokenKind.GreaterThan
)
1061 (advance lexer 2, TokenKind.GreaterThanGreaterThan
)
1062 | ('
='
, _
) -> (advance lexer 2, TokenKind.GreaterThanEqual
)
1063 | _
-> (advance lexer 1, TokenKind.GreaterThan
)
1066 match (peek_char lexer 1, peek_char lexer 2) with
1067 | ('
='
, '
='
) -> (advance lexer 3, TokenKind.EqualEqualEqual
)
1068 | ('
='
, '
>'
) -> (advance lexer 3, TokenKind.EqualEqualGreaterThan
)
1069 | ('
='
, _
) -> (advance lexer 2, TokenKind.EqualEqual
)
1070 | ('
>'
, _
) -> (advance lexer 2, TokenKind.EqualGreaterThan
)
1071 | _
-> (advance lexer 1, TokenKind.Equal
)
1074 if (peek_char lexer 1) = '
='
then (advance lexer 2, TokenKind.CaratEqual
)
1075 else (advance lexer 1, TokenKind.Carat
)
1077 match peek_char lexer 1 with
1078 | '
='
-> (advance lexer 2, TokenKind.BarEqual
)
1079 | '
>'
-> (advance lexer 2, TokenKind.BarGreaterThan
)
1080 | '
|'
-> (advance lexer 2, TokenKind.BarBar
)
1081 | _
-> (advance lexer 1, TokenKind.Bar
)
1084 match peek_char lexer 1 with
1085 | '
='
-> (advance lexer 2, TokenKind.AmpersandEqual
)
1086 | '
&'
-> (advance lexer 2, TokenKind.AmpersandAmpersand
)
1087 | _
-> (advance lexer 1, TokenKind.Ampersand
)
1090 match (peek_char lexer 1, peek_char lexer 2) with
1091 | ('
:'
, _
) when not in_type
-> (advance lexer 2, TokenKind.QuestionColon
)
1092 | ('
-'
, '
>'
) -> (advance lexer 3, TokenKind.QuestionMinusGreaterThan
)
1093 | ('?'
, _
) -> (advance lexer 2, TokenKind.QuestionQuestion
)
1094 | ('
>'
, _
) -> (advance lexer 2, TokenKind.QuestionGreaterThan
)
1095 | _
-> (advance lexer 1, TokenKind.Question
)
1098 if (peek_char lexer 1) = '
:'
then (advance lexer 2, TokenKind.ColonColon
)
1099 else (advance lexer 1, TokenKind.Colon
)
1100 | '
;'
-> (advance lexer 1, TokenKind.Semicolon
)
1101 | '
,'
-> (advance lexer 1, TokenKind.Comma
)
1102 | '
@'
-> (advance lexer 1, TokenKind.At
)
1104 match peek_char lexer 1 with
1105 | 'x'
| 'X'
-> scan_hex_literal (advance lexer 2)
1106 | 'b'
| 'B'
-> scan_binary_literal (advance lexer 2)
1107 | _
-> scan_octal_or_float lexer
1110 scan_decimal_or_float lexer
1111 | '
\''
-> scan_single_quote_string_literal lexer
1112 | '`'
-> scan_double_quote_like_string_literal_from_start lexer '`'
1113 | '
"' -> scan_double_quote_like_string_literal_from_start lexer '"'
1114 | '
\\'
-> (advance lexer 1, TokenKind.Backslash
)
1115 | 'b'
when let c1 = peek_char lexer 1 in
1116 let c2 = peek_char lexer 2 in
1117 let c3 = peek_char lexer 3 in
1118 c1 = '
"' || c1 = '\'' || (c1 = '<' && c2 = '<' && c3 = '<') ->
1119 let lexer = advance lexer 1 in scan_token_impl in_type lexer
1122 if ch0 = invalid && at_end lexer then
1123 (lexer, TokenKind.EndOfFile)
1124 else if is_name_nondigit ch0 then
1127 let lexer = with_error lexer SyntaxError.error0006 in
1128 (advance lexer 1, TokenKind.ErrorToken)
1130 let scan_token : bool -> lexer -> lexer * TokenKind.t =
1131 fun in_type lexer ->
1132 Stats_container.wrap_nullary_fn_timing
1133 ?stats:(Stats_container.get_instance ())
1134 ~key:"full_fidelity_lexer
:scan_token"
1135 ~f:(fun () -> scan_token_impl in_type lexer)
1137 let scan_token_inside_type = scan_token true
1138 let scan_token_outside_type = scan_token false
1144 * white-space-character::
1146 * Space character (U+0020)
1147 * Horizontal-tab character (U+0009)
1149 * single-line-comment::
1150 * // input-characters-opt
1151 * # input-characters-opt
1154 * Carriage-return character (U+000D)
1155 * Line-feed character (U+000A)
1156 * Carriage-return character followed by line-feed character
1159 let str_scan_end_of_line ~str ~i =
1162 match str.[succ i] with
1165 | exception Invalid_argument _ -> succ i
1168 | _ -> failwith "str_scan_end_of_line called
while not on
end of line
!"
1169 | exception Invalid_argument _ -> succ i
1171 let scan_end_of_line lexer =
1172 match peek_char lexer 0 with
1174 let w = if peek_char lexer 1 = '\n' then 2 else 1 in
1175 advance lexer w, Trivia.make_eol (source lexer) (start lexer) w
1176 | '\n' -> (advance lexer 1, Trivia.make_eol (source lexer) (start lexer) 1)
1177 | _ -> failwith "scan_end_of_line called
while not on
end of line
!"
1179 let scan_hash_comment lexer =
1180 let lexer = skip_to_end_of_line lexer in
1181 let c = Trivia.make_single_line_comment
1182 (source lexer) (start lexer) (width lexer) in
1185 let scan_single_line_comment lexer =
1186 (* A fallthrough comment is two slashes, any amount of whitespace,
1187 FALLTHROUGH, and the end of the line.
1188 An unsafe comment is two slashes, any amount of whitespace,
1189 UNSAFE, and then any characters may follow.
1190 TODO: Consider allowing trailing space for fallthrough.
1191 TODO: Consider allowing lowercase fallthrough.
1193 let lexer = advance lexer 2 in
1194 let lexer_ws = skip_whitespace lexer in
1195 let lexer = skip_to_end_of_line_or_end_tag lexer_ws in
1196 let w = width lexer in
1197 let remainder = offset lexer - offset lexer_ws in
1199 if remainder = 11 && peek_string lexer_ws 11 = "FALLTHROUGH
" then
1200 Trivia.make_fallthrough (source lexer) (start lexer) w
1201 else if remainder >= 6 && peek_string lexer_ws 6 = "UNSAFE
" then
1202 Trivia.make_unsafe (source lexer) (start lexer) w
1204 Trivia.make_single_line_comment (source lexer) (start lexer) w in
1207 let skip_to_end_of_delimited_comment lexer =
1208 let rec aux lexer offset =
1209 let ch0 = peek_char lexer offset in
1210 if ch0 = invalid then
1211 let lexer = advance lexer offset in
1212 if at_end lexer then
1213 with_error lexer SyntaxError.error0007
1215 (* TODO: Do we want to give a warning for an embedded zero char
1216 inside a comment? *)
1218 else if ch0 = '*' && (peek_char lexer (offset + 1)) = '/' then
1219 advance lexer (offset + 2)
1220 else aux lexer (offset + 1) in
1223 let scan_delimited_comment lexer =
1224 (* An unsafe expression comment is a delimited comment that begins with any
1225 whitespace, followed by UNSAFE_EXPR, followed by any text.
1227 The original lexer lexes a fixme / ignore error as:
1229 slash star [whitespace]* HH_FIXME [whitespace or newline]* leftbracket
1230 [whitespace or newline]* integer [any text]* star slash
1232 Notice that the original lexer oddly enough does not verify that there
1235 For our purposes we will just check for HH_FIXME / HH_IGNORE_ERROR;
1236 a later pass can try to parse out the integer if there is one,
1237 give a warning if there is not, and so on. *)
1239 let lexer = advance lexer 2 in
1240 let lexer_ws = skip_whitespace lexer in
1241 let lexer = skip_to_end_of_delimited_comment lexer_ws in
1242 let w = width lexer in
1244 if match_string lexer_ws "UNSAFE_EXPR
" then
1245 Trivia.make_unsafe_expression (source lexer) (start lexer) w
1246 else if match_string lexer_ws "HH_FIXME
" then
1247 Trivia.make_fix_me (source lexer) (start lexer) w
1248 else if match_string lexer_ws "HH_IGNORE_ERROR
" then
1249 Trivia.make_ignore_error (source lexer) (start lexer) w
1251 Trivia.make_delimited_comment (source lexer) (start lexer) w in
1255 let scan_php_trivia lexer =
1256 (* Hack does not support PHP style embedded markup:
1264 However, ?> is never legal in Hack, so we can treat ?> ... any text ... <?php
1265 as a comment, and then give an error saying that this feature is not supported
1268 TODO: Give an error if this appears in a Hack program.
1270 match peek_char lexer 0 with
1272 let lexer = start_new_lexeme lexer in
1273 let (lexer, c) = scan_hash_comment lexer in
1276 let lexer = start_new_lexeme lexer in
1277 match peek_char lexer 1 with
1279 let (lexer, c) = scan_single_line_comment lexer in
1282 let (lexer, c) = scan_delimited_comment lexer in
1284 | _ -> (lexer, None)
1287 let new_end = str_skip_whitespace ~str:(source_text_string lexer) ~i:(offset lexer) in
1288 let new_start = offset lexer in
1289 let new_trivia = Trivia.make_whitespace (source lexer) new_start (new_end - new_start) in
1290 (with_start_offset lexer new_start new_end, Some new_trivia)
1292 let lexer = start_new_lexeme lexer in
1293 let (lexer, e) = scan_end_of_line lexer in
1296 let lexer = start_new_lexeme lexer in
1297 (* Not trivia *) (lexer, None)
1299 let scan_xhp_trivia lexer =
1300 (* TODO: Should XHP comments <!-- --> be their own thing, or a kind of
1301 trivia associated with a token? Right now they are the former. *)
1302 let i = offset lexer in
1303 let ch = peek_char lexer 0 in
1306 let i' = str_skip_whitespace ~str:(source_text_string lexer) ~i in
1307 let lexer = with_start_offset lexer i i' in
1308 let trivium = Trivia.make_whitespace (source lexer) i (i' - i) in
1309 (lexer, Some trivium)
1311 let i' = str_scan_end_of_line ~str:(source_text_string lexer) ~i in
1312 let lexer = with_start_offset lexer i i' in
1313 let trivium = Trivia.make_eol (source lexer) i (i' - i) in
1314 (lexer, Some trivium)
1315 | _ -> (* Not trivia *)
1316 let lexer = start_new_lexeme lexer in (lexer, None)
1319 We divide trivia into "leading
" and "trailing
" trivia of an associated
1320 token. This means that we must find a dividing line between the trailing trivia
1321 following one token and the leading trivia of the following token. Plainly
1322 we need only find this line while scanning trailing trivia. The heuristics
1324 * The first newline trivia encountered is the last trailing trivia.
1325 * The newline which follows a // or # comment is not part of the comment
1326 but does terminate the trailing trivia.
1327 * A pragma to turn checks off (HH_FIXME, HH_IGNORE_ERROR and UNSAFE_EXPR) is
1328 * always a leading trivia.
1331 let scan_leading_trivia scanner lexer =
1332 let rec aux lexer acc =
1333 let (lexer, trivia) = scanner lexer in
1335 | None -> (lexer, acc)
1336 | Some t -> aux lexer (t :: acc) in
1337 let (lexer, trivia_list) = aux lexer [] in
1338 (lexer, List.rev trivia_list)
1340 let scan_leading_php_trivia lexer =
1341 scan_leading_trivia scan_php_trivia lexer
1343 let scan_leading_xhp_trivia lexer =
1344 scan_leading_trivia scan_xhp_trivia lexer
1346 let scan_trailing_trivia scanner lexer =
1347 let rec aux lexer acc =
1348 let (lexer1, trivia) = scanner lexer in
1350 | None -> (lexer1, acc)
1352 match Trivia.kind t with
1353 | TriviaKind.EndOfLine -> (lexer1, t :: acc)
1355 | TriviaKind.IgnoreError
1356 | TriviaKind.UnsafeExpression
1358 | _ -> aux lexer1 (t :: acc)
1360 let (lexer, trivia_list) = aux lexer [] in
1361 (lexer, List.rev trivia_list)
1363 let scan_trailing_php_trivia lexer =
1364 scan_trailing_trivia scan_php_trivia lexer
1366 let scan_trailing_xhp_trivia lexer =
1367 scan_trailing_trivia scan_xhp_trivia lexer
1369 let is_next_name lexer =
1370 let (lexer, _) = scan_leading_php_trivia lexer in
1371 is_name_nondigit (peek_char lexer 0)
1373 let is_next_xhp_class_name lexer =
1374 let (lexer, _) = scan_leading_php_trivia lexer in
1375 is_xhp_class_name lexer
1377 let as_case_insensitive_keyword text =
1378 (* Some keywords are case-insensitive in Hack or PHP. *)
1379 (* TODO: Consider making non-lowercase versions of these keywords errors
1381 (* TODO: Consider making these illegal, period, and code-modding away all
1382 non-lower versions in our codebase. *)
1383 let lower = String.lowercase_ascii text in
1385 | "eval
" | "isset
" | "unset
" | "empty" | "const
" | "new"
1386 | "and" | "or" | "xor
" | "as" | "print
" | "throw
"
1387 | "true" | "false" | "null
" | "array
" | "instanceof
"
1388 | "trait
" | "class" | "interface
" | "using
" | "static
" | "inout
"
1389 | "self
" | "parent
" | "__halt_compiler
" | "foreach
" | "echo
" -> lower
1392 let as_keyword kind lexer =
1393 if kind = TokenKind.Name then
1394 let text = as_case_insensitive_keyword (current_text lexer) in
1395 match TokenKind.from_string text with
1396 | Some keyword -> keyword
1397 | _ -> TokenKind.Name
1401 (* scanner takes a lexer, returns a lexer and a kind *)
1402 let scan_token_and_leading_trivia scanner as_name lexer =
1403 (* Get past the leading trivia *)
1404 let (lexer, leading) = scan_leading_php_trivia lexer in
1405 (* Remember where we were when we started this token *)
1406 let lexer = start_new_lexeme lexer in
1407 let (lexer, kind) = scanner lexer in
1408 let kind = if as_name then kind else as_keyword kind lexer in
1409 let w = width lexer in
1410 (lexer, kind, w, leading)
1412 (* scanner takes a lexer, returns a lexer and a kind *)
1413 let scan_token_and_trivia scanner as_name lexer =
1414 let token_start = offset lexer in
1415 let (lexer, kind, w, leading) =
1416 scan_token_and_leading_trivia scanner as_name lexer in
1417 let (lexer, trailing) =
1419 | TokenKind.DoubleQuotedStringLiteralHead -> (lexer, [])
1420 | TokenKind.QuestionGreaterThan ->
1421 if is_newline (peek_char lexer 0) then
1422 (* consume only trailing EOL token after ?> as trailing trivia *)
1423 let (lexer, eol) = scan_end_of_line lexer in
1427 | _ -> scan_trailing_php_trivia lexer in
1428 (lexer, Token.make kind (source lexer) token_start w leading trailing)
1430 (* tokenizer takes a lexer, returns a lexer and a token *)
1431 let scan_assert_progress tokenizer lexer =
1432 let original_remaining = remaining lexer in
1433 let (lexer, token) = tokenizer lexer in
1434 let new_remaining = remaining lexer in
1435 if (new_remaining < original_remaining ||
1436 original_remaining = 0 &&
1437 new_remaining = 0 &&
1438 (Token.kind token) = TokenKind.EndOfFile) then
1441 Printf.kprintf failwith
1442 "failed
to make progress at %d
\n" (offset lexer)
1445 let scan_next_token ~as_name scanner lexer =
1446 let tokenizer = scan_token_and_trivia scanner as_name in
1447 scan_assert_progress tokenizer lexer
1449 let scan_next_token_as_name = scan_next_token ~as_name:true
1450 let scan_next_token_as_keyword = scan_next_token ~as_name:false
1453 (* TODO: Instead of passing Boolean flags, create a flags enum? *)
1455 (* This function is the inner loop of the parser, is pure, and
1456 is frequently called twice in a row with the same lexer due to the
1457 design of the parser. We get a big win by memoizing it. *)
1460 let next_token = (* takes a lexer, returns a (lexer, token) *)
1461 let next_token_cache = Little_cache.make empty
1462 (empty, Token.make TokenKind.EndOfFile SourceText.empty 0 0 [] []) in
1463 Little_cache.memoize next_token_cache
1464 (scan_next_token_as_keyword scan_token_outside_type)
1466 let next_token_no_trailing lexer =
1467 let tokenizer lexer =
1468 let token_start = offset lexer in
1469 let (lexer, kind, w, leading) =
1470 scan_token_and_leading_trivia scan_token_outside_type false lexer in
1471 (lexer, Token.make kind (source lexer) token_start w leading []) in
1472 scan_assert_progress tokenizer lexer
1474 let next_token_in_string lexer literal_kind =
1475 let token_start = offset lexer in
1476 let lexer = start_new_lexeme lexer in
1477 (* We're inside a string. Do not scan leading trivia. *)
1478 let (lexer, kind) = scan_string_literal_in_progress lexer literal_kind in
1479 let w = width lexer in
1480 (* Only scan trailing trivia if we've finished the string. *)
1481 let (lexer, trailing) =
1483 | TokenKind.DoubleQuotedStringLiteralTail
1484 | TokenKind.HeredocStringLiteralTail -> scan_trailing_php_trivia lexer
1485 | _ -> (lexer, []) in
1486 let token = Token.make kind (source lexer) token_start w [] trailing in
1489 let next_docstring_header lexer =
1490 (* We're at the beginning of a heredoc string literal. Scan leading
1491 trivia but not trailing trivia. *)
1492 let token_start = offset lexer in
1493 let (lexer, leading) = scan_leading_php_trivia lexer in
1494 let lexer = start_new_lexeme lexer in
1495 let (lexer, name, _) = scan_docstring_header lexer in
1496 let w = width lexer in
1497 let token = Token.make TokenKind.HeredocStringLiteralHead
1498 (source lexer) token_start w leading [] in
1499 (lexer, token, name)
1501 let next_token_as_name lexer =
1502 scan_next_token_as_name scan_token_outside_type lexer
1504 let next_token_in_type lexer =
1505 scan_next_token_as_keyword scan_token_inside_type lexer
1507 let next_xhp_element_token ~no_trailing lexer =
1508 (* XHP elements have whitespace, newlines and Hack comments. *)
1509 let tokenizer lexer =
1510 let token_start = offset lexer in
1511 let (lexer, kind, w, leading) =
1512 scan_token_and_leading_trivia scan_xhp_token true lexer in
1513 (* We do not scan trivia after an XHPOpen's >. If that is the beginning of
1514 an XHP body then we want any whitespace or newlines to be leading trivia
1515 of the body token. *)
1517 | TokenKind.GreaterThan
1518 | TokenKind.SlashGreaterThan when no_trailing ->
1519 (lexer, Token.make kind (source lexer) token_start w leading [])
1521 let (lexer, trailing) = scan_trailing_php_trivia lexer in
1523 kind (source lexer) token_start w leading trailing) in
1524 let (lexer, token) = scan_assert_progress tokenizer lexer in
1525 let token_width = Token.width token in
1526 let trailing_width = Token.trailing_width token in
1527 let token_start_offset = (offset lexer) - trailing_width - token_width in
1528 let token_text = SourceText.sub (source lexer) token_start_offset token_width in
1529 (lexer, token, token_text)
1531 let next_xhp_body_token lexer =
1533 let token_start = offset lexer in
1534 let (lexer, leading) = scan_leading_xhp_trivia lexer in
1535 let lexer = start_new_lexeme lexer in
1536 let (lexer, kind) = scan_xhp_body lexer in
1537 let w = width lexer in
1538 let (lexer, trailing) =
1539 (* Trivia (leading and trailing) is semantically
1540 significant for XHPBody tokens. When we find elements or
1541 braced expressions inside the body, the trivia should be
1542 seen as leading the next token, but we should certainly
1543 keep it trailing if this is an XHPBody token. *)
1544 if kind = TokenKind.XHPBody
1545 then scan_trailing_xhp_trivia lexer
1548 (lexer, Token.make kind (source lexer) token_start w leading trailing) in
1549 scan_assert_progress scanner lexer
1551 let next_xhp_class_name lexer =
1552 scan_token_and_trivia scan_xhp_class_name false lexer
1554 let next_xhp_name lexer =
1555 scan_token_and_trivia scan_xhp_element_name false lexer
1557 let make_markup_token lexer =
1558 Token.make TokenKind.Markup (source lexer) (start lexer) (width lexer) [] []
1560 let skip_to_end_of_markup lexer ~is_leading_section =
1561 let make_markup_and_suffix lexer =
1562 let markup_text = make_markup_token lexer in
1563 let less_than_question_token =
1564 Token.make TokenKind.LessThanQuestion (source lexer) (offset lexer) 2 [] []
1567 let lexer = advance lexer 2 in
1568 let name_token_offset = offset lexer in
1569 let make_long_tag lexer size =
1571 let lexer = advance lexer size in
1572 (* single line comments that follow the language in leading markup_text
1573 determine the file check mode, read the trailing trivia and attach it
1574 to the language token *)
1575 let lexer, trailing =
1576 if is_leading_section then scan_trailing_php_trivia lexer
1579 let name = Token.make TokenKind.Name
1580 (source lexer) name_token_offset size [] trailing in
1581 lexer, markup_text, Some (less_than_question_token, Some name)
1583 let ch0 = peek_char lexer 0 in
1584 let ch1 = peek_char lexer 1 in
1585 let ch2 = peek_char lexer 2 in
1586 match ch0, ch1, ch2 with
1587 | ('H' | 'h'), ('H' | 'h'), _ -> make_long_tag lexer 2
1588 | ('P' | 'p'), ('H' | 'h'), ('P' | 'p') -> make_long_tag lexer 3
1592 let lexer = advance lexer 1 in
1593 let equal = Token.make TokenKind.Equal
1594 (source lexer) name_token_offset 1 [] [] in
1595 lexer, markup_text, Some (less_than_question_token, Some equal)
1598 lexer, markup_text, Some (less_than_question_token, None)
1600 let rec aux lexer index =
1601 (* It's not an error to run off the end of one of these. *)
1602 if at_end_index lexer index then
1603 let lexer' = with_offset lexer index in
1604 lexer', (make_markup_token lexer'), None
1606 let ch = peek lexer index in
1607 if ch = '<' && peek_def lexer (succ index) ~def:'\x00' = '?' then
1608 (* Found a beginning tag that delimits markup from the script *)
1609 make_markup_and_suffix (with_offset lexer index)
1611 aux lexer (succ index)
1615 if is_leading_section
1617 (* if leading section starts with #! - it should span the entire line *)
1618 let index = offset lexer in
1619 if peek_def ~def:'\x00' lexer index = '#' &&
1620 peek_def ~def:'\x00' lexer (succ index) = '!'
1621 then skip_while_to_offset lexer not_newline
1624 else offset lexer in
1625 aux lexer start_offset
1627 let scan_markup lexer ~is_leading_section =
1628 let lexer = start_new_lexeme lexer in
1629 skip_to_end_of_markup lexer ~is_leading_section
1631 let is_next_xhp_category_name lexer =
1632 let (lexer, _) = scan_leading_php_trivia lexer in
1633 (* An XHP category is an xhp element name preceded by a %. *)
1634 let ch0 = peek_char lexer 0 in
1635 let ch1 = peek_char lexer 1 in
1636 ch0 = '%' && is_name_nondigit ch1
1638 let scan_xhp_category_name lexer =
1639 if is_next_xhp_category_name lexer then
1640 let (lexer, _) = scan_xhp_element_name (advance lexer 1) in
1641 (lexer, TokenKind.XHPCategoryName)
1643 scan_token false lexer
1645 let next_xhp_category_name lexer =
1646 scan_token_and_trivia scan_xhp_category_name false lexer
1648 let rescan_halt_compiler lexer last_token =
1649 (* __halt_compiler stops parsing of the file.
1650 In order to preserve fill fidelity aspect of the parser
1651 we pack everything that follows __halt_compiler as
1652 separate opaque kind of trivia - it will be attached as a trailing trivia
1653 to the last_token and existing trailing trivia will be merged in. *)
1655 Token.leading_start_offset last_token +
1656 Token.leading_width last_token +
1657 Token.width last_token in
1658 let source = source lexer in
1659 let length = SourceText.length source in
1661 Trivia.make_after_halt_compiler
1664 (length - start_offset) in
1665 Lexer.with_offset lexer length, Token.with_trailing [trailing] last_token