Alternate if syntax support
[hiphop-php.git] / hphp / hack / src / parser / full_fidelity_lexer.ml
blob5768ed015fb867437d1bc6f4b513cdebfc3629c1
1 (**
2 * Copyright (c) 2016, Facebook, Inc.
3 * All rights reserved.
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the "hack" directory of this source tree. An additional grant
7 * of patent rights can be found in the PATENTS file in the same directory.
9 *)
11 module TriviaKind = Full_fidelity_trivia_kind
12 module TokenKind = Full_fidelity_token_kind
13 module SourceText = Full_fidelity_source_text
14 module SyntaxError = Full_fidelity_syntax_error
16 module Lexer : sig
17 type t = {
18 text : SourceText.t;
19 start : int; (* Both start and offset are absolute offsets in the text. *)
20 offset : int;
21 errors : SyntaxError.t list
23 val make : SourceText.t -> t
24 val start : t -> int
25 val source : t -> SourceText.t
26 val errors : t -> SyntaxError.t list
27 val offset : t -> int
29 val with_error : t -> string -> t
30 val with_offset : t -> int -> t
31 val with_offset_errors : t -> int -> SyntaxError.t list -> t
32 val start_new_lexeme : t -> t
33 val advance : t -> int -> t
34 val with_start_offset : t -> int -> int -> t
35 end = struct
37 let padding = String.make 100 '\x00'
39 (* text consists of a pair consisting of a string, padded by a certain, fixed
40 * amount of null bytes, and then the rest of the source text *)
41 type t = {
42 text : SourceText.t;
43 start : int; (* Both start and offset are absolute offsets in the text. *)
44 offset : int;
45 errors : SyntaxError.t list
48 let make text =
49 let text' = SourceText.append_padding text padding in
50 { text = text'; start = 0; offset = 0; errors = [] }
52 let start x = x.start
53 let source x = x.text
54 let errors x = x.errors
55 let offset x = x.offset
57 let with_error lexer message =
58 let error = SyntaxError.make lexer.start lexer.offset message in
59 { lexer with errors = error :: lexer.errors }
61 let with_offset lexer offset = {lexer with offset = offset}
63 let with_start_offset lexer start offset = {lexer with start = start; offset = offset}
65 let with_offset_errors lexer offset errors = {
66 lexer with offset = offset; errors = errors
69 let start_new_lexeme lexer =
70 { lexer with start = lexer.offset }
72 let advance lexer index =
73 { lexer with offset = lexer.offset + index }
74 end
76 module WithToken(Token: Lexable_token_sig.LexableToken_S) = struct
78 module Trivia = Token.Trivia
80 type lexer = Lexer.t
81 type t = lexer
83 let make = Lexer.make
84 let start = Lexer.start
85 let source = Lexer.source
86 let errors = Lexer.errors
87 let offset = Lexer.offset
88 let with_error = Lexer.with_error
89 let with_offset = Lexer.with_offset
90 let start_new_lexeme = Lexer.start_new_lexeme
91 let advance = Lexer.advance
92 let with_offset_errors = Lexer.with_offset_errors
93 let with_start_offset = Lexer.with_start_offset
95 let start_offset = start
96 let end_offset = offset
98 let invalid = '\000'
100 let empty = make SourceText.empty
102 let source_text_string (l : lexer) = SourceText.text (source l)
104 type string_literal_kind =
105 | Literal_execution_string
106 | Literal_double_quoted
107 | Literal_heredoc of string
109 (* Housekeeping *)
111 let peek_char lexer index =
112 lexer.Lexer.text.SourceText.text.[offset lexer + index]
114 let peek_string lexer size =
115 String.sub lexer.Lexer.text.SourceText.text (offset lexer) size
117 let match_string lexer s =
118 s = peek_string lexer (String.length s)
120 let make_error_with_location (l : lexer) (msg : string) =
121 SyntaxError.make (start l) (offset l) msg
123 let width lexer =
124 (offset lexer) - (start lexer)
126 let current_text lexer =
127 SourceText.sub (source lexer) (start lexer) (width lexer)
129 let current_text_at lexer length relative_start =
130 SourceText.sub (source lexer) ((start lexer) + relative_start) length
132 let at_end lexer =
133 (offset lexer) >= SourceText.length (source lexer)
135 let at_end_index lexer index =
136 index >= SourceText.length (source lexer)
138 let remaining lexer =
139 let r = (SourceText.length (source lexer)) - offset lexer in
140 if r < 0 then 0 else r
142 let text_len (l : lexer) =
143 SourceText.length (source l)
145 let peek (l : lexer) i =
146 SourceText.get (source l) i
148 let peek_def (l: lexer) i ~def =
149 if i >= SourceText.length (source l) then
151 else
152 SourceText.get (source l) i
154 (* Character classification *)
155 let is_whitespace_no_newline : char -> bool = function
156 | ' ' | '\t' -> true
157 | _ -> false
159 let is_newline = function
160 | '\r' | '\n' -> true
161 | _ -> false
163 let is_binary_digit = function
164 | '0' | '1' -> true
165 | _ -> false
167 let is_octal_digit = function
168 | '0' .. '7' -> true
169 | _ -> false
171 let is_decimal_digit = function
172 | '0' .. '9' -> true
173 | _ -> false
175 let is_hexadecimal_digit = function
176 | '0' .. '9' | 'a' .. 'f' | 'A' .. 'F' -> true
177 | _ -> false
179 let is_name_nondigit = function
180 | '_' -> true
181 | 'a' .. 'z' -> true
182 | 'A' .. 'Z' -> true
183 | '\x7f' .. '\xff' -> true
184 | _ -> false
186 let is_name_letter = function
187 | '_' -> true
188 | '0' .. '9' -> true
189 | 'a' .. 'z' -> true
190 | 'A' .. 'Z' -> true
191 | '\x7f' .. '\xff' -> true
192 | _ -> false
194 (* Lexing *)
196 let skip_while_to_offset l p =
197 let n = SourceText.length (source l) in
198 let rec aux i =
199 if i < n && p (peek l i) then aux (i + 1) else i in
200 aux (offset l)
202 (* create a new lexer where the offset is advanced as
203 * long as the predicate is true *)
204 let skip_while (l : lexer) (p : char -> bool) =
205 with_offset l (skip_while_to_offset l p)
207 let str_skip_while ~str ~i ~p =
208 let n = String.length str in
209 let rec aux i =
210 if i < n && p str.[i] then aux (i + 1) else i in
211 aux i
213 let skip_whitespace (l : lexer) =
214 skip_while l is_whitespace_no_newline
216 let str_skip_whitespace ~str ~i =
217 str_skip_while ~str ~i ~p:is_whitespace_no_newline
219 let not_newline ch = not (is_newline ch)
221 let skip_to_end_of_line (l : lexer) =
222 skip_while l not_newline
224 let skip_to_end_of_line_or_end_tag (l : lexer) =
225 let n = text_len l in
226 let peek_def i = if i < n then peek l i else invalid in
227 let should_stop i =
228 (i >= n) || begin
229 let ch = peek l i in
230 (is_newline ch) || (ch = '?' && peek_def (succ i) = '>')
231 end in
232 let i = ref (offset l) in
233 while (not (should_stop !i)) do incr i done;
234 with_offset l !i
236 let skip_name_end (l : lexer) =
237 skip_while l is_name_letter
239 let skip_end_of_line lexer =
240 match peek_char lexer 0 with
241 | '\n' -> advance lexer 1
242 | '\r' ->
243 if (peek_char lexer 1) = '\n' then advance lexer 2 else advance lexer 1
244 | _ -> lexer
246 let scan_name_impl lexer =
247 assert (is_name_nondigit (peek_char lexer 0));
248 skip_name_end (advance lexer 1)
250 let scan_name lexer =
251 let lexer = scan_name_impl lexer in
252 (lexer, TokenKind.Name)
254 let scan_variable lexer =
255 assert('$' = peek_char lexer 0);
256 let lexer = scan_name_impl (advance lexer 1) in
257 (lexer, TokenKind.Variable)
259 let scan_with_underscores (l : lexer) accepted_char =
260 let n = text_len l in
261 let peek_def i = if i < n then peek l i else invalid in
262 let rec aux i =
263 if i >= n then i
264 else let ch = peek l i in
265 if accepted_char ch then aux (succ i)
266 else if ch = ' ' && accepted_char (peek_def (succ i)) then
267 aux (2 + i)
268 else i in
269 with_offset l (aux (offset l))
271 let scan_decimal_digits (l : lexer) =
272 skip_while l is_decimal_digit
274 let scan_decimal_digits_with_underscores lexer =
275 scan_with_underscores lexer is_decimal_digit
277 let scan_octal_digits (l : lexer) =
278 skip_while l is_octal_digit
280 let scan_octal_digits_with_underscores (l : lexer) =
281 scan_with_underscores l is_octal_digit
283 let scan_binary_digits_with_underscores (l : lexer) =
284 scan_with_underscores l is_binary_digit
286 let scan_hexadecimal_digits (l : lexer) =
287 skip_while l is_hexadecimal_digit
289 let scan_hexadecimal_digits_with_underscores (l : lexer) =
290 scan_with_underscores l is_hexadecimal_digit
292 let scan_hex_literal lexer =
293 let ch = peek_char lexer 0 in
294 if not (is_hexadecimal_digit ch) then
295 let lexer = with_error lexer SyntaxError.error0001 in
296 (lexer, TokenKind.HexadecimalLiteral)
297 else
298 (scan_hexadecimal_digits_with_underscores lexer, TokenKind.HexadecimalLiteral)
300 let scan_binary_literal lexer =
301 let ch = peek_char lexer 0 in
302 if not (is_binary_digit ch) then
303 let lexer = with_error lexer SyntaxError.error0002 in
304 (lexer, TokenKind.BinaryLiteral)
305 else
306 (scan_binary_digits_with_underscores lexer, TokenKind.BinaryLiteral)
308 let scan_exponent lexer =
309 let ch = peek_char lexer 1 in
310 let lexer = if ch = '+' || ch = '-' then (advance lexer 2)
311 else (advance lexer 1) in
312 let ch = peek_char lexer 0 in
313 if not (is_decimal_digit ch) then
314 let lexer = with_error lexer SyntaxError.error0003 in
315 (lexer, TokenKind.FloatingLiteral)
316 else
317 (scan_decimal_digits lexer, TokenKind.FloatingLiteral)
319 let scan_after_decimal_point lexer =
320 let lexer = advance lexer 1 in
321 let lexer = scan_decimal_digits lexer in
322 let ch = peek_char lexer 0 in
323 if ch = 'e' || ch = 'E' then
324 scan_exponent lexer
325 else
326 (lexer, TokenKind.FloatingLiteral)
328 let scan_octal_or_float lexer =
329 (* We've scanned a leading zero. *)
330 (* We have an irritating ambiguity here. 09 is not a legal octal or
331 * floating literal, but 09e1 and 09.1 are. *)
332 let lexer = advance lexer 1 in
333 let ch = peek_char lexer 0 in
334 match ch with
335 | '.' -> (* 0. *) scan_after_decimal_point lexer
336 | 'e' | 'E' -> (* 0e *) scan_exponent lexer
337 | '0' .. '9' ->
338 (* 05 *)
339 let lexer_oct = scan_octal_digits lexer in
340 let lexer_dec = scan_decimal_digits lexer in
341 if (width lexer_oct) = (width lexer_dec) then
342 begin
343 (* Only octal digits. Could be an octal literal, or could
344 be a float. *)
345 let ch = peek_char lexer_oct 0 in
346 if ch = 'e' || ch = 'E' then scan_exponent lexer_oct
347 else if ch = '.' then scan_after_decimal_point lexer_oct
348 else
349 (* This is irritating - we only want to allow underscores for integer
350 literals. Deferring the lexing with underscores here allows us to
351 make sure we're not dealing with floats. *)
352 let lexer_oct_with_underscores =
353 scan_octal_digits_with_underscores lexer in
354 (lexer_oct_with_underscores, TokenKind.OctalLiteral)
356 else
357 begin
358 (* We had decimal digits following a leading zero; this is either a
359 float literal or an octal to be truncated at the first non-octal
360 digit. *)
361 let ch = peek_char lexer_dec 0 in
362 if ch = 'e' || ch = 'E' then
363 scan_exponent lexer_dec
364 else if ch = '.' then
365 scan_after_decimal_point lexer_dec
366 else (* an octal to be truncated at the first non-octal digit *)
367 (* Again we differ the lexing with underscores here *)
368 let lexer_dec_with_underscores =
369 scan_decimal_digits_with_underscores lexer in
370 (lexer_dec_with_underscores, TokenKind.OctalLiteral)
372 | _ -> (* 0 *) (lexer, TokenKind.OctalLiteral)
374 let scan_decimal_or_float lexer =
375 (* We've scanned a leading non-zero digit. *)
376 let lexer_no_underscores = scan_decimal_digits lexer in
377 let lexer_with_underscores = scan_decimal_digits_with_underscores lexer in
378 let ch = peek_char lexer_no_underscores 0 in
379 match ch with
380 | '.' -> (* 123. *) scan_after_decimal_point lexer_no_underscores
381 | 'e' | 'E' -> (* 123e *) scan_exponent lexer_no_underscores
382 | _ -> (* 123 *) (lexer_with_underscores, TokenKind.DecimalLiteral)
384 let scan_single_quote_string_literal (l : lexer) =
385 (* TODO: What about newlines embedded? *)
386 (* SPEC:
387 single-quoted-string-literal::
388 b-opt ' sq-char-sequence-opt '
390 TODO: What is this b-opt? We don't lex an optional 'b' before a literal.
392 sq-char-sequence::
393 sq-char
394 sq-char-sequence sq-char
396 sq-char::
397 sq-escape-sequence
398 \opt any character except single-quote (') or backslash (\)
400 sq-escape-sequence:: one of
401 \' \\
405 let n = SourceText.length (source l) in
406 let peek = SourceText.get (source l) in
408 let has_error0012 = ref false in
409 let has_error0006 = ref false in
411 let rec stepper i =
412 if i >= n then
413 (has_error0012 := true; n - 1)
414 else begin
415 let ch = peek i in
416 match ch with
417 | '\000' -> (has_error0006 := true; stepper (1+i))
418 | '\\' -> stepper (2+i)
419 | '\'' -> (1+i)
420 | _ -> stepper (1+i)
421 end in
423 let new_offset = stepper (1 + (offset l)) in
425 let new_errors =
426 let err msg = make_error_with_location l msg in
427 match (!has_error0006, !has_error0012) with
428 | (true, true) -> (err SyntaxError.error0006 :: err SyntaxError.error0012 :: (errors l))
429 | (true, false) -> (err SyntaxError.error0006 :: (errors l))
430 | (false, true) -> (err SyntaxError.error0012 :: (errors l))
431 | (false, false) -> (errors l) in
433 let res = with_offset_errors l new_offset new_errors in
434 (res, TokenKind.SingleQuotedStringLiteral)
436 let scan_hexadecimal_escape lexer =
437 let ch2 = peek_char lexer 2 in
438 let ch3 = peek_char lexer 3 in
439 if not (is_hexadecimal_digit ch2) then
440 (* TODO: Consider producing an error for a malformed hex escape *)
441 (* let lexer = with_error lexer SyntaxError.error0005 in *)
442 advance lexer 2
443 else if not (is_hexadecimal_digit ch3) then
444 (* let lexer = with_error lexer SyntaxError.error0005 in *)
445 advance lexer 3
446 else
447 advance lexer 4
449 let scan_unicode_escape lexer =
450 (* At present the lexer is pointing at \u *)
451 if (peek_char lexer 2) = '{' then
452 if (peek_char lexer 3) = '$' then
453 (* We have a malformed unicode escape that contains a possible embedded
454 expression. Eat the \u and keep on processing the embedded expression. *)
455 (* TODO: Consider producing a warning for a malformed unicode escape. *)
456 advance lexer 2
457 else
458 (* We have a possibly well-formed escape sequence, and at least we know
459 that it is not an embedded expression. *)
460 (* TODO: Consider producing an error if the digits are out of range
461 of legal Unicode characters. *)
462 (* TODO: Consider producing an error if there are no digits. *)
463 (* Skip over the slash, u and brace, and start lexing the number. *)
464 let lexer = advance lexer 3 in
465 let lexer = scan_hexadecimal_digits lexer in
466 let ch = peek_char lexer 0 in
467 if ch != '}' then
468 (* TODO: Consider producing a warning for a malformed unicode escape. *)
469 lexer
470 else
471 advance lexer 1
472 else
473 (* We have a malformed unicode escape sequence. Bail out. *)
474 (* TODO: Consider producing a warning for a malformed unicode escape. *)
475 advance lexer 2
477 let skip_uninteresting_double_quote_like_string_characters (l : lexer) start_char =
478 let is_uninteresting ch =
479 match ch with
480 | '\000' | '\\' | '$' | '{' | '[' | ']' | '-'
481 | '0' .. '9' -> false
482 | ch -> ch <> start_char && not (is_name_nondigit ch) in
483 skip_while l is_uninteresting
485 let scan_integer_literal_in_string lexer =
486 if (peek_char lexer 0) = '0' then
487 match peek_char lexer 1 with
488 | 'x' | 'X' -> scan_hex_literal (advance lexer 2)
489 | 'b' | 'B' -> scan_binary_literal (advance lexer 2)
490 | _ ->
491 (* An integer literal starting with 0 in a string will actually
492 always be treated as a string index in HHVM, and not as an octal.
493 In such a case, HHVM actually scans all decimal digits to create the
494 token. TODO: we may want to change this behavior to something more
495 sensible *)
496 (scan_decimal_digits_with_underscores lexer, TokenKind.DecimalLiteral)
497 else
498 (scan_decimal_digits_with_underscores lexer, TokenKind.DecimalLiteral)
500 (* scans double quoted or execution string literals - they have similar rules
501 for content interpretation except for \"" character - it is escaped in
502 double quoted string and remain intact in execution string literals *)
503 let scan_double_quote_like_string_literal_from_start lexer start_char =
504 let literal_token_kind =
505 if start_char = '`' then TokenKind.ExecutionStringLiteral
506 else TokenKind.DoubleQuotedStringLiteral in
507 let head_token_kind =
508 if start_char = '`' then TokenKind.ExecutionStringLiteralHead
509 else TokenKind.DoubleQuotedStringLiteralHead in
510 let rec aux lexer =
511 (* If there's nothing interesting in this double-quoted string then
512 we can just hand it back as-is. *)
513 let lexer =
514 skip_uninteresting_double_quote_like_string_characters lexer start_char in
515 match peek_char lexer 0 with
516 | '\000' ->
517 (* If the string is unterminated then give an error; if this is an
518 embedded zero character then give an error and recurse; we might
519 be able to make more progress. *)
520 if at_end lexer then
521 let lexer = with_error lexer SyntaxError.error0012 in
522 (lexer, literal_token_kind)
523 else
524 let lexer = with_error lexer SyntaxError.error0006 in
525 aux (advance lexer 1)
526 | '`' | '"' ->
527 (* We made it to the end without finding a special character. *)
528 (advance lexer 1, literal_token_kind)
529 | _ -> (* We've found a backslash, dollar or brace. *)
530 (lexer, head_token_kind) in
531 aux (advance lexer 1)
533 let is_heredoc_tail lexer name =
534 (* A heredoc tail is the identifier immediately preceded by a newline
535 and immediately followed by an optional semi and then a newline.
537 Note that the newline and optional semi are not part of the literal;
538 the literal's lexeme ends at the end of the name. Either there is
539 no trivia and the next token is a semi-with-trailing-newline, or
540 the trailing trivia is a newline.
542 This odd rule is to ensure that both
543 $x = <<<HERE
544 something
545 HERE;
549 $x = <<<HERE
550 something
551 HERE
552 . "something else";
554 are legal.
557 if not (is_newline (peek_char lexer (-1))) then
558 false
559 else
560 let len = String.length name in
561 let ch0 = peek_char lexer len in
562 let ch1 = peek_char lexer (len + 1) in
563 ((is_newline ch0) || ch0 = ';' && (is_newline ch1)) &&
564 (peek_string lexer len) = name
566 let get_tail_token_kind literal_kind =
567 match literal_kind with
568 | Literal_heredoc _-> TokenKind.HeredocStringLiteralTail
569 | Literal_execution_string -> TokenKind.ExecutionStringLiteralTail
570 | Literal_double_quoted -> TokenKind.DoubleQuotedStringLiteralTail
572 let get_string_literal_body_or_double_quoted_tail literal_kind =
573 if literal_kind = Literal_double_quoted
574 then TokenKind.DoubleQuotedStringLiteralTail
575 else TokenKind.StringLiteralBody
577 let scan_string_literal_in_progress lexer literal_kind =
578 let is_heredoc, name =
579 match literal_kind with
580 | Literal_heredoc name -> true, name
581 | _ -> false, "" in
582 let start_char =
583 if literal_kind = Literal_execution_string then '`'
584 else '"' in
585 let ch0 = peek_char lexer 0 in
586 if is_name_nondigit ch0 then
587 if is_heredoc && (is_heredoc_tail lexer name) then
588 (scan_name_impl lexer, TokenKind.HeredocStringLiteralTail)
589 else
590 (scan_name_impl lexer, TokenKind.Name)
591 else
592 match ch0 with
593 | '\000' ->
594 if at_end lexer then
595 let lexer = with_error lexer SyntaxError.error0012 in
596 (lexer, get_tail_token_kind literal_kind)
597 else
598 let lexer = with_error lexer SyntaxError.error0006 in
599 let lexer = advance lexer 1 in
600 let lexer =
601 skip_uninteresting_double_quote_like_string_characters
602 lexer
603 start_char in
604 (lexer, TokenKind.StringLiteralBody)
605 | '`' when literal_kind = Literal_execution_string ->
606 (* '`' terminates execution string *)
607 (advance lexer 1, TokenKind.ExecutionStringLiteralTail)
608 | '"' ->
609 let kind = get_string_literal_body_or_double_quoted_tail literal_kind in
610 (advance lexer 1, kind)
611 | '$' ->
612 if is_name_nondigit (peek_char lexer 1) then scan_variable lexer
613 else (advance lexer 1, TokenKind.Dollar)
614 | '{' -> (advance lexer 1, TokenKind.LeftBrace)
615 | '\\' -> begin
616 match peek_char lexer 1 with
617 (* In these cases we just skip the escape sequence and
618 keep on scanning for special characters. *)
619 | '\\' | '"' | '$' | 'e' | 'f' | 'n' | 'r' | 't' | 'v' | '`'
620 (* Same in these cases; there might be more octal characters following but
621 if there are, we'll just eat them as normal characters. *)
622 | '0' .. '7' ->
623 let lexer = advance lexer 2 in
624 let lexer =
625 skip_uninteresting_double_quote_like_string_characters
626 lexer start_char in
627 (lexer, TokenKind.StringLiteralBody)
628 | 'x' ->
629 let lexer = scan_hexadecimal_escape lexer in
630 let lexer =
631 skip_uninteresting_double_quote_like_string_characters
632 lexer start_char in
633 (lexer, TokenKind.StringLiteralBody)
634 | 'u' ->
635 let lexer = scan_unicode_escape lexer in
636 let lexer =
637 skip_uninteresting_double_quote_like_string_characters
638 lexer start_char in
639 (lexer, TokenKind.StringLiteralBody)
640 | '{' ->
641 (* The rules for escaping open braces in Hack are bizarre. Suppose we
642 have
643 $x = 123;
644 $y = 456;
645 $z = "\{$x,$y\}";
646 What is the value of $z? Naively you would think that the backslash
647 escapes the braces, and the variables are embedded, so {123,456}. But
648 that's not what happens. Yes, the backslash makes the brace no longer
649 the opening brace of an expression. But the backslash is still part
650 of the string! This is the string \{123,456\}.
651 TODO: We might want to fix this because this is very strange. *)
652 (* Eat the backslash and the brace. *)
653 let lexer = advance lexer 2 in
654 (lexer, TokenKind.StringLiteralBody)
655 | _ ->
656 (* TODO: A backslash followed by something other than an escape sequence
657 is legal in hack, and treated as though it was just the backslash
658 and the character. However we might consider making this a warning.
659 It is particularly egregious when we have something like:
660 $x = "abcdef \
661 ghi";
662 The author of the code likely means the backslash to mean line
663 continuation but in fact it just means to put a backslash and newline
664 in the string.
666 let lexer = advance lexer 1 in
667 let lexer =
668 skip_uninteresting_double_quote_like_string_characters
669 lexer start_char in
670 (lexer, TokenKind.StringLiteralBody)
672 | '[' ->
673 let lexer = advance lexer 1 in
674 (lexer, TokenKind.LeftBracket)
675 | ']' ->
676 let lexer = advance lexer 1 in
677 (lexer, TokenKind.RightBracket)
678 | '-' ->
679 if (peek_char lexer 1) = '>' then
680 let lexer = advance lexer 2 in
681 (lexer, TokenKind.MinusGreaterThan)
682 else
683 (* Nothing interesting here. Skip it and find the next
684 interesting character. *)
685 let lexer = advance lexer 1 in
686 let lexer =
687 skip_uninteresting_double_quote_like_string_characters
688 lexer start_char in
689 (lexer, TokenKind.StringLiteralBody)
690 | '0' .. '9' ->
691 let (lexer1, _) as literal = scan_integer_literal_in_string lexer in
692 if errors lexer == errors lexer1 then literal else
693 (* If we failed to scan a literal, do not interpret the literal *)
694 (with_offset lexer (offset lexer1), TokenKind.StringLiteralBody)
695 | _ ->
696 (* Nothing interesting here. Skip it and find the next
697 interesting character. *)
698 let lexer = advance lexer 1 in
699 let lexer =
700 skip_uninteresting_double_quote_like_string_characters
701 lexer start_char in
702 (lexer, TokenKind.StringLiteralBody)
704 (* A heredoc string literal has the form
706 header
707 optional body
708 trailer
710 The header is:
712 <<< (optional whitespace) name (no whitespace) (newline)
714 The optional body is:
716 any characters whatsoever including newlines (newline)
718 The trailer is:
720 (no whitespace) name (no whitespace) (optional semi) (no whitespace) (newline)
722 The names must be identical. The trailing semi and newline must be present.
724 The body is any and all characters, up to the first line that exactly matches
725 the trailer.
727 The body may contain embedded expressions.
729 A nowdoc string literal has the same form except that the first name is
730 enclosed in single quotes, and it may not contain embedded expressions.
734 let scan_docstring_name_actual lexer =
735 let ch = peek_char lexer 0 in
736 if is_name_nondigit ch then
737 let end_lexer = skip_name_end (advance lexer 1) in
738 let name = SourceText.sub
739 (source lexer) (offset lexer) (offset end_lexer - offset lexer) in
740 (end_lexer, name)
741 else
742 let lexer = with_error lexer SyntaxError.error0008 in
743 (lexer, "")
745 let scan_docstring_name lexer =
746 let lexer = skip_whitespace lexer in
747 let ch = peek_char lexer 0 in
748 let kind =
749 if ch = '\'' then TokenKind.NowdocStringLiteral
750 else TokenKind.HeredocStringLiteral in
751 let (lexer, name) =
752 if ch = '\'' then
753 let (lexer, name) = scan_docstring_name_actual (advance lexer 1) in
754 if (peek_char lexer 0) = '\'' then
755 (advance lexer 1, name)
756 else
757 (with_error lexer SyntaxError.error0010, name)
758 else
759 (* Starting with PHP 5.3.0, the opening Heredoc identifier
760 may optionally be enclosed in double quotes:*)
761 let lexer = if ch = '"' then advance lexer 1 else lexer in
762 let lexer, name = scan_docstring_name_actual lexer in
763 let lexer =
764 if ch = '"' && peek_char lexer 0 = '\"' then advance lexer 1 else lexer
766 lexer, name
768 (lexer, name, kind)
770 let scan_docstring_header lexer =
771 let ch = peek_char lexer 0 in
772 (* Skip 3 for <<< or 4 for b<<< *)
773 let skip_count = if ch = 'b' then 4 else 3 in
774 let lexer = advance lexer skip_count in
775 let (lexer, name, kind) = scan_docstring_name lexer in
776 let ch = peek_char lexer 0 in
777 let lexer =
778 if is_newline ch then lexer
779 else with_error lexer SyntaxError.error0011 in
780 let lexer = skip_to_end_of_line lexer in
781 let lexer = skip_end_of_line lexer in
782 (lexer, name, kind)
784 let scan_docstring_remainder name lexer =
785 let len = String.length name in
786 let rec aux lexer =
787 let ch0 = peek_char lexer len in
788 let ch1 = peek_char lexer (len + 1) in
789 if ((is_newline ch0) || ch0 = ';' && (is_newline ch1)) &&
790 (peek_string lexer len) = name then
791 advance lexer len
792 else
793 let lexer = skip_to_end_of_line lexer in
794 let ch = peek_char lexer 0 in
795 if is_newline ch then
796 aux (skip_end_of_line lexer)
797 else
798 (* If we got here then we ran off the end of the file without
799 finding a newline. Just bail. *)
800 with_error lexer SyntaxError.error0011 in
801 aux lexer
803 let scan_docstring_literal lexer =
804 let (lexer, name, kind) = scan_docstring_header lexer in
805 let lexer = scan_docstring_remainder name lexer in
806 (lexer, kind)
808 let scan_xhp_label lexer =
809 (* An XHP label has the same grammar as a Hack name. *)
810 let (lexer, _) = scan_name lexer in
811 lexer
813 let rec scan_xhp_element_name ?(attribute=false) lexer =
814 (* An XHP element name is a sequence of one or more XHP labels each separated
815 by a single : or -. Note that it is possible for an XHP element name to be
816 followed immediately by a : or - that is the next token, so if we find
817 a : or - not followed by a label, we need to terminate the token. *)
818 let lexer = scan_xhp_label lexer in
819 let ch0 = peek_char lexer 0 in
820 let ch1 = peek_char lexer 1 in
821 if (not attribute && ch0 = ':' || ch0 = '-') && is_name_nondigit ch1 then
822 scan_xhp_element_name (advance lexer 1)
823 else
824 (lexer, TokenKind.XHPElementName)
826 (* Is the next token we're going to lex a possible xhp class name? *)
827 let is_xhp_class_name lexer =
828 (peek_char lexer 0 = ':') && (is_name_nondigit (peek_char lexer 1))
830 let scan_xhp_class_name lexer =
831 (* An XHP class name is a colon followed by an xhp name. *)
832 if is_xhp_class_name lexer then
833 let (lexer, _) = scan_xhp_element_name (advance lexer 1) in
834 (lexer, TokenKind.XHPClassName)
835 else
836 let lexer = with_error lexer SyntaxError.error0008 in
837 (advance lexer 1, TokenKind.ErrorToken)
839 let scan_xhp_string_literal lexer =
840 (* XHP string literals are just straight up "find the closing quote"
841 strings. Embedded newlines are legal. *)
842 let rec aux lexer offset =
843 match peek_char lexer offset with
844 | '\000' ->
845 let lexer = advance lexer offset in
846 if at_end lexer then
847 let lexer = with_error lexer SyntaxError.error0012 in
848 (lexer, TokenKind.XHPStringLiteral)
849 else
850 let lexer = with_error lexer SyntaxError.error0006 in
851 aux lexer 1
852 | '"' -> (advance lexer (offset + 1), TokenKind.XHPStringLiteral)
853 | _ -> aux lexer (offset + 1) in
854 aux lexer 1
856 (* Note that this does not scan an XHP body *)
857 let scan_xhp_token lexer =
858 (* TODO: HHVM requires that there be no trivia between < and name in an
859 opening tag, but does allow trivia between </ and name in a closing tag.
860 Consider allowing trivia in an opening tag. *)
861 let ch0 = peek_char lexer 0 in
862 if ch0 = invalid && at_end lexer then
863 (lexer, TokenKind.EndOfFile)
864 else if is_name_nondigit ch0 then
865 scan_xhp_element_name lexer
866 else match ch0 with
867 | '{' -> (advance lexer 1, TokenKind.LeftBrace)
868 | '}' -> (advance lexer 1, TokenKind.RightBrace)
869 | '=' -> (advance lexer 1, TokenKind.Equal)
870 | '<' ->
871 if (peek_char lexer 1) = '/' then
872 (advance lexer 2, TokenKind.LessThanSlash)
873 else
874 (advance lexer 1, TokenKind.LessThan)
875 | '"' -> scan_xhp_string_literal lexer
876 | '/' ->
877 if (peek_char lexer 1) = '>' then
878 (advance lexer 2, TokenKind.SlashGreaterThan)
879 else
880 let lexer = with_error lexer SyntaxError.error0006 in
881 (advance lexer 1, TokenKind.ErrorToken)
882 | '>' -> (advance lexer 1, TokenKind.GreaterThan)
883 | _ ->
884 let lexer = with_error lexer SyntaxError.error0006 in
885 (advance lexer 1, TokenKind.ErrorToken)
887 let scan_xhp_comment lexer =
888 let rec aux lexer offset =
889 let ch0 = peek_char lexer offset in
890 let ch1 = peek_char lexer (offset + 1) in
891 let ch2 = peek_char lexer (offset + 2) in
892 match (ch0, ch1, ch2) with
893 | ('\000', _, _) -> with_error (advance lexer offset) SyntaxError.error0014
894 | ('-', '-', '>') -> (advance lexer (offset + 3))
895 | _ -> aux lexer (offset + 1) in
896 aux lexer 4
898 let scan_xhp_body lexer =
899 (* Naively you might think that an XHP body is just a bunch of characters,
900 terminated by an embedded { } expression or a tag. However, whitespace
901 and newlines are relevant in XHP bodies because they are "soft".
902 That is, any section of contiguous trivia has the same semantics as a
903 single space or newline -- just as in HTML.
905 Obviously this is of relevance to code formatters.
907 Therefore we detect whitespace and newlines within XHP bodies and treat
908 it as trivia surrounding the tokens within the body.
910 TODO: Is this also true of whitespace within XHP comments? If so then
911 we need to make XHP comments a sequence of tokens, rather than a
912 single token as they are now.
914 let rec aux lexer offset =
915 let ch = peek_char lexer offset in
916 match ch with
917 | '\000' ->
918 let lexer = advance lexer offset in
919 if at_end lexer then
920 let lexer = with_error lexer SyntaxError.error0013 in
921 lexer
922 else
923 let lexer = with_error lexer SyntaxError.error0006 in
924 aux lexer 1
925 | '\t' | ' ' | '\r' | '\n' | '{' | '}' | '<' -> advance lexer offset
926 | _ -> aux lexer (offset + 1) in
927 let ch0 = peek_char lexer 0 in
928 match ch0 with
929 | '\000' when at_end lexer -> (lexer, TokenKind.EndOfFile)
930 | '{' -> (advance lexer 1, TokenKind.LeftBrace)
931 | '}' -> (advance lexer 1, TokenKind.RightBrace)
932 | '<' -> begin
933 let ch1 = peek_char lexer 1 in
934 let ch2 = peek_char lexer 2 in
935 let ch3 = peek_char lexer 3 in
936 match (ch1, ch2, ch3) with
937 | ('!', '-', '-') -> (scan_xhp_comment lexer, TokenKind.XHPComment)
938 | ('/', _, _) -> (advance lexer 2, TokenKind.LessThanSlash)
939 | _ -> (advance lexer 1, TokenKind.LessThan)
941 | _ -> ((aux lexer 0), TokenKind.XHPBody)
943 let scan_dollar_token lexer =
945 We have a problem here. We wish to be able to lexically analyze both
946 PHP and Hack, but the introduction of $$ to Hack makes them incompatible.
947 "$$x" and "$$ $x" are legal in PHP, but illegal in Hack.
948 The rule in PHP seems to be that $ is a prefix operator, it is a token,
949 it can be followed by trivia, but the next token has to be another $
950 operator, a variable $x, or a {.
952 Here's a reasonable compromise. (TODO: Review this decision.)
954 $$x lexes as $ $x
955 $$$x lexes as $ $ $x
956 and so on.
958 $$ followed by anything other than a name or a $ lexes as $$.
960 This means that lexing a PHP program which contains "$$ $x" is different
961 will fail at parse time, but I'm willing to live with that.
963 This means that lexing a Hack program which contains
964 "$x |> $$instanceof Foo" produces an error as well.
966 If these decisions are unacceptable then we will need to make the lexer
967 be aware of whether it is lexing PHP or Hack; thus far we have not had
968 to make this distinction.
971 (* We are already at $. *)
972 let ch1 = peek_char lexer 1 in
973 match ch1 with
974 | '$' ->
975 let ch2 = peek_char lexer 2 in
976 if ch2 = '$' || is_name_nondigit ch2 then
977 (advance lexer 1, TokenKind.Dollar) (* $$x or $$$*)
978 else
979 (advance lexer 2, TokenKind.DollarDollar) (* $$ *)
980 | _ ->
981 if is_name_nondigit ch1 then scan_variable lexer (* $x *)
982 else (advance lexer 1, TokenKind.Dollar) (* $ *)
984 let rec scan_token_impl : bool -> lexer -> (lexer * TokenKind.t) =
985 fun in_type lexer ->
986 let ch0 = peek_char lexer 0 in
987 match ch0 with
988 | '[' -> (advance lexer 1, TokenKind.LeftBracket)
989 | ']' -> (advance lexer 1, TokenKind.RightBracket)
990 | '(' -> (advance lexer 1, TokenKind.LeftParen)
991 | ')' -> (advance lexer 1, TokenKind.RightParen)
992 | '{' -> (advance lexer 1, TokenKind.LeftBrace)
993 | '}' -> (advance lexer 1, TokenKind.RightBrace)
994 | '.' -> begin
995 match peek_char lexer 1 with
996 | '=' -> (advance lexer 2, TokenKind.DotEqual)
997 | '0' .. '9' ->
998 scan_after_decimal_point lexer
999 | '.' ->
1000 if (peek_char lexer 2) = '.' then (advance lexer 3, TokenKind.DotDotDot)
1001 else (advance lexer 1, TokenKind.Dot)
1002 | _ -> (advance lexer 1, TokenKind.Dot)
1004 | '-' -> begin
1005 match peek_char lexer 1 with
1006 | '=' -> (advance lexer 2, TokenKind.MinusEqual)
1007 | '-' -> (advance lexer 2, TokenKind.MinusMinus)
1008 | '>' -> (advance lexer 2, TokenKind.MinusGreaterThan)
1009 | _ -> (advance lexer 1, TokenKind.Minus)
1011 | '+' -> begin
1012 match peek_char lexer 1 with
1013 | '=' -> (advance lexer 2, TokenKind.PlusEqual)
1014 | '+' -> (advance lexer 2, TokenKind.PlusPlus)
1015 | _ -> (advance lexer 1, TokenKind.Plus)
1017 | '*' -> begin
1018 match (peek_char lexer 1, peek_char lexer 2) with
1019 | ('=', _) -> (advance lexer 2, TokenKind.StarEqual)
1020 | ('*', '=') -> (advance lexer 3, TokenKind.StarStarEqual)
1021 | ('*', _) -> (advance lexer 2, TokenKind.StarStar)
1022 | _ -> (advance lexer 1, TokenKind.Star)
1024 | '~' -> (advance lexer 1, TokenKind.Tilde)
1025 | '!' -> begin
1026 match (peek_char lexer 1, peek_char lexer 2) with
1027 | ('=', '=') -> (advance lexer 3, TokenKind.ExclamationEqualEqual)
1028 | ('=', _) -> (advance lexer 2, TokenKind.ExclamationEqual)
1029 | _ -> (advance lexer 1, TokenKind.Exclamation)
1031 | '$' -> scan_dollar_token lexer
1032 | '/' ->
1033 if (peek_char lexer 1) = '=' then (advance lexer 2, TokenKind.SlashEqual)
1034 else (advance lexer 1, TokenKind.Slash)
1035 | '%' ->
1036 if (peek_char lexer 1) = '=' then (advance lexer 2, TokenKind.PercentEqual)
1037 else (advance lexer 1, TokenKind.Percent)
1038 | '<' -> begin
1039 match (peek_char lexer 1, peek_char lexer 2) with
1040 | ('<', '<') -> scan_docstring_literal lexer
1041 | ('<', '=') -> (advance lexer 3, TokenKind.LessThanLessThanEqual)
1042 (* TODO: We lex and parse the spaceship operator.
1043 TODO: This is not in the spec at present. We should either make it an
1044 TODO: error, or add it to the specification. *)
1045 | ('=', '>') -> (advance lexer 3, TokenKind.LessThanEqualGreaterThan)
1046 | ('>', _) -> (advance lexer 2, TokenKind.LessThanGreaterThan)
1047 | ('=', _) -> (advance lexer 2, TokenKind.LessThanEqual)
1048 | ('<', _) -> (advance lexer 2, TokenKind.LessThanLessThan)
1049 | _ -> (advance lexer 1, TokenKind.LessThan)
1051 | '>' -> begin
1052 match (peek_char lexer 1, peek_char lexer 2) with
1053 | ('>', '=') -> (advance lexer 3, TokenKind.GreaterThanGreaterThanEqual)
1054 | ('>', _) ->
1055 (* If we are parsing a generic type argument list then we might be
1056 at the >> in List<List<int>>. In that case we want to lex two
1057 >'s, not one >>. *)
1058 if in_type then
1059 (advance lexer 1, TokenKind.GreaterThan)
1060 else
1061 (advance lexer 2, TokenKind.GreaterThanGreaterThan)
1062 | ('=', _) -> (advance lexer 2, TokenKind.GreaterThanEqual)
1063 | _ -> (advance lexer 1, TokenKind.GreaterThan)
1065 | '=' -> begin
1066 match (peek_char lexer 1, peek_char lexer 2) with
1067 | ('=', '=') -> (advance lexer 3, TokenKind.EqualEqualEqual)
1068 | ('=', '>') -> (advance lexer 3, TokenKind.EqualEqualGreaterThan)
1069 | ('=', _) -> (advance lexer 2, TokenKind.EqualEqual)
1070 | ('>', _) -> (advance lexer 2, TokenKind.EqualGreaterThan)
1071 | _ -> (advance lexer 1, TokenKind.Equal)
1073 | '^' ->
1074 if (peek_char lexer 1) = '=' then (advance lexer 2, TokenKind.CaratEqual)
1075 else (advance lexer 1, TokenKind.Carat)
1076 | '|' -> begin
1077 match peek_char lexer 1 with
1078 | '=' -> (advance lexer 2, TokenKind.BarEqual)
1079 | '>' -> (advance lexer 2, TokenKind.BarGreaterThan)
1080 | '|' -> (advance lexer 2, TokenKind.BarBar)
1081 | _ -> (advance lexer 1, TokenKind.Bar)
1083 | '&' -> begin
1084 match peek_char lexer 1 with
1085 | '=' -> (advance lexer 2, TokenKind.AmpersandEqual)
1086 | '&' -> (advance lexer 2, TokenKind.AmpersandAmpersand)
1087 | _ -> (advance lexer 1, TokenKind.Ampersand)
1089 | '?' -> begin
1090 match (peek_char lexer 1, peek_char lexer 2) with
1091 | (':', _) when not in_type -> (advance lexer 2, TokenKind.QuestionColon)
1092 | ('-', '>') -> (advance lexer 3, TokenKind.QuestionMinusGreaterThan)
1093 | ('?', _) -> (advance lexer 2, TokenKind.QuestionQuestion)
1094 | ('>', _) -> (advance lexer 2, TokenKind.QuestionGreaterThan)
1095 | _ -> (advance lexer 1, TokenKind.Question)
1097 | ':' ->
1098 if (peek_char lexer 1) = ':' then (advance lexer 2, TokenKind.ColonColon)
1099 else (advance lexer 1, TokenKind.Colon)
1100 | ';' -> (advance lexer 1, TokenKind.Semicolon)
1101 | ',' -> (advance lexer 1, TokenKind.Comma)
1102 | '@' -> (advance lexer 1, TokenKind.At)
1103 | '0' -> begin
1104 match peek_char lexer 1 with
1105 | 'x' | 'X' -> scan_hex_literal (advance lexer 2)
1106 | 'b' | 'B' -> scan_binary_literal (advance lexer 2)
1107 | _ -> scan_octal_or_float lexer
1109 | '1' .. '9' ->
1110 scan_decimal_or_float lexer
1111 | '\'' -> scan_single_quote_string_literal lexer
1112 | '`' -> scan_double_quote_like_string_literal_from_start lexer '`'
1113 | '"' -> scan_double_quote_like_string_literal_from_start lexer '"'
1114 | '\\' -> (advance lexer 1, TokenKind.Backslash)
1115 | 'b' when let c1 = peek_char lexer 1 in
1116 let c2 = peek_char lexer 2 in
1117 let c3 = peek_char lexer 3 in
1118 c1 = '"' || c1 = '\'' || (c1 = '<' && c2 = '<' && c3 = '<') ->
1119 let lexer = advance lexer 1 in scan_token_impl in_type lexer
1120 (* Names *)
1121 | _ ->
1122 if ch0 = invalid && at_end lexer then
1123 (lexer, TokenKind.EndOfFile)
1124 else if is_name_nondigit ch0 then
1125 scan_name lexer
1126 else
1127 let lexer = with_error lexer SyntaxError.error0006 in
1128 (advance lexer 1, TokenKind.ErrorToken)
1130 let scan_token : bool -> lexer -> lexer * TokenKind.t =
1131 fun in_type lexer ->
1132 Stats_container.wrap_nullary_fn_timing
1133 ?stats:(Stats_container.get_instance ())
1134 ~key:"full_fidelity_lexer:scan_token"
1135 ~f:(fun () -> scan_token_impl in_type lexer)
1137 let scan_token_inside_type = scan_token true
1138 let scan_token_outside_type = scan_token false
1140 (* Lexing trivia *)
1142 (* SPEC:
1144 * white-space-character::
1145 * new-line
1146 * Space character (U+0020)
1147 * Horizontal-tab character (U+0009)
1149 * single-line-comment::
1150 * // input-characters-opt
1151 * # input-characters-opt
1153 * new-line::
1154 * Carriage-return character (U+000D)
1155 * Line-feed character (U+000A)
1156 * Carriage-return character followed by line-feed character
1159 let str_scan_end_of_line ~str ~i =
1160 match str.[i] with
1161 | '\r' -> begin
1162 match str.[succ i] with
1163 | '\n' -> 2 + i
1164 | _ -> succ i
1165 | exception Invalid_argument _ -> succ i
1167 | '\n' -> succ i
1168 | _ -> failwith "str_scan_end_of_line called while not on end of line!"
1169 | exception Invalid_argument _ -> succ i
1171 let scan_end_of_line lexer =
1172 match peek_char lexer 0 with
1173 | '\r' ->
1174 let w = if peek_char lexer 1 = '\n' then 2 else 1 in
1175 advance lexer w, Trivia.make_eol (source lexer) (start lexer) w
1176 | '\n' -> (advance lexer 1, Trivia.make_eol (source lexer) (start lexer) 1)
1177 | _ -> failwith "scan_end_of_line called while not on end of line!"
1179 let scan_hash_comment lexer =
1180 let lexer = skip_to_end_of_line lexer in
1181 let c = Trivia.make_single_line_comment
1182 (source lexer) (start lexer) (width lexer) in
1183 (lexer, c)
1185 let scan_single_line_comment lexer =
1186 (* A fallthrough comment is two slashes, any amount of whitespace,
1187 FALLTHROUGH, and the end of the line.
1188 An unsafe comment is two slashes, any amount of whitespace,
1189 UNSAFE, and then any characters may follow.
1190 TODO: Consider allowing trailing space for fallthrough.
1191 TODO: Consider allowing lowercase fallthrough.
1193 let lexer = advance lexer 2 in
1194 let lexer_ws = skip_whitespace lexer in
1195 let lexer = skip_to_end_of_line_or_end_tag lexer_ws in
1196 let w = width lexer in
1197 let remainder = offset lexer - offset lexer_ws in
1198 let c =
1199 if remainder = 11 && peek_string lexer_ws 11 = "FALLTHROUGH" then
1200 Trivia.make_fallthrough (source lexer) (start lexer) w
1201 else if remainder >= 6 && peek_string lexer_ws 6 = "UNSAFE" then
1202 Trivia.make_unsafe (source lexer) (start lexer) w
1203 else
1204 Trivia.make_single_line_comment (source lexer) (start lexer) w in
1205 (lexer, c)
1207 let skip_to_end_of_delimited_comment lexer =
1208 let rec aux lexer offset =
1209 let ch0 = peek_char lexer offset in
1210 if ch0 = invalid then
1211 let lexer = advance lexer offset in
1212 if at_end lexer then
1213 with_error lexer SyntaxError.error0007
1214 else
1215 (* TODO: Do we want to give a warning for an embedded zero char
1216 inside a comment? *)
1217 aux lexer 1
1218 else if ch0 = '*' && (peek_char lexer (offset + 1)) = '/' then
1219 advance lexer (offset + 2)
1220 else aux lexer (offset + 1) in
1221 aux lexer 0
1223 let scan_delimited_comment lexer =
1224 (* An unsafe expression comment is a delimited comment that begins with any
1225 whitespace, followed by UNSAFE_EXPR, followed by any text.
1227 The original lexer lexes a fixme / ignore error as:
1229 slash star [whitespace]* HH_FIXME [whitespace or newline]* leftbracket
1230 [whitespace or newline]* integer [any text]* star slash
1232 Notice that the original lexer oddly enough does not verify that there
1233 is a right bracket.
1235 For our purposes we will just check for HH_FIXME / HH_IGNORE_ERROR;
1236 a later pass can try to parse out the integer if there is one,
1237 give a warning if there is not, and so on. *)
1239 let lexer = advance lexer 2 in
1240 let lexer_ws = skip_whitespace lexer in
1241 let lexer = skip_to_end_of_delimited_comment lexer_ws in
1242 let w = width lexer in
1243 let c =
1244 if match_string lexer_ws "UNSAFE_EXPR" then
1245 Trivia.make_unsafe_expression (source lexer) (start lexer) w
1246 else if match_string lexer_ws "HH_FIXME" then
1247 Trivia.make_fix_me (source lexer) (start lexer) w
1248 else if match_string lexer_ws "HH_IGNORE_ERROR" then
1249 Trivia.make_ignore_error (source lexer) (start lexer) w
1250 else
1251 Trivia.make_delimited_comment (source lexer) (start lexer) w in
1252 (lexer, c)
1255 let scan_php_trivia lexer =
1256 (* Hack does not support PHP style embedded markup:
1257 <?php
1258 if (x) {
1260 <foo>bar</foo>
1261 <?php
1262 } else { ... }
1264 However, ?> is never legal in Hack, so we can treat ?> ... any text ... <?php
1265 as a comment, and then give an error saying that this feature is not supported
1266 in Hack.
1268 TODO: Give an error if this appears in a Hack program.
1270 match peek_char lexer 0 with
1271 | '#' ->
1272 let lexer = start_new_lexeme lexer in
1273 let (lexer, c) = scan_hash_comment lexer in
1274 (lexer, Some c)
1275 | '/' -> begin
1276 let lexer = start_new_lexeme lexer in
1277 match peek_char lexer 1 with
1278 | '/' ->
1279 let (lexer, c) = scan_single_line_comment lexer in
1280 (lexer, Some c)
1281 | '*' ->
1282 let (lexer, c) = scan_delimited_comment lexer in
1283 (lexer, Some c)
1284 | _ -> (lexer, None)
1286 | ' ' | '\t' ->
1287 let new_end = str_skip_whitespace ~str:(source_text_string lexer) ~i:(offset lexer) in
1288 let new_start = offset lexer in
1289 let new_trivia = Trivia.make_whitespace (source lexer) new_start (new_end - new_start) in
1290 (with_start_offset lexer new_start new_end, Some new_trivia)
1291 | '\r' | '\n' ->
1292 let lexer = start_new_lexeme lexer in
1293 let (lexer, e) = scan_end_of_line lexer in
1294 (lexer, Some e)
1295 | _ ->
1296 let lexer = start_new_lexeme lexer in
1297 (* Not trivia *) (lexer, None)
1299 let scan_xhp_trivia lexer =
1300 (* TODO: Should XHP comments <!-- --> be their own thing, or a kind of
1301 trivia associated with a token? Right now they are the former. *)
1302 let i = offset lexer in
1303 let ch = peek_char lexer 0 in
1304 match ch with
1305 | ' ' | '\t' ->
1306 let i' = str_skip_whitespace ~str:(source_text_string lexer) ~i in
1307 let lexer = with_start_offset lexer i i' in
1308 let trivium = Trivia.make_whitespace (source lexer) i (i' - i) in
1309 (lexer, Some trivium)
1310 | '\r' | '\n' ->
1311 let i' = str_scan_end_of_line ~str:(source_text_string lexer) ~i in
1312 let lexer = with_start_offset lexer i i' in
1313 let trivium = Trivia.make_eol (source lexer) i (i' - i) in
1314 (lexer, Some trivium)
1315 | _ -> (* Not trivia *)
1316 let lexer = start_new_lexeme lexer in (lexer, None)
1319 We divide trivia into "leading" and "trailing" trivia of an associated
1320 token. This means that we must find a dividing line between the trailing trivia
1321 following one token and the leading trivia of the following token. Plainly
1322 we need only find this line while scanning trailing trivia. The heuristics
1323 we use are:
1324 * The first newline trivia encountered is the last trailing trivia.
1325 * The newline which follows a // or # comment is not part of the comment
1326 but does terminate the trailing trivia.
1327 * A pragma to turn checks off (HH_FIXME, HH_IGNORE_ERROR and UNSAFE_EXPR) is
1328 * always a leading trivia.
1331 let scan_leading_trivia scanner lexer =
1332 let rec aux lexer acc =
1333 let (lexer, trivia) = scanner lexer in
1334 match trivia with
1335 | None -> (lexer, acc)
1336 | Some t -> aux lexer (t :: acc) in
1337 let (lexer, trivia_list) = aux lexer [] in
1338 (lexer, List.rev trivia_list)
1340 let scan_leading_php_trivia lexer =
1341 scan_leading_trivia scan_php_trivia lexer
1343 let scan_leading_xhp_trivia lexer =
1344 scan_leading_trivia scan_xhp_trivia lexer
1346 let scan_trailing_trivia scanner lexer =
1347 let rec aux lexer acc =
1348 let (lexer1, trivia) = scanner lexer in
1349 match trivia with
1350 | None -> (lexer1, acc)
1351 | Some t -> begin
1352 match Trivia.kind t with
1353 | TriviaKind.EndOfLine -> (lexer1, t :: acc)
1354 | TriviaKind.FixMe
1355 | TriviaKind.IgnoreError
1356 | TriviaKind.UnsafeExpression
1357 -> (lexer, acc)
1358 | _ -> aux lexer1 (t :: acc)
1359 end in
1360 let (lexer, trivia_list) = aux lexer [] in
1361 (lexer, List.rev trivia_list)
1363 let scan_trailing_php_trivia lexer =
1364 scan_trailing_trivia scan_php_trivia lexer
1366 let scan_trailing_xhp_trivia lexer =
1367 scan_trailing_trivia scan_xhp_trivia lexer
1369 let is_next_name lexer =
1370 let (lexer, _) = scan_leading_php_trivia lexer in
1371 is_name_nondigit (peek_char lexer 0)
1373 let is_next_xhp_class_name lexer =
1374 let (lexer, _) = scan_leading_php_trivia lexer in
1375 is_xhp_class_name lexer
1377 let as_case_insensitive_keyword text =
1378 (* Some keywords are case-insensitive in Hack or PHP. *)
1379 (* TODO: Consider making non-lowercase versions of these keywords errors
1380 in strict mode. *)
1381 (* TODO: Consider making these illegal, period, and code-modding away all
1382 non-lower versions in our codebase. *)
1383 let lower = String.lowercase_ascii text in
1384 match lower with
1385 | "eval" | "isset" | "unset" | "empty" | "const" | "new"
1386 | "and" | "or" | "xor" | "as" | "print" | "throw"
1387 | "true" | "false" | "null" | "array" | "instanceof"
1388 | "trait" | "class" | "interface" | "using" | "static" | "inout"
1389 | "self" | "parent" | "__halt_compiler" | "foreach" | "echo" -> lower
1390 | _ -> text
1392 let as_keyword kind lexer =
1393 if kind = TokenKind.Name then
1394 let text = as_case_insensitive_keyword (current_text lexer) in
1395 match TokenKind.from_string text with
1396 | Some keyword -> keyword
1397 | _ -> TokenKind.Name
1398 else
1399 kind
1401 (* scanner takes a lexer, returns a lexer and a kind *)
1402 let scan_token_and_leading_trivia scanner as_name lexer =
1403 (* Get past the leading trivia *)
1404 let (lexer, leading) = scan_leading_php_trivia lexer in
1405 (* Remember where we were when we started this token *)
1406 let lexer = start_new_lexeme lexer in
1407 let (lexer, kind) = scanner lexer in
1408 let kind = if as_name then kind else as_keyword kind lexer in
1409 let w = width lexer in
1410 (lexer, kind, w, leading)
1412 (* scanner takes a lexer, returns a lexer and a kind *)
1413 let scan_token_and_trivia scanner as_name lexer =
1414 let token_start = offset lexer in
1415 let (lexer, kind, w, leading) =
1416 scan_token_and_leading_trivia scanner as_name lexer in
1417 let (lexer, trailing) =
1418 match kind with
1419 | TokenKind.DoubleQuotedStringLiteralHead -> (lexer, [])
1420 | TokenKind.QuestionGreaterThan ->
1421 if is_newline (peek_char lexer 0) then
1422 (* consume only trailing EOL token after ?> as trailing trivia *)
1423 let (lexer, eol) = scan_end_of_line lexer in
1424 (lexer, [eol])
1425 else
1426 (lexer, [])
1427 | _ -> scan_trailing_php_trivia lexer in
1428 (lexer, Token.make kind (source lexer) token_start w leading trailing)
1430 (* tokenizer takes a lexer, returns a lexer and a token *)
1431 let scan_assert_progress tokenizer lexer =
1432 let original_remaining = remaining lexer in
1433 let (lexer, token) = tokenizer lexer in
1434 let new_remaining = remaining lexer in
1435 if (new_remaining < original_remaining ||
1436 original_remaining = 0 &&
1437 new_remaining = 0 &&
1438 (Token.kind token) = TokenKind.EndOfFile) then
1439 (lexer, token)
1440 else begin
1441 Printf.kprintf failwith
1442 "failed to make progress at %d\n" (offset lexer)
1445 let scan_next_token ~as_name scanner lexer =
1446 let tokenizer = scan_token_and_trivia scanner as_name in
1447 scan_assert_progress tokenizer lexer
1449 let scan_next_token_as_name = scan_next_token ~as_name:true
1450 let scan_next_token_as_keyword = scan_next_token ~as_name:false
1452 (* Entrypoints *)
1453 (* TODO: Instead of passing Boolean flags, create a flags enum? *)
1455 (* This function is the inner loop of the parser, is pure, and
1456 is frequently called twice in a row with the same lexer due to the
1457 design of the parser. We get a big win by memoizing it. *)
1460 let next_token = (* takes a lexer, returns a (lexer, token) *)
1461 let next_token_cache = Little_cache.make empty
1462 (empty, Token.make TokenKind.EndOfFile SourceText.empty 0 0 [] []) in
1463 Little_cache.memoize next_token_cache
1464 (scan_next_token_as_keyword scan_token_outside_type)
1466 let next_token_no_trailing lexer =
1467 let tokenizer lexer =
1468 let token_start = offset lexer in
1469 let (lexer, kind, w, leading) =
1470 scan_token_and_leading_trivia scan_token_outside_type false lexer in
1471 (lexer, Token.make kind (source lexer) token_start w leading []) in
1472 scan_assert_progress tokenizer lexer
1474 let next_token_in_string lexer literal_kind =
1475 let token_start = offset lexer in
1476 let lexer = start_new_lexeme lexer in
1477 (* We're inside a string. Do not scan leading trivia. *)
1478 let (lexer, kind) = scan_string_literal_in_progress lexer literal_kind in
1479 let w = width lexer in
1480 (* Only scan trailing trivia if we've finished the string. *)
1481 let (lexer, trailing) =
1482 match kind with
1483 | TokenKind.DoubleQuotedStringLiteralTail
1484 | TokenKind.HeredocStringLiteralTail -> scan_trailing_php_trivia lexer
1485 | _ -> (lexer, []) in
1486 let token = Token.make kind (source lexer) token_start w [] trailing in
1487 (lexer, token)
1489 let next_docstring_header lexer =
1490 (* We're at the beginning of a heredoc string literal. Scan leading
1491 trivia but not trailing trivia. *)
1492 let token_start = offset lexer in
1493 let (lexer, leading) = scan_leading_php_trivia lexer in
1494 let lexer = start_new_lexeme lexer in
1495 let (lexer, name, _) = scan_docstring_header lexer in
1496 let w = width lexer in
1497 let token = Token.make TokenKind.HeredocStringLiteralHead
1498 (source lexer) token_start w leading [] in
1499 (lexer, token, name)
1501 let next_token_as_name lexer =
1502 scan_next_token_as_name scan_token_outside_type lexer
1504 let next_token_in_type lexer =
1505 scan_next_token_as_keyword scan_token_inside_type lexer
1507 let next_xhp_element_token ~no_trailing lexer =
1508 (* XHP elements have whitespace, newlines and Hack comments. *)
1509 let tokenizer lexer =
1510 let token_start = offset lexer in
1511 let (lexer, kind, w, leading) =
1512 scan_token_and_leading_trivia scan_xhp_token true lexer in
1513 (* We do not scan trivia after an XHPOpen's >. If that is the beginning of
1514 an XHP body then we want any whitespace or newlines to be leading trivia
1515 of the body token. *)
1516 match kind with
1517 | TokenKind.GreaterThan
1518 | TokenKind.SlashGreaterThan when no_trailing ->
1519 (lexer, Token.make kind (source lexer) token_start w leading [])
1520 | _ ->
1521 let (lexer, trailing) = scan_trailing_php_trivia lexer in
1522 (lexer, Token.make
1523 kind (source lexer) token_start w leading trailing) in
1524 let (lexer, token) = scan_assert_progress tokenizer lexer in
1525 let token_width = Token.width token in
1526 let trailing_width = Token.trailing_width token in
1527 let token_start_offset = (offset lexer) - trailing_width - token_width in
1528 let token_text = SourceText.sub (source lexer) token_start_offset token_width in
1529 (lexer, token, token_text)
1531 let next_xhp_body_token lexer =
1532 let scanner lexer =
1533 let token_start = offset lexer in
1534 let (lexer, leading) = scan_leading_xhp_trivia lexer in
1535 let lexer = start_new_lexeme lexer in
1536 let (lexer, kind) = scan_xhp_body lexer in
1537 let w = width lexer in
1538 let (lexer, trailing) =
1539 (* Trivia (leading and trailing) is semantically
1540 significant for XHPBody tokens. When we find elements or
1541 braced expressions inside the body, the trivia should be
1542 seen as leading the next token, but we should certainly
1543 keep it trailing if this is an XHPBody token. *)
1544 if kind = TokenKind.XHPBody
1545 then scan_trailing_xhp_trivia lexer
1546 else (lexer, [])
1548 (lexer, Token.make kind (source lexer) token_start w leading trailing) in
1549 scan_assert_progress scanner lexer
1551 let next_xhp_class_name lexer =
1552 scan_token_and_trivia scan_xhp_class_name false lexer
1554 let next_xhp_name lexer =
1555 scan_token_and_trivia scan_xhp_element_name false lexer
1557 let make_markup_token lexer =
1558 Token.make TokenKind.Markup (source lexer) (start lexer) (width lexer) [] []
1560 let skip_to_end_of_markup lexer ~is_leading_section =
1561 let make_markup_and_suffix lexer =
1562 let markup_text = make_markup_token lexer in
1563 let less_than_question_token =
1564 Token.make TokenKind.LessThanQuestion (source lexer) (offset lexer) 2 [] []
1566 (* skip <? *)
1567 let lexer = advance lexer 2 in
1568 let name_token_offset = offset lexer in
1569 let make_long_tag lexer size =
1570 (* skip name*)
1571 let lexer = advance lexer size in
1572 (* single line comments that follow the language in leading markup_text
1573 determine the file check mode, read the trailing trivia and attach it
1574 to the language token *)
1575 let lexer, trailing =
1576 if is_leading_section then scan_trailing_php_trivia lexer
1577 else lexer, []
1579 let name = Token.make TokenKind.Name
1580 (source lexer) name_token_offset size [] trailing in
1581 lexer, markup_text, Some (less_than_question_token, Some name)
1583 let ch0 = peek_char lexer 0 in
1584 let ch1 = peek_char lexer 1 in
1585 let ch2 = peek_char lexer 2 in
1586 match ch0, ch1, ch2 with
1587 | ('H' | 'h'), ('H' | 'h'), _ -> make_long_tag lexer 2
1588 | ('P' | 'p'), ('H' | 'h'), ('P' | 'p') -> make_long_tag lexer 3
1589 | '=', _, _ ->
1590 begin
1591 (* skip = *)
1592 let lexer = advance lexer 1 in
1593 let equal = Token.make TokenKind.Equal
1594 (source lexer) name_token_offset 1 [] [] in
1595 lexer, markup_text, Some (less_than_question_token, Some equal)
1597 | _ ->
1598 lexer, markup_text, Some (less_than_question_token, None)
1600 let rec aux lexer index =
1601 (* It's not an error to run off the end of one of these. *)
1602 if at_end_index lexer index then
1603 let lexer' = with_offset lexer index in
1604 lexer', (make_markup_token lexer'), None
1605 else begin
1606 let ch = peek lexer index in
1607 if ch = '<' && peek_def lexer (succ index) ~def:'\x00' = '?' then
1608 (* Found a beginning tag that delimits markup from the script *)
1609 make_markup_and_suffix (with_offset lexer index)
1610 else
1611 aux lexer (succ index)
1614 let start_offset =
1615 if is_leading_section
1616 then begin
1617 (* if leading section starts with #! - it should span the entire line *)
1618 let index = offset lexer in
1619 if peek_def ~def:'\x00' lexer index = '#' &&
1620 peek_def ~def:'\x00' lexer (succ index) = '!'
1621 then skip_while_to_offset lexer not_newline
1622 else index
1624 else offset lexer in
1625 aux lexer start_offset
1627 let scan_markup lexer ~is_leading_section =
1628 let lexer = start_new_lexeme lexer in
1629 skip_to_end_of_markup lexer ~is_leading_section
1631 let is_next_xhp_category_name lexer =
1632 let (lexer, _) = scan_leading_php_trivia lexer in
1633 (* An XHP category is an xhp element name preceded by a %. *)
1634 let ch0 = peek_char lexer 0 in
1635 let ch1 = peek_char lexer 1 in
1636 ch0 = '%' && is_name_nondigit ch1
1638 let scan_xhp_category_name lexer =
1639 if is_next_xhp_category_name lexer then
1640 let (lexer, _) = scan_xhp_element_name (advance lexer 1) in
1641 (lexer, TokenKind.XHPCategoryName)
1642 else
1643 scan_token false lexer
1645 let next_xhp_category_name lexer =
1646 scan_token_and_trivia scan_xhp_category_name false lexer
1648 let rescan_halt_compiler lexer last_token =
1649 (* __halt_compiler stops parsing of the file.
1650 In order to preserve fill fidelity aspect of the parser
1651 we pack everything that follows __halt_compiler as
1652 separate opaque kind of trivia - it will be attached as a trailing trivia
1653 to the last_token and existing trailing trivia will be merged in. *)
1654 let start_offset =
1655 Token.leading_start_offset last_token +
1656 Token.leading_width last_token +
1657 Token.width last_token in
1658 let source = source lexer in
1659 let length = SourceText.length source in
1660 let trailing =
1661 Trivia.make_after_halt_compiler
1662 source
1663 start_offset
1664 (length - start_offset) in
1665 Lexer.with_offset lexer length, Token.with_trailing [trailing] last_token
1667 end (* WithToken *)