1 %{ /* -*- mode: c++ -*- */
2 #include "hphp/parser/scanner.h"
5 #define YYSTYPE HPHP::ScannerToken
6 #define YYLTYPE HPHP::Location
7 #define YY_EXTRA_TYPE HPHP::Scanner*
8 #define _scanner yyextra
9 #define YY_INPUT(buf,result,max) _scanner->read(buf,result,max)
10 #define YY_FATAL_ERROR(msg) \
12 struct yyguts_t *yyg = (struct yyguts_t *)yyscanner; \
13 _scanner->error(msg); \
16 #undef YY_READ_BUF_SIZE
18 #define YY_READ_BUF_SIZE 1024*128 /* for reading from input */
19 #define YY_BUF_SIZE 1024*64 /* for pattern matching */
21 #define DECLARE_YYCURSOR \
22 char *&cursor = yyg->yy_c_buf_p; *cursor = yyg->yy_hold_char;
23 #define DECLARE_YYLIMIT \
24 char *limit = YY_CURRENT_BUFFER->yy_ch_buf + yyg->yy_n_chars;
25 #define YYCURSOR cursor
27 #define RESET_YYCURSOR yyg->yy_hold_char = *YYCURSOR; *YYCURSOR = '\0';
30 #define RETTOKEN(t) do {_scanner->setToken(yytext, yyleng, t); return t;} \
32 #define RETSTEP(t) do {_scanner->stepPos(yytext, yyleng, t); return t;} \
34 #define SETTOKEN(t) _scanner->setToken(yytext, yyleng, t)
35 #define STEPPOS(t) _scanner->stepPos(yytext, yyleng, t)
37 #define XHP_ONLY_KEYWORD(tok) do { \
38 RETTOKEN(_scanner->isXHPSyntaxEnabled() ? (tok) : T_STRING); \
41 #define HH_ONLY_KEYWORD(tok) do { \
42 RETTOKEN(_scanner->isHHSyntaxEnabled() ? (tok) : T_STRING); \
45 #define IS_LABEL_START(c) \
46 (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z') || \
47 (c) == '_' || (c) >= 0x7F)
50 * "Next token" types tell us how to treat a token based on the previous
51 * token for the purpose of recognizing XHP tags, XHP class names, XHP
52 * category names, and type lists.
54 * '<' will be treated as the start of an XHP tag
56 * '<' will be treated as possibly being the start of an XHP tag;
57 * we will scan ahead looking at subsequent characters to figure
58 * out if '<' is definitely the start of an XHP tag
60 * ':' will be treated as the start of an XHP class name
62 * '%' will be treated as the start of an XHP category name
64 * '<' should be recognized as possibly being the start of a type list;
65 * this will be resolved by inspecting subsequent tokens
67 namespace NextTokenType {
68 static const int Normal = 0x1;
69 static const int XhpTag = 0x2;
70 static const int XhpTagMaybe = 0x4;
71 static const int XhpClassName = 0x8;
72 static const int XhpCategoryName = 0x10;
73 static const int TypeListMaybe = 0x20;
76 static int getNextTokenType(int t) {
78 case '=': case '.': case '+': case '-': case '*': case '/': case '%':
79 case '!': case '~': case '&': case '^': case '<': case '>': case '?':
80 case ':': case '[': case '{': case ';': case '@': case -1:
91 case T_IS_NOT_IDENTICAL:
92 case T_IS_SMALLER_OR_EQUAL:
93 case T_IS_GREATER_OR_EQUAL:
124 case T_UNRESOLVED_LT:
126 return NextTokenType::XhpTag |
127 NextTokenType::XhpClassName;
128 case ',': case '(': case '|':
129 return NextTokenType::XhpTag |
130 NextTokenType::XhpClassName |
131 NextTokenType::XhpCategoryName;
133 return NextTokenType::XhpTagMaybe |
134 NextTokenType::XhpClassName;
137 return NextTokenType::XhpTagMaybe;
144 return NextTokenType::XhpClassName;
150 return NextTokenType::TypeListMaybe;
151 case T_XHP_ATTRIBUTE:
152 return NextTokenType::XhpClassName |
153 NextTokenType::TypeListMaybe;
155 return NextTokenType::XhpCategoryName |
156 NextTokenType::TypeListMaybe;
158 return NextTokenType::Normal;
172 %x ST_LOOKING_FOR_PROPERTY
173 %x ST_LOOKING_FOR_VARNAME
174 %x ST_LOOKING_FOR_COLON
179 %x ST_ONE_LINE_COMMENT
182 %x ST_XHP_END_SINGLETON_TAG
183 %x ST_XHP_END_CLOSE_TAG
190 DNUM ([0-9]*[\.][0-9]+)|([0-9]+[\.][0-9]*)
191 EXPONENT_DNUM (({LNUM}|{DNUM})[eE][+-]?{LNUM})
192 HNUM "0x"[0-9a-fA-F]+
193 LABEL [a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*
194 WHITESPACE [ \n\r\t]+
195 TABS_AND_SPACES [ \t]*
196 TOKENS [;:,.\[\]()|^&+\-*/=%!~$<>?@]
198 NEWLINE ("\r"|"\n"|"\r\n")
199 XHPLABEL {LABEL}([:-]{LABEL})*
200 COMMENT_REGEX ("/*"([^\*]|("*"[^/]))*"*/"|("//"|"#")[^\r\n]*{NEWLINE})
201 WHITESPACE_AND_COMMENTS ([ \n\r\t]|({COMMENT_REGEX}))+
204 * LITERAL_DOLLAR matches unescaped $ that aren't followed by a label character
205 * or a { and therefore will be taken literally. The case of literal $ before
206 * a variable or "${" is handled in a rule for each string type
208 DOUBLE_QUOTES_LITERAL_DOLLAR ("$"+([^a-zA-Z_\x7f-\xff$\"\\{]|("\\"{ANY_CHAR})))
209 BACKQUOTE_LITERAL_DOLLAR ("$"+([^a-zA-Z_\x7f-\xff$`\\{]|("\\"{ANY_CHAR})))
212 * CHARS matches everything up to a variable or "{$"
213 * {'s are matched as long as they aren't followed by a $
214 * The case of { before "{$" is handled in a rule for each string type
216 * For heredocs, matching continues across/after newlines if/when it's known
217 * that the next line doesn't contain a possible ending label
219 DOUBLE_QUOTES_CHARS ("{"*([^$\"\\{]|("\\"{ANY_CHAR}))|{DOUBLE_QUOTES_LITERAL_DOLLAR})
220 BACKQUOTE_CHARS ("{"*([^$`\\{]|("\\"{ANY_CHAR}))|{BACKQUOTE_LITERAL_DOLLAR})
224 <ST_IN_SCRIPTING>"exit" { RETTOKEN(T_EXIT);}
225 <ST_IN_SCRIPTING>"die" { RETTOKEN(T_EXIT);}
226 <ST_IN_SCRIPTING>"function" { RETTOKEN(T_FUNCTION);}
227 <ST_IN_SCRIPTING>"const" { RETTOKEN(T_CONST);}
228 <ST_IN_SCRIPTING>"return" { RETTOKEN(T_RETURN); }
229 <ST_IN_SCRIPTING>"yield" { RETTOKEN(T_YIELD);}
230 <ST_IN_SCRIPTING>"try" { RETTOKEN(T_TRY);}
231 <ST_IN_SCRIPTING>"catch" { RETTOKEN(T_CATCH);}
232 <ST_IN_SCRIPTING>"finally" { RETTOKEN(T_FINALLY);}
233 <ST_IN_SCRIPTING>"throw" { RETTOKEN(T_THROW);}
234 <ST_IN_SCRIPTING>"if" { RETTOKEN(T_IF);}
235 <ST_IN_SCRIPTING>"elseif" { RETTOKEN(T_ELSEIF);}
236 <ST_IN_SCRIPTING>"endif" { RETTOKEN(T_ENDIF);}
237 <ST_IN_SCRIPTING>"else" { RETTOKEN(T_ELSE);}
238 <ST_IN_SCRIPTING>"while" { RETTOKEN(T_WHILE);}
239 <ST_IN_SCRIPTING>"endwhile" { RETTOKEN(T_ENDWHILE);}
240 <ST_IN_SCRIPTING>"do" { RETTOKEN(T_DO);}
241 <ST_IN_SCRIPTING>"for" { RETTOKEN(T_FOR);}
242 <ST_IN_SCRIPTING>"endfor" { RETTOKEN(T_ENDFOR);}
243 <ST_IN_SCRIPTING>"foreach" { RETTOKEN(T_FOREACH);}
244 <ST_IN_SCRIPTING>"endforeach" { RETTOKEN(T_ENDFOREACH);}
245 <ST_IN_SCRIPTING>"declare" { RETTOKEN(T_DECLARE);}
246 <ST_IN_SCRIPTING>"enddeclare" { RETTOKEN(T_ENDDECLARE);}
247 <ST_IN_SCRIPTING>"instanceof" { RETTOKEN(T_INSTANCEOF);}
248 <ST_IN_SCRIPTING>"as" { RETTOKEN(T_AS);}
249 <ST_IN_SCRIPTING>"switch" { RETTOKEN(T_SWITCH);}
250 <ST_IN_SCRIPTING>"endswitch" { RETTOKEN(T_ENDSWITCH);}
251 <ST_IN_SCRIPTING>"case" { RETTOKEN(T_CASE);}
252 <ST_IN_SCRIPTING>"default" { RETTOKEN(T_DEFAULT);}
253 <ST_IN_SCRIPTING>"break" { RETTOKEN(T_BREAK);}
254 <ST_IN_SCRIPTING>"continue" { RETTOKEN(T_CONTINUE);}
255 <ST_IN_SCRIPTING>"goto" { RETTOKEN(T_GOTO);}
256 <ST_IN_SCRIPTING>"echo" { RETTOKEN(T_ECHO);}
257 <ST_IN_SCRIPTING>"print" { RETTOKEN(T_PRINT);}
258 <ST_IN_SCRIPTING>"class" { RETTOKEN(T_CLASS);}
259 <ST_IN_SCRIPTING>"interface" { RETTOKEN(T_INTERFACE);}
260 <ST_IN_SCRIPTING>"trait" { RETTOKEN(T_TRAIT);}
261 <ST_IN_SCRIPTING>"insteadof" { RETTOKEN(T_INSTEADOF);}
262 <ST_IN_SCRIPTING>"extends" { RETTOKEN(T_EXTENDS);}
263 <ST_IN_SCRIPTING>"implements" { RETTOKEN(T_IMPLEMENTS);}
264 <ST_IN_SCRIPTING>"attribute" { XHP_ONLY_KEYWORD(T_XHP_ATTRIBUTE); }
265 <ST_IN_SCRIPTING>"category" { XHP_ONLY_KEYWORD(T_XHP_CATEGORY); }
266 <ST_IN_SCRIPTING>"children" { XHP_ONLY_KEYWORD(T_XHP_CHILDREN); }
267 <ST_IN_SCRIPTING>"required" { XHP_ONLY_KEYWORD(T_XHP_REQUIRED); }
268 <ST_IN_SCRIPTING>"enum" { XHP_ONLY_KEYWORD(T_XHP_ENUM); }
270 <ST_IN_SCRIPTING>"->" {
271 STEPPOS(T_OBJECT_OPERATOR);
272 yy_push_state(ST_LOOKING_FOR_PROPERTY, yyscanner);
273 return T_OBJECT_OPERATOR;
276 <ST_LOOKING_FOR_PROPERTY>"->" {
277 RETSTEP(T_OBJECT_OPERATOR);
280 <ST_LOOKING_FOR_PROPERTY>{LABEL} {
282 yy_pop_state(yyscanner);
286 <ST_LOOKING_FOR_PROPERTY>{WHITESPACE} {
287 RETSTEP(T_WHITESPACE);
290 <ST_LOOKING_FOR_PROPERTY>{ANY_CHAR} {
292 yy_pop_state(yyscanner);
295 <ST_IN_SCRIPTING>"::" { RETSTEP(T_DOUBLE_COLON);}
296 <ST_IN_SCRIPTING>"\\" { RETTOKEN(T_NS_SEPARATOR);}
297 <ST_IN_SCRIPTING>"new" { RETTOKEN(T_NEW);}
298 <ST_IN_SCRIPTING>"clone" { RETTOKEN(T_CLONE);}
299 <ST_IN_SCRIPTING>"var" { RETTOKEN(T_VAR);}
301 <ST_IN_SCRIPTING>"("{TABS_AND_SPACES}("int"|"integer"){TABS_AND_SPACES}")" {
302 if (_scanner->lastToken() != T_FUNCTION || !_scanner->isHHSyntaxEnabled()) {
309 <ST_IN_SCRIPTING>"("{TABS_AND_SPACES}("real"|"double"|"float"){TABS_AND_SPACES}")" {
310 if (_scanner->lastToken() != T_FUNCTION || !_scanner->isHHSyntaxEnabled()) {
311 RETSTEP(T_DOUBLE_CAST);
317 <ST_IN_SCRIPTING>"("{TABS_AND_SPACES}("string"|"binary"){TABS_AND_SPACES}")" {
318 if (_scanner->lastToken() != T_FUNCTION || !_scanner->isHHSyntaxEnabled()) {
319 RETSTEP(T_STRING_CAST);
325 <ST_IN_SCRIPTING>"("{TABS_AND_SPACES}"array"{TABS_AND_SPACES}")" {
326 if (_scanner->lastToken() != T_FUNCTION || !_scanner->isHHSyntaxEnabled()) {
327 RETSTEP(T_ARRAY_CAST);
333 <ST_IN_SCRIPTING>"("{TABS_AND_SPACES}"object"{TABS_AND_SPACES}")" {
334 if (_scanner->lastToken() != T_FUNCTION || !_scanner->isHHSyntaxEnabled()) {
335 RETSTEP(T_OBJECT_CAST);
341 <ST_IN_SCRIPTING>"("{TABS_AND_SPACES}("bool"|"boolean"){TABS_AND_SPACES}")" {
342 if (_scanner->lastToken() != T_FUNCTION || !_scanner->isHHSyntaxEnabled()) {
343 RETSTEP(T_BOOL_CAST);
349 <ST_IN_SCRIPTING>"("{TABS_AND_SPACES}("unset"){TABS_AND_SPACES}")" {
350 if (_scanner->lastToken() != T_FUNCTION || !_scanner->isHHSyntaxEnabled()) {
351 RETSTEP(T_UNSET_CAST);
357 <ST_IN_SCRIPTING>"eval" { RETTOKEN(T_EVAL);}
358 <ST_IN_SCRIPTING>"include" { RETTOKEN(T_INCLUDE);}
359 <ST_IN_SCRIPTING>"include_once" { RETTOKEN(T_INCLUDE_ONCE);}
360 <ST_IN_SCRIPTING>"require" { RETTOKEN(T_REQUIRE);}
361 <ST_IN_SCRIPTING>"require_once" { RETTOKEN(T_REQUIRE_ONCE);}
362 <ST_IN_SCRIPTING>"namespace" { RETTOKEN(T_NAMESPACE);}
363 <ST_IN_SCRIPTING>"use" { RETTOKEN(T_USE);}
364 <ST_IN_SCRIPTING>"global" { RETTOKEN(T_GLOBAL);}
365 <ST_IN_SCRIPTING>"isset" { RETTOKEN(T_ISSET);}
366 <ST_IN_SCRIPTING>"empty" { RETTOKEN(T_EMPTY);}
367 <ST_IN_SCRIPTING>"__halt_compiler" { RETTOKEN(T_HALT_COMPILER);}
368 <ST_IN_SCRIPTING>"__compiler_halt_offset__" { RETTOKEN(T_COMPILER_HALT_OFFSET);}
369 <ST_IN_SCRIPTING>"static" { RETTOKEN(T_STATIC);}
370 <ST_IN_SCRIPTING>"abstract" { RETTOKEN(T_ABSTRACT);}
371 <ST_IN_SCRIPTING>"final" { RETTOKEN(T_FINAL);}
372 <ST_IN_SCRIPTING>"private" { RETTOKEN(T_PRIVATE);}
373 <ST_IN_SCRIPTING>"protected" { RETTOKEN(T_PROTECTED);}
374 <ST_IN_SCRIPTING>"public" { RETTOKEN(T_PUBLIC);}
375 <ST_IN_SCRIPTING>"unset" { RETTOKEN(T_UNSET);}
376 <ST_IN_SCRIPTING>"=>" { RETSTEP(T_DOUBLE_ARROW);}
377 <ST_IN_SCRIPTING>"list" { RETTOKEN(T_LIST);}
378 <ST_IN_SCRIPTING>"array" { RETTOKEN(T_ARRAY);}
379 <ST_IN_SCRIPTING>"++" { RETSTEP(T_INC);}
380 <ST_IN_SCRIPTING>"--" { RETSTEP(T_DEC);}
381 <ST_IN_SCRIPTING>"===" { RETSTEP(T_IS_IDENTICAL);}
382 <ST_IN_SCRIPTING>"!==" { RETSTEP(T_IS_NOT_IDENTICAL);}
383 <ST_IN_SCRIPTING>"==" { RETSTEP(T_IS_EQUAL);}
384 <ST_IN_SCRIPTING>"!="|"<>" { RETSTEP(T_IS_NOT_EQUAL);}
385 <ST_IN_SCRIPTING>"<=" { RETSTEP(T_IS_SMALLER_OR_EQUAL);}
386 <ST_IN_SCRIPTING>">=" { RETSTEP(T_IS_GREATER_OR_EQUAL);}
387 <ST_IN_SCRIPTING>"+=" { RETSTEP(T_PLUS_EQUAL);}
388 <ST_IN_SCRIPTING>"-=" { RETSTEP(T_MINUS_EQUAL);}
389 <ST_IN_SCRIPTING>"*=" { RETSTEP(T_MUL_EQUAL);}
390 <ST_IN_SCRIPTING>"/=" { RETSTEP(T_DIV_EQUAL);}
391 <ST_IN_SCRIPTING>".=" { RETSTEP(T_CONCAT_EQUAL);}
392 <ST_IN_SCRIPTING>"%=" { RETSTEP(T_MOD_EQUAL);}
393 <ST_IN_SCRIPTING>"<<=" { RETSTEP(T_SL_EQUAL);}
394 <ST_IN_SCRIPTING>">>=" { RETSTEP(T_SR_EQUAL);}
395 <ST_IN_SCRIPTING>"&=" { RETSTEP(T_AND_EQUAL);}
396 <ST_IN_SCRIPTING>"|=" { RETSTEP(T_OR_EQUAL);}
397 <ST_IN_SCRIPTING>"^=" { RETSTEP(T_XOR_EQUAL);}
398 <ST_IN_SCRIPTING>"||" { RETSTEP(T_BOOLEAN_OR);}
399 <ST_IN_SCRIPTING>"&&" { RETSTEP(T_BOOLEAN_AND);}
400 <ST_IN_SCRIPTING>"OR" { RETTOKEN(T_LOGICAL_OR);}
401 <ST_IN_SCRIPTING>"AND" { RETTOKEN(T_LOGICAL_AND);}
402 <ST_IN_SCRIPTING>"XOR" { RETTOKEN(T_LOGICAL_XOR);}
403 <ST_IN_SCRIPTING>"<<" { RETSTEP(T_SL);}
405 <ST_IN_SCRIPTING>"shape" { HH_ONLY_KEYWORD(T_SHAPE); }
406 <ST_IN_SCRIPTING>"type" { HH_ONLY_KEYWORD(T_UNRESOLVED_TYPE); }
407 <ST_IN_SCRIPTING>"newtype" { HH_ONLY_KEYWORD(T_UNRESOLVED_NEWTYPE); }
408 <ST_IN_SCRIPTING>"await" { HH_ONLY_KEYWORD(T_AWAIT);}
409 <ST_IN_SCRIPTING>"async"/{WHITESPACE_AND_COMMENTS}[a-zA-Z0-9_\x7f-\xff] {
410 HH_ONLY_KEYWORD(T_ASYNC);
413 <ST_IN_SCRIPTING>"tuple"/("("|{WHITESPACE_AND_COMMENTS}"(") {
414 HH_ONLY_KEYWORD(T_TUPLE);
417 <ST_IN_SCRIPTING>"?"/":"[a-zA-Z_\x7f-\xff] {
418 int ntt = getNextTokenType(_scanner->lastToken());
419 if (!_scanner->isXHPSyntaxEnabled() ||
420 ((ntt & NextTokenType::XhpClassName) && _scanner->lastToken() != '}')) {
423 /* If XHP is enabled and "?:" occurs in a place where an XHP class name is
424 not expected or it occurs after "}", drop into the ST_LOOKING_FOR_COLON
425 state to avoid potentially treating ":" as the beginning of an XHP class
427 BEGIN(ST_LOOKING_FOR_COLON);
431 <ST_LOOKING_FOR_COLON>":" {
432 BEGIN(ST_IN_SCRIPTING);
436 <ST_IN_SCRIPTING>"..." {
437 if (!_scanner->isHHSyntaxEnabled()) {
444 <ST_IN_SCRIPTING>">>" {
445 if (_scanner->getLookaheadLtDepth() < 2) {
452 <ST_IN_SCRIPTING>"<"[a-zA-Z_\x7f-\xff] {
453 if (!_scanner->isXHPSyntaxEnabled()) {
454 assert(!_scanner->isHHSyntaxEnabled());
458 int ntt = getNextTokenType(_scanner->lastToken());
459 if (ntt & NextTokenType::XhpTag) {
461 STEPPOS(T_XHP_TAG_LT);
462 yy_push_state(ST_XHP_IN_TAG, yyscanner);
465 if (ntt & NextTokenType::XhpTagMaybe) {
466 // Shift to state state ST_LT_CHECK to do a more extensive check to
467 // determine if this is the beginning of an XHP tag.
473 if (_scanner->isHHSyntaxEnabled() && (ntt & NextTokenType::TypeListMaybe)) {
474 // Return T_UNRESOLVED_LT; the scanner will inspect subseqent tokens
476 RETSTEP(T_UNRESOLVED_LT);
481 <ST_IN_SCRIPTING>"<" {
482 if (_scanner->isHHSyntaxEnabled()) {
483 int ntt = getNextTokenType(_scanner->lastToken());
484 if (ntt & NextTokenType::TypeListMaybe) {
485 // Return T_UNRESOLVED_LT; the scanner will inspect subseqent tokens
487 RETSTEP(T_UNRESOLVED_LT);
493 <ST_LT_CHECK>"<"{XHPLABEL}(">"|"/>"|{WHITESPACE_AND_COMMENTS}(">"|"/>"|[a-zA-Z_\x7f-\xff])) {
494 BEGIN(ST_IN_SCRIPTING);
496 STEPPOS(T_XHP_TAG_LT);
497 yy_push_state(ST_XHP_IN_TAG, yyscanner);
502 BEGIN(ST_IN_SCRIPTING);
506 <ST_IN_SCRIPTING>":"{XHPLABEL} {
507 if (_scanner->isXHPSyntaxEnabled()) {
508 int ntt = getNextTokenType(_scanner->lastToken());
509 if (ntt & NextTokenType::XhpClassName) {
510 yytext++; yyleng--; // skipping the first colon
511 RETTOKEN(T_XHP_LABEL);
518 <ST_IN_SCRIPTING>"%"{XHPLABEL} {
519 if (_scanner->isXHPSyntaxEnabled()) {
520 int ntt = getNextTokenType(_scanner->lastToken());
521 if (ntt & NextTokenType::XhpCategoryName) {
522 yytext++; yyleng--; // skipping "%"
523 RETTOKEN(T_XHP_CATEGORY_LABEL);
530 <ST_IN_SCRIPTING>{TOKENS} {RETSTEP(yytext[0]);}
532 <ST_IN_SCRIPTING>"{" {
534 yy_push_state(ST_IN_SCRIPTING, yyscanner);
538 <ST_DOUBLE_QUOTES,ST_BACKQUOTE,ST_HEREDOC>"${" {
539 STEPPOS(T_DOLLAR_OPEN_CURLY_BRACES);
540 yy_push_state(ST_LOOKING_FOR_VARNAME, yyscanner);
541 return T_DOLLAR_OPEN_CURLY_BRACES;
544 <ST_IN_SCRIPTING>"}"/":"[a-zA-Z_\x7f-\xff] {
546 // We need to be robust against a '}' in PHP code with
547 // no corresponding '{'
548 struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
549 if (yyg->yy_start_stack_ptr) {
550 yy_pop_state(yyscanner);
551 if (YY_START == ST_IN_SCRIPTING) {
552 /* If XHP is enabled and "}:" occurs (and "}" does not cause us
553 to transition to some state other than ST_IN_SCRIPTING), drop
554 into the ST_LOOKING_FOR_COLON state to avoid potentially
555 treating ":" as the beginning of an XHP class name */
556 BEGIN(ST_LOOKING_FOR_COLON);
562 <ST_IN_SCRIPTING>"}" {
564 // We need to be robust against a '}' in PHP code with
565 // no corresponding '{'
566 struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
567 if (yyg->yy_start_stack_ptr) yy_pop_state(yyscanner);
571 <ST_LOOKING_FOR_VARNAME>{LABEL} {
572 SETTOKEN(T_STRING_VARNAME);
573 // Change state to IN_SCRIPTING; current state will be popped
574 // when we encounter '}'
575 BEGIN(ST_IN_SCRIPTING);
576 return T_STRING_VARNAME;
579 <ST_LOOKING_FOR_VARNAME>{ANY_CHAR} {
581 // Change state to IN_SCRIPTING; current state will be popped
582 // when we encounter '}'
583 BEGIN(ST_IN_SCRIPTING);
586 <ST_IN_SCRIPTING,ST_XHP_IN_TAG>{LNUM} {
588 long ret = strtoll(yytext, NULL, 0);
589 if (errno == ERANGE || ret < 0) {
590 _scanner->error("Dec number is too big: %s", yytext);
591 if (_scanner->isHHFile()) {
592 RETTOKEN(T_HH_ERROR);
598 <ST_IN_SCRIPTING,ST_XHP_IN_TAG>{HNUM} {
600 long ret = strtoull(yytext, NULL, 16);
601 if (errno == ERANGE || ret < 0) {
602 _scanner->error("Hex number is too big: %s", yytext);
603 if (_scanner->isHHFile()) {
604 RETTOKEN(T_HH_ERROR);
610 <ST_VAR_OFFSET>0|([1-9][0-9]*) { /* Offset could be treated as a long */
612 long ret = strtoll(yytext, NULL, 0);
613 if (ret == LLONG_MAX && errno == ERANGE) {
614 _scanner->error("Offset number is too big: %s", yytext);
615 if (_scanner->isHHFile()) {
616 RETTOKEN(T_HH_ERROR);
619 RETTOKEN(T_NUM_STRING);
622 <ST_VAR_OFFSET>{LNUM}|{HNUM} { /* Offset must be treated as a string */
623 RETTOKEN(T_NUM_STRING);
626 <ST_IN_SCRIPTING,ST_XHP_IN_TAG>{DNUM}|{EXPONENT_DNUM} {
630 <ST_IN_SCRIPTING>"__CLASS__" { RETTOKEN(T_CLASS_C); }
631 <ST_IN_SCRIPTING>"__TRAIT__" { RETTOKEN(T_TRAIT_C); }
632 <ST_IN_SCRIPTING>"__FUNCTION__" { RETTOKEN(T_FUNC_C); }
633 <ST_IN_SCRIPTING>"__METHOD__" { RETTOKEN(T_METHOD_C);}
634 <ST_IN_SCRIPTING>"__LINE__" { RETTOKEN(T_LINE); }
635 <ST_IN_SCRIPTING>"__FILE__" { RETTOKEN(T_FILE); }
636 <ST_IN_SCRIPTING>"__DIR__" { RETTOKEN(T_DIR); }
637 <ST_IN_SCRIPTING>"__NAMESPACE__" { RETTOKEN(T_NS_C); }
639 <INITIAL>"#"[^\n]*"\n" {
640 _scanner->setHashBang(yytext, yyleng, T_INLINE_HTML);
641 BEGIN(ST_IN_SCRIPTING);
642 yy_push_state(ST_AFTER_HASHBANG, yyscanner);
643 return T_INLINE_HTML;
646 <INITIAL>(([^<#]|"<"[^?%s<]){1,400})|"<s"|"<" {
647 SETTOKEN(T_INLINE_HTML);
648 BEGIN(ST_IN_SCRIPTING);
649 yy_push_state(ST_IN_HTML, yyscanner);
650 return T_INLINE_HTML;
653 <ST_IN_HTML,ST_AFTER_HASHBANG>(([^<]|"<"[^?%s<]){1,400})|"<s"|"<" {
654 SETTOKEN(T_INLINE_HTML);
656 return T_INLINE_HTML;
659 <INITIAL,ST_IN_HTML,ST_AFTER_HASHBANG>"<?"|("<?php"([ \t]|{NEWLINE}))|"<script"{WHITESPACE}+"language"{WHITESPACE}*"="{WHITESPACE}*("php"|"\"php\""|"\'php\'"){WHITESPACE}*">" {
660 if (_scanner->shortTags() || yyleng > 2) {
661 SETTOKEN(T_OPEN_TAG);
662 if (YY_START == INITIAL) {
663 BEGIN(ST_IN_SCRIPTING);
665 yy_pop_state(yyscanner);
669 SETTOKEN(T_INLINE_HTML);
670 if (YY_START == INITIAL) {
671 BEGIN(ST_IN_SCRIPTING);
672 yy_push_state(ST_IN_HTML, yyscanner);
673 } else if (YY_START == ST_AFTER_HASHBANG) {
676 return T_INLINE_HTML;
680 <INITIAL,ST_IN_HTML,ST_AFTER_HASHBANG>"<%="|"<?=" {
681 if ((yytext[1]=='%' && _scanner->aspTags()) ||
682 (yytext[1]=='?' && _scanner->shortTags())) {
683 if (YY_START == INITIAL) {
684 BEGIN(ST_IN_SCRIPTING);
686 yy_pop_state(yyscanner);
688 RETTOKEN(T_ECHO); //return T_OPEN_TAG_WITH_ECHO;
690 if (YY_START == INITIAL) {
691 BEGIN(ST_IN_SCRIPTING);
692 yy_push_state(ST_IN_HTML, yyscanner);
693 } else if (YY_START == ST_AFTER_HASHBANG) {
696 RETTOKEN(T_INLINE_HTML);
700 <INITIAL,ST_IN_HTML,ST_AFTER_HASHBANG>"<%" {
701 if (_scanner->aspTags()) {
702 if (YY_START == INITIAL) {
703 BEGIN(ST_IN_SCRIPTING);
705 yy_pop_state(yyscanner);
707 RETTOKEN(T_OPEN_TAG);
709 if (YY_START == INITIAL) {
710 BEGIN(ST_IN_SCRIPTING);
711 yy_push_state(ST_IN_HTML, yyscanner);
712 } else if (YY_START == ST_AFTER_HASHBANG) {
715 RETTOKEN(T_INLINE_HTML);
719 <INITIAL,ST_IN_HTML,ST_AFTER_HASHBANG>"<?hh"([ \t]|{NEWLINE}) {
720 if (YY_START == INITIAL) {
721 BEGIN(ST_IN_SCRIPTING);
722 } else if (YY_START == ST_AFTER_HASHBANG) {
723 yy_pop_state(yyscanner);
725 _scanner->error("HH mode: content before <?hh");
729 _scanner->setHHFile();
733 <ST_IN_SCRIPTING,ST_DOUBLE_QUOTES,ST_HEREDOC,ST_BACKQUOTE,ST_VAR_OFFSET>"$"{LABEL} {
734 _scanner->setToken(yytext, yyleng, yytext+1, yyleng-1, T_VARIABLE);
738 <ST_DOUBLE_QUOTES,ST_HEREDOC,ST_BACKQUOTE>"$"{LABEL}"->"[a-zA-Z_\x7f-\xff] {
740 yy_push_state(ST_LOOKING_FOR_PROPERTY, yyscanner);
741 _scanner->setToken(yytext, yyleng, yytext+1, yyleng-1, T_VARIABLE);
745 <ST_DOUBLE_QUOTES,ST_HEREDOC,ST_BACKQUOTE>"$"{LABEL}"[" {
747 yy_push_state(ST_VAR_OFFSET, yyscanner);
748 _scanner->setToken(yytext, yyleng, yytext+1, yyleng-1, T_VARIABLE);
753 yy_pop_state(yyscanner);
757 <ST_VAR_OFFSET>{TOKENS}|[{}\"`] {
758 /* Only '[' can be valid, but returning other tokens will allow
759 a more explicit parse error */
763 <ST_VAR_OFFSET>[ \n\r\t\\\'#] {
764 /* Invalid rule to return a more explicit parse error with proper
767 yy_pop_state(yyscanner);
768 RETSTEP(T_ENCAPSED_AND_WHITESPACE);
771 <ST_IN_SCRIPTING,ST_VAR_OFFSET>{LABEL} {
775 <ST_IN_SCRIPTING,ST_XHP_IN_TAG>{WHITESPACE} {
776 RETSTEP(T_WHITESPACE);
779 <ST_IN_SCRIPTING,ST_XHP_IN_TAG>"#"|"//" {
780 yy_push_state(ST_ONE_LINE_COMMENT, yyscanner);
784 <ST_ONE_LINE_COMMENT>"?"|"%"|">" {
788 <ST_ONE_LINE_COMMENT>[^\n\r?%>]*{ANY_CHAR} {
789 switch (yytext[yyleng-1]) {
798 yy_pop_state(yyscanner);
803 <ST_ONE_LINE_COMMENT>{NEWLINE} {
805 yy_pop_state(yyscanner);
809 <ST_ONE_LINE_COMMENT>"?>"|"%>" {
810 if (_scanner->isHHFile()) {
811 _scanner->error("HH mode: ?> not allowed");
814 if (_scanner->aspTags() || yytext[yyleng-2] != '%') {
815 _scanner->setToken(yytext, yyleng-2, yytext, yyleng-2, T_COMMENT);
817 yy_pop_state(yyscanner);
824 <ST_IN_SCRIPTING,ST_XHP_IN_TAG>"/**"{WHITESPACE} {
825 yy_push_state(ST_DOC_COMMENT, yyscanner);
829 <ST_IN_SCRIPTING,ST_XHP_IN_TAG>"/*" {
830 yy_push_state(ST_COMMENT, yyscanner);
834 <ST_COMMENT,ST_DOC_COMMENT>[^*]+ {
838 <ST_DOC_COMMENT>"*/" {
839 SETTOKEN(T_DOC_COMMENT);
840 yy_pop_state(yyscanner);
841 return T_DOC_COMMENT;
846 yy_pop_state(yyscanner);
850 <ST_COMMENT,ST_DOC_COMMENT>"*" {
854 <ST_XHP_COMMENT>[^-]+ {
858 <ST_XHP_COMMENT>"-->" {
860 yy_pop_state(yyscanner);
864 <ST_XHP_COMMENT>"-" {
868 <ST_IN_SCRIPTING>"?>"{NEWLINE}? {
869 if (_scanner->isHHFile()) {
870 _scanner->error("HH mode: ?> not allowed");
873 yy_push_state(ST_IN_HTML, yyscanner);
874 if (_scanner->full()) {
875 RETSTEP(T_CLOSE_TAG);
881 <ST_IN_SCRIPTING>"</script"{WHITESPACE}*">"{NEWLINE}? {
882 yy_push_state(ST_IN_HTML, yyscanner);
883 if (_scanner->full()) {
884 RETSTEP(T_CLOSE_TAG);
890 <ST_IN_SCRIPTING>"%>"{NEWLINE}? {
891 if (_scanner->aspTags()) {
892 yy_push_state(ST_IN_HTML, yyscanner);
893 if (_scanner->full()) {
894 RETSTEP(T_CLOSE_TAG);
900 _scanner->setToken(yytext, 1, yytext, 1);
905 <ST_IN_SCRIPTING>(b?[\"]{DOUBLE_QUOTES_CHARS}*("{"*|"$"*)[\"]) {
906 int bprefix = (yytext[0] != '"') ? 1 : 0;
908 _scanner->escape(yytext + bprefix + 1,
909 yyleng - bprefix - 2, '"');
910 _scanner->setToken(yytext, yyleng, strval.c_str(), strval.length());
911 return T_CONSTANT_ENCAPSED_STRING;
914 <ST_IN_SCRIPTING>(b?[\']([^\'\\]|("\\"{ANY_CHAR}))*[\']?) {
915 int bprefix = (yytext[0] != '\'') ? 1 : 0;
916 int closed = (yytext[yyleng - 1] == '\'');
918 _scanner->escape(yytext + bprefix + 1,
919 yyleng - bprefix - 2, '\'');
920 _scanner->setToken(yytext, yyleng, strval.c_str(), strval.length());
921 return closed ? T_CONSTANT_ENCAPSED_STRING : T_ENCAPSED_AND_WHITESPACE;
924 <ST_IN_SCRIPTING>b?[\"] {
925 int bprefix = (yytext[0] != '"') ? 1 : 0;
926 _scanner->setToken(yytext, yyleng, yytext + bprefix, yyleng - bprefix);
927 BEGIN(ST_DOUBLE_QUOTES);
931 <ST_IN_SCRIPTING>b?"<<<"{TABS_AND_SPACES}({LABEL}|[']{LABEL}[']|["]{LABEL}["]){NEWLINE} {
932 int bprefix = (yytext[0] != '<') ? 1 : 0;
933 int label_len = yyleng-bprefix-3-1-(yytext[yyleng-2]=='\r'?1:0);
934 char *s = yytext+bprefix+3;
935 while ((*s == ' ') || (*s == '\t')) {
950 _scanner->setHeredocLabel(s, label_len);
951 _scanner->setToken(yytext, yyleng, s, label_len);
952 return T_START_HEREDOC;
955 <ST_IN_SCRIPTING>[`] {
961 <ST_XHP_IN_TAG>{XHPLABEL} {
962 RETTOKEN(T_XHP_LABEL);
969 <ST_XHP_IN_TAG>["][^"]*["] {
970 _scanner->setToken(yytext, yyleng, yytext+1, yyleng-2);
976 yy_push_state(ST_IN_SCRIPTING, yyscanner);
981 STEPPOS(T_XHP_TAG_GT);
986 <ST_XHP_IN_TAG>"/>" {
987 BEGIN(ST_XHP_END_SINGLETON_TAG);
992 <ST_XHP_IN_TAG>{ANY_CHAR} {
993 // This rule ensures we get a reasonable syntax error message
994 // when unexpected characters occur inside XHP tags
996 _scanner->error("Unexpected character in input: '%c' (ASCII=%d)",
997 yytext[0], yytext[0]);
1001 <ST_XHP_END_SINGLETON_TAG>">" {
1002 STEPPOS(T_XHP_TAG_GT);
1003 yy_pop_state(yyscanner);
1004 return T_XHP_TAG_GT;
1007 <ST_XHP_CHILD>"<!--" {
1008 yy_push_state(ST_XHP_COMMENT, yyscanner);
1012 <ST_XHP_CHILD>[^{<]+ {
1013 RETTOKEN(T_XHP_TEXT);
1018 yy_push_state(ST_IN_SCRIPTING, yyscanner);
1022 <ST_XHP_CHILD>"</" {
1023 BEGIN(ST_XHP_END_CLOSE_TAG);
1025 RETSTEP(T_XHP_TAG_LT);
1028 <ST_XHP_END_CLOSE_TAG>"/" {
1032 <ST_XHP_END_CLOSE_TAG>{XHPLABEL} {
1033 RETTOKEN(T_XHP_LABEL);
1036 <ST_XHP_END_CLOSE_TAG>">" {
1037 STEPPOS(T_XHP_TAG_GT);
1038 yy_pop_state(yyscanner);
1039 return T_XHP_TAG_GT;
1043 STEPPOS(T_XHP_TAG_LT);
1044 yy_push_state(ST_XHP_IN_TAG, yyscanner);
1045 return T_XHP_TAG_LT;
1048 <ST_HEREDOC,ST_NOWDOC>{ANY_CHAR} {
1049 int refillResult = EOB_ACT_CONTINUE_SCAN;
1050 std::vector<std::string> docPieces;
1051 size_t totalDocSize = 0;
1052 std::string entireDoc;
1053 int docLabelLen = _scanner->getHeredocLabelLen();
1054 bool isHeredoc = (YYSTATE == ST_HEREDOC);
1060 // The rules that lead to this state all consume an end-of-line.
1061 bool lookingForEndLabel = true;
1063 while (refillResult == EOB_ACT_CONTINUE_SCAN) {
1064 while (YYCURSOR < YYLIMIT) {
1065 switch (*YYCURSOR++) {
1067 lookingForEndLabel = true;
1070 lookingForEndLabel = true;
1073 lookingForEndLabel = false;
1075 if (YYCURSOR == YYLIMIT) {
1077 goto doc_scan_get_more_buffer;
1079 if (IS_LABEL_START(*YYCURSOR) || *YYCURSOR == '{') {
1086 lookingForEndLabel = false;
1088 if (YYCURSOR == YYLIMIT) {
1090 goto doc_scan_get_more_buffer;
1092 if (*YYCURSOR == '$') {
1099 lookingForEndLabel = false;
1101 if (YYCURSOR == YYLIMIT) {
1103 goto doc_scan_get_more_buffer;
1105 if (*YYCURSOR != '\n' && *YYCURSOR != '\r') {
1111 if (lookingForEndLabel) {
1112 lookingForEndLabel = false;
1114 // Check for ending label on this line.
1115 if (!IS_LABEL_START(YYCURSOR[-1])) continue;
1117 // Adjust cursor to the start of the potential label.
1118 // If a label is recgonized, we want the cursor pointing at it.
1121 if ((docLabelLen + 2) > (YYLIMIT - YYCURSOR)) {
1122 lookingForEndLabel = true;
1123 goto doc_scan_get_more_buffer;
1126 if (!memcmp(YYCURSOR, _scanner->getHeredocLabel(), docLabelLen)) {
1127 const char *end = YYCURSOR + docLabelLen;
1131 if (*end == '\n' || *end == '\r') {
1132 BEGIN(ST_END_HEREDOC);
1136 ++YYCURSOR; // No label found, consume this character.
1142 doc_scan_get_more_buffer:
1143 // We ran off the end of the buffer, but no end label has been found.
1144 // Save off the string we have so far, re-fill the buffer, and repeat.
1145 yyleng = YYCURSOR - yytext;
1146 docPieces.emplace_back(yytext, yyleng);
1147 if (totalDocSize >= entireDoc.max_size() - yyleng) {
1148 _scanner->error("%sdoc too large", isHeredoc ? "Here" : "Now");
1151 totalDocSize += yyleng;
1153 // yy_get_next_buffer() needs the text pointing at the data we want to keep
1154 // in the buffer, and the cursor pointing off the end. It will move what's
1155 // at yytext (if anything) to the beginning of the buffer and fill the rest
1157 yytext = yytext + yyleng;
1159 YYCURSOR = YYLIMIT + 1;
1160 refillResult = yy_get_next_buffer(yyscanner);
1162 // Point to the beginning of the (possibly new) buffer.
1163 YYCURSOR = yyg->yy_c_buf_p = yytext;
1164 YYLIMIT = YY_CURRENT_BUFFER->yy_ch_buf + yyg->yy_n_chars;
1167 _scanner->error("Unterminated %sdoc at end of file",
1168 isHeredoc ? "here" : "now");
1172 yyleng = YYCURSOR - yytext;
1173 totalDocSize += yyleng;
1176 if (totalDocSize > 0) {
1177 entireDoc.reserve(totalDocSize);
1179 for (const auto& piece: docPieces) {
1180 entireDoc.append(piece);
1184 entireDoc.append(yytext, yyleng);
1187 // Newline before label will be subtracted from returned text, but
1188 // raw text will include it, for zend_highlight/strip, tokenizer, etc.
1190 bool endLabelFound = (YYSTATE == ST_END_HEREDOC);
1191 if (endLabelFound && (entireDoc.length() > 0)) {
1192 auto it = entireDoc.end();
1193 if (*--it == '\n') {
1195 if ((entireDoc.length() > 1) && (*--it == '\r')) {
1202 std::string escapedDoc = _scanner->escape(entireDoc.c_str(),
1203 entireDoc.length() - newline,
1205 _scanner->setToken(entireDoc.c_str(), entireDoc.length(),
1206 escapedDoc.c_str(), escapedDoc.length());
1208 _scanner->setToken(entireDoc.c_str(), entireDoc.length(),
1209 entireDoc.c_str(), entireDoc.length() - newline);
1211 return T_ENCAPSED_AND_WHITESPACE;
1213 // No data before the label means we just go right to ST_END_HEREDOC
1214 // without forming a new token.
1218 <ST_END_HEREDOC>{LABEL} {
1219 BEGIN(ST_IN_SCRIPTING);
1220 RETSTEP(T_END_HEREDOC);
1223 <ST_DOUBLE_QUOTES,ST_BACKQUOTE,ST_HEREDOC>"{$" {
1224 _scanner->setToken(yytext, 1, yytext, 1);
1225 yy_push_state(ST_IN_SCRIPTING, yyscanner);
1227 return T_CURLY_OPEN;
1230 <ST_DOUBLE_QUOTES>{DOUBLE_QUOTES_CHARS}+ {
1231 std::string strval = _scanner->escape(yytext, yyleng, '"');
1232 _scanner->setToken(yytext, yyleng, strval.c_str(), strval.length());
1233 return T_ENCAPSED_AND_WHITESPACE;
1236 <ST_DOUBLE_QUOTES>{DOUBLE_QUOTES_CHARS}*("{"{2,}|"$"{2,}|(("{"+|"$"+)[\"])) {
1238 std::string strval = _scanner->escape(yytext, yyleng, '"');
1239 _scanner->setToken(yytext, yyleng, strval.c_str(), strval.length());
1240 return T_ENCAPSED_AND_WHITESPACE;
1243 <ST_BACKQUOTE>{BACKQUOTE_CHARS}+ {
1244 std::string strval = _scanner->escape(yytext, yyleng, '`');
1245 _scanner->setToken(yytext, yyleng, strval.c_str(), strval.length());
1246 return T_ENCAPSED_AND_WHITESPACE;
1249 <ST_BACKQUOTE>{BACKQUOTE_CHARS}*("{"{2,}|"$"{2,}|(("{"+|"$"+)[`])) {
1251 std::string strval = _scanner->escape(yytext, yyleng, '`');
1252 _scanner->setToken(yytext, yyleng, strval.c_str(), strval.length());
1253 return T_ENCAPSED_AND_WHITESPACE;
1256 <ST_DOUBLE_QUOTES>[\"] {
1257 BEGIN(ST_IN_SCRIPTING);
1261 <ST_BACKQUOTE>[\`] {
1262 BEGIN(ST_IN_SCRIPTING);
1266 <ST_COMMENT,ST_DOC_COMMENT><<EOF>> {
1267 _scanner->error("Unterminated comment at end of file");
1272 _scanner->error("Unexpected character in input: '%c' (ASCII=%d)",
1273 yytext[0], yytext[0]);
1279 void Scanner::init() {
1280 yylex_init_extra(this, &m_yyscanner);
1281 struct yyguts_t *yyg = (struct yyguts_t *)m_yyscanner;
1285 int Scanner::scan() {
1286 return yylex(m_token, m_loc, m_yyscanner);
1289 void Scanner::reset() {
1290 void *yyscanner = (void *)m_yyscanner;
1291 struct yyguts_t *yyg = (struct yyguts_t *)m_yyscanner;
1293 yylex_destroy(m_yyscanner);
1296 static void suppress_unused_errors() {
1299 suppress_unused_errors();
1304 int yywrap(yyscan_t yyscanner) {